structure saas with tools

2025-04-25 15:30:54 -03:00
commit 1aef473937
16434 changed files with 6584257 additions and 0 deletions
--- a/.venv/lib/python3.10/site-packages/vertexai/tokenization/_tokenizer_loading.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/tokenization/_tokenizer_loading.py
@@ -0,0 +1,187 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import requests
+import uuid
+import os
+import tempfile
+import hashlib
+import dataclasses
+
+import sentencepiece as spm
+import functools
+from sentencepiece import sentencepiece_model_pb2
+
+
+@dataclasses.dataclass(frozen=True)
+class _TokenizerConfig:
+    model_url: str
+    model_hash: str
+
+
+_GEMMA_TOKENIZER = "google/gemma"
+
+# SoT: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models
+_GEMINI_MODEL_NAMES = ["gemini-1.0-pro", "gemini-1.5-pro", "gemini-1.5-flash"]
+_GEMINI_STABLE_MODEL_NAMES = [
+    "gemini-1.0-pro-001",
+    "gemini-1.0-pro-002",
+    "gemini-1.5-pro-001",
+    "gemini-1.5-flash-001",
+    "gemini-1.5-flash-002",
+    "gemini-1.5-pro-002",
+]
+
+_TOKENIZERS = {
+    _GEMMA_TOKENIZER: _TokenizerConfig(
+        model_url="https://raw.githubusercontent.com/google/gemma_pytorch/33b652c465537c6158f9a472ea5700e5e770ad3f/tokenizer/tokenizer.model",
+        model_hash="61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2",
+    )
+}
+
+
+def _load_file(file_url_path: str) -> bytes:
+    """Loads file bytes from the given file url path."""
+    resp = requests.get(file_url_path)
+    resp.raise_for_status()
+    return resp.content
+
+
+def _is_valid_model(*, model_data: bytes, expected_hash: str) -> bool:
+    """Returns true if the content is valid by checking the hash."""
+    if not expected_hash:
+        raise ValueError("expected_hash is required")
+    return hashlib.sha256(model_data).hexdigest() == expected_hash
+
+
+def _maybe_remove_file(file_path: str) -> None:
+    """Removes the file if exists."""
+    if not os.path.exists(file_path):
+        return
+    try:
+        os.remove(file_path)
+    except OSError:
+        # Don't raise if we cannot remove file.
+        pass
+
+
+def _maybe_load_from_cache(*, file_path: str, expected_hash: str) -> bytes:
+    """Loads the content from the cache path."""
+    if not os.path.exists(file_path):
+        return
+    with open(file_path, "rb") as f:
+        content = f.read()
+    if _is_valid_model(model_data=content, expected_hash=expected_hash):
+        return content
+
+    # Cached file corrupted.
+    _maybe_remove_file(file_path)
+
+
+def _maybe_save_to_cache(*, cache_dir: str, cache_path: str, content: bytes) -> None:
+    """Saves the content to the cache path."""
+    try:
+        os.makedirs(cache_dir, exist_ok=True)
+        tmp_path = cache_dir + "." + str(uuid.uuid4()) + ".tmp"
+        with open(tmp_path, "wb") as f:
+            f.write(content)
+        os.rename(tmp_path, cache_path)
+    except OSError:
+        # Don't raise if we cannot write file.
+        pass
+
+
+def _load_from_url(*, file_url: str, expected_hash: str) -> bytes:
+    """Loads model bytes from the given file url."""
+    content = _load_file(file_url)
+    if not _is_valid_model(model_data=content, expected_hash=expected_hash):
+        actual_hash = hashlib.sha256(content).hexdigest()
+        raise ValueError(
+            f"Downloaded model file is corrupted."
+            f" Expected hash {expected_hash}. Got file hash {actual_hash}."
+        )
+    return content
+
+
+def _load(*, file_url: str, expected_hash: str) -> bytes:
+    """Loads model bytes from the given file url.
+
+    1. If the find local cached file for the given url and the cached file hash
+       matches the expected hash, the cached file is returned.
+    2. If local cached file is not found or the hash does not match, the file is
+       downloaded from the given url. And write to local cache and return the
+       file bytes.
+    3. If the file downloaded from the given url does not match the expected
+       hash, raise ValueError.
+
+    Args:
+        file_url: The url of the file to load.
+        expected_hash: The expected hash of the file.
+
+    Returns:
+        The file bytes.
+    """
+    model_dir = os.path.join(tempfile.gettempdir(), "vertexai_tokenizer_model")
+    filename = hashlib.sha1(file_url.encode()).hexdigest()
+    model_path = os.path.join(model_dir, filename)
+
+    model_data = _maybe_load_from_cache(
+        file_path=model_path, expected_hash=expected_hash
+    )
+    if not model_data:
+        model_data = _load_from_url(file_url=file_url, expected_hash=expected_hash)
+
+    _maybe_save_to_cache(cache_dir=model_dir, cache_path=model_path, content=model_data)
+    return model_data
+
+
+def _load_model_proto_bytes(tokenizer_name: str) -> bytes:
+    """Loads model proto bytes from the given tokenizer name."""
+    if tokenizer_name not in _TOKENIZERS:
+        raise ValueError(
+            f"Tokenizer {tokenizer_name} is not supported."
+            f"Supported tokenizers: {list(_TOKENIZERS.keys())}"
+        )
+    return _load(
+        file_url=_TOKENIZERS[tokenizer_name].model_url,
+        expected_hash=_TOKENIZERS[tokenizer_name].model_hash,
+    )
+
+
+@functools.lru_cache()
+def load_model_proto(tokenizer_name) -> sentencepiece_model_pb2.ModelProto:
+    """Loads model proto from the given tokenizer name."""
+    model_proto = sentencepiece_model_pb2.ModelProto()
+    model_proto.ParseFromString(_load_model_proto_bytes(tokenizer_name))
+    return model_proto
+
+
+def get_tokenizer_name(model_name: str):
+    """Gets the tokenizer name for the given model name."""
+    if model_name in _GEMINI_MODEL_NAMES:
+        return _GEMMA_TOKENIZER
+    if model_name in _GEMINI_STABLE_MODEL_NAMES:
+        return _GEMMA_TOKENIZER
+    raise ValueError(
+        f"Model {model_name} is not supported. Supported models: {', '.join(_GEMINI_STABLE_MODEL_NAMES)}.\n"  # pylint: disable=line-too-long
+    )
+
+
+@functools.lru_cache()
+def get_sentencepiece(tokenizer_name: str) -> spm.SentencePieceProcessor:
+    """Loads sentencepiece tokenizer from the given tokenizer name."""
+    processor = spm.SentencePieceProcessor()
+    processor.LoadFromSerializedProto(_load_model_proto_bytes(tokenizer_name))
+    return processor