structure saas with tools
This commit is contained in:
870
.venv/lib/python3.10/site-packages/vertexai/rag/rag_data.py
Normal file
870
.venv/lib/python3.10/site-packages/vertexai/rag/rag_data.py
Normal file
@@ -0,0 +1,870 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2024 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""RAG data management SDK."""
|
||||
|
||||
from typing import Optional, Sequence, Union
|
||||
from google import auth
|
||||
from google.api_core import operation_async
|
||||
from google.auth.transport import requests as google_auth_requests
|
||||
from google.cloud import aiplatform
|
||||
from google.cloud.aiplatform import initializer
|
||||
from google.cloud.aiplatform import utils
|
||||
from google.cloud.aiplatform_v1 import (
|
||||
CreateRagCorpusRequest,
|
||||
DeleteRagCorpusRequest,
|
||||
DeleteRagFileRequest,
|
||||
GetRagCorpusRequest,
|
||||
GetRagFileRequest,
|
||||
ImportRagFilesResponse,
|
||||
ListRagCorporaRequest,
|
||||
ListRagFilesRequest,
|
||||
RagCorpus as GapicRagCorpus,
|
||||
UpdateRagCorpusRequest,
|
||||
)
|
||||
from google.cloud.aiplatform_v1.services.vertex_rag_data_service.pagers import (
|
||||
ListRagCorporaPager,
|
||||
ListRagFilesPager,
|
||||
)
|
||||
from vertexai.rag.utils import (
|
||||
_gapic_utils,
|
||||
)
|
||||
from vertexai.rag.utils.resources import (
|
||||
JiraSource,
|
||||
LayoutParserConfig,
|
||||
RagCorpus,
|
||||
RagFile,
|
||||
RagVectorDbConfig,
|
||||
SharePointSources,
|
||||
SlackChannelsSource,
|
||||
VertexAiSearchConfig,
|
||||
TransformationConfig,
|
||||
)
|
||||
|
||||
|
||||
def create_corpus(
|
||||
display_name: Optional[str] = None,
|
||||
description: Optional[str] = None,
|
||||
vertex_ai_search_config: Optional[VertexAiSearchConfig] = None,
|
||||
backend_config: Optional[
|
||||
Union[
|
||||
RagVectorDbConfig,
|
||||
None,
|
||||
]
|
||||
] = None,
|
||||
) -> RagCorpus:
|
||||
"""Creates a new RagCorpus resource.
|
||||
|
||||
Example usage:
|
||||
```
|
||||
import vertexai
|
||||
from vertexai import rag
|
||||
|
||||
vertexai.init(project="my-project")
|
||||
|
||||
rag_corpus = rag.create_corpus(
|
||||
display_name="my-corpus-1",
|
||||
)
|
||||
```
|
||||
|
||||
Args:
|
||||
display_name: If not provided, SDK will create one. The display name of
|
||||
the RagCorpus. The name can be up to 128 characters long and can
|
||||
consist of any UTF-8 characters.
|
||||
description: The description of the RagCorpus.
|
||||
vertex_ai_search_config: The Vertex AI Search config of the RagCorpus.
|
||||
Note: backend_config cannot be set if vertex_ai_search_config is
|
||||
specified.
|
||||
backend_config: The backend config of the RagCorpus, specifying a
|
||||
data store and/or embedding model.
|
||||
Returns:
|
||||
RagCorpus.
|
||||
Raises:
|
||||
RuntimeError: Failed in RagCorpus creation due to exception.
|
||||
RuntimeError: Failed in RagCorpus creation due to operation error.
|
||||
"""
|
||||
if vertex_ai_search_config and backend_config:
|
||||
raise ValueError(
|
||||
"Only one of vertex_ai_search_config or backend_config can be set."
|
||||
)
|
||||
|
||||
if not display_name:
|
||||
display_name = "vertex-" + utils.timestamped_unique_name()
|
||||
parent = initializer.global_config.common_location_path(project=None, location=None)
|
||||
|
||||
rag_corpus = GapicRagCorpus(display_name=display_name, description=description)
|
||||
|
||||
if backend_config:
|
||||
_gapic_utils.set_backend_config(
|
||||
backend_config=backend_config,
|
||||
rag_corpus=rag_corpus,
|
||||
)
|
||||
elif vertex_ai_search_config:
|
||||
_gapic_utils.set_vertex_ai_search_config(
|
||||
vertex_ai_search_config=vertex_ai_search_config,
|
||||
rag_corpus=rag_corpus,
|
||||
)
|
||||
|
||||
request = CreateRagCorpusRequest(
|
||||
parent=parent,
|
||||
rag_corpus=rag_corpus,
|
||||
)
|
||||
client = _gapic_utils.create_rag_data_service_client()
|
||||
|
||||
try:
|
||||
response = client.create_rag_corpus(request=request)
|
||||
except Exception as e:
|
||||
raise RuntimeError("Failed in RagCorpus creation due to: ", e) from e
|
||||
return _gapic_utils.convert_gapic_to_rag_corpus(response.result(timeout=600))
|
||||
|
||||
|
||||
def update_corpus(
|
||||
corpus_name: str,
|
||||
display_name: Optional[str] = None,
|
||||
description: Optional[str] = None,
|
||||
vertex_ai_search_config: Optional[VertexAiSearchConfig] = None,
|
||||
backend_config: Optional[
|
||||
Union[
|
||||
RagVectorDbConfig,
|
||||
None,
|
||||
]
|
||||
] = None,
|
||||
) -> RagCorpus:
|
||||
"""Updates a RagCorpus resource. It is intended to update 3rd party vector
|
||||
DBs (Vector Search, Vertex AI Feature Store, Weaviate, Pinecone) but not
|
||||
Vertex RagManagedDb.
|
||||
|
||||
Example usage:
|
||||
```
|
||||
import vertexai
|
||||
from vertexai import rag
|
||||
|
||||
vertexai.init(project="my-project")
|
||||
|
||||
rag_corpus = rag.update_corpus(
|
||||
corpus_name="projects/my-project/locations/us-central1/ragCorpora/my-corpus-1",
|
||||
display_name="my-corpus-1",
|
||||
)
|
||||
```
|
||||
|
||||
Args:
|
||||
corpus_name: The name of the RagCorpus resource to update. Format:
|
||||
``projects/{project}/locations/{location}/ragCorpora/{rag_corpus}`` or
|
||||
``{rag_corpus}``.
|
||||
display_name: If not provided, the display name will not be updated. The
|
||||
display name of the RagCorpus. The name can be up to 128 characters long
|
||||
and can consist of any UTF-8 characters.
|
||||
description: The description of the RagCorpus. If not provided, the
|
||||
description will not be updated.
|
||||
vertex_ai_search_config: The Vertex AI Search config of the RagCorpus.
|
||||
If not provided, the Vertex AI Search config will not be updated.
|
||||
Note: backend_config cannot be set if vertex_ai_search_config is
|
||||
specified.
|
||||
backend_config: The backend config of the RagCorpus, specifying a
|
||||
data store and/or embedding model.
|
||||
|
||||
Returns:
|
||||
RagCorpus.
|
||||
Raises:
|
||||
RuntimeError: Failed in RagCorpus update due to exception.
|
||||
RuntimeError: Failed in RagCorpus update due to operation error.
|
||||
"""
|
||||
if vertex_ai_search_config and backend_config:
|
||||
raise ValueError(
|
||||
"Only one of vertex_ai_search_config or backend_config can be set."
|
||||
)
|
||||
|
||||
corpus_name = _gapic_utils.get_corpus_name(corpus_name)
|
||||
if display_name and description:
|
||||
rag_corpus = GapicRagCorpus(
|
||||
name=corpus_name, display_name=display_name, description=description
|
||||
)
|
||||
elif display_name:
|
||||
rag_corpus = GapicRagCorpus(name=corpus_name, display_name=display_name)
|
||||
elif description:
|
||||
rag_corpus = GapicRagCorpus(name=corpus_name, description=description)
|
||||
else:
|
||||
rag_corpus = GapicRagCorpus(name=corpus_name)
|
||||
|
||||
if backend_config:
|
||||
_gapic_utils.set_backend_config(
|
||||
backend_config=backend_config,
|
||||
rag_corpus=rag_corpus,
|
||||
)
|
||||
|
||||
if vertex_ai_search_config:
|
||||
_gapic_utils.set_vertex_ai_search_config(
|
||||
vertex_ai_search_config=vertex_ai_search_config,
|
||||
rag_corpus=rag_corpus,
|
||||
)
|
||||
|
||||
request = UpdateRagCorpusRequest(
|
||||
rag_corpus=rag_corpus,
|
||||
)
|
||||
client = _gapic_utils.create_rag_data_service_client()
|
||||
|
||||
try:
|
||||
response = client.update_rag_corpus(request=request)
|
||||
except Exception as e:
|
||||
raise RuntimeError("Failed in RagCorpus update due to: ", e) from e
|
||||
return _gapic_utils.convert_gapic_to_rag_corpus_no_embedding_model_config(
|
||||
response.result(timeout=600)
|
||||
)
|
||||
|
||||
|
||||
def get_corpus(name: str) -> RagCorpus:
|
||||
"""
|
||||
Get an existing RagCorpus.
|
||||
|
||||
Args:
|
||||
name: An existing RagCorpus resource name. Format:
|
||||
``projects/{project}/locations/{location}/ragCorpora/{rag_corpus}``
|
||||
or ``{rag_corpus}``.
|
||||
Returns:
|
||||
RagCorpus.
|
||||
"""
|
||||
corpus_name = _gapic_utils.get_corpus_name(name)
|
||||
request = GetRagCorpusRequest(name=corpus_name)
|
||||
client = _gapic_utils.create_rag_data_service_client()
|
||||
try:
|
||||
response = client.get_rag_corpus(request=request)
|
||||
except Exception as e:
|
||||
raise RuntimeError("Failed in getting the RagCorpus due to: ", e) from e
|
||||
return _gapic_utils.convert_gapic_to_rag_corpus(response)
|
||||
|
||||
|
||||
def list_corpora(
|
||||
page_size: Optional[int] = None, page_token: Optional[str] = None
|
||||
) -> ListRagCorporaPager:
|
||||
"""
|
||||
List all RagCorpora in the same project and location.
|
||||
|
||||
Example usage:
|
||||
```
|
||||
import vertexai
|
||||
from vertexai import rag
|
||||
|
||||
vertexai.init(project="my-project")
|
||||
|
||||
# List all corpora.
|
||||
rag_corpora = list(rag.list_corpora())
|
||||
|
||||
# Alternatively, return a ListRagCorporaPager.
|
||||
pager_1 = rag.list_corpora(page_size=10)
|
||||
# Then get the next page, use the generated next_page_token from the last pager.
|
||||
pager_2 = rag.list_corpora(page_size=10, page_token=pager_1.next_page_token)
|
||||
|
||||
```
|
||||
Args:
|
||||
page_size: The standard list page size. Leaving out the page_size
|
||||
causes all of the results to be returned.
|
||||
page_token: The standard list page token.
|
||||
|
||||
Returns:
|
||||
ListRagCorporaPager.
|
||||
"""
|
||||
parent = initializer.global_config.common_location_path(project=None, location=None)
|
||||
request = ListRagCorporaRequest(
|
||||
parent=parent,
|
||||
page_size=page_size,
|
||||
page_token=page_token,
|
||||
)
|
||||
client = _gapic_utils.create_rag_data_service_client()
|
||||
try:
|
||||
pager = client.list_rag_corpora(request=request)
|
||||
except Exception as e:
|
||||
raise RuntimeError("Failed in listing the RagCorpora due to: ", e) from e
|
||||
|
||||
return pager
|
||||
|
||||
|
||||
def delete_corpus(name: str) -> None:
|
||||
"""
|
||||
Delete an existing RagCorpus.
|
||||
|
||||
Args:
|
||||
name: An existing RagCorpus resource name. Format:
|
||||
``projects/{project}/locations/{location}/ragCorpora/{rag_corpus}``
|
||||
or ``{rag_corpus}``.
|
||||
"""
|
||||
corpus_name = _gapic_utils.get_corpus_name(name)
|
||||
request = DeleteRagCorpusRequest(name=corpus_name)
|
||||
|
||||
client = _gapic_utils.create_rag_data_service_client()
|
||||
try:
|
||||
client.delete_rag_corpus(request=request)
|
||||
print("Successfully deleted the RagCorpus.")
|
||||
except Exception as e:
|
||||
raise RuntimeError("Failed in RagCorpus deletion due to: ", e) from e
|
||||
return None
|
||||
|
||||
|
||||
def upload_file(
|
||||
corpus_name: str,
|
||||
path: Union[str, Sequence[str]],
|
||||
display_name: Optional[str] = None,
|
||||
description: Optional[str] = None,
|
||||
transformation_config: Optional[TransformationConfig] = None,
|
||||
) -> RagFile:
|
||||
"""
|
||||
Synchronous file upload to an existing RagCorpus.
|
||||
|
||||
Example usage:
|
||||
|
||||
```
|
||||
import vertexai
|
||||
from vertexai import rag
|
||||
|
||||
vertexai.init(project="my-project")
|
||||
|
||||
// Optional.
|
||||
transformation_config = TransformationConfig(
|
||||
chunking_config=ChunkingConfig(
|
||||
chunk_size=1024,
|
||||
chunk_overlap=200,
|
||||
),
|
||||
)
|
||||
|
||||
rag_file = rag.upload_file(
|
||||
corpus_name="projects/my-project/locations/us-central1/ragCorpora/my-corpus-1",
|
||||
display_name="my_file.txt",
|
||||
path="usr/home/my_file.txt",
|
||||
transformation_config=transformation_config,
|
||||
)
|
||||
```
|
||||
|
||||
Args:
|
||||
corpus_name: The name of the RagCorpus resource into which to upload the file.
|
||||
Format: ``projects/{project}/locations/{location}/ragCorpora/{rag_corpus}``
|
||||
or ``{rag_corpus}``.
|
||||
path: A local file path. For example,
|
||||
"usr/home/my_file.txt".
|
||||
display_name: The display name of the data file.
|
||||
description: The description of the RagFile.
|
||||
transformation_config: The config for transforming the RagFile, like chunking.
|
||||
|
||||
Returns:
|
||||
RagFile.
|
||||
Raises:
|
||||
RuntimeError: Failed in RagFile upload.
|
||||
ValueError: RagCorpus is not found.
|
||||
RuntimeError: Failed in indexing the RagFile.
|
||||
"""
|
||||
corpus_name = _gapic_utils.get_corpus_name(corpus_name)
|
||||
location = initializer.global_config.location
|
||||
# GAPIC doesn't expose a path (scotty). Use requests API instead
|
||||
if display_name is None:
|
||||
display_name = "vertex-" + utils.timestamped_unique_name()
|
||||
headers = {"X-Goog-Upload-Protocol": "multipart"}
|
||||
if not initializer.global_config.api_endpoint:
|
||||
request_endpoint = "{}-{}".format(
|
||||
location, aiplatform.constants.base.API_BASE_PATH
|
||||
)
|
||||
else:
|
||||
request_endpoint = initializer.global_config.api_endpoint
|
||||
upload_request_uri = "https://{}/upload/v1/{}/ragFiles:upload".format(
|
||||
request_endpoint,
|
||||
corpus_name,
|
||||
)
|
||||
js_rag_file = {"rag_file": {"display_name": display_name}}
|
||||
|
||||
if description:
|
||||
js_rag_file["rag_file"]["description"] = description
|
||||
|
||||
if transformation_config and transformation_config.chunking_config:
|
||||
chunk_size = transformation_config.chunking_config.chunk_size
|
||||
chunk_overlap = transformation_config.chunking_config.chunk_overlap
|
||||
js_rag_file["upload_rag_file_config"] = {
|
||||
"rag_file_transformation_config": {
|
||||
"rag_file_chunking_config": {
|
||||
"fixed_length_chunking": {
|
||||
"chunk_size": chunk_size,
|
||||
"chunk_overlap": chunk_overlap,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
files = {
|
||||
"metadata": (None, str(js_rag_file)),
|
||||
"file": open(path, "rb"),
|
||||
}
|
||||
credentials, _ = auth.default()
|
||||
authorized_session = google_auth_requests.AuthorizedSession(credentials=credentials)
|
||||
try:
|
||||
response = authorized_session.post(
|
||||
url=upload_request_uri,
|
||||
files=files,
|
||||
headers=headers,
|
||||
)
|
||||
except Exception as e:
|
||||
raise RuntimeError("Failed in uploading the RagFile due to: ", e) from e
|
||||
|
||||
if response.status_code == 404:
|
||||
raise ValueError(
|
||||
"RagCorpus '%s' is not found: %s", corpus_name, upload_request_uri
|
||||
)
|
||||
if response.json().get("error"):
|
||||
raise RuntimeError(
|
||||
"Failed in indexing the RagFile due to: ", response.json().get("error")
|
||||
)
|
||||
return _gapic_utils.convert_json_to_rag_file(response.json())
|
||||
|
||||
|
||||
def import_files(
|
||||
corpus_name: str,
|
||||
paths: Optional[Sequence[str]] = None,
|
||||
source: Optional[Union[SlackChannelsSource, JiraSource, SharePointSources]] = None,
|
||||
transformation_config: Optional[TransformationConfig] = None,
|
||||
timeout: int = 600,
|
||||
max_embedding_requests_per_min: int = 1000,
|
||||
import_result_sink: Optional[str] = None,
|
||||
partial_failures_sink: Optional[str] = None,
|
||||
parser: Optional[LayoutParserConfig] = None,
|
||||
) -> ImportRagFilesResponse:
|
||||
"""
|
||||
Import files to an existing RagCorpus, wait until completion.
|
||||
|
||||
Example usage:
|
||||
|
||||
```
|
||||
import vertexai
|
||||
from vertexai import rag
|
||||
from google.protobuf import timestamp_pb2
|
||||
|
||||
vertexai.init(project="my-project")
|
||||
# Google Drive example
|
||||
paths = [
|
||||
"https://drive.google.com/file/d/123",
|
||||
"https://drive.google.com/drive/folders/456"
|
||||
]
|
||||
# Google Cloud Storage example
|
||||
paths = ["gs://my_bucket/my_files_dir", ...]
|
||||
|
||||
transformation_config = TransformationConfig(
|
||||
chunking_config=ChunkingConfig(
|
||||
chunk_size=1024,
|
||||
chunk_overlap=200,
|
||||
),
|
||||
)
|
||||
|
||||
response = rag.import_files(
|
||||
corpus_name="projects/my-project/locations/us-central1/ragCorpora/my-corpus-1",
|
||||
paths=paths,
|
||||
transformation_config=transformation_config,
|
||||
)
|
||||
|
||||
# Slack example
|
||||
start_time = timestamp_pb2.Timestamp()
|
||||
start_time.FromJsonString('2020-12-31T21:33:44Z')
|
||||
end_time = timestamp_pb2.Timestamp()
|
||||
end_time.GetCurrentTime()
|
||||
source = rag.SlackChannelsSource(
|
||||
channels = [
|
||||
SlackChannel("channel1", "api_key1"),
|
||||
SlackChannel("channel2", "api_key2", start_time, end_time)
|
||||
],
|
||||
)
|
||||
# Jira Example
|
||||
jira_query = rag.JiraQuery(
|
||||
email="xxx@yyy.com",
|
||||
jira_projects=["project1", "project2"],
|
||||
custom_queries=["query1", "query2"],
|
||||
api_key="api_key",
|
||||
server_uri="server.atlassian.net"
|
||||
)
|
||||
source = rag.JiraSource(
|
||||
queries=[jira_query],
|
||||
)
|
||||
|
||||
response = rag.import_files(
|
||||
corpus_name="projects/my-project/locations/us-central1/ragCorpora/my-corpus-1",
|
||||
source=source,
|
||||
transformation_config=transformation_config,
|
||||
)
|
||||
|
||||
# SharePoint Example.
|
||||
sharepoint_query = rag.SharePointSource(
|
||||
sharepoint_folder_path="https://my-sharepoint-site.com/my-folder",
|
||||
sharepoint_site_name="my-sharepoint-site.com",
|
||||
client_id="my-client-id",
|
||||
client_secret="my-client-secret",
|
||||
tenant_id="my-tenant-id",
|
||||
drive_id="my-drive-id",
|
||||
)
|
||||
source = rag.SharePointSources(
|
||||
share_point_sources=[sharepoint_query],
|
||||
)
|
||||
|
||||
# Return the number of imported RagFiles after completion.
|
||||
print(response.imported_rag_files_count)
|
||||
|
||||
# Document AI Layout Parser example.
|
||||
parser = LayoutParserConfig(
|
||||
processor_name="projects/my-project/locations/us-central1/processors/my-processor-id",
|
||||
max_parsing_requests_per_min=120,
|
||||
)
|
||||
response = rag.import_files(
|
||||
corpus_name="projects/my-project/locations/us-central1/ragCorpora/my-corpus-1",
|
||||
paths=paths,
|
||||
parser=parser,
|
||||
)
|
||||
|
||||
```
|
||||
Args:
|
||||
corpus_name: The name of the RagCorpus resource into which to import files.
|
||||
Format: ``projects/{project}/locations/{location}/ragCorpora/{rag_corpus}``
|
||||
or ``{rag_corpus}``.
|
||||
paths: A list of uris. Eligible uris will be Google Cloud Storage
|
||||
directory ("gs://my-bucket/my_dir") or a Google Drive url for file
|
||||
(https://drive.google.com/file/... or folder
|
||||
"https://drive.google.com/corp/drive/folders/...").
|
||||
source: The source of the Slack or Jira import.
|
||||
Must be either a SlackChannelsSource or JiraSource.
|
||||
transformation_config: The config for transforming the imported
|
||||
RagFiles.
|
||||
max_embedding_requests_per_min:
|
||||
Optional. The max number of queries per
|
||||
minute that this job is allowed to make to the
|
||||
embedding model specified on the corpus. This
|
||||
value is specific to this job and not shared
|
||||
across other import jobs. Consult the Quotas
|
||||
page on the project to set an appropriate value
|
||||
here. If unspecified, a default value of 1,000
|
||||
QPM would be used.
|
||||
timeout: Default is 600 seconds.
|
||||
import_result_sink: Either a GCS path to store import results or a
|
||||
BigQuery table to store import results. The format is
|
||||
"gs://my-bucket/my/object.ndjson" for GCS or
|
||||
"bq://my-project.my-dataset.my-table" for BigQuery. An existing GCS
|
||||
object cannot be used. However, the BigQuery table may or may not
|
||||
exist - if it does not exist, it will be created. If it does exist,
|
||||
the schema will be checked and the import results will be appended
|
||||
to the table.
|
||||
partial_failures_sink: Deprecated. Prefer to use `import_result_sink`.
|
||||
Either a GCS path to store partial failures or a BigQuery table to
|
||||
store partial failures. The format is
|
||||
"gs://my-bucket/my/object.ndjson" for GCS or
|
||||
"bq://my-project.my-dataset.my-table" for BigQuery. An existing GCS
|
||||
object cannot be used. However, the BigQuery table may or may not
|
||||
exist - if it does not exist, it will be created. If it does exist,
|
||||
the schema will be checked and the partial failures will be appended
|
||||
to the table.
|
||||
parser: Document parser to use. Should be either None (default parser),
|
||||
or a LayoutParserConfig (to parse documents using a Document AI
|
||||
Layout Parser processor).
|
||||
Returns:
|
||||
ImportRagFilesResponse.
|
||||
"""
|
||||
if source is not None and paths is not None:
|
||||
raise ValueError("Only one of source or paths must be passed in at a time")
|
||||
if source is None and paths is None:
|
||||
raise ValueError("One of source or paths must be passed in")
|
||||
corpus_name = _gapic_utils.get_corpus_name(corpus_name)
|
||||
request = _gapic_utils.prepare_import_files_request(
|
||||
corpus_name=corpus_name,
|
||||
paths=paths,
|
||||
source=source,
|
||||
transformation_config=transformation_config,
|
||||
max_embedding_requests_per_min=max_embedding_requests_per_min,
|
||||
import_result_sink=import_result_sink,
|
||||
partial_failures_sink=partial_failures_sink,
|
||||
parser=parser,
|
||||
)
|
||||
client = _gapic_utils.create_rag_data_service_client()
|
||||
try:
|
||||
response = client.import_rag_files(request=request)
|
||||
except Exception as e:
|
||||
raise RuntimeError("Failed in importing the RagFiles due to: ", e) from e
|
||||
|
||||
return response.result(timeout=timeout)
|
||||
|
||||
|
||||
async def import_files_async(
|
||||
corpus_name: str,
|
||||
paths: Optional[Sequence[str]] = None,
|
||||
source: Optional[Union[SlackChannelsSource, JiraSource, SharePointSources]] = None,
|
||||
transformation_config: Optional[TransformationConfig] = None,
|
||||
max_embedding_requests_per_min: int = 1000,
|
||||
import_result_sink: Optional[str] = None,
|
||||
partial_failures_sink: Optional[str] = None,
|
||||
parser: Optional[LayoutParserConfig] = None,
|
||||
) -> operation_async.AsyncOperation:
|
||||
"""
|
||||
Import files to an existing RagCorpus asynchronously.
|
||||
|
||||
Example usage:
|
||||
|
||||
```
|
||||
import vertexai
|
||||
from vertexai import rag
|
||||
from google.protobuf import timestamp_pb2
|
||||
|
||||
vertexai.init(project="my-project")
|
||||
|
||||
# Google Drive example
|
||||
paths = [
|
||||
"https://drive.google.com/file/d/123",
|
||||
"https://drive.google.com/drive/folders/456"
|
||||
]
|
||||
# Google Cloud Storage example
|
||||
paths = ["gs://my_bucket/my_files_dir", ...]
|
||||
|
||||
transformation_config = TransformationConfig(
|
||||
chunking_config=ChunkingConfig(
|
||||
chunk_size=1024,
|
||||
chunk_overlap=200,
|
||||
),
|
||||
)
|
||||
|
||||
response = await rag.import_files_async(
|
||||
corpus_name="projects/my-project/locations/us-central1/ragCorpora/my-corpus-1",
|
||||
paths=paths,
|
||||
transformation_config=transformation_config,
|
||||
)
|
||||
|
||||
# Slack example
|
||||
start_time = timestamp_pb2.Timestamp()
|
||||
start_time.FromJsonString('2020-12-31T21:33:44Z')
|
||||
end_time = timestamp_pb2.Timestamp()
|
||||
end_time.GetCurrentTime()
|
||||
source = rag.SlackChannelsSource(
|
||||
channels = [
|
||||
SlackChannel("channel1", "api_key1"),
|
||||
SlackChannel("channel2", "api_key2", start_time, end_time)
|
||||
],
|
||||
)
|
||||
# Jira Example
|
||||
jira_query = rag.JiraQuery(
|
||||
email="xxx@yyy.com",
|
||||
jira_projects=["project1", "project2"],
|
||||
custom_queries=["query1", "query2"],
|
||||
api_key="api_key",
|
||||
server_uri="server.atlassian.net"
|
||||
)
|
||||
source = rag.JiraSource(
|
||||
queries=[jira_query],
|
||||
)
|
||||
|
||||
response = await rag.import_files_async(
|
||||
corpus_name="projects/my-project/locations/us-central1/ragCorpora/my-corpus-1",
|
||||
source=source,
|
||||
transformation_config=transformation_config,
|
||||
)
|
||||
|
||||
# SharePoint Example.
|
||||
sharepoint_query = rag.SharePointSource(
|
||||
sharepoint_folder_path="https://my-sharepoint-site.com/my-folder",
|
||||
sharepoint_site_name="my-sharepoint-site.com",
|
||||
client_id="my-client-id",
|
||||
client_secret="my-client-secret",
|
||||
tenant_id="my-tenant-id",
|
||||
drive_id="my-drive-id",
|
||||
)
|
||||
source = rag.SharePointSources(
|
||||
share_point_sources=[sharepoint_query],
|
||||
)
|
||||
|
||||
# Document AI Layout Parser example.
|
||||
parser = LayoutParserConfig(
|
||||
processor_name="projects/my-project/locations/us-central1/processors/my-processor-id",
|
||||
max_parsing_requests_per_min=120,
|
||||
)
|
||||
response = rag.import_files_async(
|
||||
corpus_name="projects/my-project/locations/us-central1/ragCorpora/my-corpus-1",
|
||||
paths=paths,
|
||||
parser=parser,
|
||||
)
|
||||
|
||||
# Get the result.
|
||||
await response.result()
|
||||
|
||||
```
|
||||
Args:
|
||||
corpus_name: The name of the RagCorpus resource into which to import files.
|
||||
Format: ``projects/{project}/locations/{location}/ragCorpora/{rag_corpus}``
|
||||
or ``{rag_corpus}``.
|
||||
paths: A list of uris. Eligible uris will be Google Cloud Storage
|
||||
directory ("gs://my-bucket/my_dir") or a Google Drive url for file
|
||||
(https://drive.google.com/file/... or folder
|
||||
"https://drive.google.com/corp/drive/folders/...").
|
||||
source: The source of the Slack or Jira import.
|
||||
Must be either a SlackChannelsSource or JiraSource.
|
||||
transformation_config: The config for transforming the imported
|
||||
RagFiles.
|
||||
max_embedding_requests_per_min:
|
||||
Optional. The max number of queries per
|
||||
minute that this job is allowed to make to the
|
||||
embedding model specified on the corpus. This
|
||||
value is specific to this job and not shared
|
||||
across other import jobs. Consult the Quotas
|
||||
page on the project to set an appropriate value
|
||||
here. If unspecified, a default value of 1,000
|
||||
QPM would be used.
|
||||
import_result_sink: Either a GCS path to store import results or a
|
||||
BigQuery table to store import results. The format is
|
||||
"gs://my-bucket/my/object.ndjson" for GCS or
|
||||
"bq://my-project.my-dataset.my-table" for BigQuery. An existing GCS
|
||||
object cannot be used. However, the BigQuery table may or may not
|
||||
exist - if it does not exist, it will be created. If it does exist,
|
||||
the schema will be checked and the import results will be appended
|
||||
to the table.
|
||||
partial_failures_sink: Deprecated. Prefer to use `import_result_sink`.
|
||||
Either a GCS path to store partial failures or a BigQuery table to
|
||||
store partial failures. The format is
|
||||
"gs://my-bucket/my/object.ndjson" for GCS or
|
||||
"bq://my-project.my-dataset.my-table" for BigQuery. An existing GCS
|
||||
object cannot be used. However, the BigQuery table may or may not
|
||||
exist - if it does not exist, it will be created. If it does exist,
|
||||
the schema will be checked and the partial failures will be appended
|
||||
to the table.
|
||||
parser: Document parser to use. Should be either None (default parser),
|
||||
or a LayoutParserConfig (to parse documents using a Document AI
|
||||
Layout Parser processor).
|
||||
Returns:
|
||||
operation_async.AsyncOperation.
|
||||
"""
|
||||
if source is not None and paths is not None:
|
||||
raise ValueError("Only one of source or paths must be passed in at a time")
|
||||
if source is None and paths is None:
|
||||
raise ValueError("One of source or paths must be passed in")
|
||||
corpus_name = _gapic_utils.get_corpus_name(corpus_name)
|
||||
request = _gapic_utils.prepare_import_files_request(
|
||||
corpus_name=corpus_name,
|
||||
paths=paths,
|
||||
source=source,
|
||||
transformation_config=transformation_config,
|
||||
max_embedding_requests_per_min=max_embedding_requests_per_min,
|
||||
import_result_sink=import_result_sink,
|
||||
partial_failures_sink=partial_failures_sink,
|
||||
parser=parser,
|
||||
)
|
||||
async_client = _gapic_utils.create_rag_data_service_async_client()
|
||||
try:
|
||||
response = await async_client.import_rag_files(request=request)
|
||||
except Exception as e:
|
||||
raise RuntimeError("Failed in importing the RagFiles due to: ", e) from e
|
||||
return response
|
||||
|
||||
|
||||
def get_file(name: str, corpus_name: Optional[str] = None) -> RagFile:
|
||||
"""
|
||||
Get an existing RagFile.
|
||||
|
||||
Args:
|
||||
name: Either a full RagFile resource name must be provided, or a RagCorpus
|
||||
name and a RagFile name must be provided. Format:
|
||||
``projects/{project}/locations/{location}/ragCorpora/{rag_corpus}/ragFiles/{rag_file}``
|
||||
or ``{rag_file}``.
|
||||
corpus_name: If `name` is not a full resource name, an existing RagCorpus
|
||||
name must be provided. Format:
|
||||
``projects/{project}/locations/{location}/ragCorpora/{rag_corpus}``
|
||||
or ``{rag_corpus}``.
|
||||
Returns:
|
||||
RagFile.
|
||||
"""
|
||||
corpus_name = _gapic_utils.get_corpus_name(corpus_name)
|
||||
name = _gapic_utils.get_file_name(name, corpus_name)
|
||||
request = GetRagFileRequest(name=name)
|
||||
client = _gapic_utils.create_rag_data_service_client()
|
||||
try:
|
||||
response = client.get_rag_file(request=request)
|
||||
except Exception as e:
|
||||
raise RuntimeError("Failed in getting the RagFile due to: ", e) from e
|
||||
return _gapic_utils.convert_gapic_to_rag_file(response)
|
||||
|
||||
|
||||
def list_files(
|
||||
corpus_name: str, page_size: Optional[int] = None, page_token: Optional[str] = None
|
||||
) -> ListRagFilesPager:
|
||||
"""
|
||||
List all RagFiles in an existing RagCorpus.
|
||||
|
||||
Example usage:
|
||||
```
|
||||
import vertexai
|
||||
|
||||
vertexai.init(project="my-project")
|
||||
# List all corpora.
|
||||
rag_corpora = list(rag.list_corpora())
|
||||
|
||||
# List all files of the first corpus.
|
||||
rag_files = list(rag.list_files(corpus_name=rag_corpora[0].name))
|
||||
|
||||
# Alternatively, return a ListRagFilesPager.
|
||||
pager_1 = rag.list_files(
|
||||
corpus_name=rag_corpora[0].name,
|
||||
page_size=10
|
||||
)
|
||||
# Then get the next page, use the generated next_page_token from the last pager.
|
||||
pager_2 = rag.list_files(
|
||||
corpus_name=rag_corpora[0].name,
|
||||
page_size=10,
|
||||
page_token=pager_1.next_page_token
|
||||
)
|
||||
|
||||
```
|
||||
|
||||
Args:
|
||||
corpus_name: An existing RagCorpus name. Format:
|
||||
``projects/{project}/locations/{location}/ragCorpora/{rag_corpus}``
|
||||
or ``{rag_corpus}``.
|
||||
page_size: The standard list page size. Leaving out the page_size
|
||||
causes all of the results to be returned.
|
||||
page_token: The standard list page token.
|
||||
Returns:
|
||||
ListRagFilesPager.
|
||||
"""
|
||||
corpus_name = _gapic_utils.get_corpus_name(corpus_name)
|
||||
request = ListRagFilesRequest(
|
||||
parent=corpus_name,
|
||||
page_size=page_size,
|
||||
page_token=page_token,
|
||||
)
|
||||
client = _gapic_utils.create_rag_data_service_client()
|
||||
try:
|
||||
pager = client.list_rag_files(request=request)
|
||||
except Exception as e:
|
||||
raise RuntimeError("Failed in listing the RagFiles due to: ", e) from e
|
||||
|
||||
return pager
|
||||
|
||||
|
||||
def delete_file(name: str, corpus_name: Optional[str] = None) -> None:
|
||||
"""
|
||||
Delete RagFile from an existing RagCorpus.
|
||||
|
||||
Args:
|
||||
name: Either a full RagFile resource name must be provided, or a RagCorpus
|
||||
name and a RagFile name must be provided. Format:
|
||||
``projects/{project}/locations/{location}/ragCorpora/{rag_corpus}/ragFiles/{rag_file}``
|
||||
or ``{rag_file}``.
|
||||
corpus_name: If `name` is not a full resource name, an existing RagCorpus
|
||||
name must be provided. Format:
|
||||
``projects/{project}/locations/{location}/ragCorpora/{rag_corpus}``
|
||||
or ``{rag_corpus}``.
|
||||
"""
|
||||
corpus_name = _gapic_utils.get_corpus_name(corpus_name)
|
||||
name = _gapic_utils.get_file_name(name, corpus_name)
|
||||
request = DeleteRagFileRequest(name=name)
|
||||
|
||||
client = _gapic_utils.create_rag_data_service_client()
|
||||
try:
|
||||
client.delete_rag_file(request=request)
|
||||
print("Successfully deleted the RagFile.")
|
||||
except Exception as e:
|
||||
raise RuntimeError("Failed in RagFile deletion due to: ", e) from e
|
||||
return None
|
||||
Reference in New Issue
Block a user