structure saas with tools
This commit is contained in:
1446
.venv/lib/python3.10/site-packages/huggingface_hub/__init__.py
Normal file
1446
.venv/lib/python3.10/site-packages/huggingface_hub/__init__.py
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,896 @@
|
||||
"""
|
||||
Type definitions and utilities for the `create_commit` API
|
||||
"""
|
||||
|
||||
import base64
|
||||
import io
|
||||
import math
|
||||
import os
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass, field
|
||||
from itertools import groupby
|
||||
from pathlib import Path, PurePosixPath
|
||||
from typing import TYPE_CHECKING, Any, BinaryIO, Dict, Iterable, Iterator, List, Literal, Optional, Tuple, Union
|
||||
|
||||
from tqdm.contrib.concurrent import thread_map
|
||||
|
||||
from . import constants
|
||||
from .errors import EntryNotFoundError, HfHubHTTPError, XetAuthorizationError, XetRefreshTokenError
|
||||
from .file_download import hf_hub_url
|
||||
from .lfs import UploadInfo, lfs_upload, post_lfs_batch_info
|
||||
from .utils import (
|
||||
FORBIDDEN_FOLDERS,
|
||||
XetTokenType,
|
||||
chunk_iterable,
|
||||
fetch_xet_connection_info_from_repo_info,
|
||||
get_session,
|
||||
hf_raise_for_status,
|
||||
logging,
|
||||
sha,
|
||||
tqdm_stream_file,
|
||||
validate_hf_hub_args,
|
||||
)
|
||||
from .utils import tqdm as hf_tqdm
|
||||
from .utils.tqdm import _get_progress_bar_context
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .hf_api import RepoFile
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
UploadMode = Literal["lfs", "regular"]
|
||||
|
||||
# Max is 1,000 per request on the Hub for HfApi.get_paths_info
|
||||
# Otherwise we get:
|
||||
# HfHubHTTPError: 413 Client Error: Payload Too Large for url: https://huggingface.co/api/datasets/xxx (Request ID: xxx)\n\ntoo many parameters
|
||||
# See https://github.com/huggingface/huggingface_hub/issues/1503
|
||||
FETCH_LFS_BATCH_SIZE = 500
|
||||
|
||||
UPLOAD_BATCH_MAX_NUM_FILES = 256
|
||||
|
||||
|
||||
@dataclass
|
||||
class CommitOperationDelete:
|
||||
"""
|
||||
Data structure holding necessary info to delete a file or a folder from a repository
|
||||
on the Hub.
|
||||
|
||||
Args:
|
||||
path_in_repo (`str`):
|
||||
Relative filepath in the repo, for example: `"checkpoints/1fec34a/weights.bin"`
|
||||
for a file or `"checkpoints/1fec34a/"` for a folder.
|
||||
is_folder (`bool` or `Literal["auto"]`, *optional*)
|
||||
Whether the Delete Operation applies to a folder or not. If "auto", the path
|
||||
type (file or folder) is guessed automatically by looking if path ends with
|
||||
a "/" (folder) or not (file). To explicitly set the path type, you can set
|
||||
`is_folder=True` or `is_folder=False`.
|
||||
"""
|
||||
|
||||
path_in_repo: str
|
||||
is_folder: Union[bool, Literal["auto"]] = "auto"
|
||||
|
||||
def __post_init__(self):
|
||||
self.path_in_repo = _validate_path_in_repo(self.path_in_repo)
|
||||
|
||||
if self.is_folder == "auto":
|
||||
self.is_folder = self.path_in_repo.endswith("/")
|
||||
if not isinstance(self.is_folder, bool):
|
||||
raise ValueError(
|
||||
f"Wrong value for `is_folder`. Must be one of [`True`, `False`, `'auto'`]. Got '{self.is_folder}'."
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CommitOperationCopy:
|
||||
"""
|
||||
Data structure holding necessary info to copy a file in a repository on the Hub.
|
||||
|
||||
Limitations:
|
||||
- Only LFS files can be copied. To copy a regular file, you need to download it locally and re-upload it
|
||||
- Cross-repository copies are not supported.
|
||||
|
||||
Note: you can combine a [`CommitOperationCopy`] and a [`CommitOperationDelete`] to rename an LFS file on the Hub.
|
||||
|
||||
Args:
|
||||
src_path_in_repo (`str`):
|
||||
Relative filepath in the repo of the file to be copied, e.g. `"checkpoints/1fec34a/weights.bin"`.
|
||||
path_in_repo (`str`):
|
||||
Relative filepath in the repo where to copy the file, e.g. `"checkpoints/1fec34a/weights_copy.bin"`.
|
||||
src_revision (`str`, *optional*):
|
||||
The git revision of the file to be copied. Can be any valid git revision.
|
||||
Default to the target commit revision.
|
||||
"""
|
||||
|
||||
src_path_in_repo: str
|
||||
path_in_repo: str
|
||||
src_revision: Optional[str] = None
|
||||
# set to the OID of the file to be copied if it has already been uploaded
|
||||
# useful to determine if a commit will be empty or not.
|
||||
_src_oid: Optional[str] = None
|
||||
# set to the OID of the file to copy to if it has already been uploaded
|
||||
# useful to determine if a commit will be empty or not.
|
||||
_dest_oid: Optional[str] = None
|
||||
|
||||
def __post_init__(self):
|
||||
self.src_path_in_repo = _validate_path_in_repo(self.src_path_in_repo)
|
||||
self.path_in_repo = _validate_path_in_repo(self.path_in_repo)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CommitOperationAdd:
|
||||
"""
|
||||
Data structure holding necessary info to upload a file to a repository on the Hub.
|
||||
|
||||
Args:
|
||||
path_in_repo (`str`):
|
||||
Relative filepath in the repo, for example: `"checkpoints/1fec34a/weights.bin"`
|
||||
path_or_fileobj (`str`, `Path`, `bytes`, or `BinaryIO`):
|
||||
Either:
|
||||
- a path to a local file (as `str` or `pathlib.Path`) to upload
|
||||
- a buffer of bytes (`bytes`) holding the content of the file to upload
|
||||
- a "file object" (subclass of `io.BufferedIOBase`), typically obtained
|
||||
with `open(path, "rb")`. It must support `seek()` and `tell()` methods.
|
||||
|
||||
Raises:
|
||||
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
||||
If `path_or_fileobj` is not one of `str`, `Path`, `bytes` or `io.BufferedIOBase`.
|
||||
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
||||
If `path_or_fileobj` is a `str` or `Path` but not a path to an existing file.
|
||||
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
||||
If `path_or_fileobj` is a `io.BufferedIOBase` but it doesn't support both
|
||||
`seek()` and `tell()`.
|
||||
"""
|
||||
|
||||
path_in_repo: str
|
||||
path_or_fileobj: Union[str, Path, bytes, BinaryIO]
|
||||
upload_info: UploadInfo = field(init=False, repr=False)
|
||||
|
||||
# Internal attributes
|
||||
|
||||
# set to "lfs" or "regular" once known
|
||||
_upload_mode: Optional[UploadMode] = field(init=False, repr=False, default=None)
|
||||
|
||||
# set to True if .gitignore rules prevent the file from being uploaded as LFS
|
||||
# (server-side check)
|
||||
_should_ignore: Optional[bool] = field(init=False, repr=False, default=None)
|
||||
|
||||
# set to the remote OID of the file if it has already been uploaded
|
||||
# useful to determine if a commit will be empty or not
|
||||
_remote_oid: Optional[str] = field(init=False, repr=False, default=None)
|
||||
|
||||
# set to True once the file has been uploaded as LFS
|
||||
_is_uploaded: bool = field(init=False, repr=False, default=False)
|
||||
|
||||
# set to True once the file has been committed
|
||||
_is_committed: bool = field(init=False, repr=False, default=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
"""Validates `path_or_fileobj` and compute `upload_info`."""
|
||||
self.path_in_repo = _validate_path_in_repo(self.path_in_repo)
|
||||
|
||||
# Validate `path_or_fileobj` value
|
||||
if isinstance(self.path_or_fileobj, Path):
|
||||
self.path_or_fileobj = str(self.path_or_fileobj)
|
||||
if isinstance(self.path_or_fileobj, str):
|
||||
path_or_fileobj = os.path.normpath(os.path.expanduser(self.path_or_fileobj))
|
||||
if not os.path.isfile(path_or_fileobj):
|
||||
raise ValueError(f"Provided path: '{path_or_fileobj}' is not a file on the local file system")
|
||||
elif not isinstance(self.path_or_fileobj, (io.BufferedIOBase, bytes)):
|
||||
# ^^ Inspired from: https://stackoverflow.com/questions/44584829/how-to-determine-if-file-is-opened-in-binary-or-text-mode
|
||||
raise ValueError(
|
||||
"path_or_fileobj must be either an instance of str, bytes or"
|
||||
" io.BufferedIOBase. If you passed a file-like object, make sure it is"
|
||||
" in binary mode."
|
||||
)
|
||||
if isinstance(self.path_or_fileobj, io.BufferedIOBase):
|
||||
try:
|
||||
self.path_or_fileobj.tell()
|
||||
self.path_or_fileobj.seek(0, os.SEEK_CUR)
|
||||
except (OSError, AttributeError) as exc:
|
||||
raise ValueError(
|
||||
"path_or_fileobj is a file-like object but does not implement seek() and tell()"
|
||||
) from exc
|
||||
|
||||
# Compute "upload_info" attribute
|
||||
if isinstance(self.path_or_fileobj, str):
|
||||
self.upload_info = UploadInfo.from_path(self.path_or_fileobj)
|
||||
elif isinstance(self.path_or_fileobj, bytes):
|
||||
self.upload_info = UploadInfo.from_bytes(self.path_or_fileobj)
|
||||
else:
|
||||
self.upload_info = UploadInfo.from_fileobj(self.path_or_fileobj)
|
||||
|
||||
@contextmanager
|
||||
def as_file(self, with_tqdm: bool = False) -> Iterator[BinaryIO]:
|
||||
"""
|
||||
A context manager that yields a file-like object allowing to read the underlying
|
||||
data behind `path_or_fileobj`.
|
||||
|
||||
Args:
|
||||
with_tqdm (`bool`, *optional*, defaults to `False`):
|
||||
If True, iterating over the file object will display a progress bar. Only
|
||||
works if the file-like object is a path to a file. Pure bytes and buffers
|
||||
are not supported.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> operation = CommitOperationAdd(
|
||||
... path_in_repo="remote/dir/weights.h5",
|
||||
... path_or_fileobj="./local/weights.h5",
|
||||
... )
|
||||
CommitOperationAdd(path_in_repo='remote/dir/weights.h5', path_or_fileobj='./local/weights.h5')
|
||||
|
||||
>>> with operation.as_file() as file:
|
||||
... content = file.read()
|
||||
|
||||
>>> with operation.as_file(with_tqdm=True) as file:
|
||||
... while True:
|
||||
... data = file.read(1024)
|
||||
... if not data:
|
||||
... break
|
||||
config.json: 100%|█████████████████████████| 8.19k/8.19k [00:02<00:00, 3.72kB/s]
|
||||
|
||||
>>> with operation.as_file(with_tqdm=True) as file:
|
||||
... requests.put(..., data=file)
|
||||
config.json: 100%|█████████████████████████| 8.19k/8.19k [00:02<00:00, 3.72kB/s]
|
||||
```
|
||||
"""
|
||||
if isinstance(self.path_or_fileobj, str) or isinstance(self.path_or_fileobj, Path):
|
||||
if with_tqdm:
|
||||
with tqdm_stream_file(self.path_or_fileobj) as file:
|
||||
yield file
|
||||
else:
|
||||
with open(self.path_or_fileobj, "rb") as file:
|
||||
yield file
|
||||
elif isinstance(self.path_or_fileobj, bytes):
|
||||
yield io.BytesIO(self.path_or_fileobj)
|
||||
elif isinstance(self.path_or_fileobj, io.BufferedIOBase):
|
||||
prev_pos = self.path_or_fileobj.tell()
|
||||
yield self.path_or_fileobj
|
||||
self.path_or_fileobj.seek(prev_pos, io.SEEK_SET)
|
||||
|
||||
def b64content(self) -> bytes:
|
||||
"""
|
||||
The base64-encoded content of `path_or_fileobj`
|
||||
|
||||
Returns: `bytes`
|
||||
"""
|
||||
with self.as_file() as file:
|
||||
return base64.b64encode(file.read())
|
||||
|
||||
@property
|
||||
def _local_oid(self) -> Optional[str]:
|
||||
"""Return the OID of the local file.
|
||||
|
||||
This OID is then compared to `self._remote_oid` to check if the file has changed compared to the remote one.
|
||||
If the file did not change, we won't upload it again to prevent empty commits.
|
||||
|
||||
For LFS files, the OID corresponds to the SHA256 of the file content (used a LFS ref).
|
||||
For regular files, the OID corresponds to the SHA1 of the file content.
|
||||
Note: this is slightly different to git OID computation since the oid of an LFS file is usually the git-SHA1 of the
|
||||
pointer file content (not the actual file content). However, using the SHA256 is enough to detect changes
|
||||
and more convenient client-side.
|
||||
"""
|
||||
if self._upload_mode is None:
|
||||
return None
|
||||
elif self._upload_mode == "lfs":
|
||||
return self.upload_info.sha256.hex()
|
||||
else:
|
||||
# Regular file => compute sha1
|
||||
# => no need to read by chunk since the file is guaranteed to be <=5MB.
|
||||
with self.as_file() as file:
|
||||
return sha.git_hash(file.read())
|
||||
|
||||
|
||||
def _validate_path_in_repo(path_in_repo: str) -> str:
|
||||
# Validate `path_in_repo` value to prevent a server-side issue
|
||||
if path_in_repo.startswith("/"):
|
||||
path_in_repo = path_in_repo[1:]
|
||||
if path_in_repo == "." or path_in_repo == ".." or path_in_repo.startswith("../"):
|
||||
raise ValueError(f"Invalid `path_in_repo` in CommitOperation: '{path_in_repo}'")
|
||||
if path_in_repo.startswith("./"):
|
||||
path_in_repo = path_in_repo[2:]
|
||||
for forbidden in FORBIDDEN_FOLDERS:
|
||||
if any(part == forbidden for part in path_in_repo.split("/")):
|
||||
raise ValueError(
|
||||
f"Invalid `path_in_repo` in CommitOperation: cannot update files under a '{forbidden}/' folder (path:"
|
||||
f" '{path_in_repo}')."
|
||||
)
|
||||
return path_in_repo
|
||||
|
||||
|
||||
CommitOperation = Union[CommitOperationAdd, CommitOperationCopy, CommitOperationDelete]
|
||||
|
||||
|
||||
def _warn_on_overwriting_operations(operations: List[CommitOperation]) -> None:
|
||||
"""
|
||||
Warn user when a list of operations is expected to overwrite itself in a single
|
||||
commit.
|
||||
|
||||
Rules:
|
||||
- If a filepath is updated by multiple `CommitOperationAdd` operations, a warning
|
||||
message is triggered.
|
||||
- If a filepath is updated at least once by a `CommitOperationAdd` and then deleted
|
||||
by a `CommitOperationDelete`, a warning is triggered.
|
||||
- If a `CommitOperationDelete` deletes a filepath that is then updated by a
|
||||
`CommitOperationAdd`, no warning is triggered. This is usually useless (no need to
|
||||
delete before upload) but can happen if a user deletes an entire folder and then
|
||||
add new files to it.
|
||||
"""
|
||||
nb_additions_per_path: Dict[str, int] = defaultdict(int)
|
||||
for operation in operations:
|
||||
path_in_repo = operation.path_in_repo
|
||||
if isinstance(operation, CommitOperationAdd):
|
||||
if nb_additions_per_path[path_in_repo] > 0:
|
||||
warnings.warn(
|
||||
"About to update multiple times the same file in the same commit:"
|
||||
f" '{path_in_repo}'. This can cause undesired inconsistencies in"
|
||||
" your repo."
|
||||
)
|
||||
nb_additions_per_path[path_in_repo] += 1
|
||||
for parent in PurePosixPath(path_in_repo).parents:
|
||||
# Also keep track of number of updated files per folder
|
||||
# => warns if deleting a folder overwrite some contained files
|
||||
nb_additions_per_path[str(parent)] += 1
|
||||
if isinstance(operation, CommitOperationDelete):
|
||||
if nb_additions_per_path[str(PurePosixPath(path_in_repo))] > 0:
|
||||
if operation.is_folder:
|
||||
warnings.warn(
|
||||
"About to delete a folder containing files that have just been"
|
||||
f" updated within the same commit: '{path_in_repo}'. This can"
|
||||
" cause undesired inconsistencies in your repo."
|
||||
)
|
||||
else:
|
||||
warnings.warn(
|
||||
"About to delete a file that have just been updated within the"
|
||||
f" same commit: '{path_in_repo}'. This can cause undesired"
|
||||
" inconsistencies in your repo."
|
||||
)
|
||||
|
||||
|
||||
@validate_hf_hub_args
|
||||
def _upload_lfs_files(
|
||||
*,
|
||||
additions: List[CommitOperationAdd],
|
||||
repo_type: str,
|
||||
repo_id: str,
|
||||
headers: Dict[str, str],
|
||||
endpoint: Optional[str] = None,
|
||||
num_threads: int = 5,
|
||||
revision: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Uploads the content of `additions` to the Hub using the large file storage protocol.
|
||||
|
||||
Relevant external documentation:
|
||||
- LFS Batch API: https://github.com/git-lfs/git-lfs/blob/main/docs/api/batch.md
|
||||
|
||||
Args:
|
||||
additions (`List` of `CommitOperationAdd`):
|
||||
The files to be uploaded
|
||||
repo_type (`str`):
|
||||
Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
|
||||
repo_id (`str`):
|
||||
A namespace (user or an organization) and a repo name separated
|
||||
by a `/`.
|
||||
headers (`Dict[str, str]`):
|
||||
Headers to use for the request, including authorization headers and user agent.
|
||||
num_threads (`int`, *optional*):
|
||||
The number of concurrent threads to use when uploading. Defaults to 5.
|
||||
revision (`str`, *optional*):
|
||||
The git revision to upload to.
|
||||
|
||||
Raises:
|
||||
[`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
|
||||
If an upload failed for any reason
|
||||
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
||||
If the server returns malformed responses
|
||||
[`HTTPError`](https://requests.readthedocs.io/en/latest/api/#requests.HTTPError)
|
||||
If the LFS batch endpoint returned an HTTP error.
|
||||
"""
|
||||
# Step 1: retrieve upload instructions from the LFS batch endpoint.
|
||||
# Upload instructions are retrieved by chunk of 256 files to avoid reaching
|
||||
# the payload limit.
|
||||
batch_actions: List[Dict] = []
|
||||
for chunk in chunk_iterable(additions, chunk_size=UPLOAD_BATCH_MAX_NUM_FILES):
|
||||
batch_actions_chunk, batch_errors_chunk = post_lfs_batch_info(
|
||||
upload_infos=[op.upload_info for op in chunk],
|
||||
repo_id=repo_id,
|
||||
repo_type=repo_type,
|
||||
revision=revision,
|
||||
endpoint=endpoint,
|
||||
headers=headers,
|
||||
token=None, # already passed in 'headers'
|
||||
)
|
||||
|
||||
# If at least 1 error, we do not retrieve information for other chunks
|
||||
if batch_errors_chunk:
|
||||
message = "\n".join(
|
||||
[
|
||||
f"Encountered error for file with OID {err.get('oid')}: `{err.get('error', {}).get('message')}"
|
||||
for err in batch_errors_chunk
|
||||
]
|
||||
)
|
||||
raise ValueError(f"LFS batch endpoint returned errors:\n{message}")
|
||||
|
||||
batch_actions += batch_actions_chunk
|
||||
oid2addop = {add_op.upload_info.sha256.hex(): add_op for add_op in additions}
|
||||
|
||||
# Step 2: ignore files that have already been uploaded
|
||||
filtered_actions = []
|
||||
for action in batch_actions:
|
||||
if action.get("actions") is None:
|
||||
logger.debug(
|
||||
f"Content of file {oid2addop[action['oid']].path_in_repo} is already"
|
||||
" present upstream - skipping upload."
|
||||
)
|
||||
else:
|
||||
filtered_actions.append(action)
|
||||
|
||||
if len(filtered_actions) == 0:
|
||||
logger.debug("No LFS files to upload.")
|
||||
return
|
||||
|
||||
# Step 3: upload files concurrently according to these instructions
|
||||
def _wrapped_lfs_upload(batch_action) -> None:
|
||||
try:
|
||||
operation = oid2addop[batch_action["oid"]]
|
||||
lfs_upload(operation=operation, lfs_batch_action=batch_action, headers=headers, endpoint=endpoint)
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"Error while uploading '{operation.path_in_repo}' to the Hub.") from exc
|
||||
|
||||
if constants.HF_HUB_ENABLE_HF_TRANSFER:
|
||||
logger.debug(f"Uploading {len(filtered_actions)} LFS files to the Hub using `hf_transfer`.")
|
||||
for action in hf_tqdm(filtered_actions, name="huggingface_hub.lfs_upload"):
|
||||
_wrapped_lfs_upload(action)
|
||||
elif len(filtered_actions) == 1:
|
||||
logger.debug("Uploading 1 LFS file to the Hub")
|
||||
_wrapped_lfs_upload(filtered_actions[0])
|
||||
else:
|
||||
logger.debug(
|
||||
f"Uploading {len(filtered_actions)} LFS files to the Hub using up to {num_threads} threads concurrently"
|
||||
)
|
||||
thread_map(
|
||||
_wrapped_lfs_upload,
|
||||
filtered_actions,
|
||||
desc=f"Upload {len(filtered_actions)} LFS files",
|
||||
max_workers=num_threads,
|
||||
tqdm_class=hf_tqdm,
|
||||
)
|
||||
|
||||
|
||||
@validate_hf_hub_args
|
||||
def _upload_xet_files(
|
||||
*,
|
||||
additions: List[CommitOperationAdd],
|
||||
repo_type: str,
|
||||
repo_id: str,
|
||||
headers: Dict[str, str],
|
||||
endpoint: Optional[str] = None,
|
||||
revision: Optional[str] = None,
|
||||
create_pr: Optional[bool] = None,
|
||||
):
|
||||
"""
|
||||
Uploads the content of `additions` to the Hub using the xet storage protocol.
|
||||
This chunks the files and deduplicates the chunks before uploading them to xetcas storage.
|
||||
|
||||
Args:
|
||||
additions (`List` of `CommitOperationAdd`):
|
||||
The files to be uploaded.
|
||||
repo_type (`str`):
|
||||
Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
|
||||
repo_id (`str`):
|
||||
A namespace (user or an organization) and a repo name separated
|
||||
by a `/`.
|
||||
headers (`Dict[str, str]`):
|
||||
Headers to use for the request, including authorization headers and user agent.
|
||||
endpoint: (`str`, *optional*):
|
||||
The endpoint to use for the xetcas service. Defaults to `constants.ENDPOINT`.
|
||||
revision (`str`, *optional*):
|
||||
The git revision to upload to.
|
||||
create_pr (`bool`, *optional*):
|
||||
Whether or not to create a Pull Request with that commit.
|
||||
|
||||
Raises:
|
||||
[`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
|
||||
If an upload failed for any reason.
|
||||
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
||||
If the server returns malformed responses or if the user is unauthorized to upload to xet storage.
|
||||
[`HTTPError`](https://requests.readthedocs.io/en/latest/api/#requests.HTTPError)
|
||||
If the LFS batch endpoint returned an HTTP error.
|
||||
|
||||
**How it works:**
|
||||
The file download system uses Xet storage, which is a content-addressable storage system that breaks files into chunks
|
||||
for efficient storage and transfer.
|
||||
|
||||
`hf_xet.upload_files` manages uploading files by:
|
||||
- Taking a list of file paths to upload
|
||||
- Breaking files into smaller chunks for efficient storage
|
||||
- Avoiding duplicate storage by recognizing identical chunks across files
|
||||
- Connecting to a storage server (CAS server) that manages these chunks
|
||||
|
||||
The upload process works like this:
|
||||
1. Create a local folder at ~/.cache/huggingface/xet/chunk-cache to store file chunks for reuse.
|
||||
2. Process files in parallel (up to 8 files at once):
|
||||
2.1. Read the file content.
|
||||
2.2. Split the file content into smaller chunks based on content patterns: each chunk gets a unique ID based on what's in it.
|
||||
2.3. For each chunk:
|
||||
- Check if it already exists in storage.
|
||||
- Skip uploading chunks that already exist.
|
||||
2.4. Group chunks into larger blocks for efficient transfer.
|
||||
2.5. Upload these blocks to the storage server.
|
||||
2.6. Create and upload information about how the file is structured.
|
||||
3. Return reference files that contain information about the uploaded files, which can be used later to download them.
|
||||
"""
|
||||
if len(additions) == 0:
|
||||
return
|
||||
# at this point, we know that hf_xet is installed
|
||||
from hf_xet import upload_files
|
||||
|
||||
try:
|
||||
xet_connection_info = fetch_xet_connection_info_from_repo_info(
|
||||
token_type=XetTokenType.WRITE,
|
||||
repo_id=repo_id,
|
||||
repo_type=repo_type,
|
||||
revision=revision,
|
||||
headers=headers,
|
||||
endpoint=endpoint,
|
||||
params={"create_pr": "1"} if create_pr else None,
|
||||
)
|
||||
except HfHubHTTPError as e:
|
||||
if e.response.status_code == 401:
|
||||
raise XetAuthorizationError(
|
||||
f"You are unauthorized to upload to xet storage for {repo_type}/{repo_id}. "
|
||||
f"Please check that you have configured your access token with write access to the repo."
|
||||
) from e
|
||||
raise
|
||||
|
||||
xet_endpoint = xet_connection_info.endpoint
|
||||
access_token_info = (xet_connection_info.access_token, xet_connection_info.expiration_unix_epoch)
|
||||
|
||||
def token_refresher() -> Tuple[str, int]:
|
||||
new_xet_connection = fetch_xet_connection_info_from_repo_info(
|
||||
token_type=XetTokenType.WRITE,
|
||||
repo_id=repo_id,
|
||||
repo_type=repo_type,
|
||||
revision=revision,
|
||||
headers=headers,
|
||||
endpoint=endpoint,
|
||||
params={"create_pr": "1"} if create_pr else None,
|
||||
)
|
||||
if new_xet_connection is None:
|
||||
raise XetRefreshTokenError("Failed to refresh xet token")
|
||||
return new_xet_connection.access_token, new_xet_connection.expiration_unix_epoch
|
||||
|
||||
num_chunks = math.ceil(len(additions) / UPLOAD_BATCH_MAX_NUM_FILES)
|
||||
num_chunks_num_digits = int(math.log10(num_chunks)) + 1
|
||||
for i, chunk in enumerate(chunk_iterable(additions, chunk_size=UPLOAD_BATCH_MAX_NUM_FILES)):
|
||||
_chunk = [op for op in chunk]
|
||||
paths = [str(op.path_or_fileobj) for op in _chunk]
|
||||
expected_size = sum([os.path.getsize(path) for path in paths])
|
||||
|
||||
if num_chunks > 1:
|
||||
description = f"Uploading Batch [{str(i + 1).zfill(num_chunks_num_digits)}/{num_chunks}]..."
|
||||
else:
|
||||
description = "Uploading..."
|
||||
progress_cm = _get_progress_bar_context(
|
||||
desc=description,
|
||||
total=expected_size,
|
||||
initial=0,
|
||||
unit="B",
|
||||
unit_scale=True,
|
||||
name="huggingface_hub.xet_put",
|
||||
log_level=logger.getEffectiveLevel(),
|
||||
)
|
||||
with progress_cm as progress:
|
||||
|
||||
def update_progress(increment: int):
|
||||
progress.update(increment)
|
||||
|
||||
upload_files(paths, xet_endpoint, access_token_info, token_refresher, update_progress, repo_type)
|
||||
return
|
||||
|
||||
|
||||
def _validate_preupload_info(preupload_info: dict):
|
||||
files = preupload_info.get("files")
|
||||
if not isinstance(files, list):
|
||||
raise ValueError("preupload_info is improperly formatted")
|
||||
for file_info in files:
|
||||
if not (
|
||||
isinstance(file_info, dict)
|
||||
and isinstance(file_info.get("path"), str)
|
||||
and isinstance(file_info.get("uploadMode"), str)
|
||||
and (file_info["uploadMode"] in ("lfs", "regular"))
|
||||
):
|
||||
raise ValueError("preupload_info is improperly formatted:")
|
||||
return preupload_info
|
||||
|
||||
|
||||
@validate_hf_hub_args
|
||||
def _fetch_upload_modes(
|
||||
additions: Iterable[CommitOperationAdd],
|
||||
repo_type: str,
|
||||
repo_id: str,
|
||||
headers: Dict[str, str],
|
||||
revision: str,
|
||||
endpoint: Optional[str] = None,
|
||||
create_pr: bool = False,
|
||||
gitignore_content: Optional[str] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Requests the Hub "preupload" endpoint to determine whether each input file should be uploaded as a regular git blob,
|
||||
as a git LFS blob, or as a XET file. Input `additions` are mutated in-place with the upload mode.
|
||||
|
||||
Args:
|
||||
additions (`Iterable` of :class:`CommitOperationAdd`):
|
||||
Iterable of :class:`CommitOperationAdd` describing the files to
|
||||
upload to the Hub.
|
||||
repo_type (`str`):
|
||||
Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
|
||||
repo_id (`str`):
|
||||
A namespace (user or an organization) and a repo name separated
|
||||
by a `/`.
|
||||
headers (`Dict[str, str]`):
|
||||
Headers to use for the request, including authorization headers and user agent.
|
||||
revision (`str`):
|
||||
The git revision to upload the files to. Can be any valid git revision.
|
||||
gitignore_content (`str`, *optional*):
|
||||
The content of the `.gitignore` file to know which files should be ignored. The order of priority
|
||||
is to first check if `gitignore_content` is passed, then check if the `.gitignore` file is present
|
||||
in the list of files to commit and finally default to the `.gitignore` file already hosted on the Hub
|
||||
(if any).
|
||||
Raises:
|
||||
[`~utils.HfHubHTTPError`]
|
||||
If the Hub API returned an error.
|
||||
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
||||
If the Hub API response is improperly formatted.
|
||||
"""
|
||||
endpoint = endpoint if endpoint is not None else constants.ENDPOINT
|
||||
|
||||
# Fetch upload mode (LFS or regular) chunk by chunk.
|
||||
upload_modes: Dict[str, UploadMode] = {}
|
||||
should_ignore_info: Dict[str, bool] = {}
|
||||
oid_info: Dict[str, Optional[str]] = {}
|
||||
|
||||
for chunk in chunk_iterable(additions, 256):
|
||||
payload: Dict = {
|
||||
"files": [
|
||||
{
|
||||
"path": op.path_in_repo,
|
||||
"sample": base64.b64encode(op.upload_info.sample).decode("ascii"),
|
||||
"size": op.upload_info.size,
|
||||
}
|
||||
for op in chunk
|
||||
]
|
||||
}
|
||||
if gitignore_content is not None:
|
||||
payload["gitIgnore"] = gitignore_content
|
||||
|
||||
resp = get_session().post(
|
||||
f"{endpoint}/api/{repo_type}s/{repo_id}/preupload/{revision}",
|
||||
json=payload,
|
||||
headers=headers,
|
||||
params={"create_pr": "1"} if create_pr else None,
|
||||
)
|
||||
hf_raise_for_status(resp)
|
||||
preupload_info = _validate_preupload_info(resp.json())
|
||||
upload_modes.update(**{file["path"]: file["uploadMode"] for file in preupload_info["files"]})
|
||||
should_ignore_info.update(**{file["path"]: file["shouldIgnore"] for file in preupload_info["files"]})
|
||||
oid_info.update(**{file["path"]: file.get("oid") for file in preupload_info["files"]})
|
||||
|
||||
# Set upload mode for each addition operation
|
||||
for addition in additions:
|
||||
addition._upload_mode = upload_modes[addition.path_in_repo]
|
||||
addition._should_ignore = should_ignore_info[addition.path_in_repo]
|
||||
addition._remote_oid = oid_info[addition.path_in_repo]
|
||||
|
||||
# Empty files cannot be uploaded as LFS (S3 would fail with a 501 Not Implemented)
|
||||
# => empty files are uploaded as "regular" to still allow users to commit them.
|
||||
for addition in additions:
|
||||
if addition.upload_info.size == 0:
|
||||
addition._upload_mode = "regular"
|
||||
|
||||
|
||||
@validate_hf_hub_args
|
||||
def _fetch_files_to_copy(
|
||||
copies: Iterable[CommitOperationCopy],
|
||||
repo_type: str,
|
||||
repo_id: str,
|
||||
headers: Dict[str, str],
|
||||
revision: str,
|
||||
endpoint: Optional[str] = None,
|
||||
) -> Dict[Tuple[str, Optional[str]], Union["RepoFile", bytes]]:
|
||||
"""
|
||||
Fetch information about the files to copy.
|
||||
|
||||
For LFS files, we only need their metadata (file size and sha256) while for regular files
|
||||
we need to download the raw content from the Hub.
|
||||
|
||||
Args:
|
||||
copies (`Iterable` of :class:`CommitOperationCopy`):
|
||||
Iterable of :class:`CommitOperationCopy` describing the files to
|
||||
copy on the Hub.
|
||||
repo_type (`str`):
|
||||
Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
|
||||
repo_id (`str`):
|
||||
A namespace (user or an organization) and a repo name separated
|
||||
by a `/`.
|
||||
headers (`Dict[str, str]`):
|
||||
Headers to use for the request, including authorization headers and user agent.
|
||||
revision (`str`):
|
||||
The git revision to upload the files to. Can be any valid git revision.
|
||||
|
||||
Returns: `Dict[Tuple[str, Optional[str]], Union[RepoFile, bytes]]]`
|
||||
Key is the file path and revision of the file to copy.
|
||||
Value is the raw content as bytes (for regular files) or the file information as a RepoFile (for LFS files).
|
||||
|
||||
Raises:
|
||||
[`~utils.HfHubHTTPError`]
|
||||
If the Hub API returned an error.
|
||||
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
||||
If the Hub API response is improperly formatted.
|
||||
"""
|
||||
from .hf_api import HfApi, RepoFolder
|
||||
|
||||
hf_api = HfApi(endpoint=endpoint, headers=headers)
|
||||
files_to_copy: Dict[Tuple[str, Optional[str]], Union["RepoFile", bytes]] = {}
|
||||
# Store (path, revision) -> oid mapping
|
||||
oid_info: Dict[Tuple[str, Optional[str]], Optional[str]] = {}
|
||||
# 1. Fetch OIDs for destination paths in batches.
|
||||
dest_paths = [op.path_in_repo for op in copies]
|
||||
for offset in range(0, len(dest_paths), FETCH_LFS_BATCH_SIZE):
|
||||
dest_repo_files = hf_api.get_paths_info(
|
||||
repo_id=repo_id,
|
||||
paths=dest_paths[offset : offset + FETCH_LFS_BATCH_SIZE],
|
||||
revision=revision,
|
||||
repo_type=repo_type,
|
||||
)
|
||||
for file in dest_repo_files:
|
||||
if not isinstance(file, RepoFolder):
|
||||
oid_info[(file.path, revision)] = file.blob_id
|
||||
|
||||
# 2. Group by source revision and fetch source file info in batches.
|
||||
for src_revision, operations in groupby(copies, key=lambda op: op.src_revision):
|
||||
operations = list(operations) # type: ignore
|
||||
src_paths = [op.src_path_in_repo for op in operations]
|
||||
for offset in range(0, len(src_paths), FETCH_LFS_BATCH_SIZE):
|
||||
src_repo_files = hf_api.get_paths_info(
|
||||
repo_id=repo_id,
|
||||
paths=src_paths[offset : offset + FETCH_LFS_BATCH_SIZE],
|
||||
revision=src_revision or revision,
|
||||
repo_type=repo_type,
|
||||
)
|
||||
|
||||
for src_repo_file in src_repo_files:
|
||||
if isinstance(src_repo_file, RepoFolder):
|
||||
raise NotImplementedError("Copying a folder is not implemented.")
|
||||
oid_info[(src_repo_file.path, src_revision)] = src_repo_file.blob_id
|
||||
# If it's an LFS file, store the RepoFile object. Otherwise, download raw bytes.
|
||||
if src_repo_file.lfs:
|
||||
files_to_copy[(src_repo_file.path, src_revision)] = src_repo_file
|
||||
else:
|
||||
# TODO: (optimization) download regular files to copy concurrently
|
||||
url = hf_hub_url(
|
||||
endpoint=endpoint,
|
||||
repo_type=repo_type,
|
||||
repo_id=repo_id,
|
||||
revision=src_revision or revision,
|
||||
filename=src_repo_file.path,
|
||||
)
|
||||
response = get_session().get(url, headers=headers)
|
||||
hf_raise_for_status(response)
|
||||
files_to_copy[(src_repo_file.path, src_revision)] = response.content
|
||||
# 3. Ensure all operations found a corresponding file in the Hub
|
||||
# and track src/dest OIDs for each operation.
|
||||
for operation in operations:
|
||||
if (operation.src_path_in_repo, src_revision) not in files_to_copy:
|
||||
raise EntryNotFoundError(
|
||||
f"Cannot copy {operation.src_path_in_repo} at revision "
|
||||
f"{src_revision or revision}: file is missing on repo."
|
||||
)
|
||||
operation._src_oid = oid_info.get((operation.src_path_in_repo, operation.src_revision))
|
||||
operation._dest_oid = oid_info.get((operation.path_in_repo, revision))
|
||||
return files_to_copy
|
||||
|
||||
|
||||
def _prepare_commit_payload(
|
||||
operations: Iterable[CommitOperation],
|
||||
files_to_copy: Dict[Tuple[str, Optional[str]], Union["RepoFile", bytes]],
|
||||
commit_message: str,
|
||||
commit_description: Optional[str] = None,
|
||||
parent_commit: Optional[str] = None,
|
||||
) -> Iterable[Dict[str, Any]]:
|
||||
"""
|
||||
Builds the payload to POST to the `/commit` API of the Hub.
|
||||
|
||||
Payload is returned as an iterator so that it can be streamed as a ndjson in the
|
||||
POST request.
|
||||
|
||||
For more information, see:
|
||||
- https://github.com/huggingface/huggingface_hub/issues/1085#issuecomment-1265208073
|
||||
- http://ndjson.org/
|
||||
"""
|
||||
commit_description = commit_description if commit_description is not None else ""
|
||||
|
||||
# 1. Send a header item with the commit metadata
|
||||
header_value = {"summary": commit_message, "description": commit_description}
|
||||
if parent_commit is not None:
|
||||
header_value["parentCommit"] = parent_commit
|
||||
yield {"key": "header", "value": header_value}
|
||||
|
||||
nb_ignored_files = 0
|
||||
|
||||
# 2. Send operations, one per line
|
||||
for operation in operations:
|
||||
# Skip ignored files
|
||||
if isinstance(operation, CommitOperationAdd) and operation._should_ignore:
|
||||
logger.debug(f"Skipping file '{operation.path_in_repo}' in commit (ignored by gitignore file).")
|
||||
nb_ignored_files += 1
|
||||
continue
|
||||
|
||||
# 2.a. Case adding a regular file
|
||||
if isinstance(operation, CommitOperationAdd) and operation._upload_mode == "regular":
|
||||
yield {
|
||||
"key": "file",
|
||||
"value": {
|
||||
"content": operation.b64content().decode(),
|
||||
"path": operation.path_in_repo,
|
||||
"encoding": "base64",
|
||||
},
|
||||
}
|
||||
# 2.b. Case adding an LFS file
|
||||
elif isinstance(operation, CommitOperationAdd) and operation._upload_mode == "lfs":
|
||||
yield {
|
||||
"key": "lfsFile",
|
||||
"value": {
|
||||
"path": operation.path_in_repo,
|
||||
"algo": "sha256",
|
||||
"oid": operation.upload_info.sha256.hex(),
|
||||
"size": operation.upload_info.size,
|
||||
},
|
||||
}
|
||||
# 2.c. Case deleting a file or folder
|
||||
elif isinstance(operation, CommitOperationDelete):
|
||||
yield {
|
||||
"key": "deletedFolder" if operation.is_folder else "deletedFile",
|
||||
"value": {"path": operation.path_in_repo},
|
||||
}
|
||||
# 2.d. Case copying a file or folder
|
||||
elif isinstance(operation, CommitOperationCopy):
|
||||
file_to_copy = files_to_copy[(operation.src_path_in_repo, operation.src_revision)]
|
||||
if isinstance(file_to_copy, bytes):
|
||||
yield {
|
||||
"key": "file",
|
||||
"value": {
|
||||
"content": base64.b64encode(file_to_copy).decode(),
|
||||
"path": operation.path_in_repo,
|
||||
"encoding": "base64",
|
||||
},
|
||||
}
|
||||
elif file_to_copy.lfs:
|
||||
yield {
|
||||
"key": "lfsFile",
|
||||
"value": {
|
||||
"path": operation.path_in_repo,
|
||||
"algo": "sha256",
|
||||
"oid": file_to_copy.lfs.sha256,
|
||||
},
|
||||
}
|
||||
else:
|
||||
raise ValueError(
|
||||
"Malformed files_to_copy (should be raw file content as bytes or RepoFile objects with LFS info."
|
||||
)
|
||||
# 2.e. Never expected to happen
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unknown operation to commit. Operation: {operation}. Upload mode:"
|
||||
f" {getattr(operation, '_upload_mode', None)}"
|
||||
)
|
||||
|
||||
if nb_ignored_files > 0:
|
||||
logger.info(f"Skipped {nb_ignored_files} file(s) in commit (ignored by gitignore file).")
|
||||
@@ -0,0 +1,353 @@
|
||||
import atexit
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from concurrent.futures import Future
|
||||
from dataclasses import dataclass
|
||||
from io import SEEK_END, SEEK_SET, BytesIO
|
||||
from pathlib import Path
|
||||
from threading import Lock, Thread
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
from .hf_api import DEFAULT_IGNORE_PATTERNS, CommitInfo, CommitOperationAdd, HfApi
|
||||
from .utils import filter_repo_objects
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class _FileToUpload:
|
||||
"""Temporary dataclass to store info about files to upload. Not meant to be used directly."""
|
||||
|
||||
local_path: Path
|
||||
path_in_repo: str
|
||||
size_limit: int
|
||||
last_modified: float
|
||||
|
||||
|
||||
class CommitScheduler:
|
||||
"""
|
||||
Scheduler to upload a local folder to the Hub at regular intervals (e.g. push to hub every 5 minutes).
|
||||
|
||||
The recommended way to use the scheduler is to use it as a context manager. This ensures that the scheduler is
|
||||
properly stopped and the last commit is triggered when the script ends. The scheduler can also be stopped manually
|
||||
with the `stop` method. Checkout the [upload guide](https://huggingface.co/docs/huggingface_hub/guides/upload#scheduled-uploads)
|
||||
to learn more about how to use it.
|
||||
|
||||
Args:
|
||||
repo_id (`str`):
|
||||
The id of the repo to commit to.
|
||||
folder_path (`str` or `Path`):
|
||||
Path to the local folder to upload regularly.
|
||||
every (`int` or `float`, *optional*):
|
||||
The number of minutes between each commit. Defaults to 5 minutes.
|
||||
path_in_repo (`str`, *optional*):
|
||||
Relative path of the directory in the repo, for example: `"checkpoints/"`. Defaults to the root folder
|
||||
of the repository.
|
||||
repo_type (`str`, *optional*):
|
||||
The type of the repo to commit to. Defaults to `model`.
|
||||
revision (`str`, *optional*):
|
||||
The revision of the repo to commit to. Defaults to `main`.
|
||||
private (`bool`, *optional*):
|
||||
Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists.
|
||||
token (`str`, *optional*):
|
||||
The token to use to commit to the repo. Defaults to the token saved on the machine.
|
||||
allow_patterns (`List[str]` or `str`, *optional*):
|
||||
If provided, only files matching at least one pattern are uploaded.
|
||||
ignore_patterns (`List[str]` or `str`, *optional*):
|
||||
If provided, files matching any of the patterns are not uploaded.
|
||||
squash_history (`bool`, *optional*):
|
||||
Whether to squash the history of the repo after each commit. Defaults to `False`. Squashing commits is
|
||||
useful to avoid degraded performances on the repo when it grows too large.
|
||||
hf_api (`HfApi`, *optional*):
|
||||
The [`HfApi`] client to use to commit to the Hub. Can be set with custom settings (user agent, token,...).
|
||||
|
||||
Example:
|
||||
```py
|
||||
>>> from pathlib import Path
|
||||
>>> from huggingface_hub import CommitScheduler
|
||||
|
||||
# Scheduler uploads every 10 minutes
|
||||
>>> csv_path = Path("watched_folder/data.csv")
|
||||
>>> CommitScheduler(repo_id="test_scheduler", repo_type="dataset", folder_path=csv_path.parent, every=10)
|
||||
|
||||
>>> with csv_path.open("a") as f:
|
||||
... f.write("first line")
|
||||
|
||||
# Some time later (...)
|
||||
>>> with csv_path.open("a") as f:
|
||||
... f.write("second line")
|
||||
```
|
||||
|
||||
Example using a context manager:
|
||||
```py
|
||||
>>> from pathlib import Path
|
||||
>>> from huggingface_hub import CommitScheduler
|
||||
|
||||
>>> with CommitScheduler(repo_id="test_scheduler", repo_type="dataset", folder_path="watched_folder", every=10) as scheduler:
|
||||
... csv_path = Path("watched_folder/data.csv")
|
||||
... with csv_path.open("a") as f:
|
||||
... f.write("first line")
|
||||
... (...)
|
||||
... with csv_path.open("a") as f:
|
||||
... f.write("second line")
|
||||
|
||||
# Scheduler is now stopped and last commit have been triggered
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
repo_id: str,
|
||||
folder_path: Union[str, Path],
|
||||
every: Union[int, float] = 5,
|
||||
path_in_repo: Optional[str] = None,
|
||||
repo_type: Optional[str] = None,
|
||||
revision: Optional[str] = None,
|
||||
private: Optional[bool] = None,
|
||||
token: Optional[str] = None,
|
||||
allow_patterns: Optional[Union[List[str], str]] = None,
|
||||
ignore_patterns: Optional[Union[List[str], str]] = None,
|
||||
squash_history: bool = False,
|
||||
hf_api: Optional["HfApi"] = None,
|
||||
) -> None:
|
||||
self.api = hf_api or HfApi(token=token)
|
||||
|
||||
# Folder
|
||||
self.folder_path = Path(folder_path).expanduser().resolve()
|
||||
self.path_in_repo = path_in_repo or ""
|
||||
self.allow_patterns = allow_patterns
|
||||
|
||||
if ignore_patterns is None:
|
||||
ignore_patterns = []
|
||||
elif isinstance(ignore_patterns, str):
|
||||
ignore_patterns = [ignore_patterns]
|
||||
self.ignore_patterns = ignore_patterns + DEFAULT_IGNORE_PATTERNS
|
||||
|
||||
if self.folder_path.is_file():
|
||||
raise ValueError(f"'folder_path' must be a directory, not a file: '{self.folder_path}'.")
|
||||
self.folder_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Repository
|
||||
repo_url = self.api.create_repo(repo_id=repo_id, private=private, repo_type=repo_type, exist_ok=True)
|
||||
self.repo_id = repo_url.repo_id
|
||||
self.repo_type = repo_type
|
||||
self.revision = revision
|
||||
self.token = token
|
||||
|
||||
# Keep track of already uploaded files
|
||||
self.last_uploaded: Dict[Path, float] = {} # key is local path, value is timestamp
|
||||
|
||||
# Scheduler
|
||||
if not every > 0:
|
||||
raise ValueError(f"'every' must be a positive integer, not '{every}'.")
|
||||
self.lock = Lock()
|
||||
self.every = every
|
||||
self.squash_history = squash_history
|
||||
|
||||
logger.info(f"Scheduled job to push '{self.folder_path}' to '{self.repo_id}' every {self.every} minutes.")
|
||||
self._scheduler_thread = Thread(target=self._run_scheduler, daemon=True)
|
||||
self._scheduler_thread.start()
|
||||
atexit.register(self._push_to_hub)
|
||||
|
||||
self.__stopped = False
|
||||
|
||||
def stop(self) -> None:
|
||||
"""Stop the scheduler.
|
||||
|
||||
A stopped scheduler cannot be restarted. Mostly for tests purposes.
|
||||
"""
|
||||
self.__stopped = True
|
||||
|
||||
def __enter__(self) -> "CommitScheduler":
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
||||
# Upload last changes before exiting
|
||||
self.trigger().result()
|
||||
self.stop()
|
||||
return
|
||||
|
||||
def _run_scheduler(self) -> None:
|
||||
"""Dumb thread waiting between each scheduled push to Hub."""
|
||||
while True:
|
||||
self.last_future = self.trigger()
|
||||
time.sleep(self.every * 60)
|
||||
if self.__stopped:
|
||||
break
|
||||
|
||||
def trigger(self) -> Future:
|
||||
"""Trigger a `push_to_hub` and return a future.
|
||||
|
||||
This method is automatically called every `every` minutes. You can also call it manually to trigger a commit
|
||||
immediately, without waiting for the next scheduled commit.
|
||||
"""
|
||||
return self.api.run_as_future(self._push_to_hub)
|
||||
|
||||
def _push_to_hub(self) -> Optional[CommitInfo]:
|
||||
if self.__stopped: # If stopped, already scheduled commits are ignored
|
||||
return None
|
||||
|
||||
logger.info("(Background) scheduled commit triggered.")
|
||||
try:
|
||||
value = self.push_to_hub()
|
||||
if self.squash_history:
|
||||
logger.info("(Background) squashing repo history.")
|
||||
self.api.super_squash_history(repo_id=self.repo_id, repo_type=self.repo_type, branch=self.revision)
|
||||
return value
|
||||
except Exception as e:
|
||||
logger.error(f"Error while pushing to Hub: {e}") # Depending on the setup, error might be silenced
|
||||
raise
|
||||
|
||||
def push_to_hub(self) -> Optional[CommitInfo]:
|
||||
"""
|
||||
Push folder to the Hub and return the commit info.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This method is not meant to be called directly. It is run in the background by the scheduler, respecting a
|
||||
queue mechanism to avoid concurrent commits. Making a direct call to the method might lead to concurrency
|
||||
issues.
|
||||
|
||||
</Tip>
|
||||
|
||||
The default behavior of `push_to_hub` is to assume an append-only folder. It lists all files in the folder and
|
||||
uploads only changed files. If no changes are found, the method returns without committing anything. If you want
|
||||
to change this behavior, you can inherit from [`CommitScheduler`] and override this method. This can be useful
|
||||
for example to compress data together in a single file before committing. For more details and examples, check
|
||||
out our [integration guide](https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#scheduled-uploads).
|
||||
"""
|
||||
# Check files to upload (with lock)
|
||||
with self.lock:
|
||||
logger.debug("Listing files to upload for scheduled commit.")
|
||||
|
||||
# List files from folder (taken from `_prepare_upload_folder_additions`)
|
||||
relpath_to_abspath = {
|
||||
path.relative_to(self.folder_path).as_posix(): path
|
||||
for path in sorted(self.folder_path.glob("**/*")) # sorted to be deterministic
|
||||
if path.is_file()
|
||||
}
|
||||
prefix = f"{self.path_in_repo.strip('/')}/" if self.path_in_repo else ""
|
||||
|
||||
# Filter with pattern + filter out unchanged files + retrieve current file size
|
||||
files_to_upload: List[_FileToUpload] = []
|
||||
for relpath in filter_repo_objects(
|
||||
relpath_to_abspath.keys(), allow_patterns=self.allow_patterns, ignore_patterns=self.ignore_patterns
|
||||
):
|
||||
local_path = relpath_to_abspath[relpath]
|
||||
stat = local_path.stat()
|
||||
if self.last_uploaded.get(local_path) is None or self.last_uploaded[local_path] != stat.st_mtime:
|
||||
files_to_upload.append(
|
||||
_FileToUpload(
|
||||
local_path=local_path,
|
||||
path_in_repo=prefix + relpath,
|
||||
size_limit=stat.st_size,
|
||||
last_modified=stat.st_mtime,
|
||||
)
|
||||
)
|
||||
|
||||
# Return if nothing to upload
|
||||
if len(files_to_upload) == 0:
|
||||
logger.debug("Dropping schedule commit: no changed file to upload.")
|
||||
return None
|
||||
|
||||
# Convert `_FileToUpload` as `CommitOperationAdd` (=> compute file shas + limit to file size)
|
||||
logger.debug("Removing unchanged files since previous scheduled commit.")
|
||||
add_operations = [
|
||||
CommitOperationAdd(
|
||||
# Cap the file to its current size, even if the user append data to it while a scheduled commit is happening
|
||||
path_or_fileobj=PartialFileIO(file_to_upload.local_path, size_limit=file_to_upload.size_limit),
|
||||
path_in_repo=file_to_upload.path_in_repo,
|
||||
)
|
||||
for file_to_upload in files_to_upload
|
||||
]
|
||||
|
||||
# Upload files (append mode expected - no need for lock)
|
||||
logger.debug("Uploading files for scheduled commit.")
|
||||
commit_info = self.api.create_commit(
|
||||
repo_id=self.repo_id,
|
||||
repo_type=self.repo_type,
|
||||
operations=add_operations,
|
||||
commit_message="Scheduled Commit",
|
||||
revision=self.revision,
|
||||
)
|
||||
|
||||
# Successful commit: keep track of the latest "last_modified" for each file
|
||||
for file in files_to_upload:
|
||||
self.last_uploaded[file.local_path] = file.last_modified
|
||||
return commit_info
|
||||
|
||||
|
||||
class PartialFileIO(BytesIO):
|
||||
"""A file-like object that reads only the first part of a file.
|
||||
|
||||
Useful to upload a file to the Hub when the user might still be appending data to it. Only the first part of the
|
||||
file is uploaded (i.e. the part that was available when the filesystem was first scanned).
|
||||
|
||||
In practice, only used internally by the CommitScheduler to regularly push a folder to the Hub with minimal
|
||||
disturbance for the user. The object is passed to `CommitOperationAdd`.
|
||||
|
||||
Only supports `read`, `tell` and `seek` methods.
|
||||
|
||||
Args:
|
||||
file_path (`str` or `Path`):
|
||||
Path to the file to read.
|
||||
size_limit (`int`):
|
||||
The maximum number of bytes to read from the file. If the file is larger than this, only the first part
|
||||
will be read (and uploaded).
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: Union[str, Path], size_limit: int) -> None:
|
||||
self._file_path = Path(file_path)
|
||||
self._file = self._file_path.open("rb")
|
||||
self._size_limit = min(size_limit, os.fstat(self._file.fileno()).st_size)
|
||||
|
||||
def __del__(self) -> None:
|
||||
self._file.close()
|
||||
return super().__del__()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<PartialFileIO file_path={self._file_path} size_limit={self._size_limit}>"
|
||||
|
||||
def __len__(self) -> int:
|
||||
return self._size_limit
|
||||
|
||||
def __getattribute__(self, name: str):
|
||||
if name.startswith("_") or name in ("read", "tell", "seek"): # only 3 public methods supported
|
||||
return super().__getattribute__(name)
|
||||
raise NotImplementedError(f"PartialFileIO does not support '{name}'.")
|
||||
|
||||
def tell(self) -> int:
|
||||
"""Return the current file position."""
|
||||
return self._file.tell()
|
||||
|
||||
def seek(self, __offset: int, __whence: int = SEEK_SET) -> int:
|
||||
"""Change the stream position to the given offset.
|
||||
|
||||
Behavior is the same as a regular file, except that the position is capped to the size limit.
|
||||
"""
|
||||
if __whence == SEEK_END:
|
||||
# SEEK_END => set from the truncated end
|
||||
__offset = len(self) + __offset
|
||||
__whence = SEEK_SET
|
||||
|
||||
pos = self._file.seek(__offset, __whence)
|
||||
if pos > self._size_limit:
|
||||
return self._file.seek(self._size_limit)
|
||||
return pos
|
||||
|
||||
def read(self, __size: Optional[int] = -1) -> bytes:
|
||||
"""Read at most `__size` bytes from the file.
|
||||
|
||||
Behavior is the same as a regular file, except that it is capped to the size limit.
|
||||
"""
|
||||
current = self._file.tell()
|
||||
if __size is None or __size < 0:
|
||||
# Read until file limit
|
||||
truncated_size = self._size_limit - current
|
||||
else:
|
||||
# Read until file limit or __size
|
||||
truncated_size = min(__size, self._size_limit - current)
|
||||
return self._file.read(truncated_size)
|
||||
@@ -0,0 +1,407 @@
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import TYPE_CHECKING, Dict, Optional, Union
|
||||
|
||||
from huggingface_hub.errors import InferenceEndpointError, InferenceEndpointTimeoutError
|
||||
|
||||
from .inference._client import InferenceClient
|
||||
from .inference._generated._async_client import AsyncInferenceClient
|
||||
from .utils import get_session, logging, parse_datetime
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .hf_api import HfApi
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class InferenceEndpointStatus(str, Enum):
|
||||
PENDING = "pending"
|
||||
INITIALIZING = "initializing"
|
||||
UPDATING = "updating"
|
||||
UPDATE_FAILED = "updateFailed"
|
||||
RUNNING = "running"
|
||||
PAUSED = "paused"
|
||||
FAILED = "failed"
|
||||
SCALED_TO_ZERO = "scaledToZero"
|
||||
|
||||
|
||||
class InferenceEndpointType(str, Enum):
|
||||
PUBlIC = "public"
|
||||
PROTECTED = "protected"
|
||||
PRIVATE = "private"
|
||||
|
||||
|
||||
@dataclass
|
||||
class InferenceEndpoint:
|
||||
"""
|
||||
Contains information about a deployed Inference Endpoint.
|
||||
|
||||
Args:
|
||||
name (`str`):
|
||||
The unique name of the Inference Endpoint.
|
||||
namespace (`str`):
|
||||
The namespace where the Inference Endpoint is located.
|
||||
repository (`str`):
|
||||
The name of the model repository deployed on this Inference Endpoint.
|
||||
status ([`InferenceEndpointStatus`]):
|
||||
The current status of the Inference Endpoint.
|
||||
url (`str`, *optional*):
|
||||
The URL of the Inference Endpoint, if available. Only a deployed Inference Endpoint will have a URL.
|
||||
framework (`str`):
|
||||
The machine learning framework used for the model.
|
||||
revision (`str`):
|
||||
The specific model revision deployed on the Inference Endpoint.
|
||||
task (`str`):
|
||||
The task associated with the deployed model.
|
||||
created_at (`datetime.datetime`):
|
||||
The timestamp when the Inference Endpoint was created.
|
||||
updated_at (`datetime.datetime`):
|
||||
The timestamp of the last update of the Inference Endpoint.
|
||||
type ([`InferenceEndpointType`]):
|
||||
The type of the Inference Endpoint (public, protected, private).
|
||||
raw (`Dict`):
|
||||
The raw dictionary data returned from the API.
|
||||
token (`str` or `bool`, *optional*):
|
||||
Authentication token for the Inference Endpoint, if set when requesting the API. Will default to the
|
||||
locally saved token if not provided. Pass `token=False` if you don't want to send your token to the server.
|
||||
|
||||
Example:
|
||||
```python
|
||||
>>> from huggingface_hub import get_inference_endpoint
|
||||
>>> endpoint = get_inference_endpoint("my-text-to-image")
|
||||
>>> endpoint
|
||||
InferenceEndpoint(name='my-text-to-image', ...)
|
||||
|
||||
# Get status
|
||||
>>> endpoint.status
|
||||
'running'
|
||||
>>> endpoint.url
|
||||
'https://my-text-to-image.region.vendor.endpoints.huggingface.cloud'
|
||||
|
||||
# Run inference
|
||||
>>> endpoint.client.text_to_image(...)
|
||||
|
||||
# Pause endpoint to save $$$
|
||||
>>> endpoint.pause()
|
||||
|
||||
# ...
|
||||
# Resume and wait for deployment
|
||||
>>> endpoint.resume()
|
||||
>>> endpoint.wait()
|
||||
>>> endpoint.client.text_to_image(...)
|
||||
```
|
||||
"""
|
||||
|
||||
# Field in __repr__
|
||||
name: str = field(init=False)
|
||||
namespace: str
|
||||
repository: str = field(init=False)
|
||||
status: InferenceEndpointStatus = field(init=False)
|
||||
url: Optional[str] = field(init=False)
|
||||
|
||||
# Other fields
|
||||
framework: str = field(repr=False, init=False)
|
||||
revision: str = field(repr=False, init=False)
|
||||
task: str = field(repr=False, init=False)
|
||||
created_at: datetime = field(repr=False, init=False)
|
||||
updated_at: datetime = field(repr=False, init=False)
|
||||
type: InferenceEndpointType = field(repr=False, init=False)
|
||||
|
||||
# Raw dict from the API
|
||||
raw: Dict = field(repr=False)
|
||||
|
||||
# Internal fields
|
||||
_token: Union[str, bool, None] = field(repr=False, compare=False)
|
||||
_api: "HfApi" = field(repr=False, compare=False)
|
||||
|
||||
@classmethod
|
||||
def from_raw(
|
||||
cls, raw: Dict, namespace: str, token: Union[str, bool, None] = None, api: Optional["HfApi"] = None
|
||||
) -> "InferenceEndpoint":
|
||||
"""Initialize object from raw dictionary."""
|
||||
if api is None:
|
||||
from .hf_api import HfApi
|
||||
|
||||
api = HfApi()
|
||||
if token is None:
|
||||
token = api.token
|
||||
|
||||
# All other fields are populated in __post_init__
|
||||
return cls(raw=raw, namespace=namespace, _token=token, _api=api)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
"""Populate fields from raw dictionary."""
|
||||
self._populate_from_raw()
|
||||
|
||||
@property
|
||||
def client(self) -> InferenceClient:
|
||||
"""Returns a client to make predictions on this Inference Endpoint.
|
||||
|
||||
Returns:
|
||||
[`InferenceClient`]: an inference client pointing to the deployed endpoint.
|
||||
|
||||
Raises:
|
||||
[`InferenceEndpointError`]: If the Inference Endpoint is not yet deployed.
|
||||
"""
|
||||
if self.url is None:
|
||||
raise InferenceEndpointError(
|
||||
"Cannot create a client for this Inference Endpoint as it is not yet deployed. "
|
||||
"Please wait for the Inference Endpoint to be deployed using `endpoint.wait()` and try again."
|
||||
)
|
||||
return InferenceClient(
|
||||
model=self.url,
|
||||
token=self._token, # type: ignore[arg-type] # boolean token shouldn't be possible. In practice it's ok.
|
||||
)
|
||||
|
||||
@property
|
||||
def async_client(self) -> AsyncInferenceClient:
|
||||
"""Returns a client to make predictions on this Inference Endpoint.
|
||||
|
||||
Returns:
|
||||
[`AsyncInferenceClient`]: an asyncio-compatible inference client pointing to the deployed endpoint.
|
||||
|
||||
Raises:
|
||||
[`InferenceEndpointError`]: If the Inference Endpoint is not yet deployed.
|
||||
"""
|
||||
if self.url is None:
|
||||
raise InferenceEndpointError(
|
||||
"Cannot create a client for this Inference Endpoint as it is not yet deployed. "
|
||||
"Please wait for the Inference Endpoint to be deployed using `endpoint.wait()` and try again."
|
||||
)
|
||||
return AsyncInferenceClient(
|
||||
model=self.url,
|
||||
token=self._token, # type: ignore[arg-type] # boolean token shouldn't be possible. In practice it's ok.
|
||||
)
|
||||
|
||||
def wait(self, timeout: Optional[int] = None, refresh_every: int = 5) -> "InferenceEndpoint":
|
||||
"""Wait for the Inference Endpoint to be deployed.
|
||||
|
||||
Information from the server will be fetched every 1s. If the Inference Endpoint is not deployed after `timeout`
|
||||
seconds, a [`InferenceEndpointTimeoutError`] will be raised. The [`InferenceEndpoint`] will be mutated in place with the latest
|
||||
data.
|
||||
|
||||
Args:
|
||||
timeout (`int`, *optional*):
|
||||
The maximum time to wait for the Inference Endpoint to be deployed, in seconds. If `None`, will wait
|
||||
indefinitely.
|
||||
refresh_every (`int`, *optional*):
|
||||
The time to wait between each fetch of the Inference Endpoint status, in seconds. Defaults to 5s.
|
||||
|
||||
Returns:
|
||||
[`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
|
||||
|
||||
Raises:
|
||||
[`InferenceEndpointError`]
|
||||
If the Inference Endpoint ended up in a failed state.
|
||||
[`InferenceEndpointTimeoutError`]
|
||||
If the Inference Endpoint is not deployed after `timeout` seconds.
|
||||
"""
|
||||
if timeout is not None and timeout < 0:
|
||||
raise ValueError("`timeout` cannot be negative.")
|
||||
if refresh_every <= 0:
|
||||
raise ValueError("`refresh_every` must be positive.")
|
||||
|
||||
start = time.time()
|
||||
while True:
|
||||
if self.status == InferenceEndpointStatus.FAILED:
|
||||
raise InferenceEndpointError(
|
||||
f"Inference Endpoint {self.name} failed to deploy. Please check the logs for more information."
|
||||
)
|
||||
if self.status == InferenceEndpointStatus.UPDATE_FAILED:
|
||||
raise InferenceEndpointError(
|
||||
f"Inference Endpoint {self.name} failed to update. Please check the logs for more information."
|
||||
)
|
||||
if self.status == InferenceEndpointStatus.RUNNING and self.url is not None:
|
||||
# Verify the endpoint is actually reachable
|
||||
response = get_session().get(self.url, headers=self._api._build_hf_headers(token=self._token))
|
||||
if response.status_code == 200:
|
||||
logger.info("Inference Endpoint is ready to be used.")
|
||||
return self
|
||||
|
||||
if timeout is not None:
|
||||
if time.time() - start > timeout:
|
||||
raise InferenceEndpointTimeoutError("Timeout while waiting for Inference Endpoint to be deployed.")
|
||||
logger.info(f"Inference Endpoint is not deployed yet ({self.status}). Waiting {refresh_every}s...")
|
||||
time.sleep(refresh_every)
|
||||
self.fetch()
|
||||
|
||||
def fetch(self) -> "InferenceEndpoint":
|
||||
"""Fetch latest information about the Inference Endpoint.
|
||||
|
||||
Returns:
|
||||
[`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
|
||||
"""
|
||||
obj = self._api.get_inference_endpoint(name=self.name, namespace=self.namespace, token=self._token) # type: ignore [arg-type]
|
||||
self.raw = obj.raw
|
||||
self._populate_from_raw()
|
||||
return self
|
||||
|
||||
def update(
|
||||
self,
|
||||
*,
|
||||
# Compute update
|
||||
accelerator: Optional[str] = None,
|
||||
instance_size: Optional[str] = None,
|
||||
instance_type: Optional[str] = None,
|
||||
min_replica: Optional[int] = None,
|
||||
max_replica: Optional[int] = None,
|
||||
scale_to_zero_timeout: Optional[int] = None,
|
||||
# Model update
|
||||
repository: Optional[str] = None,
|
||||
framework: Optional[str] = None,
|
||||
revision: Optional[str] = None,
|
||||
task: Optional[str] = None,
|
||||
custom_image: Optional[Dict] = None,
|
||||
secrets: Optional[Dict[str, str]] = None,
|
||||
) -> "InferenceEndpoint":
|
||||
"""Update the Inference Endpoint.
|
||||
|
||||
This method allows the update of either the compute configuration, the deployed model, or both. All arguments are
|
||||
optional but at least one must be provided.
|
||||
|
||||
This is an alias for [`HfApi.update_inference_endpoint`]. The current object is mutated in place with the
|
||||
latest data from the server.
|
||||
|
||||
Args:
|
||||
accelerator (`str`, *optional*):
|
||||
The hardware accelerator to be used for inference (e.g. `"cpu"`).
|
||||
instance_size (`str`, *optional*):
|
||||
The size or type of the instance to be used for hosting the model (e.g. `"x4"`).
|
||||
instance_type (`str`, *optional*):
|
||||
The cloud instance type where the Inference Endpoint will be deployed (e.g. `"intel-icl"`).
|
||||
min_replica (`int`, *optional*):
|
||||
The minimum number of replicas (instances) to keep running for the Inference Endpoint.
|
||||
max_replica (`int`, *optional*):
|
||||
The maximum number of replicas (instances) to scale to for the Inference Endpoint.
|
||||
scale_to_zero_timeout (`int`, *optional*):
|
||||
The duration in minutes before an inactive endpoint is scaled to zero.
|
||||
|
||||
repository (`str`, *optional*):
|
||||
The name of the model repository associated with the Inference Endpoint (e.g. `"gpt2"`).
|
||||
framework (`str`, *optional*):
|
||||
The machine learning framework used for the model (e.g. `"custom"`).
|
||||
revision (`str`, *optional*):
|
||||
The specific model revision to deploy on the Inference Endpoint (e.g. `"6c0e6080953db56375760c0471a8c5f2929baf11"`).
|
||||
task (`str`, *optional*):
|
||||
The task on which to deploy the model (e.g. `"text-classification"`).
|
||||
custom_image (`Dict`, *optional*):
|
||||
A custom Docker image to use for the Inference Endpoint. This is useful if you want to deploy an
|
||||
Inference Endpoint running on the `text-generation-inference` (TGI) framework (see examples).
|
||||
secrets (`Dict[str, str]`, *optional*):
|
||||
Secret values to inject in the container environment.
|
||||
Returns:
|
||||
[`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
|
||||
"""
|
||||
# Make API call
|
||||
obj = self._api.update_inference_endpoint(
|
||||
name=self.name,
|
||||
namespace=self.namespace,
|
||||
accelerator=accelerator,
|
||||
instance_size=instance_size,
|
||||
instance_type=instance_type,
|
||||
min_replica=min_replica,
|
||||
max_replica=max_replica,
|
||||
scale_to_zero_timeout=scale_to_zero_timeout,
|
||||
repository=repository,
|
||||
framework=framework,
|
||||
revision=revision,
|
||||
task=task,
|
||||
custom_image=custom_image,
|
||||
secrets=secrets,
|
||||
token=self._token, # type: ignore [arg-type]
|
||||
)
|
||||
|
||||
# Mutate current object
|
||||
self.raw = obj.raw
|
||||
self._populate_from_raw()
|
||||
return self
|
||||
|
||||
def pause(self) -> "InferenceEndpoint":
|
||||
"""Pause the Inference Endpoint.
|
||||
|
||||
A paused Inference Endpoint will not be charged. It can be resumed at any time using [`InferenceEndpoint.resume`].
|
||||
This is different than scaling the Inference Endpoint to zero with [`InferenceEndpoint.scale_to_zero`], which
|
||||
would be automatically restarted when a request is made to it.
|
||||
|
||||
This is an alias for [`HfApi.pause_inference_endpoint`]. The current object is mutated in place with the
|
||||
latest data from the server.
|
||||
|
||||
Returns:
|
||||
[`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
|
||||
"""
|
||||
obj = self._api.pause_inference_endpoint(name=self.name, namespace=self.namespace, token=self._token) # type: ignore [arg-type]
|
||||
self.raw = obj.raw
|
||||
self._populate_from_raw()
|
||||
return self
|
||||
|
||||
def resume(self, running_ok: bool = True) -> "InferenceEndpoint":
|
||||
"""Resume the Inference Endpoint.
|
||||
|
||||
This is an alias for [`HfApi.resume_inference_endpoint`]. The current object is mutated in place with the
|
||||
latest data from the server.
|
||||
|
||||
Args:
|
||||
running_ok (`bool`, *optional*):
|
||||
If `True`, the method will not raise an error if the Inference Endpoint is already running. Defaults to
|
||||
`True`.
|
||||
|
||||
Returns:
|
||||
[`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
|
||||
"""
|
||||
obj = self._api.resume_inference_endpoint(
|
||||
name=self.name, namespace=self.namespace, running_ok=running_ok, token=self._token
|
||||
) # type: ignore [arg-type]
|
||||
self.raw = obj.raw
|
||||
self._populate_from_raw()
|
||||
return self
|
||||
|
||||
def scale_to_zero(self) -> "InferenceEndpoint":
|
||||
"""Scale Inference Endpoint to zero.
|
||||
|
||||
An Inference Endpoint scaled to zero will not be charged. It will be resume on the next request to it, with a
|
||||
cold start delay. This is different than pausing the Inference Endpoint with [`InferenceEndpoint.pause`], which
|
||||
would require a manual resume with [`InferenceEndpoint.resume`].
|
||||
|
||||
This is an alias for [`HfApi.scale_to_zero_inference_endpoint`]. The current object is mutated in place with the
|
||||
latest data from the server.
|
||||
|
||||
Returns:
|
||||
[`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
|
||||
"""
|
||||
obj = self._api.scale_to_zero_inference_endpoint(name=self.name, namespace=self.namespace, token=self._token) # type: ignore [arg-type]
|
||||
self.raw = obj.raw
|
||||
self._populate_from_raw()
|
||||
return self
|
||||
|
||||
def delete(self) -> None:
|
||||
"""Delete the Inference Endpoint.
|
||||
|
||||
This operation is not reversible. If you don't want to be charged for an Inference Endpoint, it is preferable
|
||||
to pause it with [`InferenceEndpoint.pause`] or scale it to zero with [`InferenceEndpoint.scale_to_zero`].
|
||||
|
||||
This is an alias for [`HfApi.delete_inference_endpoint`].
|
||||
"""
|
||||
self._api.delete_inference_endpoint(name=self.name, namespace=self.namespace, token=self._token) # type: ignore [arg-type]
|
||||
|
||||
def _populate_from_raw(self) -> None:
|
||||
"""Populate fields from raw dictionary.
|
||||
|
||||
Called in __post_init__ + each time the Inference Endpoint is updated.
|
||||
"""
|
||||
# Repr fields
|
||||
self.name = self.raw["name"]
|
||||
self.repository = self.raw["model"]["repository"]
|
||||
self.status = self.raw["status"]["state"]
|
||||
self.url = self.raw["status"].get("url")
|
||||
|
||||
# Other fields
|
||||
self.framework = self.raw["model"]["framework"]
|
||||
self.revision = self.raw["model"]["revision"]
|
||||
self.task = self.raw["model"]["task"]
|
||||
self.created_at = parse_datetime(self.raw["status"]["createdAt"])
|
||||
self.updated_at = parse_datetime(self.raw["status"]["updatedAt"])
|
||||
self.type = self.raw["type"]
|
||||
@@ -0,0 +1,432 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contains utilities to handle the `../.cache/huggingface` folder in local directories.
|
||||
|
||||
First discussed in https://github.com/huggingface/huggingface_hub/issues/1738 to store
|
||||
download metadata when downloading files from the hub to a local directory (without
|
||||
using the cache).
|
||||
|
||||
./.cache/huggingface folder structure:
|
||||
[4.0K] data
|
||||
├── [4.0K] .cache
|
||||
│ └── [4.0K] huggingface
|
||||
│ └── [4.0K] download
|
||||
│ ├── [ 16] file.parquet.metadata
|
||||
│ ├── [ 16] file.txt.metadata
|
||||
│ └── [4.0K] folder
|
||||
│ └── [ 16] file.parquet.metadata
|
||||
│
|
||||
├── [6.5G] file.parquet
|
||||
├── [1.5K] file.txt
|
||||
└── [4.0K] folder
|
||||
└── [ 16] file.parquet
|
||||
|
||||
|
||||
Download metadata file structure:
|
||||
```
|
||||
# file.txt.metadata
|
||||
11c5a3d5811f50298f278a704980280950aedb10
|
||||
a16a55fda99d2f2e7b69cce5cf93ff4ad3049930
|
||||
1712656091.123
|
||||
|
||||
# file.parquet.metadata
|
||||
11c5a3d5811f50298f278a704980280950aedb10
|
||||
7c5d3f4b8b76583b422fcb9189ad6c89d5d97a094541ce8932dce3ecabde1421
|
||||
1712656091.123
|
||||
}
|
||||
```
|
||||
"""
|
||||
|
||||
import base64
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from .utils import WeakFileLock
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LocalDownloadFilePaths:
|
||||
"""
|
||||
Paths to the files related to a download process in a local dir.
|
||||
|
||||
Returned by [`get_local_download_paths`].
|
||||
|
||||
Attributes:
|
||||
file_path (`Path`):
|
||||
Path where the file will be saved.
|
||||
lock_path (`Path`):
|
||||
Path to the lock file used to ensure atomicity when reading/writing metadata.
|
||||
metadata_path (`Path`):
|
||||
Path to the metadata file.
|
||||
"""
|
||||
|
||||
file_path: Path
|
||||
lock_path: Path
|
||||
metadata_path: Path
|
||||
|
||||
def incomplete_path(self, etag: str) -> Path:
|
||||
"""Return the path where a file will be temporarily downloaded before being moved to `file_path`."""
|
||||
return self.metadata_path.parent / f"{_short_hash(self.metadata_path.name)}.{etag}.incomplete"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class LocalUploadFilePaths:
|
||||
"""
|
||||
Paths to the files related to an upload process in a local dir.
|
||||
|
||||
Returned by [`get_local_upload_paths`].
|
||||
|
||||
Attributes:
|
||||
path_in_repo (`str`):
|
||||
Path of the file in the repo.
|
||||
file_path (`Path`):
|
||||
Path where the file will be saved.
|
||||
lock_path (`Path`):
|
||||
Path to the lock file used to ensure atomicity when reading/writing metadata.
|
||||
metadata_path (`Path`):
|
||||
Path to the metadata file.
|
||||
"""
|
||||
|
||||
path_in_repo: str
|
||||
file_path: Path
|
||||
lock_path: Path
|
||||
metadata_path: Path
|
||||
|
||||
|
||||
@dataclass
|
||||
class LocalDownloadFileMetadata:
|
||||
"""
|
||||
Metadata about a file in the local directory related to a download process.
|
||||
|
||||
Attributes:
|
||||
filename (`str`):
|
||||
Path of the file in the repo.
|
||||
commit_hash (`str`):
|
||||
Commit hash of the file in the repo.
|
||||
etag (`str`):
|
||||
ETag of the file in the repo. Used to check if the file has changed.
|
||||
For LFS files, this is the sha256 of the file. For regular files, it corresponds to the git hash.
|
||||
timestamp (`int`):
|
||||
Unix timestamp of when the metadata was saved i.e. when the metadata was accurate.
|
||||
"""
|
||||
|
||||
filename: str
|
||||
commit_hash: str
|
||||
etag: str
|
||||
timestamp: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class LocalUploadFileMetadata:
|
||||
"""
|
||||
Metadata about a file in the local directory related to an upload process.
|
||||
"""
|
||||
|
||||
size: int
|
||||
|
||||
# Default values correspond to "we don't know yet"
|
||||
timestamp: Optional[float] = None
|
||||
should_ignore: Optional[bool] = None
|
||||
sha256: Optional[str] = None
|
||||
upload_mode: Optional[str] = None
|
||||
is_uploaded: bool = False
|
||||
is_committed: bool = False
|
||||
|
||||
def save(self, paths: LocalUploadFilePaths) -> None:
|
||||
"""Save the metadata to disk."""
|
||||
with WeakFileLock(paths.lock_path):
|
||||
with paths.metadata_path.open("w") as f:
|
||||
new_timestamp = time.time()
|
||||
f.write(str(new_timestamp) + "\n")
|
||||
|
||||
f.write(str(self.size)) # never None
|
||||
f.write("\n")
|
||||
|
||||
if self.should_ignore is not None:
|
||||
f.write(str(int(self.should_ignore)))
|
||||
f.write("\n")
|
||||
|
||||
if self.sha256 is not None:
|
||||
f.write(self.sha256)
|
||||
f.write("\n")
|
||||
|
||||
if self.upload_mode is not None:
|
||||
f.write(self.upload_mode)
|
||||
f.write("\n")
|
||||
|
||||
f.write(str(int(self.is_uploaded)) + "\n")
|
||||
f.write(str(int(self.is_committed)) + "\n")
|
||||
|
||||
self.timestamp = new_timestamp
|
||||
|
||||
|
||||
def get_local_download_paths(local_dir: Path, filename: str) -> LocalDownloadFilePaths:
|
||||
"""Compute paths to the files related to a download process.
|
||||
|
||||
Folders containing the paths are all guaranteed to exist.
|
||||
|
||||
Args:
|
||||
local_dir (`Path`):
|
||||
Path to the local directory in which files are downloaded.
|
||||
filename (`str`):
|
||||
Path of the file in the repo.
|
||||
|
||||
Return:
|
||||
[`LocalDownloadFilePaths`]: the paths to the files (file_path, lock_path, metadata_path, incomplete_path).
|
||||
"""
|
||||
# filename is the path in the Hub repository (separated by '/')
|
||||
# make sure to have a cross platform transcription
|
||||
sanitized_filename = os.path.join(*filename.split("/"))
|
||||
if os.name == "nt":
|
||||
if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename:
|
||||
raise ValueError(
|
||||
f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository"
|
||||
" owner to rename this file."
|
||||
)
|
||||
file_path = local_dir / sanitized_filename
|
||||
metadata_path = _huggingface_dir(local_dir) / "download" / f"{sanitized_filename}.metadata"
|
||||
lock_path = metadata_path.with_suffix(".lock")
|
||||
|
||||
# Some Windows versions do not allow for paths longer than 255 characters.
|
||||
# In this case, we must specify it as an extended path by using the "\\?\" prefix
|
||||
if os.name == "nt":
|
||||
if not str(local_dir).startswith("\\\\?\\") and len(os.path.abspath(lock_path)) > 255:
|
||||
file_path = Path("\\\\?\\" + os.path.abspath(file_path))
|
||||
lock_path = Path("\\\\?\\" + os.path.abspath(lock_path))
|
||||
metadata_path = Path("\\\\?\\" + os.path.abspath(metadata_path))
|
||||
|
||||
file_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
metadata_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
return LocalDownloadFilePaths(file_path=file_path, lock_path=lock_path, metadata_path=metadata_path)
|
||||
|
||||
|
||||
def get_local_upload_paths(local_dir: Path, filename: str) -> LocalUploadFilePaths:
|
||||
"""Compute paths to the files related to an upload process.
|
||||
|
||||
Folders containing the paths are all guaranteed to exist.
|
||||
|
||||
Args:
|
||||
local_dir (`Path`):
|
||||
Path to the local directory that is uploaded.
|
||||
filename (`str`):
|
||||
Path of the file in the repo.
|
||||
|
||||
Return:
|
||||
[`LocalUploadFilePaths`]: the paths to the files (file_path, lock_path, metadata_path).
|
||||
"""
|
||||
# filename is the path in the Hub repository (separated by '/')
|
||||
# make sure to have a cross platform transcription
|
||||
sanitized_filename = os.path.join(*filename.split("/"))
|
||||
if os.name == "nt":
|
||||
if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename:
|
||||
raise ValueError(
|
||||
f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository"
|
||||
" owner to rename this file."
|
||||
)
|
||||
file_path = local_dir / sanitized_filename
|
||||
metadata_path = _huggingface_dir(local_dir) / "upload" / f"{sanitized_filename}.metadata"
|
||||
lock_path = metadata_path.with_suffix(".lock")
|
||||
|
||||
# Some Windows versions do not allow for paths longer than 255 characters.
|
||||
# In this case, we must specify it as an extended path by using the "\\?\" prefix
|
||||
if os.name == "nt":
|
||||
if not str(local_dir).startswith("\\\\?\\") and len(os.path.abspath(lock_path)) > 255:
|
||||
file_path = Path("\\\\?\\" + os.path.abspath(file_path))
|
||||
lock_path = Path("\\\\?\\" + os.path.abspath(lock_path))
|
||||
metadata_path = Path("\\\\?\\" + os.path.abspath(metadata_path))
|
||||
|
||||
file_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
metadata_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
return LocalUploadFilePaths(
|
||||
path_in_repo=filename, file_path=file_path, lock_path=lock_path, metadata_path=metadata_path
|
||||
)
|
||||
|
||||
|
||||
def read_download_metadata(local_dir: Path, filename: str) -> Optional[LocalDownloadFileMetadata]:
|
||||
"""Read metadata about a file in the local directory related to a download process.
|
||||
|
||||
Args:
|
||||
local_dir (`Path`):
|
||||
Path to the local directory in which files are downloaded.
|
||||
filename (`str`):
|
||||
Path of the file in the repo.
|
||||
|
||||
Return:
|
||||
`[LocalDownloadFileMetadata]` or `None`: the metadata if it exists, `None` otherwise.
|
||||
"""
|
||||
paths = get_local_download_paths(local_dir, filename)
|
||||
with WeakFileLock(paths.lock_path):
|
||||
if paths.metadata_path.exists():
|
||||
try:
|
||||
with paths.metadata_path.open() as f:
|
||||
commit_hash = f.readline().strip()
|
||||
etag = f.readline().strip()
|
||||
timestamp = float(f.readline().strip())
|
||||
metadata = LocalDownloadFileMetadata(
|
||||
filename=filename,
|
||||
commit_hash=commit_hash,
|
||||
etag=etag,
|
||||
timestamp=timestamp,
|
||||
)
|
||||
except Exception as e:
|
||||
# remove the metadata file if it is corrupted / not the right format
|
||||
logger.warning(
|
||||
f"Invalid metadata file {paths.metadata_path}: {e}. Removing it from disk and continue."
|
||||
)
|
||||
try:
|
||||
paths.metadata_path.unlink()
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not remove corrupted metadata file {paths.metadata_path}: {e}")
|
||||
|
||||
try:
|
||||
# check if the file exists and hasn't been modified since the metadata was saved
|
||||
stat = paths.file_path.stat()
|
||||
if (
|
||||
stat.st_mtime - 1 <= metadata.timestamp
|
||||
): # allow 1s difference as stat.st_mtime might not be precise
|
||||
return metadata
|
||||
logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.")
|
||||
except FileNotFoundError:
|
||||
# file does not exist => metadata is outdated
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def read_upload_metadata(local_dir: Path, filename: str) -> LocalUploadFileMetadata:
|
||||
"""Read metadata about a file in the local directory related to an upload process.
|
||||
|
||||
TODO: factorize logic with `read_download_metadata`.
|
||||
|
||||
Args:
|
||||
local_dir (`Path`):
|
||||
Path to the local directory in which files are downloaded.
|
||||
filename (`str`):
|
||||
Path of the file in the repo.
|
||||
|
||||
Return:
|
||||
`[LocalUploadFileMetadata]` or `None`: the metadata if it exists, `None` otherwise.
|
||||
"""
|
||||
paths = get_local_upload_paths(local_dir, filename)
|
||||
with WeakFileLock(paths.lock_path):
|
||||
if paths.metadata_path.exists():
|
||||
try:
|
||||
with paths.metadata_path.open() as f:
|
||||
timestamp = float(f.readline().strip())
|
||||
|
||||
size = int(f.readline().strip()) # never None
|
||||
|
||||
_should_ignore = f.readline().strip()
|
||||
should_ignore = None if _should_ignore == "" else bool(int(_should_ignore))
|
||||
|
||||
_sha256 = f.readline().strip()
|
||||
sha256 = None if _sha256 == "" else _sha256
|
||||
|
||||
_upload_mode = f.readline().strip()
|
||||
upload_mode = None if _upload_mode == "" else _upload_mode
|
||||
if upload_mode not in (None, "regular", "lfs"):
|
||||
raise ValueError(f"Invalid upload mode in metadata {paths.path_in_repo}: {upload_mode}")
|
||||
|
||||
is_uploaded = bool(int(f.readline().strip()))
|
||||
is_committed = bool(int(f.readline().strip()))
|
||||
|
||||
metadata = LocalUploadFileMetadata(
|
||||
timestamp=timestamp,
|
||||
size=size,
|
||||
should_ignore=should_ignore,
|
||||
sha256=sha256,
|
||||
upload_mode=upload_mode,
|
||||
is_uploaded=is_uploaded,
|
||||
is_committed=is_committed,
|
||||
)
|
||||
except Exception as e:
|
||||
# remove the metadata file if it is corrupted / not the right format
|
||||
logger.warning(
|
||||
f"Invalid metadata file {paths.metadata_path}: {e}. Removing it from disk and continue."
|
||||
)
|
||||
try:
|
||||
paths.metadata_path.unlink()
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not remove corrupted metadata file {paths.metadata_path}: {e}")
|
||||
|
||||
# TODO: can we do better?
|
||||
if (
|
||||
metadata.timestamp is not None
|
||||
and metadata.is_uploaded # file was uploaded
|
||||
and not metadata.is_committed # but not committed
|
||||
and time.time() - metadata.timestamp > 20 * 3600 # and it's been more than 20 hours
|
||||
): # => we consider it as garbage-collected by S3
|
||||
metadata.is_uploaded = False
|
||||
|
||||
# check if the file exists and hasn't been modified since the metadata was saved
|
||||
try:
|
||||
if metadata.timestamp is not None and paths.file_path.stat().st_mtime <= metadata.timestamp:
|
||||
return metadata
|
||||
logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.")
|
||||
except FileNotFoundError:
|
||||
# file does not exist => metadata is outdated
|
||||
pass
|
||||
|
||||
# empty metadata => we don't know anything expect its size
|
||||
return LocalUploadFileMetadata(size=paths.file_path.stat().st_size)
|
||||
|
||||
|
||||
def write_download_metadata(local_dir: Path, filename: str, commit_hash: str, etag: str) -> None:
|
||||
"""Write metadata about a file in the local directory related to a download process.
|
||||
|
||||
Args:
|
||||
local_dir (`Path`):
|
||||
Path to the local directory in which files are downloaded.
|
||||
"""
|
||||
paths = get_local_download_paths(local_dir, filename)
|
||||
with WeakFileLock(paths.lock_path):
|
||||
with paths.metadata_path.open("w") as f:
|
||||
f.write(f"{commit_hash}\n{etag}\n{time.time()}\n")
|
||||
|
||||
|
||||
def _huggingface_dir(local_dir: Path) -> Path:
|
||||
"""Return the path to the `.cache/huggingface` directory in a local directory."""
|
||||
# Wrap in lru_cache to avoid overwriting the .gitignore file if called multiple times
|
||||
path = local_dir / ".cache" / "huggingface"
|
||||
path.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# Create a .gitignore file in the .cache/huggingface directory if it doesn't exist
|
||||
# Should be thread-safe enough like this.
|
||||
gitignore = path / ".gitignore"
|
||||
gitignore_lock = path / ".gitignore.lock"
|
||||
if not gitignore.exists():
|
||||
try:
|
||||
with WeakFileLock(gitignore_lock, timeout=0.1):
|
||||
gitignore.write_text("*")
|
||||
except IndexError:
|
||||
pass
|
||||
except OSError: # TimeoutError, FileNotFoundError, PermissionError, etc.
|
||||
pass
|
||||
try:
|
||||
gitignore_lock.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
return path
|
||||
|
||||
|
||||
def _short_hash(filename: str) -> str:
|
||||
return base64.urlsafe_b64encode(hashlib.sha1(filename.encode()).digest()).decode()
|
||||
520
.venv/lib/python3.10/site-packages/huggingface_hub/_login.py
Normal file
520
.venv/lib/python3.10/site-packages/huggingface_hub/_login.py
Normal file
@@ -0,0 +1,520 @@
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contains methods to log in to the Hub."""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
from getpass import getpass
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from . import constants
|
||||
from .commands._cli_utils import ANSI
|
||||
from .utils import (
|
||||
capture_output,
|
||||
get_token,
|
||||
is_google_colab,
|
||||
is_notebook,
|
||||
list_credential_helpers,
|
||||
logging,
|
||||
run_subprocess,
|
||||
set_git_credential,
|
||||
unset_git_credential,
|
||||
)
|
||||
from .utils._auth import (
|
||||
_get_token_by_name,
|
||||
_get_token_from_environment,
|
||||
_get_token_from_file,
|
||||
_get_token_from_google_colab,
|
||||
_save_stored_tokens,
|
||||
_save_token,
|
||||
get_stored_tokens,
|
||||
)
|
||||
from .utils._deprecation import _deprecate_arguments, _deprecate_positional_args
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
_HF_LOGO_ASCII = """
|
||||
_| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|
|
||||
_| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
||||
_|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|
|
||||
_| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
||||
_| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|
|
||||
"""
|
||||
|
||||
|
||||
@_deprecate_arguments(
|
||||
version="1.0",
|
||||
deprecated_args="write_permission",
|
||||
custom_message="Fine-grained tokens added complexity to the permissions, making it irrelevant to check if a token has 'write' access.",
|
||||
)
|
||||
@_deprecate_positional_args(version="1.0")
|
||||
def login(
|
||||
token: Optional[str] = None,
|
||||
*,
|
||||
add_to_git_credential: bool = False,
|
||||
new_session: bool = True,
|
||||
write_permission: bool = False,
|
||||
) -> None:
|
||||
"""Login the machine to access the Hub.
|
||||
|
||||
The `token` is persisted in cache and set as a git credential. Once done, the machine
|
||||
is logged in and the access token will be available across all `huggingface_hub`
|
||||
components. If `token` is not provided, it will be prompted to the user either with
|
||||
a widget (in a notebook) or via the terminal.
|
||||
|
||||
To log in from outside of a script, one can also use `huggingface-cli login` which is
|
||||
a cli command that wraps [`login`].
|
||||
|
||||
<Tip>
|
||||
|
||||
[`login`] is a drop-in replacement method for [`notebook_login`] as it wraps and
|
||||
extends its capabilities.
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip>
|
||||
|
||||
When the token is not passed, [`login`] will automatically detect if the script runs
|
||||
in a notebook or not. However, this detection might not be accurate due to the
|
||||
variety of notebooks that exists nowadays. If that is the case, you can always force
|
||||
the UI by using [`notebook_login`] or [`interpreter_login`].
|
||||
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
token (`str`, *optional*):
|
||||
User access token to generate from https://huggingface.co/settings/token.
|
||||
add_to_git_credential (`bool`, defaults to `False`):
|
||||
If `True`, token will be set as git credential. If no git credential helper
|
||||
is configured, a warning will be displayed to the user. If `token` is `None`,
|
||||
the value of `add_to_git_credential` is ignored and will be prompted again
|
||||
to the end user.
|
||||
new_session (`bool`, defaults to `True`):
|
||||
If `True`, will request a token even if one is already saved on the machine.
|
||||
write_permission (`bool`):
|
||||
Ignored and deprecated argument.
|
||||
Raises:
|
||||
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
||||
If an organization token is passed. Only personal account tokens are valid
|
||||
to log in.
|
||||
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
||||
If token is invalid.
|
||||
[`ImportError`](https://docs.python.org/3/library/exceptions.html#ImportError)
|
||||
If running in a notebook but `ipywidgets` is not installed.
|
||||
"""
|
||||
if token is not None:
|
||||
if not add_to_git_credential:
|
||||
logger.info(
|
||||
"The token has not been saved to the git credentials helper. Pass "
|
||||
"`add_to_git_credential=True` in this function directly or "
|
||||
"`--add-to-git-credential` if using via `huggingface-cli` if "
|
||||
"you want to set the git credential as well."
|
||||
)
|
||||
_login(token, add_to_git_credential=add_to_git_credential)
|
||||
elif is_notebook():
|
||||
notebook_login(new_session=new_session)
|
||||
else:
|
||||
interpreter_login(new_session=new_session)
|
||||
|
||||
|
||||
def logout(token_name: Optional[str] = None) -> None:
|
||||
"""Logout the machine from the Hub.
|
||||
|
||||
Token is deleted from the machine and removed from git credential.
|
||||
|
||||
Args:
|
||||
token_name (`str`, *optional*):
|
||||
Name of the access token to logout from. If `None`, will logout from all saved access tokens.
|
||||
Raises:
|
||||
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError):
|
||||
If the access token name is not found.
|
||||
"""
|
||||
if get_token() is None and not get_stored_tokens(): # No active token and no saved access tokens
|
||||
logger.warning("Not logged in!")
|
||||
return
|
||||
if not token_name:
|
||||
# Delete all saved access tokens and token
|
||||
for file_path in (constants.HF_TOKEN_PATH, constants.HF_STORED_TOKENS_PATH):
|
||||
try:
|
||||
Path(file_path).unlink()
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
logger.info("Successfully logged out from all access tokens.")
|
||||
else:
|
||||
_logout_from_token(token_name)
|
||||
logger.info(f"Successfully logged out from access token: {token_name}.")
|
||||
|
||||
unset_git_credential()
|
||||
|
||||
# Check if still logged in
|
||||
if _get_token_from_google_colab() is not None:
|
||||
raise EnvironmentError(
|
||||
"You are automatically logged in using a Google Colab secret.\n"
|
||||
"To log out, you must unset the `HF_TOKEN` secret in your Colab settings."
|
||||
)
|
||||
if _get_token_from_environment() is not None:
|
||||
raise EnvironmentError(
|
||||
"Token has been deleted from your machine but you are still logged in.\n"
|
||||
"To log out, you must clear out both `HF_TOKEN` and `HUGGING_FACE_HUB_TOKEN` environment variables."
|
||||
)
|
||||
|
||||
|
||||
def auth_switch(token_name: str, add_to_git_credential: bool = False) -> None:
|
||||
"""Switch to a different access token.
|
||||
|
||||
Args:
|
||||
token_name (`str`):
|
||||
Name of the access token to switch to.
|
||||
add_to_git_credential (`bool`, defaults to `False`):
|
||||
If `True`, token will be set as git credential. If no git credential helper
|
||||
is configured, a warning will be displayed to the user. If `token` is `None`,
|
||||
the value of `add_to_git_credential` is ignored and will be prompted again
|
||||
to the end user.
|
||||
|
||||
Raises:
|
||||
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError):
|
||||
If the access token name is not found.
|
||||
"""
|
||||
token = _get_token_by_name(token_name)
|
||||
if not token:
|
||||
raise ValueError(f"Access token {token_name} not found in {constants.HF_STORED_TOKENS_PATH}")
|
||||
# Write token to HF_TOKEN_PATH
|
||||
_set_active_token(token_name, add_to_git_credential)
|
||||
logger.info(f"The current active token is: {token_name}")
|
||||
token_from_environment = _get_token_from_environment()
|
||||
if token_from_environment is not None and token_from_environment != token:
|
||||
logger.warning(
|
||||
"The environment variable `HF_TOKEN` is set and will override the access token you've just switched to."
|
||||
)
|
||||
|
||||
|
||||
def auth_list() -> None:
|
||||
"""List all stored access tokens."""
|
||||
tokens = get_stored_tokens()
|
||||
|
||||
if not tokens:
|
||||
logger.info("No access tokens found.")
|
||||
return
|
||||
# Find current token
|
||||
current_token = get_token()
|
||||
current_token_name = None
|
||||
for token_name in tokens:
|
||||
if tokens.get(token_name) == current_token:
|
||||
current_token_name = token_name
|
||||
# Print header
|
||||
max_offset = max(len("token"), max(len(token) for token in tokens)) + 2
|
||||
print(f" {{:<{max_offset}}}| {{:<15}}".format("name", "token"))
|
||||
print("-" * (max_offset + 2) + "|" + "-" * 15)
|
||||
|
||||
# Print saved access tokens
|
||||
for token_name in tokens:
|
||||
token = tokens.get(token_name, "<not set>")
|
||||
masked_token = f"{token[:3]}****{token[-4:]}" if token != "<not set>" else token
|
||||
is_current = "*" if token == current_token else " "
|
||||
|
||||
print(f"{is_current} {{:<{max_offset}}}| {{:<15}}".format(token_name, masked_token))
|
||||
|
||||
if _get_token_from_environment():
|
||||
logger.warning(
|
||||
"\nNote: Environment variable `HF_TOKEN` is set and is the current active token independently from the stored tokens listed above."
|
||||
)
|
||||
elif current_token_name is None:
|
||||
logger.warning(
|
||||
"\nNote: No active token is set and no environment variable `HF_TOKEN` is found. Use `huggingface-cli login` to log in."
|
||||
)
|
||||
|
||||
|
||||
###
|
||||
# Interpreter-based login (text)
|
||||
###
|
||||
|
||||
|
||||
@_deprecate_arguments(
|
||||
version="1.0",
|
||||
deprecated_args="write_permission",
|
||||
custom_message="Fine-grained tokens added complexity to the permissions, making it irrelevant to check if a token has 'write' access.",
|
||||
)
|
||||
@_deprecate_positional_args(version="1.0")
|
||||
def interpreter_login(*, new_session: bool = True, write_permission: bool = False) -> None:
|
||||
"""
|
||||
Displays a prompt to log in to the HF website and store the token.
|
||||
|
||||
This is equivalent to [`login`] without passing a token when not run in a notebook.
|
||||
[`interpreter_login`] is useful if you want to force the use of the terminal prompt
|
||||
instead of a notebook widget.
|
||||
|
||||
For more details, see [`login`].
|
||||
|
||||
Args:
|
||||
new_session (`bool`, defaults to `True`):
|
||||
If `True`, will request a token even if one is already saved on the machine.
|
||||
write_permission (`bool`):
|
||||
Ignored and deprecated argument.
|
||||
"""
|
||||
if not new_session and get_token() is not None:
|
||||
logger.info("User is already logged in.")
|
||||
return
|
||||
|
||||
from .commands.delete_cache import _ask_for_confirmation_no_tui
|
||||
|
||||
print(_HF_LOGO_ASCII)
|
||||
if get_token() is not None:
|
||||
logger.info(
|
||||
" A token is already saved on your machine. Run `huggingface-cli"
|
||||
" whoami` to get more information or `huggingface-cli logout` if you want"
|
||||
" to log out."
|
||||
)
|
||||
logger.info(" Setting a new token will erase the existing one.")
|
||||
|
||||
logger.info(
|
||||
" To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens ."
|
||||
)
|
||||
if os.name == "nt":
|
||||
logger.info("Token can be pasted using 'Right-Click'.")
|
||||
token = getpass("Enter your token (input will not be visible): ")
|
||||
add_to_git_credential = _ask_for_confirmation_no_tui("Add token as git credential?")
|
||||
|
||||
_login(token=token, add_to_git_credential=add_to_git_credential)
|
||||
|
||||
|
||||
###
|
||||
# Notebook-based login (widget)
|
||||
###
|
||||
|
||||
NOTEBOOK_LOGIN_PASSWORD_HTML = """<center> <img
|
||||
src=https://huggingface.co/front/assets/huggingface_logo-noborder.svg
|
||||
alt='Hugging Face'> <br> Immediately click login after typing your password or
|
||||
it might be stored in plain text in this notebook file. </center>"""
|
||||
|
||||
|
||||
NOTEBOOK_LOGIN_TOKEN_HTML_START = """<center> <img
|
||||
src=https://huggingface.co/front/assets/huggingface_logo-noborder.svg
|
||||
alt='Hugging Face'> <br> Copy a token from <a
|
||||
href="https://huggingface.co/settings/tokens" target="_blank">your Hugging Face
|
||||
tokens page</a> and paste it below. <br> Immediately click login after copying
|
||||
your token or it might be stored in plain text in this notebook file. </center>"""
|
||||
|
||||
|
||||
NOTEBOOK_LOGIN_TOKEN_HTML_END = """
|
||||
<b>Pro Tip:</b> If you don't already have one, you can create a dedicated
|
||||
'notebooks' token with 'write' access, that you can then easily reuse for all
|
||||
notebooks. </center>"""
|
||||
|
||||
|
||||
@_deprecate_arguments(
|
||||
version="1.0",
|
||||
deprecated_args="write_permission",
|
||||
custom_message="Fine-grained tokens added complexity to the permissions, making it irrelevant to check if a token has 'write' access.",
|
||||
)
|
||||
@_deprecate_positional_args(version="1.0")
|
||||
def notebook_login(*, new_session: bool = True, write_permission: bool = False) -> None:
|
||||
"""
|
||||
Displays a widget to log in to the HF website and store the token.
|
||||
|
||||
This is equivalent to [`login`] without passing a token when run in a notebook.
|
||||
[`notebook_login`] is useful if you want to force the use of the notebook widget
|
||||
instead of a prompt in the terminal.
|
||||
|
||||
For more details, see [`login`].
|
||||
|
||||
Args:
|
||||
new_session (`bool`, defaults to `True`):
|
||||
If `True`, will request a token even if one is already saved on the machine.
|
||||
write_permission (`bool`):
|
||||
Ignored and deprecated argument.
|
||||
"""
|
||||
try:
|
||||
import ipywidgets.widgets as widgets # type: ignore
|
||||
from IPython.display import display # type: ignore
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"The `notebook_login` function can only be used in a notebook (Jupyter or"
|
||||
" Colab) and you need the `ipywidgets` module: `pip install ipywidgets`."
|
||||
)
|
||||
if not new_session and get_token() is not None:
|
||||
logger.info("User is already logged in.")
|
||||
return
|
||||
|
||||
box_layout = widgets.Layout(display="flex", flex_flow="column", align_items="center", width="50%")
|
||||
|
||||
token_widget = widgets.Password(description="Token:")
|
||||
git_checkbox_widget = widgets.Checkbox(value=True, description="Add token as git credential?")
|
||||
token_finish_button = widgets.Button(description="Login")
|
||||
|
||||
login_token_widget = widgets.VBox(
|
||||
[
|
||||
widgets.HTML(NOTEBOOK_LOGIN_TOKEN_HTML_START),
|
||||
token_widget,
|
||||
git_checkbox_widget,
|
||||
token_finish_button,
|
||||
widgets.HTML(NOTEBOOK_LOGIN_TOKEN_HTML_END),
|
||||
],
|
||||
layout=box_layout,
|
||||
)
|
||||
display(login_token_widget)
|
||||
|
||||
# On click events
|
||||
def login_token_event(t):
|
||||
"""Event handler for the login button."""
|
||||
token = token_widget.value
|
||||
add_to_git_credential = git_checkbox_widget.value
|
||||
# Erase token and clear value to make sure it's not saved in the notebook.
|
||||
token_widget.value = ""
|
||||
# Hide inputs
|
||||
login_token_widget.children = [widgets.Label("Connecting...")]
|
||||
try:
|
||||
with capture_output() as captured:
|
||||
_login(token, add_to_git_credential=add_to_git_credential)
|
||||
message = captured.getvalue()
|
||||
except Exception as error:
|
||||
message = str(error)
|
||||
# Print result (success message or error)
|
||||
login_token_widget.children = [widgets.Label(line) for line in message.split("\n") if line.strip()]
|
||||
|
||||
token_finish_button.on_click(login_token_event)
|
||||
|
||||
|
||||
###
|
||||
# Login private helpers
|
||||
###
|
||||
|
||||
|
||||
def _login(
|
||||
token: str,
|
||||
add_to_git_credential: bool,
|
||||
) -> None:
|
||||
from .hf_api import whoami # avoid circular import
|
||||
|
||||
if token.startswith("api_org"):
|
||||
raise ValueError("You must use your personal account token, not an organization token.")
|
||||
|
||||
token_info = whoami(token)
|
||||
permission = token_info["auth"]["accessToken"]["role"]
|
||||
logger.info(f"Token is valid (permission: {permission}).")
|
||||
|
||||
token_name = token_info["auth"]["accessToken"]["displayName"]
|
||||
# Store token locally
|
||||
_save_token(token=token, token_name=token_name)
|
||||
# Set active token
|
||||
_set_active_token(token_name=token_name, add_to_git_credential=add_to_git_credential)
|
||||
logger.info("Login successful.")
|
||||
if _get_token_from_environment():
|
||||
logger.warning(
|
||||
"Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured."
|
||||
)
|
||||
else:
|
||||
logger.info(f"The current active token is: `{token_name}`")
|
||||
|
||||
|
||||
def _logout_from_token(token_name: str) -> None:
|
||||
"""Logout from a specific access token.
|
||||
|
||||
Args:
|
||||
token_name (`str`):
|
||||
The name of the access token to logout from.
|
||||
Raises:
|
||||
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError):
|
||||
If the access token name is not found.
|
||||
"""
|
||||
stored_tokens = get_stored_tokens()
|
||||
# If there is no access tokens saved or the access token name is not found, do nothing
|
||||
if not stored_tokens or token_name not in stored_tokens:
|
||||
return
|
||||
|
||||
token = stored_tokens.pop(token_name)
|
||||
_save_stored_tokens(stored_tokens)
|
||||
|
||||
if token == _get_token_from_file():
|
||||
logger.warning(f"Active token '{token_name}' has been deleted.")
|
||||
Path(constants.HF_TOKEN_PATH).unlink(missing_ok=True)
|
||||
|
||||
|
||||
def _set_active_token(
|
||||
token_name: str,
|
||||
add_to_git_credential: bool,
|
||||
) -> None:
|
||||
"""Set the active access token.
|
||||
|
||||
Args:
|
||||
token_name (`str`):
|
||||
The name of the token to set as active.
|
||||
"""
|
||||
token = _get_token_by_name(token_name)
|
||||
if not token:
|
||||
raise ValueError(f"Token {token_name} not found in {constants.HF_STORED_TOKENS_PATH}")
|
||||
if add_to_git_credential:
|
||||
if _is_git_credential_helper_configured():
|
||||
set_git_credential(token)
|
||||
logger.info(
|
||||
"Your token has been saved in your configured git credential helpers"
|
||||
+ f" ({','.join(list_credential_helpers())})."
|
||||
)
|
||||
else:
|
||||
logger.warning("Token has not been saved to git credential helper.")
|
||||
# Write token to HF_TOKEN_PATH
|
||||
path = Path(constants.HF_TOKEN_PATH)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(token)
|
||||
logger.info(f"Your token has been saved to {constants.HF_TOKEN_PATH}")
|
||||
|
||||
|
||||
def _is_git_credential_helper_configured() -> bool:
|
||||
"""Check if a git credential helper is configured.
|
||||
|
||||
Warns user if not the case (except for Google Colab where "store" is set by default
|
||||
by `huggingface_hub`).
|
||||
"""
|
||||
helpers = list_credential_helpers()
|
||||
if len(helpers) > 0:
|
||||
return True # Do not warn: at least 1 helper is set
|
||||
|
||||
# Only in Google Colab to avoid the warning message
|
||||
# See https://github.com/huggingface/huggingface_hub/issues/1043#issuecomment-1247010710
|
||||
if is_google_colab():
|
||||
_set_store_as_git_credential_helper_globally()
|
||||
return True # Do not warn: "store" is used by default in Google Colab
|
||||
|
||||
# Otherwise, warn user
|
||||
print(
|
||||
ANSI.red(
|
||||
"Cannot authenticate through git-credential as no helper is defined on your"
|
||||
" machine.\nYou might have to re-authenticate when pushing to the Hugging"
|
||||
" Face Hub.\nRun the following command in your terminal in case you want to"
|
||||
" set the 'store' credential helper as default.\n\ngit config --global"
|
||||
" credential.helper store\n\nRead"
|
||||
" https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more"
|
||||
" details."
|
||||
)
|
||||
)
|
||||
return False
|
||||
|
||||
|
||||
def _set_store_as_git_credential_helper_globally() -> None:
|
||||
"""Set globally the credential.helper to `store`.
|
||||
|
||||
To be used only in Google Colab as we assume the user doesn't care about the git
|
||||
credential config. It is the only particular case where we don't want to display the
|
||||
warning message in [`notebook_login()`].
|
||||
|
||||
Related:
|
||||
- https://github.com/huggingface/huggingface_hub/issues/1043
|
||||
- https://github.com/huggingface/huggingface_hub/issues/1051
|
||||
- https://git-scm.com/docs/git-credential-store
|
||||
"""
|
||||
try:
|
||||
run_subprocess("git config --global credential.helper store")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
raise EnvironmentError(exc.stderr)
|
||||
@@ -0,0 +1,307 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Literal, Optional, Union
|
||||
|
||||
import requests
|
||||
from tqdm.auto import tqdm as base_tqdm
|
||||
from tqdm.contrib.concurrent import thread_map
|
||||
|
||||
from . import constants
|
||||
from .errors import GatedRepoError, LocalEntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError
|
||||
from .file_download import REGEX_COMMIT_HASH, hf_hub_download, repo_folder_name
|
||||
from .hf_api import DatasetInfo, HfApi, ModelInfo, SpaceInfo
|
||||
from .utils import OfflineModeIsEnabled, filter_repo_objects, logging, validate_hf_hub_args
|
||||
from .utils import tqdm as hf_tqdm
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@validate_hf_hub_args
|
||||
def snapshot_download(
|
||||
repo_id: str,
|
||||
*,
|
||||
repo_type: Optional[str] = None,
|
||||
revision: Optional[str] = None,
|
||||
cache_dir: Union[str, Path, None] = None,
|
||||
local_dir: Union[str, Path, None] = None,
|
||||
library_name: Optional[str] = None,
|
||||
library_version: Optional[str] = None,
|
||||
user_agent: Optional[Union[Dict, str]] = None,
|
||||
proxies: Optional[Dict] = None,
|
||||
etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
|
||||
force_download: bool = False,
|
||||
token: Optional[Union[bool, str]] = None,
|
||||
local_files_only: bool = False,
|
||||
allow_patterns: Optional[Union[List[str], str]] = None,
|
||||
ignore_patterns: Optional[Union[List[str], str]] = None,
|
||||
max_workers: int = 8,
|
||||
tqdm_class: Optional[base_tqdm] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
endpoint: Optional[str] = None,
|
||||
# Deprecated args
|
||||
local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
|
||||
resume_download: Optional[bool] = None,
|
||||
) -> str:
|
||||
"""Download repo files.
|
||||
|
||||
Download a whole snapshot of a repo's files at the specified revision. This is useful when you want all files from
|
||||
a repo, because you don't know which ones you will need a priori. All files are nested inside a folder in order
|
||||
to keep their actual filename relative to that folder. You can also filter which files to download using
|
||||
`allow_patterns` and `ignore_patterns`.
|
||||
|
||||
If `local_dir` is provided, the file structure from the repo will be replicated in this location. When using this
|
||||
option, the `cache_dir` will not be used and a `.cache/huggingface/` folder will be created at the root of `local_dir`
|
||||
to store some metadata related to the downloaded files. While this mechanism is not as robust as the main
|
||||
cache-system, it's optimized for regularly pulling the latest version of a repository.
|
||||
|
||||
An alternative would be to clone the repo but this requires git and git-lfs to be installed and properly
|
||||
configured. It is also not possible to filter which files to download when cloning a repository using git.
|
||||
|
||||
Args:
|
||||
repo_id (`str`):
|
||||
A user or an organization name and a repo name separated by a `/`.
|
||||
repo_type (`str`, *optional*):
|
||||
Set to `"dataset"` or `"space"` if downloading from a dataset or space,
|
||||
`None` or `"model"` if downloading from a model. Default is `None`.
|
||||
revision (`str`, *optional*):
|
||||
An optional Git revision id which can be a branch name, a tag, or a
|
||||
commit hash.
|
||||
cache_dir (`str`, `Path`, *optional*):
|
||||
Path to the folder where cached files are stored.
|
||||
local_dir (`str` or `Path`, *optional*):
|
||||
If provided, the downloaded files will be placed under this directory.
|
||||
library_name (`str`, *optional*):
|
||||
The name of the library to which the object corresponds.
|
||||
library_version (`str`, *optional*):
|
||||
The version of the library.
|
||||
user_agent (`str`, `dict`, *optional*):
|
||||
The user-agent info in the form of a dictionary or a string.
|
||||
proxies (`dict`, *optional*):
|
||||
Dictionary mapping protocol to the URL of the proxy passed to
|
||||
`requests.request`.
|
||||
etag_timeout (`float`, *optional*, defaults to `10`):
|
||||
When fetching ETag, how many seconds to wait for the server to send
|
||||
data before giving up which is passed to `requests.request`.
|
||||
force_download (`bool`, *optional*, defaults to `False`):
|
||||
Whether the file should be downloaded even if it already exists in the local cache.
|
||||
token (`str`, `bool`, *optional*):
|
||||
A token to be used for the download.
|
||||
- If `True`, the token is read from the HuggingFace config
|
||||
folder.
|
||||
- If a string, it's used as the authentication token.
|
||||
headers (`dict`, *optional*):
|
||||
Additional headers to include in the request. Those headers take precedence over the others.
|
||||
local_files_only (`bool`, *optional*, defaults to `False`):
|
||||
If `True`, avoid downloading the file and return the path to the
|
||||
local cached file if it exists.
|
||||
allow_patterns (`List[str]` or `str`, *optional*):
|
||||
If provided, only files matching at least one pattern are downloaded.
|
||||
ignore_patterns (`List[str]` or `str`, *optional*):
|
||||
If provided, files matching any of the patterns are not downloaded.
|
||||
max_workers (`int`, *optional*):
|
||||
Number of concurrent threads to download files (1 thread = 1 file download).
|
||||
Defaults to 8.
|
||||
tqdm_class (`tqdm`, *optional*):
|
||||
If provided, overwrites the default behavior for the progress bar. Passed
|
||||
argument must inherit from `tqdm.auto.tqdm` or at least mimic its behavior.
|
||||
Note that the `tqdm_class` is not passed to each individual download.
|
||||
Defaults to the custom HF progress bar that can be disabled by setting
|
||||
`HF_HUB_DISABLE_PROGRESS_BARS` environment variable.
|
||||
|
||||
Returns:
|
||||
`str`: folder path of the repo snapshot.
|
||||
|
||||
Raises:
|
||||
[`~utils.RepositoryNotFoundError`]
|
||||
If the repository to download from cannot be found. This may be because it doesn't exist,
|
||||
or because it is set to `private` and you do not have access.
|
||||
[`~utils.RevisionNotFoundError`]
|
||||
If the revision to download from cannot be found.
|
||||
[`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
|
||||
If `token=True` and the token cannot be found.
|
||||
[`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if
|
||||
ETag cannot be determined.
|
||||
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
||||
if some parameter value is invalid.
|
||||
"""
|
||||
if cache_dir is None:
|
||||
cache_dir = constants.HF_HUB_CACHE
|
||||
if revision is None:
|
||||
revision = constants.DEFAULT_REVISION
|
||||
if isinstance(cache_dir, Path):
|
||||
cache_dir = str(cache_dir)
|
||||
|
||||
if repo_type is None:
|
||||
repo_type = "model"
|
||||
if repo_type not in constants.REPO_TYPES:
|
||||
raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(constants.REPO_TYPES)}")
|
||||
|
||||
storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
|
||||
|
||||
repo_info: Union[ModelInfo, DatasetInfo, SpaceInfo, None] = None
|
||||
api_call_error: Optional[Exception] = None
|
||||
if not local_files_only:
|
||||
# try/except logic to handle different errors => taken from `hf_hub_download`
|
||||
try:
|
||||
# if we have internet connection we want to list files to download
|
||||
api = HfApi(
|
||||
library_name=library_name,
|
||||
library_version=library_version,
|
||||
user_agent=user_agent,
|
||||
endpoint=endpoint,
|
||||
headers=headers,
|
||||
)
|
||||
repo_info = api.repo_info(repo_id=repo_id, repo_type=repo_type, revision=revision, token=token)
|
||||
except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
|
||||
# Actually raise for those subclasses of ConnectionError
|
||||
raise
|
||||
except (
|
||||
requests.exceptions.ConnectionError,
|
||||
requests.exceptions.Timeout,
|
||||
OfflineModeIsEnabled,
|
||||
) as error:
|
||||
# Internet connection is down
|
||||
# => will try to use local files only
|
||||
api_call_error = error
|
||||
pass
|
||||
except RevisionNotFoundError:
|
||||
# The repo was found but the revision doesn't exist on the Hub (never existed or got deleted)
|
||||
raise
|
||||
except requests.HTTPError as error:
|
||||
# Multiple reasons for an http error:
|
||||
# - Repository is private and invalid/missing token sent
|
||||
# - Repository is gated and invalid/missing token sent
|
||||
# - Hub is down (error 500 or 504)
|
||||
# => let's switch to 'local_files_only=True' to check if the files are already cached.
|
||||
# (if it's not the case, the error will be re-raised)
|
||||
api_call_error = error
|
||||
pass
|
||||
|
||||
# At this stage, if `repo_info` is None it means either:
|
||||
# - internet connection is down
|
||||
# - internet connection is deactivated (local_files_only=True or HF_HUB_OFFLINE=True)
|
||||
# - repo is private/gated and invalid/missing token sent
|
||||
# - Hub is down
|
||||
# => let's look if we can find the appropriate folder in the cache:
|
||||
# - if the specified revision is a commit hash, look inside "snapshots".
|
||||
# - f the specified revision is a branch or tag, look inside "refs".
|
||||
# => if local_dir is not None, we will return the path to the local folder if it exists.
|
||||
if repo_info is None:
|
||||
# Try to get which commit hash corresponds to the specified revision
|
||||
commit_hash = None
|
||||
if REGEX_COMMIT_HASH.match(revision):
|
||||
commit_hash = revision
|
||||
else:
|
||||
ref_path = os.path.join(storage_folder, "refs", revision)
|
||||
if os.path.exists(ref_path):
|
||||
# retrieve commit_hash from refs file
|
||||
with open(ref_path) as f:
|
||||
commit_hash = f.read()
|
||||
|
||||
# Try to locate snapshot folder for this commit hash
|
||||
if commit_hash is not None:
|
||||
snapshot_folder = os.path.join(storage_folder, "snapshots", commit_hash)
|
||||
if os.path.exists(snapshot_folder):
|
||||
# Snapshot folder exists => let's return it
|
||||
# (but we can't check if all the files are actually there)
|
||||
return snapshot_folder
|
||||
# If local_dir is not None, return it if it exists and is not empty
|
||||
if local_dir is not None:
|
||||
local_dir = Path(local_dir)
|
||||
if local_dir.is_dir() and any(local_dir.iterdir()):
|
||||
logger.warning(
|
||||
f"Returning existing local_dir `{local_dir}` as remote repo cannot be accessed in `snapshot_download` ({api_call_error})."
|
||||
)
|
||||
return str(local_dir.resolve())
|
||||
# If we couldn't find the appropriate folder on disk, raise an error.
|
||||
if local_files_only:
|
||||
raise LocalEntryNotFoundError(
|
||||
"Cannot find an appropriate cached snapshot folder for the specified revision on the local disk and "
|
||||
"outgoing traffic has been disabled. To enable repo look-ups and downloads online, pass "
|
||||
"'local_files_only=False' as input."
|
||||
)
|
||||
elif isinstance(api_call_error, OfflineModeIsEnabled):
|
||||
raise LocalEntryNotFoundError(
|
||||
"Cannot find an appropriate cached snapshot folder for the specified revision on the local disk and "
|
||||
"outgoing traffic has been disabled. To enable repo look-ups and downloads online, set "
|
||||
"'HF_HUB_OFFLINE=0' as environment variable."
|
||||
) from api_call_error
|
||||
elif isinstance(api_call_error, RepositoryNotFoundError) or isinstance(api_call_error, GatedRepoError):
|
||||
# Repo not found => let's raise the actual error
|
||||
raise api_call_error
|
||||
else:
|
||||
# Otherwise: most likely a connection issue or Hub downtime => let's warn the user
|
||||
raise LocalEntryNotFoundError(
|
||||
"An error happened while trying to locate the files on the Hub and we cannot find the appropriate"
|
||||
" snapshot folder for the specified revision on the local disk. Please check your internet connection"
|
||||
" and try again."
|
||||
) from api_call_error
|
||||
|
||||
# At this stage, internet connection is up and running
|
||||
# => let's download the files!
|
||||
assert repo_info.sha is not None, "Repo info returned from server must have a revision sha."
|
||||
assert repo_info.siblings is not None, "Repo info returned from server must have a siblings list."
|
||||
filtered_repo_files = list(
|
||||
filter_repo_objects(
|
||||
items=[f.rfilename for f in repo_info.siblings],
|
||||
allow_patterns=allow_patterns,
|
||||
ignore_patterns=ignore_patterns,
|
||||
)
|
||||
)
|
||||
commit_hash = repo_info.sha
|
||||
snapshot_folder = os.path.join(storage_folder, "snapshots", commit_hash)
|
||||
# if passed revision is not identical to commit_hash
|
||||
# then revision has to be a branch name or tag name.
|
||||
# In that case store a ref.
|
||||
if revision != commit_hash:
|
||||
ref_path = os.path.join(storage_folder, "refs", revision)
|
||||
try:
|
||||
os.makedirs(os.path.dirname(ref_path), exist_ok=True)
|
||||
with open(ref_path, "w") as f:
|
||||
f.write(commit_hash)
|
||||
except OSError as e:
|
||||
logger.warning(f"Ignored error while writing commit hash to {ref_path}: {e}.")
|
||||
|
||||
# we pass the commit_hash to hf_hub_download
|
||||
# so no network call happens if we already
|
||||
# have the file locally.
|
||||
def _inner_hf_hub_download(repo_file: str):
|
||||
return hf_hub_download(
|
||||
repo_id,
|
||||
filename=repo_file,
|
||||
repo_type=repo_type,
|
||||
revision=commit_hash,
|
||||
endpoint=endpoint,
|
||||
cache_dir=cache_dir,
|
||||
local_dir=local_dir,
|
||||
local_dir_use_symlinks=local_dir_use_symlinks,
|
||||
library_name=library_name,
|
||||
library_version=library_version,
|
||||
user_agent=user_agent,
|
||||
proxies=proxies,
|
||||
etag_timeout=etag_timeout,
|
||||
resume_download=resume_download,
|
||||
force_download=force_download,
|
||||
token=token,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
if constants.HF_HUB_ENABLE_HF_TRANSFER:
|
||||
# when using hf_transfer we don't want extra parallelism
|
||||
# from the one hf_transfer provides
|
||||
for file in filtered_repo_files:
|
||||
_inner_hf_hub_download(file)
|
||||
else:
|
||||
thread_map(
|
||||
_inner_hf_hub_download,
|
||||
filtered_repo_files,
|
||||
desc=f"Fetching {len(filtered_repo_files)} files",
|
||||
max_workers=max_workers,
|
||||
# User can use its own tqdm class or the default one from `huggingface_hub.utils`
|
||||
tqdm_class=tqdm_class or hf_tqdm,
|
||||
)
|
||||
|
||||
if local_dir is not None:
|
||||
return str(os.path.realpath(local_dir))
|
||||
return snapshot_folder
|
||||
173
.venv/lib/python3.10/site-packages/huggingface_hub/_space_api.py
Normal file
173
.venv/lib/python3.10/site-packages/huggingface_hub/_space_api.py
Normal file
@@ -0,0 +1,173 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2019-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Dict, Optional
|
||||
|
||||
from huggingface_hub.utils import parse_datetime
|
||||
|
||||
|
||||
class SpaceStage(str, Enum):
|
||||
"""
|
||||
Enumeration of possible stage of a Space on the Hub.
|
||||
|
||||
Value can be compared to a string:
|
||||
```py
|
||||
assert SpaceStage.BUILDING == "BUILDING"
|
||||
```
|
||||
|
||||
Taken from https://github.com/huggingface/moon-landing/blob/main/server/repo_types/SpaceInfo.ts#L61 (private url).
|
||||
"""
|
||||
|
||||
# Copied from moon-landing > server > repo_types > SpaceInfo.ts (private repo)
|
||||
NO_APP_FILE = "NO_APP_FILE"
|
||||
CONFIG_ERROR = "CONFIG_ERROR"
|
||||
BUILDING = "BUILDING"
|
||||
BUILD_ERROR = "BUILD_ERROR"
|
||||
RUNNING = "RUNNING"
|
||||
RUNNING_BUILDING = "RUNNING_BUILDING"
|
||||
RUNTIME_ERROR = "RUNTIME_ERROR"
|
||||
DELETING = "DELETING"
|
||||
STOPPED = "STOPPED"
|
||||
PAUSED = "PAUSED"
|
||||
|
||||
|
||||
class SpaceHardware(str, Enum):
|
||||
"""
|
||||
Enumeration of hardwares available to run your Space on the Hub.
|
||||
|
||||
Value can be compared to a string:
|
||||
```py
|
||||
assert SpaceHardware.CPU_BASIC == "cpu-basic"
|
||||
```
|
||||
|
||||
Taken from https://github.com/huggingface-internal/moon-landing/blob/main/server/repo_types/SpaceHardwareFlavor.ts (private url).
|
||||
"""
|
||||
|
||||
# CPU
|
||||
CPU_BASIC = "cpu-basic"
|
||||
CPU_UPGRADE = "cpu-upgrade"
|
||||
CPU_XL = "cpu-xl"
|
||||
|
||||
# ZeroGPU
|
||||
ZERO_A10G = "zero-a10g"
|
||||
|
||||
# GPU
|
||||
T4_SMALL = "t4-small"
|
||||
T4_MEDIUM = "t4-medium"
|
||||
L4X1 = "l4x1"
|
||||
L4X4 = "l4x4"
|
||||
L40SX1 = "l40sx1"
|
||||
L40SX4 = "l40sx4"
|
||||
L40SX8 = "l40sx8"
|
||||
A10G_SMALL = "a10g-small"
|
||||
A10G_LARGE = "a10g-large"
|
||||
A10G_LARGEX2 = "a10g-largex2"
|
||||
A10G_LARGEX4 = "a10g-largex4"
|
||||
A100_LARGE = "a100-large"
|
||||
H100 = "h100"
|
||||
H100X8 = "h100x8"
|
||||
|
||||
# TPU
|
||||
V5E_1X1 = "v5e-1x1"
|
||||
V5E_2X2 = "v5e-2x2"
|
||||
V5E_2X4 = "v5e-2x4"
|
||||
|
||||
|
||||
class SpaceStorage(str, Enum):
|
||||
"""
|
||||
Enumeration of persistent storage available for your Space on the Hub.
|
||||
|
||||
Value can be compared to a string:
|
||||
```py
|
||||
assert SpaceStorage.SMALL == "small"
|
||||
```
|
||||
|
||||
Taken from https://github.com/huggingface/moon-landing/blob/main/server/repo_types/SpaceHardwareFlavor.ts#L24 (private url).
|
||||
"""
|
||||
|
||||
SMALL = "small"
|
||||
MEDIUM = "medium"
|
||||
LARGE = "large"
|
||||
|
||||
|
||||
@dataclass
|
||||
class SpaceRuntime:
|
||||
"""
|
||||
Contains information about the current runtime of a Space.
|
||||
|
||||
Args:
|
||||
stage (`str`):
|
||||
Current stage of the space. Example: RUNNING.
|
||||
hardware (`str` or `None`):
|
||||
Current hardware of the space. Example: "cpu-basic". Can be `None` if Space
|
||||
is `BUILDING` for the first time.
|
||||
requested_hardware (`str` or `None`):
|
||||
Requested hardware. Can be different than `hardware` especially if the request
|
||||
has just been made. Example: "t4-medium". Can be `None` if no hardware has
|
||||
been requested yet.
|
||||
sleep_time (`int` or `None`):
|
||||
Number of seconds the Space will be kept alive after the last request. By default (if value is `None`), the
|
||||
Space will never go to sleep if it's running on an upgraded hardware, while it will go to sleep after 48
|
||||
hours on a free 'cpu-basic' hardware. For more details, see https://huggingface.co/docs/hub/spaces-gpus#sleep-time.
|
||||
raw (`dict`):
|
||||
Raw response from the server. Contains more information about the Space
|
||||
runtime like number of replicas, number of cpu, memory size,...
|
||||
"""
|
||||
|
||||
stage: SpaceStage
|
||||
hardware: Optional[SpaceHardware]
|
||||
requested_hardware: Optional[SpaceHardware]
|
||||
sleep_time: Optional[int]
|
||||
storage: Optional[SpaceStorage]
|
||||
raw: Dict
|
||||
|
||||
def __init__(self, data: Dict) -> None:
|
||||
self.stage = data["stage"]
|
||||
self.hardware = data.get("hardware", {}).get("current")
|
||||
self.requested_hardware = data.get("hardware", {}).get("requested")
|
||||
self.sleep_time = data.get("gcTimeout")
|
||||
self.storage = data.get("storage")
|
||||
self.raw = data
|
||||
|
||||
|
||||
@dataclass
|
||||
class SpaceVariable:
|
||||
"""
|
||||
Contains information about the current variables of a Space.
|
||||
|
||||
Args:
|
||||
key (`str`):
|
||||
Variable key. Example: `"MODEL_REPO_ID"`
|
||||
value (`str`):
|
||||
Variable value. Example: `"the_model_repo_id"`.
|
||||
description (`str` or None):
|
||||
Description of the variable. Example: `"Model Repo ID of the implemented model"`.
|
||||
updatedAt (`datetime` or None):
|
||||
datetime of the last update of the variable (if the variable has been updated at least once).
|
||||
"""
|
||||
|
||||
key: str
|
||||
value: str
|
||||
description: Optional[str]
|
||||
updated_at: Optional[datetime]
|
||||
|
||||
def __init__(self, key: str, values: Dict) -> None:
|
||||
self.key = key
|
||||
self.value = values["value"]
|
||||
self.description = values.get("description")
|
||||
updated_at = values.get("updatedAt")
|
||||
self.updated_at = parse_datetime(updated_at) if updated_at is not None else None
|
||||
@@ -0,0 +1,194 @@
|
||||
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contains a logger to push training logs to the Hub, using Tensorboard."""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, List, Optional, Union
|
||||
|
||||
from ._commit_scheduler import CommitScheduler
|
||||
from .errors import EntryNotFoundError
|
||||
from .repocard import ModelCard
|
||||
from .utils import experimental
|
||||
|
||||
|
||||
# Depending on user's setup, SummaryWriter can come either from 'tensorboardX'
|
||||
# or from 'torch.utils.tensorboard'. Both are compatible so let's try to load
|
||||
# from either of them.
|
||||
try:
|
||||
from tensorboardX import SummaryWriter
|
||||
|
||||
is_summary_writer_available = True
|
||||
|
||||
except ImportError:
|
||||
try:
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
is_summary_writer_available = False
|
||||
except ImportError:
|
||||
# Dummy class to avoid failing at import. Will raise on instance creation.
|
||||
SummaryWriter = object
|
||||
is_summary_writer_available = False
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tensorboardX import SummaryWriter
|
||||
|
||||
|
||||
class HFSummaryWriter(SummaryWriter):
|
||||
"""
|
||||
Wrapper around the tensorboard's `SummaryWriter` to push training logs to the Hub.
|
||||
|
||||
Data is logged locally and then pushed to the Hub asynchronously. Pushing data to the Hub is done in a separate
|
||||
thread to avoid blocking the training script. In particular, if the upload fails for any reason (e.g. a connection
|
||||
issue), the main script will not be interrupted. Data is automatically pushed to the Hub every `commit_every`
|
||||
minutes (default to every 5 minutes).
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
`HFSummaryWriter` is experimental. Its API is subject to change in the future without prior notice.
|
||||
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
repo_id (`str`):
|
||||
The id of the repo to which the logs will be pushed.
|
||||
logdir (`str`, *optional*):
|
||||
The directory where the logs will be written. If not specified, a local directory will be created by the
|
||||
underlying `SummaryWriter` object.
|
||||
commit_every (`int` or `float`, *optional*):
|
||||
The frequency (in minutes) at which the logs will be pushed to the Hub. Defaults to 5 minutes.
|
||||
squash_history (`bool`, *optional*):
|
||||
Whether to squash the history of the repo after each commit. Defaults to `False`. Squashing commits is
|
||||
useful to avoid degraded performances on the repo when it grows too large.
|
||||
repo_type (`str`, *optional*):
|
||||
The type of the repo to which the logs will be pushed. Defaults to "model".
|
||||
repo_revision (`str`, *optional*):
|
||||
The revision of the repo to which the logs will be pushed. Defaults to "main".
|
||||
repo_private (`bool`, *optional*):
|
||||
Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists.
|
||||
path_in_repo (`str`, *optional*):
|
||||
The path to the folder in the repo where the logs will be pushed. Defaults to "tensorboard/".
|
||||
repo_allow_patterns (`List[str]` or `str`, *optional*):
|
||||
A list of patterns to include in the upload. Defaults to `"*.tfevents.*"`. Check out the
|
||||
[upload guide](https://huggingface.co/docs/huggingface_hub/guides/upload#upload-a-folder) for more details.
|
||||
repo_ignore_patterns (`List[str]` or `str`, *optional*):
|
||||
A list of patterns to exclude in the upload. Check out the
|
||||
[upload guide](https://huggingface.co/docs/huggingface_hub/guides/upload#upload-a-folder) for more details.
|
||||
token (`str`, *optional*):
|
||||
Authentication token. Will default to the stored token. See https://huggingface.co/settings/token for more
|
||||
details
|
||||
kwargs:
|
||||
Additional keyword arguments passed to `SummaryWriter`.
|
||||
|
||||
Examples:
|
||||
```diff
|
||||
# Taken from https://pytorch.org/docs/stable/tensorboard.html
|
||||
- from torch.utils.tensorboard import SummaryWriter
|
||||
+ from huggingface_hub import HFSummaryWriter
|
||||
|
||||
import numpy as np
|
||||
|
||||
- writer = SummaryWriter()
|
||||
+ writer = HFSummaryWriter(repo_id="username/my-trained-model")
|
||||
|
||||
for n_iter in range(100):
|
||||
writer.add_scalar('Loss/train', np.random.random(), n_iter)
|
||||
writer.add_scalar('Loss/test', np.random.random(), n_iter)
|
||||
writer.add_scalar('Accuracy/train', np.random.random(), n_iter)
|
||||
writer.add_scalar('Accuracy/test', np.random.random(), n_iter)
|
||||
```
|
||||
|
||||
```py
|
||||
>>> from huggingface_hub import HFSummaryWriter
|
||||
|
||||
# Logs are automatically pushed every 15 minutes (5 by default) + when exiting the context manager
|
||||
>>> with HFSummaryWriter(repo_id="test_hf_logger", commit_every=15) as logger:
|
||||
... logger.add_scalar("a", 1)
|
||||
... logger.add_scalar("b", 2)
|
||||
```
|
||||
"""
|
||||
|
||||
@experimental
|
||||
def __new__(cls, *args, **kwargs) -> "HFSummaryWriter":
|
||||
if not is_summary_writer_available:
|
||||
raise ImportError(
|
||||
"You must have `tensorboard` installed to use `HFSummaryWriter`. Please run `pip install --upgrade"
|
||||
" tensorboardX` first."
|
||||
)
|
||||
return super().__new__(cls)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
repo_id: str,
|
||||
*,
|
||||
logdir: Optional[str] = None,
|
||||
commit_every: Union[int, float] = 5,
|
||||
squash_history: bool = False,
|
||||
repo_type: Optional[str] = None,
|
||||
repo_revision: Optional[str] = None,
|
||||
repo_private: Optional[bool] = None,
|
||||
path_in_repo: Optional[str] = "tensorboard",
|
||||
repo_allow_patterns: Optional[Union[List[str], str]] = "*.tfevents.*",
|
||||
repo_ignore_patterns: Optional[Union[List[str], str]] = None,
|
||||
token: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
# Initialize SummaryWriter
|
||||
super().__init__(logdir=logdir, **kwargs)
|
||||
|
||||
# Check logdir has been correctly initialized and fail early otherwise. In practice, SummaryWriter takes care of it.
|
||||
if not isinstance(self.logdir, str):
|
||||
raise ValueError(f"`self.logdir` must be a string. Got '{self.logdir}' of type {type(self.logdir)}.")
|
||||
|
||||
# Append logdir name to `path_in_repo`
|
||||
if path_in_repo is None or path_in_repo == "":
|
||||
path_in_repo = Path(self.logdir).name
|
||||
else:
|
||||
path_in_repo = path_in_repo.strip("/") + "/" + Path(self.logdir).name
|
||||
|
||||
# Initialize scheduler
|
||||
self.scheduler = CommitScheduler(
|
||||
folder_path=self.logdir,
|
||||
path_in_repo=path_in_repo,
|
||||
repo_id=repo_id,
|
||||
repo_type=repo_type,
|
||||
revision=repo_revision,
|
||||
private=repo_private,
|
||||
token=token,
|
||||
allow_patterns=repo_allow_patterns,
|
||||
ignore_patterns=repo_ignore_patterns,
|
||||
every=commit_every,
|
||||
squash_history=squash_history,
|
||||
)
|
||||
|
||||
# Exposing some high-level info at root level
|
||||
self.repo_id = self.scheduler.repo_id
|
||||
self.repo_type = self.scheduler.repo_type
|
||||
self.repo_revision = self.scheduler.revision
|
||||
|
||||
# Add `hf-summary-writer` tag to the model card metadata
|
||||
try:
|
||||
card = ModelCard.load(repo_id_or_path=self.repo_id, repo_type=self.repo_type)
|
||||
except EntryNotFoundError:
|
||||
card = ModelCard("")
|
||||
tags = card.data.get("tags", [])
|
||||
if "hf-summary-writer" not in tags:
|
||||
tags.append("hf-summary-writer")
|
||||
card.data["tags"] = tags
|
||||
card.push_to_hub(repo_id=self.repo_id, repo_type=self.repo_type)
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Push to hub in a non-blocking way when exiting the logger's context manager."""
|
||||
super().__exit__(exc_type, exc_val, exc_tb)
|
||||
future = self.scheduler.trigger()
|
||||
future.result()
|
||||
@@ -0,0 +1,622 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import enum
|
||||
import logging
|
||||
import os
|
||||
import queue
|
||||
import shutil
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from threading import Lock
|
||||
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
|
||||
from urllib.parse import quote
|
||||
|
||||
from . import constants
|
||||
from ._commit_api import CommitOperationAdd, UploadInfo, _fetch_upload_modes
|
||||
from ._local_folder import LocalUploadFileMetadata, LocalUploadFilePaths, get_local_upload_paths, read_upload_metadata
|
||||
from .constants import DEFAULT_REVISION, REPO_TYPES
|
||||
from .utils import DEFAULT_IGNORE_PATTERNS, filter_repo_objects, tqdm
|
||||
from .utils._cache_manager import _format_size
|
||||
from .utils.sha import sha_fileobj
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .hf_api import HfApi
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
WAITING_TIME_IF_NO_TASKS = 10 # seconds
|
||||
MAX_NB_REGULAR_FILES_PER_COMMIT = 75
|
||||
MAX_NB_LFS_FILES_PER_COMMIT = 150
|
||||
|
||||
|
||||
def upload_large_folder_internal(
|
||||
api: "HfApi",
|
||||
repo_id: str,
|
||||
folder_path: Union[str, Path],
|
||||
*,
|
||||
repo_type: str, # Repo type is required!
|
||||
revision: Optional[str] = None,
|
||||
private: Optional[bool] = None,
|
||||
allow_patterns: Optional[Union[List[str], str]] = None,
|
||||
ignore_patterns: Optional[Union[List[str], str]] = None,
|
||||
num_workers: Optional[int] = None,
|
||||
print_report: bool = True,
|
||||
print_report_every: int = 60,
|
||||
):
|
||||
"""Upload a large folder to the Hub in the most resilient way possible.
|
||||
|
||||
See [`HfApi.upload_large_folder`] for the full documentation.
|
||||
"""
|
||||
# 1. Check args and setup
|
||||
if repo_type is None:
|
||||
raise ValueError(
|
||||
"For large uploads, `repo_type` is explicitly required. Please set it to `model`, `dataset` or `space`."
|
||||
" If you are using the CLI, pass it as `--repo-type=model`."
|
||||
)
|
||||
if repo_type not in REPO_TYPES:
|
||||
raise ValueError(f"Invalid repo type, must be one of {REPO_TYPES}")
|
||||
if revision is None:
|
||||
revision = DEFAULT_REVISION
|
||||
|
||||
folder_path = Path(folder_path).expanduser().resolve()
|
||||
if not folder_path.is_dir():
|
||||
raise ValueError(f"Provided path: '{folder_path}' is not a directory")
|
||||
|
||||
if ignore_patterns is None:
|
||||
ignore_patterns = []
|
||||
elif isinstance(ignore_patterns, str):
|
||||
ignore_patterns = [ignore_patterns]
|
||||
ignore_patterns += DEFAULT_IGNORE_PATTERNS
|
||||
|
||||
if num_workers is None:
|
||||
nb_cores = os.cpu_count() or 1
|
||||
num_workers = max(nb_cores - 2, 2) # Use all but 2 cores, or at least 2 cores
|
||||
|
||||
# 2. Create repo if missing
|
||||
repo_url = api.create_repo(repo_id=repo_id, repo_type=repo_type, private=private, exist_ok=True)
|
||||
logger.info(f"Repo created: {repo_url}")
|
||||
repo_id = repo_url.repo_id
|
||||
|
||||
# 3. List files to upload
|
||||
filtered_paths_list = filter_repo_objects(
|
||||
(path.relative_to(folder_path).as_posix() for path in folder_path.glob("**/*") if path.is_file()),
|
||||
allow_patterns=allow_patterns,
|
||||
ignore_patterns=ignore_patterns,
|
||||
)
|
||||
paths_list = [get_local_upload_paths(folder_path, relpath) for relpath in filtered_paths_list]
|
||||
logger.info(f"Found {len(paths_list)} candidate files to upload")
|
||||
|
||||
# Read metadata for each file
|
||||
items = [
|
||||
(paths, read_upload_metadata(folder_path, paths.path_in_repo))
|
||||
for paths in tqdm(paths_list, desc="Recovering from metadata files")
|
||||
]
|
||||
|
||||
# 4. Start workers
|
||||
status = LargeUploadStatus(items)
|
||||
threads = [
|
||||
threading.Thread(
|
||||
target=_worker_job,
|
||||
kwargs={
|
||||
"status": status,
|
||||
"api": api,
|
||||
"repo_id": repo_id,
|
||||
"repo_type": repo_type,
|
||||
"revision": revision,
|
||||
},
|
||||
)
|
||||
for _ in range(num_workers)
|
||||
]
|
||||
|
||||
for thread in threads:
|
||||
thread.start()
|
||||
|
||||
# 5. Print regular reports
|
||||
if print_report:
|
||||
print("\n\n" + status.current_report())
|
||||
last_report_ts = time.time()
|
||||
while True:
|
||||
time.sleep(1)
|
||||
if time.time() - last_report_ts >= print_report_every:
|
||||
if print_report:
|
||||
_print_overwrite(status.current_report())
|
||||
last_report_ts = time.time()
|
||||
if status.is_done():
|
||||
logging.info("Is done: exiting main loop")
|
||||
break
|
||||
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
logger.info(status.current_report())
|
||||
logging.info("Upload is complete!")
|
||||
|
||||
|
||||
####################
|
||||
# Logic to manage workers and synchronize tasks
|
||||
####################
|
||||
|
||||
|
||||
class WorkerJob(enum.Enum):
|
||||
SHA256 = enum.auto()
|
||||
GET_UPLOAD_MODE = enum.auto()
|
||||
PREUPLOAD_LFS = enum.auto()
|
||||
COMMIT = enum.auto()
|
||||
WAIT = enum.auto() # if no tasks are available but we don't want to exit
|
||||
|
||||
|
||||
JOB_ITEM_T = Tuple[LocalUploadFilePaths, LocalUploadFileMetadata]
|
||||
|
||||
|
||||
class LargeUploadStatus:
|
||||
"""Contains information, queues and tasks for a large upload process."""
|
||||
|
||||
def __init__(self, items: List[JOB_ITEM_T]):
|
||||
self.items = items
|
||||
self.queue_sha256: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
|
||||
self.queue_get_upload_mode: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
|
||||
self.queue_preupload_lfs: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
|
||||
self.queue_commit: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
|
||||
self.lock = Lock()
|
||||
|
||||
self.nb_workers_sha256: int = 0
|
||||
self.nb_workers_get_upload_mode: int = 0
|
||||
self.nb_workers_preupload_lfs: int = 0
|
||||
self.nb_workers_commit: int = 0
|
||||
self.nb_workers_waiting: int = 0
|
||||
self.last_commit_attempt: Optional[float] = None
|
||||
|
||||
self._started_at = datetime.now()
|
||||
|
||||
# Setup queues
|
||||
for item in self.items:
|
||||
paths, metadata = item
|
||||
if metadata.sha256 is None:
|
||||
self.queue_sha256.put(item)
|
||||
elif metadata.upload_mode is None:
|
||||
self.queue_get_upload_mode.put(item)
|
||||
elif metadata.upload_mode == "lfs" and not metadata.is_uploaded:
|
||||
self.queue_preupload_lfs.put(item)
|
||||
elif not metadata.is_committed:
|
||||
self.queue_commit.put(item)
|
||||
else:
|
||||
logger.debug(f"Skipping file {paths.path_in_repo} (already uploaded and committed)")
|
||||
|
||||
def current_report(self) -> str:
|
||||
"""Generate a report of the current status of the large upload."""
|
||||
nb_hashed = 0
|
||||
size_hashed = 0
|
||||
nb_preuploaded = 0
|
||||
nb_lfs = 0
|
||||
nb_lfs_unsure = 0
|
||||
size_preuploaded = 0
|
||||
nb_committed = 0
|
||||
size_committed = 0
|
||||
total_size = 0
|
||||
ignored_files = 0
|
||||
total_files = 0
|
||||
|
||||
with self.lock:
|
||||
for _, metadata in self.items:
|
||||
if metadata.should_ignore:
|
||||
ignored_files += 1
|
||||
continue
|
||||
total_size += metadata.size
|
||||
total_files += 1
|
||||
if metadata.sha256 is not None:
|
||||
nb_hashed += 1
|
||||
size_hashed += metadata.size
|
||||
if metadata.upload_mode == "lfs":
|
||||
nb_lfs += 1
|
||||
if metadata.upload_mode is None:
|
||||
nb_lfs_unsure += 1
|
||||
if metadata.is_uploaded:
|
||||
nb_preuploaded += 1
|
||||
size_preuploaded += metadata.size
|
||||
if metadata.is_committed:
|
||||
nb_committed += 1
|
||||
size_committed += metadata.size
|
||||
total_size_str = _format_size(total_size)
|
||||
|
||||
now = datetime.now()
|
||||
now_str = now.strftime("%Y-%m-%d %H:%M:%S")
|
||||
elapsed = now - self._started_at
|
||||
elapsed_str = str(elapsed).split(".")[0] # remove milliseconds
|
||||
|
||||
message = "\n" + "-" * 10
|
||||
message += f" {now_str} ({elapsed_str}) "
|
||||
message += "-" * 10 + "\n"
|
||||
|
||||
message += "Files: "
|
||||
message += f"hashed {nb_hashed}/{total_files} ({_format_size(size_hashed)}/{total_size_str}) | "
|
||||
message += f"pre-uploaded: {nb_preuploaded}/{nb_lfs} ({_format_size(size_preuploaded)}/{total_size_str})"
|
||||
if nb_lfs_unsure > 0:
|
||||
message += f" (+{nb_lfs_unsure} unsure)"
|
||||
message += f" | committed: {nb_committed}/{total_files} ({_format_size(size_committed)}/{total_size_str})"
|
||||
message += f" | ignored: {ignored_files}\n"
|
||||
|
||||
message += "Workers: "
|
||||
message += f"hashing: {self.nb_workers_sha256} | "
|
||||
message += f"get upload mode: {self.nb_workers_get_upload_mode} | "
|
||||
message += f"pre-uploading: {self.nb_workers_preupload_lfs} | "
|
||||
message += f"committing: {self.nb_workers_commit} | "
|
||||
message += f"waiting: {self.nb_workers_waiting}\n"
|
||||
message += "-" * 51
|
||||
|
||||
return message
|
||||
|
||||
def is_done(self) -> bool:
|
||||
with self.lock:
|
||||
return all(metadata.is_committed or metadata.should_ignore for _, metadata in self.items)
|
||||
|
||||
|
||||
def _worker_job(
|
||||
status: LargeUploadStatus,
|
||||
api: "HfApi",
|
||||
repo_id: str,
|
||||
repo_type: str,
|
||||
revision: str,
|
||||
):
|
||||
"""
|
||||
Main process for a worker. The worker will perform tasks based on the priority list until all files are uploaded
|
||||
and committed. If no tasks are available, the worker will wait for 10 seconds before checking again.
|
||||
|
||||
If a task fails for any reason, the item(s) are put back in the queue for another worker to pick up.
|
||||
|
||||
Read `upload_large_folder` docstring for more information on how tasks are prioritized.
|
||||
"""
|
||||
while True:
|
||||
next_job: Optional[Tuple[WorkerJob, List[JOB_ITEM_T]]] = None
|
||||
|
||||
# Determine next task
|
||||
next_job = _determine_next_job(status)
|
||||
if next_job is None:
|
||||
return
|
||||
job, items = next_job
|
||||
|
||||
# Perform task
|
||||
if job == WorkerJob.SHA256:
|
||||
item = items[0] # single item
|
||||
try:
|
||||
_compute_sha256(item)
|
||||
status.queue_get_upload_mode.put(item)
|
||||
except KeyboardInterrupt:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to compute sha256: {e}")
|
||||
traceback.format_exc()
|
||||
status.queue_sha256.put(item)
|
||||
|
||||
with status.lock:
|
||||
status.nb_workers_sha256 -= 1
|
||||
|
||||
elif job == WorkerJob.GET_UPLOAD_MODE:
|
||||
try:
|
||||
_get_upload_mode(items, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision)
|
||||
except KeyboardInterrupt:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get upload mode: {e}")
|
||||
traceback.format_exc()
|
||||
|
||||
# Items are either:
|
||||
# - dropped (if should_ignore)
|
||||
# - put in LFS queue (if LFS)
|
||||
# - put in commit queue (if regular)
|
||||
# - or put back (if error occurred).
|
||||
for item in items:
|
||||
_, metadata = item
|
||||
if metadata.should_ignore:
|
||||
continue
|
||||
if metadata.upload_mode == "lfs":
|
||||
status.queue_preupload_lfs.put(item)
|
||||
elif metadata.upload_mode == "regular":
|
||||
status.queue_commit.put(item)
|
||||
else:
|
||||
status.queue_get_upload_mode.put(item)
|
||||
|
||||
with status.lock:
|
||||
status.nb_workers_get_upload_mode -= 1
|
||||
|
||||
elif job == WorkerJob.PREUPLOAD_LFS:
|
||||
item = items[0] # single item
|
||||
try:
|
||||
_preupload_lfs(item, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision)
|
||||
status.queue_commit.put(item)
|
||||
except KeyboardInterrupt:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to preupload LFS: {e}")
|
||||
traceback.format_exc()
|
||||
status.queue_preupload_lfs.put(item)
|
||||
|
||||
with status.lock:
|
||||
status.nb_workers_preupload_lfs -= 1
|
||||
|
||||
elif job == WorkerJob.COMMIT:
|
||||
try:
|
||||
_commit(items, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision)
|
||||
except KeyboardInterrupt:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to commit: {e}")
|
||||
traceback.format_exc()
|
||||
for item in items:
|
||||
status.queue_commit.put(item)
|
||||
with status.lock:
|
||||
status.last_commit_attempt = time.time()
|
||||
status.nb_workers_commit -= 1
|
||||
|
||||
elif job == WorkerJob.WAIT:
|
||||
time.sleep(WAITING_TIME_IF_NO_TASKS)
|
||||
with status.lock:
|
||||
status.nb_workers_waiting -= 1
|
||||
|
||||
|
||||
def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob, List[JOB_ITEM_T]]]:
|
||||
with status.lock:
|
||||
# 1. Commit if more than 5 minutes since last commit attempt (and at least 1 file)
|
||||
if (
|
||||
status.nb_workers_commit == 0
|
||||
and status.queue_commit.qsize() > 0
|
||||
and status.last_commit_attempt is not None
|
||||
and time.time() - status.last_commit_attempt > 5 * 60
|
||||
):
|
||||
status.nb_workers_commit += 1
|
||||
logger.debug("Job: commit (more than 5 minutes since last commit attempt)")
|
||||
return (WorkerJob.COMMIT, _get_items_to_commit(status.queue_commit))
|
||||
|
||||
# 2. Commit if at least 100 files are ready to commit
|
||||
elif status.nb_workers_commit == 0 and status.queue_commit.qsize() >= 150:
|
||||
status.nb_workers_commit += 1
|
||||
logger.debug("Job: commit (>100 files ready)")
|
||||
return (WorkerJob.COMMIT, _get_items_to_commit(status.queue_commit))
|
||||
|
||||
# 3. Get upload mode if at least 10 files
|
||||
elif status.queue_get_upload_mode.qsize() >= 10:
|
||||
status.nb_workers_get_upload_mode += 1
|
||||
logger.debug("Job: get upload mode (>10 files ready)")
|
||||
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, 50))
|
||||
|
||||
# 4. Preupload LFS file if at least 1 file and no worker is preuploading LFS
|
||||
elif status.queue_preupload_lfs.qsize() > 0 and status.nb_workers_preupload_lfs == 0:
|
||||
status.nb_workers_preupload_lfs += 1
|
||||
logger.debug("Job: preupload LFS (no other worker preuploading LFS)")
|
||||
return (WorkerJob.PREUPLOAD_LFS, _get_one(status.queue_preupload_lfs))
|
||||
|
||||
# 5. Compute sha256 if at least 1 file and no worker is computing sha256
|
||||
elif status.queue_sha256.qsize() > 0 and status.nb_workers_sha256 == 0:
|
||||
status.nb_workers_sha256 += 1
|
||||
logger.debug("Job: sha256 (no other worker computing sha256)")
|
||||
return (WorkerJob.SHA256, _get_one(status.queue_sha256))
|
||||
|
||||
# 6. Get upload mode if at least 1 file and no worker is getting upload mode
|
||||
elif status.queue_get_upload_mode.qsize() > 0 and status.nb_workers_get_upload_mode == 0:
|
||||
status.nb_workers_get_upload_mode += 1
|
||||
logger.debug("Job: get upload mode (no other worker getting upload mode)")
|
||||
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, 50))
|
||||
|
||||
# 7. Preupload LFS file if at least 1 file
|
||||
# Skip if hf_transfer is enabled and there is already a worker preuploading LFS
|
||||
elif status.queue_preupload_lfs.qsize() > 0 and (
|
||||
status.nb_workers_preupload_lfs == 0 or not constants.HF_HUB_ENABLE_HF_TRANSFER
|
||||
):
|
||||
status.nb_workers_preupload_lfs += 1
|
||||
logger.debug("Job: preupload LFS")
|
||||
return (WorkerJob.PREUPLOAD_LFS, _get_one(status.queue_preupload_lfs))
|
||||
|
||||
# 8. Compute sha256 if at least 1 file
|
||||
elif status.queue_sha256.qsize() > 0:
|
||||
status.nb_workers_sha256 += 1
|
||||
logger.debug("Job: sha256")
|
||||
return (WorkerJob.SHA256, _get_one(status.queue_sha256))
|
||||
|
||||
# 9. Get upload mode if at least 1 file
|
||||
elif status.queue_get_upload_mode.qsize() > 0:
|
||||
status.nb_workers_get_upload_mode += 1
|
||||
logger.debug("Job: get upload mode")
|
||||
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, 50))
|
||||
|
||||
# 10. Commit if at least 1 file and 1 min since last commit attempt
|
||||
elif (
|
||||
status.nb_workers_commit == 0
|
||||
and status.queue_commit.qsize() > 0
|
||||
and status.last_commit_attempt is not None
|
||||
and time.time() - status.last_commit_attempt > 1 * 60
|
||||
):
|
||||
status.nb_workers_commit += 1
|
||||
logger.debug("Job: commit (1 min since last commit attempt)")
|
||||
return (WorkerJob.COMMIT, _get_items_to_commit(status.queue_commit))
|
||||
|
||||
# 11. Commit if at least 1 file all other queues are empty and all workers are waiting
|
||||
# e.g. when it's the last commit
|
||||
elif (
|
||||
status.nb_workers_commit == 0
|
||||
and status.queue_commit.qsize() > 0
|
||||
and status.queue_sha256.qsize() == 0
|
||||
and status.queue_get_upload_mode.qsize() == 0
|
||||
and status.queue_preupload_lfs.qsize() == 0
|
||||
and status.nb_workers_sha256 == 0
|
||||
and status.nb_workers_get_upload_mode == 0
|
||||
and status.nb_workers_preupload_lfs == 0
|
||||
):
|
||||
status.nb_workers_commit += 1
|
||||
logger.debug("Job: commit")
|
||||
return (WorkerJob.COMMIT, _get_items_to_commit(status.queue_commit))
|
||||
|
||||
# 12. If all queues are empty, exit
|
||||
elif all(metadata.is_committed or metadata.should_ignore for _, metadata in status.items):
|
||||
logger.info("All files have been processed! Exiting worker.")
|
||||
return None
|
||||
|
||||
# 13. If no task is available, wait
|
||||
else:
|
||||
status.nb_workers_waiting += 1
|
||||
logger.debug(f"No task available, waiting... ({WAITING_TIME_IF_NO_TASKS}s)")
|
||||
return (WorkerJob.WAIT, [])
|
||||
|
||||
|
||||
####################
|
||||
# Atomic jobs (sha256, get_upload_mode, preupload_lfs, commit)
|
||||
####################
|
||||
|
||||
|
||||
def _compute_sha256(item: JOB_ITEM_T) -> None:
|
||||
"""Compute sha256 of a file and save it in metadata."""
|
||||
paths, metadata = item
|
||||
if metadata.sha256 is None:
|
||||
with paths.file_path.open("rb") as f:
|
||||
metadata.sha256 = sha_fileobj(f).hex()
|
||||
metadata.save(paths)
|
||||
|
||||
|
||||
def _get_upload_mode(items: List[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
|
||||
"""Get upload mode for each file and update metadata.
|
||||
|
||||
Also receive info if the file should be ignored.
|
||||
"""
|
||||
additions = [_build_hacky_operation(item) for item in items]
|
||||
_fetch_upload_modes(
|
||||
additions=additions,
|
||||
repo_type=repo_type,
|
||||
repo_id=repo_id,
|
||||
headers=api._build_hf_headers(),
|
||||
revision=quote(revision, safe=""),
|
||||
)
|
||||
for item, addition in zip(items, additions):
|
||||
paths, metadata = item
|
||||
metadata.upload_mode = addition._upload_mode
|
||||
metadata.should_ignore = addition._should_ignore
|
||||
metadata.save(paths)
|
||||
|
||||
|
||||
def _preupload_lfs(item: JOB_ITEM_T, api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
|
||||
"""Preupload LFS file and update metadata."""
|
||||
paths, metadata = item
|
||||
addition = _build_hacky_operation(item)
|
||||
api.preupload_lfs_files(
|
||||
repo_id=repo_id,
|
||||
repo_type=repo_type,
|
||||
revision=revision,
|
||||
additions=[addition],
|
||||
)
|
||||
|
||||
metadata.is_uploaded = True
|
||||
metadata.save(paths)
|
||||
|
||||
|
||||
def _commit(items: List[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
|
||||
"""Commit files to the repo."""
|
||||
additions = [_build_hacky_operation(item) for item in items]
|
||||
api.create_commit(
|
||||
repo_id=repo_id,
|
||||
repo_type=repo_type,
|
||||
revision=revision,
|
||||
operations=additions,
|
||||
commit_message="Add files using upload-large-folder tool",
|
||||
)
|
||||
for paths, metadata in items:
|
||||
metadata.is_committed = True
|
||||
metadata.save(paths)
|
||||
|
||||
|
||||
####################
|
||||
# Hacks with CommitOperationAdd to bypass checks/sha256 calculation
|
||||
####################
|
||||
|
||||
|
||||
class HackyCommitOperationAdd(CommitOperationAdd):
|
||||
def __post_init__(self) -> None:
|
||||
if isinstance(self.path_or_fileobj, Path):
|
||||
self.path_or_fileobj = str(self.path_or_fileobj)
|
||||
|
||||
|
||||
def _build_hacky_operation(item: JOB_ITEM_T) -> HackyCommitOperationAdd:
|
||||
paths, metadata = item
|
||||
operation = HackyCommitOperationAdd(path_in_repo=paths.path_in_repo, path_or_fileobj=paths.file_path)
|
||||
with paths.file_path.open("rb") as file:
|
||||
sample = file.peek(512)[:512]
|
||||
if metadata.sha256 is None:
|
||||
raise ValueError("sha256 must have been computed by now!")
|
||||
operation.upload_info = UploadInfo(sha256=bytes.fromhex(metadata.sha256), size=metadata.size, sample=sample)
|
||||
return operation
|
||||
|
||||
|
||||
####################
|
||||
# Misc helpers
|
||||
####################
|
||||
|
||||
|
||||
def _get_one(queue: "queue.Queue[JOB_ITEM_T]") -> List[JOB_ITEM_T]:
|
||||
return [queue.get()]
|
||||
|
||||
|
||||
def _get_n(queue: "queue.Queue[JOB_ITEM_T]", n: int) -> List[JOB_ITEM_T]:
|
||||
return [queue.get() for _ in range(min(queue.qsize(), n))]
|
||||
|
||||
|
||||
def _get_items_to_commit(queue: "queue.Queue[JOB_ITEM_T]") -> List[JOB_ITEM_T]:
|
||||
"""Special case for commit job: the number of items to commit depends on the type of files."""
|
||||
# Can take at most 50 regular files and/or 100 LFS files in a single commit
|
||||
items: List[JOB_ITEM_T] = []
|
||||
nb_lfs, nb_regular = 0, 0
|
||||
while True:
|
||||
# If empty queue => commit everything
|
||||
if queue.qsize() == 0:
|
||||
return items
|
||||
|
||||
# If we have enough items => commit them
|
||||
if nb_lfs >= MAX_NB_LFS_FILES_PER_COMMIT or nb_regular >= MAX_NB_REGULAR_FILES_PER_COMMIT:
|
||||
return items
|
||||
|
||||
# Else, get a new item and increase counter
|
||||
item = queue.get()
|
||||
items.append(item)
|
||||
_, metadata = item
|
||||
if metadata.upload_mode == "lfs":
|
||||
nb_lfs += 1
|
||||
else:
|
||||
nb_regular += 1
|
||||
|
||||
|
||||
def _print_overwrite(report: str) -> None:
|
||||
"""Print a report, overwriting the previous lines.
|
||||
|
||||
Since tqdm in using `sys.stderr` to (re-)write progress bars, we need to use `sys.stdout`
|
||||
to print the report.
|
||||
|
||||
Note: works well only if no other process is writing to `sys.stdout`!
|
||||
"""
|
||||
report += "\n"
|
||||
# Get terminal width
|
||||
terminal_width = shutil.get_terminal_size().columns
|
||||
|
||||
# Count number of lines that should be cleared
|
||||
nb_lines = sum(len(line) // terminal_width + 1 for line in report.splitlines())
|
||||
|
||||
# Clear previous lines based on the number of lines in the report
|
||||
for _ in range(nb_lines):
|
||||
sys.stdout.write("\r\033[K") # Clear line
|
||||
sys.stdout.write("\033[F") # Move cursor up one line
|
||||
|
||||
# Print the new report, filling remaining space with whitespace
|
||||
sys.stdout.write(report)
|
||||
sys.stdout.write(" " * (terminal_width - len(report.splitlines()[-1])))
|
||||
sys.stdout.flush()
|
||||
@@ -0,0 +1,137 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contains data structures to parse the webhooks payload."""
|
||||
|
||||
from typing import List, Literal, Optional
|
||||
|
||||
from .utils import is_pydantic_available
|
||||
|
||||
|
||||
if is_pydantic_available():
|
||||
from pydantic import BaseModel
|
||||
else:
|
||||
# Define a dummy BaseModel to avoid import errors when pydantic is not installed
|
||||
# Import error will be raised when trying to use the class
|
||||
|
||||
class BaseModel: # type: ignore [no-redef]
|
||||
def __init__(self, *args, **kwargs) -> None:
|
||||
raise ImportError(
|
||||
"You must have `pydantic` installed to use `WebhookPayload`. This is an optional dependency that"
|
||||
" should be installed separately. Please run `pip install --upgrade pydantic` and retry."
|
||||
)
|
||||
|
||||
|
||||
# This is an adaptation of the ReportV3 interface implemented in moon-landing. V0, V1 and V2 have been ignored as they
|
||||
# are not in used anymore. To keep in sync when format is updated in
|
||||
# https://github.com/huggingface/moon-landing/blob/main/server/lib/HFWebhooks.ts (internal link).
|
||||
|
||||
|
||||
WebhookEvent_T = Literal[
|
||||
"create",
|
||||
"delete",
|
||||
"move",
|
||||
"update",
|
||||
]
|
||||
RepoChangeEvent_T = Literal[
|
||||
"add",
|
||||
"move",
|
||||
"remove",
|
||||
"update",
|
||||
]
|
||||
RepoType_T = Literal[
|
||||
"dataset",
|
||||
"model",
|
||||
"space",
|
||||
]
|
||||
DiscussionStatus_T = Literal[
|
||||
"closed",
|
||||
"draft",
|
||||
"open",
|
||||
"merged",
|
||||
]
|
||||
SupportedWebhookVersion = Literal[3]
|
||||
|
||||
|
||||
class ObjectId(BaseModel):
|
||||
id: str
|
||||
|
||||
|
||||
class WebhookPayloadUrl(BaseModel):
|
||||
web: str
|
||||
api: Optional[str] = None
|
||||
|
||||
|
||||
class WebhookPayloadMovedTo(BaseModel):
|
||||
name: str
|
||||
owner: ObjectId
|
||||
|
||||
|
||||
class WebhookPayloadWebhook(ObjectId):
|
||||
version: SupportedWebhookVersion
|
||||
|
||||
|
||||
class WebhookPayloadEvent(BaseModel):
|
||||
action: WebhookEvent_T
|
||||
scope: str
|
||||
|
||||
|
||||
class WebhookPayloadDiscussionChanges(BaseModel):
|
||||
base: str
|
||||
mergeCommitId: Optional[str] = None
|
||||
|
||||
|
||||
class WebhookPayloadComment(ObjectId):
|
||||
author: ObjectId
|
||||
hidden: bool
|
||||
content: Optional[str] = None
|
||||
url: WebhookPayloadUrl
|
||||
|
||||
|
||||
class WebhookPayloadDiscussion(ObjectId):
|
||||
num: int
|
||||
author: ObjectId
|
||||
url: WebhookPayloadUrl
|
||||
title: str
|
||||
isPullRequest: bool
|
||||
status: DiscussionStatus_T
|
||||
changes: Optional[WebhookPayloadDiscussionChanges] = None
|
||||
pinned: Optional[bool] = None
|
||||
|
||||
|
||||
class WebhookPayloadRepo(ObjectId):
|
||||
owner: ObjectId
|
||||
head_sha: Optional[str] = None
|
||||
name: str
|
||||
private: bool
|
||||
subdomain: Optional[str] = None
|
||||
tags: Optional[List[str]] = None
|
||||
type: Literal["dataset", "model", "space"]
|
||||
url: WebhookPayloadUrl
|
||||
|
||||
|
||||
class WebhookPayloadUpdatedRef(BaseModel):
|
||||
ref: str
|
||||
oldSha: Optional[str] = None
|
||||
newSha: Optional[str] = None
|
||||
|
||||
|
||||
class WebhookPayload(BaseModel):
|
||||
event: WebhookPayloadEvent
|
||||
repo: WebhookPayloadRepo
|
||||
discussion: Optional[WebhookPayloadDiscussion] = None
|
||||
comment: Optional[WebhookPayloadComment] = None
|
||||
webhook: WebhookPayloadWebhook
|
||||
movedTo: Optional[WebhookPayloadMovedTo] = None
|
||||
updatedRefs: Optional[List[WebhookPayloadUpdatedRef]] = None
|
||||
@@ -0,0 +1,388 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contains `WebhooksServer` and `webhook_endpoint` to create a webhook server easily."""
|
||||
|
||||
import atexit
|
||||
import inspect
|
||||
import os
|
||||
from functools import wraps
|
||||
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional
|
||||
|
||||
from .utils import experimental, is_fastapi_available, is_gradio_available
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import gradio as gr
|
||||
from fastapi import Request
|
||||
|
||||
if is_fastapi_available():
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
else:
|
||||
# Will fail at runtime if FastAPI is not available
|
||||
FastAPI = Request = JSONResponse = None # type: ignore [misc, assignment]
|
||||
|
||||
|
||||
_global_app: Optional["WebhooksServer"] = None
|
||||
_is_local = os.environ.get("SPACE_ID") is None
|
||||
|
||||
|
||||
@experimental
|
||||
class WebhooksServer:
|
||||
"""
|
||||
The [`WebhooksServer`] class lets you create an instance of a Gradio app that can receive Huggingface webhooks.
|
||||
These webhooks can be registered using the [`~WebhooksServer.add_webhook`] decorator. Webhook endpoints are added to
|
||||
the app as a POST endpoint to the FastAPI router. Once all the webhooks are registered, the `launch` method has to be
|
||||
called to start the app.
|
||||
|
||||
It is recommended to accept [`WebhookPayload`] as the first argument of the webhook function. It is a Pydantic
|
||||
model that contains all the information about the webhook event. The data will be parsed automatically for you.
|
||||
|
||||
Check out the [webhooks guide](../guides/webhooks_server) for a step-by-step tutorial on how to setup your
|
||||
WebhooksServer and deploy it on a Space.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
`WebhooksServer` is experimental. Its API is subject to change in the future.
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
You must have `gradio` installed to use `WebhooksServer` (`pip install --upgrade gradio`).
|
||||
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
ui (`gradio.Blocks`, optional):
|
||||
A Gradio UI instance to be used as the Space landing page. If `None`, a UI displaying instructions
|
||||
about the configured webhooks is created.
|
||||
webhook_secret (`str`, optional):
|
||||
A secret key to verify incoming webhook requests. You can set this value to any secret you want as long as
|
||||
you also configure it in your [webhooks settings panel](https://huggingface.co/settings/webhooks). You
|
||||
can also set this value as the `WEBHOOK_SECRET` environment variable. If no secret is provided, the
|
||||
webhook endpoints are opened without any security.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
import gradio as gr
|
||||
from huggingface_hub import WebhooksServer, WebhookPayload
|
||||
|
||||
with gr.Blocks() as ui:
|
||||
...
|
||||
|
||||
app = WebhooksServer(ui=ui, webhook_secret="my_secret_key")
|
||||
|
||||
@app.add_webhook("/say_hello")
|
||||
async def hello(payload: WebhookPayload):
|
||||
return {"message": "hello"}
|
||||
|
||||
app.launch()
|
||||
```
|
||||
"""
|
||||
|
||||
def __new__(cls, *args, **kwargs) -> "WebhooksServer":
|
||||
if not is_gradio_available():
|
||||
raise ImportError(
|
||||
"You must have `gradio` installed to use `WebhooksServer`. Please run `pip install --upgrade gradio`"
|
||||
" first."
|
||||
)
|
||||
if not is_fastapi_available():
|
||||
raise ImportError(
|
||||
"You must have `fastapi` installed to use `WebhooksServer`. Please run `pip install --upgrade fastapi`"
|
||||
" first."
|
||||
)
|
||||
return super().__new__(cls)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ui: Optional["gr.Blocks"] = None,
|
||||
webhook_secret: Optional[str] = None,
|
||||
) -> None:
|
||||
self._ui = ui
|
||||
|
||||
self.webhook_secret = webhook_secret or os.getenv("WEBHOOK_SECRET")
|
||||
self.registered_webhooks: Dict[str, Callable] = {}
|
||||
_warn_on_empty_secret(self.webhook_secret)
|
||||
|
||||
def add_webhook(self, path: Optional[str] = None) -> Callable:
|
||||
"""
|
||||
Decorator to add a webhook to the [`WebhooksServer`] server.
|
||||
|
||||
Args:
|
||||
path (`str`, optional):
|
||||
The URL path to register the webhook function. If not provided, the function name will be used as the
|
||||
path. In any case, all webhooks are registered under `/webhooks`.
|
||||
|
||||
Raises:
|
||||
ValueError: If the provided path is already registered as a webhook.
|
||||
|
||||
Example:
|
||||
```python
|
||||
from huggingface_hub import WebhooksServer, WebhookPayload
|
||||
|
||||
app = WebhooksServer()
|
||||
|
||||
@app.add_webhook
|
||||
async def trigger_training(payload: WebhookPayload):
|
||||
if payload.repo.type == "dataset" and payload.event.action == "update":
|
||||
# Trigger a training job if a dataset is updated
|
||||
...
|
||||
|
||||
app.launch()
|
||||
```
|
||||
"""
|
||||
# Usage: directly as decorator. Example: `@app.add_webhook`
|
||||
if callable(path):
|
||||
# If path is a function, it means it was used as a decorator without arguments
|
||||
return self.add_webhook()(path)
|
||||
|
||||
# Usage: provide a path. Example: `@app.add_webhook(...)`
|
||||
@wraps(FastAPI.post)
|
||||
def _inner_post(*args, **kwargs):
|
||||
func = args[0]
|
||||
abs_path = f"/webhooks/{(path or func.__name__).strip('/')}"
|
||||
if abs_path in self.registered_webhooks:
|
||||
raise ValueError(f"Webhook {abs_path} already exists.")
|
||||
self.registered_webhooks[abs_path] = func
|
||||
|
||||
return _inner_post
|
||||
|
||||
def launch(self, prevent_thread_lock: bool = False, **launch_kwargs: Any) -> None:
|
||||
"""Launch the Gradio app and register webhooks to the underlying FastAPI server.
|
||||
|
||||
Input parameters are forwarded to Gradio when launching the app.
|
||||
"""
|
||||
ui = self._ui or self._get_default_ui()
|
||||
|
||||
# Start Gradio App
|
||||
# - as non-blocking so that webhooks can be added afterwards
|
||||
# - as shared if launch locally (to debug webhooks)
|
||||
launch_kwargs.setdefault("share", _is_local)
|
||||
self.fastapi_app, _, _ = ui.launch(prevent_thread_lock=True, **launch_kwargs)
|
||||
|
||||
# Register webhooks to FastAPI app
|
||||
for path, func in self.registered_webhooks.items():
|
||||
# Add secret check if required
|
||||
if self.webhook_secret is not None:
|
||||
func = _wrap_webhook_to_check_secret(func, webhook_secret=self.webhook_secret)
|
||||
|
||||
# Add route to FastAPI app
|
||||
self.fastapi_app.post(path)(func)
|
||||
|
||||
# Print instructions and block main thread
|
||||
space_host = os.environ.get("SPACE_HOST")
|
||||
url = "https://" + space_host if space_host is not None else (ui.share_url or ui.local_url)
|
||||
if url is None:
|
||||
raise ValueError("Cannot find the URL of the app. Please provide a valid `ui` or update `gradio` version.")
|
||||
url = url.strip("/")
|
||||
message = "\nWebhooks are correctly setup and ready to use:"
|
||||
message += "\n" + "\n".join(f" - POST {url}{webhook}" for webhook in self.registered_webhooks)
|
||||
message += "\nGo to https://huggingface.co/settings/webhooks to setup your webhooks."
|
||||
print(message)
|
||||
|
||||
if not prevent_thread_lock:
|
||||
ui.block_thread()
|
||||
|
||||
def _get_default_ui(self) -> "gr.Blocks":
|
||||
"""Default UI if not provided (lists webhooks and provides basic instructions)."""
|
||||
import gradio as gr
|
||||
|
||||
with gr.Blocks() as ui:
|
||||
gr.Markdown("# This is an app to process 🤗 Webhooks")
|
||||
gr.Markdown(
|
||||
"Webhooks are a foundation for MLOps-related features. They allow you to listen for new changes on"
|
||||
" specific repos or to all repos belonging to particular set of users/organizations (not just your"
|
||||
" repos, but any repo). Check out this [guide](https://huggingface.co/docs/hub/webhooks) to get to"
|
||||
" know more about webhooks on the Huggingface Hub."
|
||||
)
|
||||
gr.Markdown(
|
||||
f"{len(self.registered_webhooks)} webhook(s) are registered:"
|
||||
+ "\n\n"
|
||||
+ "\n ".join(
|
||||
f"- [{webhook_path}]({_get_webhook_doc_url(webhook.__name__, webhook_path)})"
|
||||
for webhook_path, webhook in self.registered_webhooks.items()
|
||||
)
|
||||
)
|
||||
gr.Markdown(
|
||||
"Go to https://huggingface.co/settings/webhooks to setup your webhooks."
|
||||
+ "\nYou app is running locally. Please look at the logs to check the full URL you need to set."
|
||||
if _is_local
|
||||
else (
|
||||
"\nThis app is running on a Space. You can find the corresponding URL in the options menu"
|
||||
" (top-right) > 'Embed the Space'. The URL looks like 'https://{username}-{repo_name}.hf.space'."
|
||||
)
|
||||
)
|
||||
return ui
|
||||
|
||||
|
||||
@experimental
|
||||
def webhook_endpoint(path: Optional[str] = None) -> Callable:
|
||||
"""Decorator to start a [`WebhooksServer`] and register the decorated function as a webhook endpoint.
|
||||
|
||||
This is a helper to get started quickly. If you need more flexibility (custom landing page or webhook secret),
|
||||
you can use [`WebhooksServer`] directly. You can register multiple webhook endpoints (to the same server) by using
|
||||
this decorator multiple times.
|
||||
|
||||
Check out the [webhooks guide](../guides/webhooks_server) for a step-by-step tutorial on how to setup your
|
||||
server and deploy it on a Space.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
`webhook_endpoint` is experimental. Its API is subject to change in the future.
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
You must have `gradio` installed to use `webhook_endpoint` (`pip install --upgrade gradio`).
|
||||
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
path (`str`, optional):
|
||||
The URL path to register the webhook function. If not provided, the function name will be used as the path.
|
||||
In any case, all webhooks are registered under `/webhooks`.
|
||||
|
||||
Examples:
|
||||
The default usage is to register a function as a webhook endpoint. The function name will be used as the path.
|
||||
The server will be started automatically at exit (i.e. at the end of the script).
|
||||
|
||||
```python
|
||||
from huggingface_hub import webhook_endpoint, WebhookPayload
|
||||
|
||||
@webhook_endpoint
|
||||
async def trigger_training(payload: WebhookPayload):
|
||||
if payload.repo.type == "dataset" and payload.event.action == "update":
|
||||
# Trigger a training job if a dataset is updated
|
||||
...
|
||||
|
||||
# Server is automatically started at the end of the script.
|
||||
```
|
||||
|
||||
Advanced usage: register a function as a webhook endpoint and start the server manually. This is useful if you
|
||||
are running it in a notebook.
|
||||
|
||||
```python
|
||||
from huggingface_hub import webhook_endpoint, WebhookPayload
|
||||
|
||||
@webhook_endpoint
|
||||
async def trigger_training(payload: WebhookPayload):
|
||||
if payload.repo.type == "dataset" and payload.event.action == "update":
|
||||
# Trigger a training job if a dataset is updated
|
||||
...
|
||||
|
||||
# Start the server manually
|
||||
trigger_training.launch()
|
||||
```
|
||||
"""
|
||||
if callable(path):
|
||||
# If path is a function, it means it was used as a decorator without arguments
|
||||
return webhook_endpoint()(path)
|
||||
|
||||
@wraps(WebhooksServer.add_webhook)
|
||||
def _inner(func: Callable) -> Callable:
|
||||
app = _get_global_app()
|
||||
app.add_webhook(path)(func)
|
||||
if len(app.registered_webhooks) == 1:
|
||||
# Register `app.launch` to run at exit (only once)
|
||||
atexit.register(app.launch)
|
||||
|
||||
@wraps(app.launch)
|
||||
def _launch_now():
|
||||
# Run the app directly (without waiting atexit)
|
||||
atexit.unregister(app.launch)
|
||||
app.launch()
|
||||
|
||||
func.launch = _launch_now # type: ignore
|
||||
return func
|
||||
|
||||
return _inner
|
||||
|
||||
|
||||
def _get_global_app() -> WebhooksServer:
|
||||
global _global_app
|
||||
if _global_app is None:
|
||||
_global_app = WebhooksServer()
|
||||
return _global_app
|
||||
|
||||
|
||||
def _warn_on_empty_secret(webhook_secret: Optional[str]) -> None:
|
||||
if webhook_secret is None:
|
||||
print("Webhook secret is not defined. This means your webhook endpoints will be open to everyone.")
|
||||
print(
|
||||
"To add a secret, set `WEBHOOK_SECRET` as environment variable or pass it at initialization: "
|
||||
"\n\t`app = WebhooksServer(webhook_secret='my_secret', ...)`"
|
||||
)
|
||||
print(
|
||||
"For more details about webhook secrets, please refer to"
|
||||
" https://huggingface.co/docs/hub/webhooks#webhook-secret."
|
||||
)
|
||||
else:
|
||||
print("Webhook secret is correctly defined.")
|
||||
|
||||
|
||||
def _get_webhook_doc_url(webhook_name: str, webhook_path: str) -> str:
|
||||
"""Returns the anchor to a given webhook in the docs (experimental)"""
|
||||
return "/docs#/default/" + webhook_name + webhook_path.replace("/", "_") + "_post"
|
||||
|
||||
|
||||
def _wrap_webhook_to_check_secret(func: Callable, webhook_secret: str) -> Callable:
|
||||
"""Wraps a webhook function to check the webhook secret before calling the function.
|
||||
|
||||
This is a hacky way to add the `request` parameter to the function signature. Since FastAPI based itself on route
|
||||
parameters to inject the values to the function, we need to hack the function signature to retrieve the `Request`
|
||||
object (and hence the headers). A far cleaner solution would be to use a middleware. However, since
|
||||
`fastapi==0.90.1`, a middleware cannot be added once the app has started. And since the FastAPI app is started by
|
||||
Gradio internals (and not by us), we cannot add a middleware.
|
||||
|
||||
This method is called only when a secret has been defined by the user. If a request is sent without the
|
||||
"x-webhook-secret", the function will return a 401 error (unauthorized). If the header is sent but is incorrect,
|
||||
the function will return a 403 error (forbidden).
|
||||
|
||||
Inspired by https://stackoverflow.com/a/33112180.
|
||||
"""
|
||||
initial_sig = inspect.signature(func)
|
||||
|
||||
@wraps(func)
|
||||
async def _protected_func(request: Request, **kwargs):
|
||||
request_secret = request.headers.get("x-webhook-secret")
|
||||
if request_secret is None:
|
||||
return JSONResponse({"error": "x-webhook-secret header not set."}, status_code=401)
|
||||
if request_secret != webhook_secret:
|
||||
return JSONResponse({"error": "Invalid webhook secret."}, status_code=403)
|
||||
|
||||
# Inject `request` in kwargs if required
|
||||
if "request" in initial_sig.parameters:
|
||||
kwargs["request"] = request
|
||||
|
||||
# Handle both sync and async routes
|
||||
if inspect.iscoroutinefunction(func):
|
||||
return await func(**kwargs)
|
||||
else:
|
||||
return func(**kwargs)
|
||||
|
||||
# Update signature to include request
|
||||
if "request" not in initial_sig.parameters:
|
||||
_protected_func.__signature__ = initial_sig.replace( # type: ignore
|
||||
parameters=(
|
||||
inspect.Parameter(name="request", kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=Request),
|
||||
)
|
||||
+ tuple(initial_sig.parameters.values())
|
||||
)
|
||||
|
||||
# Return protected route
|
||||
return _protected_func
|
||||
@@ -0,0 +1,27 @@
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from argparse import _SubParsersAction
|
||||
|
||||
|
||||
class BaseHuggingfaceCLICommand(ABC):
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def register_subcommand(parser: _SubParsersAction):
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def run(self):
|
||||
raise NotImplementedError()
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,69 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contains a utility for good-looking prints."""
|
||||
|
||||
import os
|
||||
from typing import List, Union
|
||||
|
||||
|
||||
class ANSI:
|
||||
"""
|
||||
Helper for en.wikipedia.org/wiki/ANSI_escape_code
|
||||
"""
|
||||
|
||||
_bold = "\u001b[1m"
|
||||
_gray = "\u001b[90m"
|
||||
_red = "\u001b[31m"
|
||||
_reset = "\u001b[0m"
|
||||
_yellow = "\u001b[33m"
|
||||
|
||||
@classmethod
|
||||
def bold(cls, s: str) -> str:
|
||||
return cls._format(s, cls._bold)
|
||||
|
||||
@classmethod
|
||||
def gray(cls, s: str) -> str:
|
||||
return cls._format(s, cls._gray)
|
||||
|
||||
@classmethod
|
||||
def red(cls, s: str) -> str:
|
||||
return cls._format(s, cls._bold + cls._red)
|
||||
|
||||
@classmethod
|
||||
def yellow(cls, s: str) -> str:
|
||||
return cls._format(s, cls._yellow)
|
||||
|
||||
@classmethod
|
||||
def _format(cls, s: str, code: str) -> str:
|
||||
if os.environ.get("NO_COLOR"):
|
||||
# See https://no-color.org/
|
||||
return s
|
||||
return f"{code}{s}{cls._reset}"
|
||||
|
||||
|
||||
def tabulate(rows: List[List[Union[str, int]]], headers: List[str]) -> str:
|
||||
"""
|
||||
Inspired by:
|
||||
|
||||
- stackoverflow.com/a/8356620/593036
|
||||
- stackoverflow.com/questions/9535954/printing-lists-as-tabular-data
|
||||
"""
|
||||
col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)]
|
||||
row_format = ("{{:{}}} " * len(headers)).format(*col_widths)
|
||||
lines = []
|
||||
lines.append(row_format.format(*headers))
|
||||
lines.append(row_format.format(*["-" * w for w in col_widths]))
|
||||
for row in rows:
|
||||
lines.append(row_format.format(*row))
|
||||
return "\n".join(lines)
|
||||
@@ -0,0 +1,474 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contains command to delete some revisions from the HF cache directory.
|
||||
|
||||
Usage:
|
||||
huggingface-cli delete-cache
|
||||
huggingface-cli delete-cache --disable-tui
|
||||
huggingface-cli delete-cache --dir ~/.cache/huggingface/hub
|
||||
huggingface-cli delete-cache --sort=size
|
||||
|
||||
NOTE:
|
||||
This command is based on `InquirerPy` to build the multiselect menu in the terminal.
|
||||
This dependency has to be installed with `pip install huggingface_hub[cli]`. Since
|
||||
we want to avoid as much as possible cross-platform issues, I chose a library that
|
||||
is built on top of `python-prompt-toolkit` which seems to be a reference in terminal
|
||||
GUI (actively maintained on both Unix and Windows, 7.9k stars).
|
||||
|
||||
For the moment, the TUI feature is in beta.
|
||||
|
||||
See:
|
||||
- https://github.com/kazhala/InquirerPy
|
||||
- https://inquirerpy.readthedocs.io/en/latest/
|
||||
- https://github.com/prompt-toolkit/python-prompt-toolkit
|
||||
|
||||
Other solutions could have been:
|
||||
- `simple_term_menu`: would be good as well for our use case but some issues suggest
|
||||
that Windows is less supported.
|
||||
See: https://github.com/IngoMeyer441/simple-term-menu
|
||||
- `PyInquirer`: very similar to `InquirerPy` but older and not maintained anymore.
|
||||
In particular, no support of Python3.10.
|
||||
See: https://github.com/CITGuru/PyInquirer
|
||||
- `pick` (or `pickpack`): easy to use and flexible but built on top of Python's
|
||||
standard library `curses` that is specific to Unix (not implemented on Windows).
|
||||
See https://github.com/wong2/pick and https://github.com/anafvana/pickpack.
|
||||
- `inquirer`: lot of traction (700 stars) but explicitly states "experimental
|
||||
support of Windows". Not built on top of `python-prompt-toolkit`.
|
||||
See https://github.com/magmax/python-inquirer
|
||||
|
||||
TODO: add support for `huggingface-cli delete-cache aaaaaa bbbbbb cccccc (...)` ?
|
||||
TODO: add "--keep-last" arg to delete revisions that are not on `main` ref
|
||||
TODO: add "--filter" arg to filter repositories by name ?
|
||||
TODO: add "--limit" arg to limit to X repos ?
|
||||
TODO: add "-y" arg for immediate deletion ?
|
||||
See discussions in https://github.com/huggingface/huggingface_hub/issues/1025.
|
||||
"""
|
||||
|
||||
import os
|
||||
from argparse import Namespace, _SubParsersAction
|
||||
from functools import wraps
|
||||
from tempfile import mkstemp
|
||||
from typing import Any, Callable, Iterable, List, Literal, Optional, Union
|
||||
|
||||
from ..utils import CachedRepoInfo, CachedRevisionInfo, HFCacheInfo, scan_cache_dir
|
||||
from . import BaseHuggingfaceCLICommand
|
||||
from ._cli_utils import ANSI
|
||||
|
||||
|
||||
try:
|
||||
from InquirerPy import inquirer
|
||||
from InquirerPy.base.control import Choice
|
||||
from InquirerPy.separator import Separator
|
||||
|
||||
_inquirer_py_available = True
|
||||
except ImportError:
|
||||
_inquirer_py_available = False
|
||||
|
||||
SortingOption_T = Literal["alphabetical", "lastUpdated", "lastUsed", "size"]
|
||||
|
||||
|
||||
def require_inquirer_py(fn: Callable) -> Callable:
|
||||
"""Decorator to flag methods that require `InquirerPy`."""
|
||||
|
||||
# TODO: refactor this + imports in a unified pattern across codebase
|
||||
@wraps(fn)
|
||||
def _inner(*args, **kwargs):
|
||||
if not _inquirer_py_available:
|
||||
raise ImportError(
|
||||
"The `delete-cache` command requires extra dependencies to work with"
|
||||
" the TUI.\nPlease run `pip install huggingface_hub[cli]` to install"
|
||||
" them.\nOtherwise, disable TUI using the `--disable-tui` flag."
|
||||
)
|
||||
|
||||
return fn(*args, **kwargs)
|
||||
|
||||
return _inner
|
||||
|
||||
|
||||
# Possibility for the user to cancel deletion
|
||||
_CANCEL_DELETION_STR = "CANCEL_DELETION"
|
||||
|
||||
|
||||
class DeleteCacheCommand(BaseHuggingfaceCLICommand):
|
||||
@staticmethod
|
||||
def register_subcommand(parser: _SubParsersAction):
|
||||
delete_cache_parser = parser.add_parser("delete-cache", help="Delete revisions from the cache directory.")
|
||||
|
||||
delete_cache_parser.add_argument(
|
||||
"--dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="cache directory (optional). Default to the default HuggingFace cache.",
|
||||
)
|
||||
|
||||
delete_cache_parser.add_argument(
|
||||
"--disable-tui",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Disable Terminal User Interface (TUI) mode. Useful if your"
|
||||
" platform/terminal doesn't support the multiselect menu."
|
||||
),
|
||||
)
|
||||
|
||||
delete_cache_parser.add_argument(
|
||||
"--sort",
|
||||
nargs="?",
|
||||
choices=["alphabetical", "lastUpdated", "lastUsed", "size"],
|
||||
help=(
|
||||
"Sort repositories by the specified criteria. Options: "
|
||||
"'alphabetical' (A-Z), "
|
||||
"'lastUpdated' (newest first), "
|
||||
"'lastUsed' (most recent first), "
|
||||
"'size' (largest first)."
|
||||
),
|
||||
)
|
||||
|
||||
delete_cache_parser.set_defaults(func=DeleteCacheCommand)
|
||||
|
||||
def __init__(self, args: Namespace) -> None:
|
||||
self.cache_dir: Optional[str] = args.dir
|
||||
self.disable_tui: bool = args.disable_tui
|
||||
self.sort_by: Optional[SortingOption_T] = args.sort
|
||||
|
||||
def run(self):
|
||||
"""Run `delete-cache` command with or without TUI."""
|
||||
# Scan cache directory
|
||||
hf_cache_info = scan_cache_dir(self.cache_dir)
|
||||
|
||||
# Manual review from the user
|
||||
if self.disable_tui:
|
||||
selected_hashes = _manual_review_no_tui(hf_cache_info, preselected=[], sort_by=self.sort_by)
|
||||
else:
|
||||
selected_hashes = _manual_review_tui(hf_cache_info, preselected=[], sort_by=self.sort_by)
|
||||
|
||||
# If deletion is not cancelled
|
||||
if len(selected_hashes) > 0 and _CANCEL_DELETION_STR not in selected_hashes:
|
||||
confirm_message = _get_expectations_str(hf_cache_info, selected_hashes) + " Confirm deletion ?"
|
||||
|
||||
# Confirm deletion
|
||||
if self.disable_tui:
|
||||
confirmed = _ask_for_confirmation_no_tui(confirm_message)
|
||||
else:
|
||||
confirmed = _ask_for_confirmation_tui(confirm_message)
|
||||
|
||||
# Deletion is confirmed
|
||||
if confirmed:
|
||||
strategy = hf_cache_info.delete_revisions(*selected_hashes)
|
||||
print("Start deletion.")
|
||||
strategy.execute()
|
||||
print(
|
||||
f"Done. Deleted {len(strategy.repos)} repo(s) and"
|
||||
f" {len(strategy.snapshots)} revision(s) for a total of"
|
||||
f" {strategy.expected_freed_size_str}."
|
||||
)
|
||||
return
|
||||
|
||||
# Deletion is cancelled
|
||||
print("Deletion is cancelled. Do nothing.")
|
||||
|
||||
|
||||
def _get_repo_sorting_key(repo: CachedRepoInfo, sort_by: Optional[SortingOption_T] = None):
|
||||
if sort_by == "alphabetical":
|
||||
return (repo.repo_type, repo.repo_id.lower()) # by type then name
|
||||
elif sort_by == "lastUpdated":
|
||||
return -max(rev.last_modified for rev in repo.revisions) # newest first
|
||||
elif sort_by == "lastUsed":
|
||||
return -repo.last_accessed # most recently used first
|
||||
elif sort_by == "size":
|
||||
return -repo.size_on_disk # largest first
|
||||
else:
|
||||
return (repo.repo_type, repo.repo_id) # default stable order
|
||||
|
||||
|
||||
@require_inquirer_py
|
||||
def _manual_review_tui(
|
||||
hf_cache_info: HFCacheInfo,
|
||||
preselected: List[str],
|
||||
sort_by: Optional[SortingOption_T] = None,
|
||||
) -> List[str]:
|
||||
"""Ask the user for a manual review of the revisions to delete.
|
||||
|
||||
Displays a multi-select menu in the terminal (TUI).
|
||||
"""
|
||||
# Define multiselect list
|
||||
choices = _get_tui_choices_from_scan(
|
||||
repos=hf_cache_info.repos,
|
||||
preselected=preselected,
|
||||
sort_by=sort_by,
|
||||
)
|
||||
checkbox = inquirer.checkbox(
|
||||
message="Select revisions to delete:",
|
||||
choices=choices, # List of revisions with some pre-selection
|
||||
cycle=False, # No loop between top and bottom
|
||||
height=100, # Large list if possible
|
||||
# We use the instruction to display to the user the expected effect of the
|
||||
# deletion.
|
||||
instruction=_get_expectations_str(
|
||||
hf_cache_info,
|
||||
selected_hashes=[c.value for c in choices if isinstance(c, Choice) and c.enabled],
|
||||
),
|
||||
# We use the long instruction to should keybindings instructions to the user
|
||||
long_instruction="Press <space> to select, <enter> to validate and <ctrl+c> to quit without modification.",
|
||||
# Message that is displayed once the user validates its selection.
|
||||
transformer=lambda result: f"{len(result)} revision(s) selected.",
|
||||
)
|
||||
|
||||
# Add a callback to update the information line when a revision is
|
||||
# selected/unselected
|
||||
def _update_expectations(_) -> None:
|
||||
# Hacky way to dynamically set an instruction message to the checkbox when
|
||||
# a revision hash is selected/unselected.
|
||||
checkbox._instruction = _get_expectations_str(
|
||||
hf_cache_info,
|
||||
selected_hashes=[choice["value"] for choice in checkbox.content_control.choices if choice["enabled"]],
|
||||
)
|
||||
|
||||
checkbox.kb_func_lookup["toggle"].append({"func": _update_expectations})
|
||||
|
||||
# Finally display the form to the user.
|
||||
try:
|
||||
return checkbox.execute()
|
||||
except KeyboardInterrupt:
|
||||
return [] # Quit without deletion
|
||||
|
||||
|
||||
@require_inquirer_py
|
||||
def _ask_for_confirmation_tui(message: str, default: bool = True) -> bool:
|
||||
"""Ask for confirmation using Inquirer."""
|
||||
return inquirer.confirm(message, default=default).execute()
|
||||
|
||||
|
||||
def _get_tui_choices_from_scan(
|
||||
repos: Iterable[CachedRepoInfo],
|
||||
preselected: List[str],
|
||||
sort_by: Optional[SortingOption_T] = None,
|
||||
) -> List:
|
||||
"""Build a list of choices from the scanned repos.
|
||||
|
||||
Args:
|
||||
repos (*Iterable[`CachedRepoInfo`]*):
|
||||
List of scanned repos on which we want to delete revisions.
|
||||
preselected (*List[`str`]*):
|
||||
List of revision hashes that will be preselected.
|
||||
sort_by (*Optional[SortingOption_T]*):
|
||||
Sorting direction. Choices: "alphabetical", "lastUpdated", "lastUsed", "size".
|
||||
|
||||
Return:
|
||||
The list of choices to pass to `inquirer.checkbox`.
|
||||
"""
|
||||
choices: List[Union[Choice, Separator]] = []
|
||||
|
||||
# First choice is to cancel the deletion
|
||||
choices.append(
|
||||
Choice(
|
||||
_CANCEL_DELETION_STR,
|
||||
name="None of the following (if selected, nothing will be deleted).",
|
||||
enabled=False,
|
||||
)
|
||||
)
|
||||
|
||||
# Sort repos based on specified criteria
|
||||
sorted_repos = sorted(repos, key=lambda repo: _get_repo_sorting_key(repo, sort_by))
|
||||
|
||||
for repo in sorted_repos:
|
||||
# Repo as separator
|
||||
choices.append(
|
||||
Separator(
|
||||
f"\n{repo.repo_type.capitalize()} {repo.repo_id} ({repo.size_on_disk_str},"
|
||||
f" used {repo.last_accessed_str})"
|
||||
)
|
||||
)
|
||||
for revision in sorted(repo.revisions, key=_revision_sorting_order):
|
||||
# Revision as choice
|
||||
choices.append(
|
||||
Choice(
|
||||
revision.commit_hash,
|
||||
name=(
|
||||
f"{revision.commit_hash[:8]}:"
|
||||
f" {', '.join(sorted(revision.refs)) or '(detached)'} #"
|
||||
f" modified {revision.last_modified_str}"
|
||||
),
|
||||
enabled=revision.commit_hash in preselected,
|
||||
)
|
||||
)
|
||||
|
||||
# Return choices
|
||||
return choices
|
||||
|
||||
|
||||
def _manual_review_no_tui(
|
||||
hf_cache_info: HFCacheInfo,
|
||||
preselected: List[str],
|
||||
sort_by: Optional[SortingOption_T] = None,
|
||||
) -> List[str]:
|
||||
"""Ask the user for a manual review of the revisions to delete.
|
||||
|
||||
Used when TUI is disabled. Manual review happens in a separate tmp file that the
|
||||
user can manually edit.
|
||||
"""
|
||||
# 1. Generate temporary file with delete commands.
|
||||
fd, tmp_path = mkstemp(suffix=".txt") # suffix to make it easier to find by editors
|
||||
os.close(fd)
|
||||
|
||||
lines = []
|
||||
|
||||
sorted_repos = sorted(hf_cache_info.repos, key=lambda repo: _get_repo_sorting_key(repo, sort_by))
|
||||
|
||||
for repo in sorted_repos:
|
||||
lines.append(
|
||||
f"\n# {repo.repo_type.capitalize()} {repo.repo_id} ({repo.size_on_disk_str},"
|
||||
f" used {repo.last_accessed_str})"
|
||||
)
|
||||
for revision in sorted(repo.revisions, key=_revision_sorting_order):
|
||||
lines.append(
|
||||
# Deselect by prepending a '#'
|
||||
f"{'' if revision.commit_hash in preselected else '#'} "
|
||||
f" {revision.commit_hash} # Refs:"
|
||||
# Print `refs` as comment on same line
|
||||
f" {', '.join(sorted(revision.refs)) or '(detached)'} # modified"
|
||||
# Print `last_modified` as comment on same line
|
||||
f" {revision.last_modified_str}"
|
||||
)
|
||||
|
||||
with open(tmp_path, "w") as f:
|
||||
f.write(_MANUAL_REVIEW_NO_TUI_INSTRUCTIONS)
|
||||
f.write("\n".join(lines))
|
||||
|
||||
# 2. Prompt instructions to user.
|
||||
instructions = f"""
|
||||
TUI is disabled. In order to select which revisions you want to delete, please edit
|
||||
the following file using the text editor of your choice. Instructions for manual
|
||||
editing are located at the beginning of the file. Edit the file, save it and confirm
|
||||
to continue.
|
||||
File to edit: {ANSI.bold(tmp_path)}
|
||||
"""
|
||||
print("\n".join(line.strip() for line in instructions.strip().split("\n")))
|
||||
|
||||
# 3. Wait for user confirmation.
|
||||
while True:
|
||||
selected_hashes = _read_manual_review_tmp_file(tmp_path)
|
||||
if _ask_for_confirmation_no_tui(
|
||||
_get_expectations_str(hf_cache_info, selected_hashes) + " Continue ?",
|
||||
default=False,
|
||||
):
|
||||
break
|
||||
|
||||
# 4. Return selected_hashes sorted to maintain stable order
|
||||
os.remove(tmp_path)
|
||||
return sorted(selected_hashes) # Sort to maintain stable order
|
||||
|
||||
|
||||
def _ask_for_confirmation_no_tui(message: str, default: bool = True) -> bool:
|
||||
"""Ask for confirmation using pure-python."""
|
||||
YES = ("y", "yes", "1")
|
||||
NO = ("n", "no", "0")
|
||||
DEFAULT = ""
|
||||
ALL = YES + NO + (DEFAULT,)
|
||||
full_message = message + (" (Y/n) " if default else " (y/N) ")
|
||||
while True:
|
||||
answer = input(full_message).lower()
|
||||
if answer == DEFAULT:
|
||||
return default
|
||||
if answer in YES:
|
||||
return True
|
||||
if answer in NO:
|
||||
return False
|
||||
print(f"Invalid input. Must be one of {ALL}")
|
||||
|
||||
|
||||
def _get_expectations_str(hf_cache_info: HFCacheInfo, selected_hashes: List[str]) -> str:
|
||||
"""Format a string to display to the user how much space would be saved.
|
||||
|
||||
Example:
|
||||
```
|
||||
>>> _get_expectations_str(hf_cache_info, selected_hashes)
|
||||
'7 revisions selected counting for 4.3G.'
|
||||
```
|
||||
"""
|
||||
if _CANCEL_DELETION_STR in selected_hashes:
|
||||
return "Nothing will be deleted."
|
||||
strategy = hf_cache_info.delete_revisions(*selected_hashes)
|
||||
return f"{len(selected_hashes)} revisions selected counting for {strategy.expected_freed_size_str}."
|
||||
|
||||
|
||||
def _read_manual_review_tmp_file(tmp_path: str) -> List[str]:
|
||||
"""Read the manually reviewed instruction file and return a list of revision hash.
|
||||
|
||||
Example:
|
||||
```txt
|
||||
# This is the tmp file content
|
||||
###
|
||||
|
||||
# Commented out line
|
||||
123456789 # revision hash
|
||||
|
||||
# Something else
|
||||
# a_newer_hash # 2 days ago
|
||||
an_older_hash # 3 days ago
|
||||
```
|
||||
|
||||
```py
|
||||
>>> _read_manual_review_tmp_file(tmp_path)
|
||||
['123456789', 'an_older_hash']
|
||||
```
|
||||
"""
|
||||
with open(tmp_path) as f:
|
||||
content = f.read()
|
||||
|
||||
# Split lines
|
||||
lines = [line.strip() for line in content.split("\n")]
|
||||
|
||||
# Filter commented lines
|
||||
selected_lines = [line for line in lines if not line.startswith("#")]
|
||||
|
||||
# Select only before comment
|
||||
selected_hashes = [line.split("#")[0].strip() for line in selected_lines]
|
||||
|
||||
# Return revision hashes
|
||||
return [hash for hash in selected_hashes if len(hash) > 0]
|
||||
|
||||
|
||||
_MANUAL_REVIEW_NO_TUI_INSTRUCTIONS = f"""
|
||||
# INSTRUCTIONS
|
||||
# ------------
|
||||
# This is a temporary file created by running `huggingface-cli delete-cache` with the
|
||||
# `--disable-tui` option. It contains a set of revisions that can be deleted from your
|
||||
# local cache directory.
|
||||
#
|
||||
# Please manually review the revisions you want to delete:
|
||||
# - Revision hashes can be commented out with '#'.
|
||||
# - Only non-commented revisions in this file will be deleted.
|
||||
# - Revision hashes that are removed from this file are ignored as well.
|
||||
# - If `{_CANCEL_DELETION_STR}` line is uncommented, the all cache deletion is cancelled and
|
||||
# no changes will be applied.
|
||||
#
|
||||
# Once you've manually reviewed this file, please confirm deletion in the terminal. This
|
||||
# file will be automatically removed once done.
|
||||
# ------------
|
||||
|
||||
# KILL SWITCH
|
||||
# ------------
|
||||
# Un-comment following line to completely cancel the deletion process
|
||||
# {_CANCEL_DELETION_STR}
|
||||
# ------------
|
||||
|
||||
# REVISIONS
|
||||
# ------------
|
||||
""".strip()
|
||||
|
||||
|
||||
def _revision_sorting_order(revision: CachedRevisionInfo) -> Any:
|
||||
# Sort by last modified (oldest first)
|
||||
return revision.last_modified
|
||||
@@ -0,0 +1,200 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contains command to download files from the Hub with the CLI.
|
||||
|
||||
Usage:
|
||||
huggingface-cli download --help
|
||||
|
||||
# Download file
|
||||
huggingface-cli download gpt2 config.json
|
||||
|
||||
# Download entire repo
|
||||
huggingface-cli download fffiloni/zeroscope --repo-type=space --revision=refs/pr/78
|
||||
|
||||
# Download repo with filters
|
||||
huggingface-cli download gpt2 --include="*.safetensors"
|
||||
|
||||
# Download with token
|
||||
huggingface-cli download Wauplin/private-model --token=hf_***
|
||||
|
||||
# Download quietly (no progress bar, no warnings, only the returned path)
|
||||
huggingface-cli download gpt2 config.json --quiet
|
||||
|
||||
# Download to local dir
|
||||
huggingface-cli download gpt2 --local-dir=./models/gpt2
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from argparse import Namespace, _SubParsersAction
|
||||
from typing import List, Optional
|
||||
|
||||
from huggingface_hub import logging
|
||||
from huggingface_hub._snapshot_download import snapshot_download
|
||||
from huggingface_hub.commands import BaseHuggingfaceCLICommand
|
||||
from huggingface_hub.file_download import hf_hub_download
|
||||
from huggingface_hub.utils import disable_progress_bars, enable_progress_bars
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class DownloadCommand(BaseHuggingfaceCLICommand):
|
||||
@staticmethod
|
||||
def register_subcommand(parser: _SubParsersAction):
|
||||
download_parser = parser.add_parser("download", help="Download files from the Hub")
|
||||
download_parser.add_argument(
|
||||
"repo_id", type=str, help="ID of the repo to download from (e.g. `username/repo-name`)."
|
||||
)
|
||||
download_parser.add_argument(
|
||||
"filenames", type=str, nargs="*", help="Files to download (e.g. `config.json`, `data/metadata.jsonl`)."
|
||||
)
|
||||
download_parser.add_argument(
|
||||
"--repo-type",
|
||||
choices=["model", "dataset", "space"],
|
||||
default="model",
|
||||
help="Type of repo to download from (defaults to 'model').",
|
||||
)
|
||||
download_parser.add_argument(
|
||||
"--revision",
|
||||
type=str,
|
||||
help="An optional Git revision id which can be a branch name, a tag, or a commit hash.",
|
||||
)
|
||||
download_parser.add_argument(
|
||||
"--include", nargs="*", type=str, help="Glob patterns to match files to download."
|
||||
)
|
||||
download_parser.add_argument(
|
||||
"--exclude", nargs="*", type=str, help="Glob patterns to exclude from files to download."
|
||||
)
|
||||
download_parser.add_argument(
|
||||
"--cache-dir", type=str, help="Path to the directory where to save the downloaded files."
|
||||
)
|
||||
download_parser.add_argument(
|
||||
"--local-dir",
|
||||
type=str,
|
||||
help=(
|
||||
"If set, the downloaded file will be placed under this directory. Check out"
|
||||
" https://huggingface.co/docs/huggingface_hub/guides/download#download-files-to-local-folder for more"
|
||||
" details."
|
||||
),
|
||||
)
|
||||
download_parser.add_argument(
|
||||
"--local-dir-use-symlinks",
|
||||
choices=["auto", "True", "False"],
|
||||
help=("Deprecated and ignored. Downloading to a local directory does not use symlinks anymore."),
|
||||
)
|
||||
download_parser.add_argument(
|
||||
"--force-download",
|
||||
action="store_true",
|
||||
help="If True, the files will be downloaded even if they are already cached.",
|
||||
)
|
||||
download_parser.add_argument(
|
||||
"--resume-download",
|
||||
action="store_true",
|
||||
help="Deprecated and ignored. Downloading a file to local dir always attempts to resume previously interrupted downloads (unless hf-transfer is enabled).",
|
||||
)
|
||||
download_parser.add_argument(
|
||||
"--token", type=str, help="A User Access Token generated from https://huggingface.co/settings/tokens"
|
||||
)
|
||||
download_parser.add_argument(
|
||||
"--quiet",
|
||||
action="store_true",
|
||||
help="If True, progress bars are disabled and only the path to the download files is printed.",
|
||||
)
|
||||
download_parser.add_argument(
|
||||
"--max-workers",
|
||||
type=int,
|
||||
default=8,
|
||||
help="Maximum number of workers to use for downloading files. Default is 8.",
|
||||
)
|
||||
download_parser.set_defaults(func=DownloadCommand)
|
||||
|
||||
def __init__(self, args: Namespace) -> None:
|
||||
self.token = args.token
|
||||
self.repo_id: str = args.repo_id
|
||||
self.filenames: List[str] = args.filenames
|
||||
self.repo_type: str = args.repo_type
|
||||
self.revision: Optional[str] = args.revision
|
||||
self.include: Optional[List[str]] = args.include
|
||||
self.exclude: Optional[List[str]] = args.exclude
|
||||
self.cache_dir: Optional[str] = args.cache_dir
|
||||
self.local_dir: Optional[str] = args.local_dir
|
||||
self.force_download: bool = args.force_download
|
||||
self.resume_download: Optional[bool] = args.resume_download or None
|
||||
self.quiet: bool = args.quiet
|
||||
self.max_workers: int = args.max_workers
|
||||
|
||||
if args.local_dir_use_symlinks is not None:
|
||||
warnings.warn(
|
||||
"Ignoring --local-dir-use-symlinks. Downloading to a local directory does not use symlinks anymore.",
|
||||
FutureWarning,
|
||||
)
|
||||
|
||||
def run(self) -> None:
|
||||
if self.quiet:
|
||||
disable_progress_bars()
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
print(self._download()) # Print path to downloaded files
|
||||
enable_progress_bars()
|
||||
else:
|
||||
logging.set_verbosity_info()
|
||||
print(self._download()) # Print path to downloaded files
|
||||
logging.set_verbosity_warning()
|
||||
|
||||
def _download(self) -> str:
|
||||
# Warn user if patterns are ignored
|
||||
if len(self.filenames) > 0:
|
||||
if self.include is not None and len(self.include) > 0:
|
||||
warnings.warn("Ignoring `--include` since filenames have being explicitly set.")
|
||||
if self.exclude is not None and len(self.exclude) > 0:
|
||||
warnings.warn("Ignoring `--exclude` since filenames have being explicitly set.")
|
||||
|
||||
# Single file to download: use `hf_hub_download`
|
||||
if len(self.filenames) == 1:
|
||||
return hf_hub_download(
|
||||
repo_id=self.repo_id,
|
||||
repo_type=self.repo_type,
|
||||
revision=self.revision,
|
||||
filename=self.filenames[0],
|
||||
cache_dir=self.cache_dir,
|
||||
resume_download=self.resume_download,
|
||||
force_download=self.force_download,
|
||||
token=self.token,
|
||||
local_dir=self.local_dir,
|
||||
library_name="huggingface-cli",
|
||||
)
|
||||
|
||||
# Otherwise: use `snapshot_download` to ensure all files comes from same revision
|
||||
elif len(self.filenames) == 0:
|
||||
allow_patterns = self.include
|
||||
ignore_patterns = self.exclude
|
||||
else:
|
||||
allow_patterns = self.filenames
|
||||
ignore_patterns = None
|
||||
|
||||
return snapshot_download(
|
||||
repo_id=self.repo_id,
|
||||
repo_type=self.repo_type,
|
||||
revision=self.revision,
|
||||
allow_patterns=allow_patterns,
|
||||
ignore_patterns=ignore_patterns,
|
||||
resume_download=self.resume_download,
|
||||
force_download=self.force_download,
|
||||
cache_dir=self.cache_dir,
|
||||
token=self.token,
|
||||
local_dir=self.local_dir,
|
||||
library_name="huggingface-cli",
|
||||
max_workers=self.max_workers,
|
||||
)
|
||||
@@ -0,0 +1,36 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contains command to print information about the environment.
|
||||
|
||||
Usage:
|
||||
huggingface-cli env
|
||||
"""
|
||||
|
||||
from argparse import _SubParsersAction
|
||||
|
||||
from ..utils import dump_environment_info
|
||||
from . import BaseHuggingfaceCLICommand
|
||||
|
||||
|
||||
class EnvironmentCommand(BaseHuggingfaceCLICommand):
|
||||
def __init__(self, args):
|
||||
self.args = args
|
||||
|
||||
@staticmethod
|
||||
def register_subcommand(parser: _SubParsersAction):
|
||||
env_parser = parser.add_parser("env", help="Print information about the environment.")
|
||||
env_parser.set_defaults(func=EnvironmentCommand)
|
||||
|
||||
def run(self) -> None:
|
||||
dump_environment_info()
|
||||
@@ -0,0 +1,61 @@
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from argparse import ArgumentParser
|
||||
|
||||
from huggingface_hub.commands.delete_cache import DeleteCacheCommand
|
||||
from huggingface_hub.commands.download import DownloadCommand
|
||||
from huggingface_hub.commands.env import EnvironmentCommand
|
||||
from huggingface_hub.commands.lfs import LfsCommands
|
||||
from huggingface_hub.commands.repo_files import RepoFilesCommand
|
||||
from huggingface_hub.commands.scan_cache import ScanCacheCommand
|
||||
from huggingface_hub.commands.tag import TagCommands
|
||||
from huggingface_hub.commands.upload import UploadCommand
|
||||
from huggingface_hub.commands.upload_large_folder import UploadLargeFolderCommand
|
||||
from huggingface_hub.commands.user import UserCommands
|
||||
from huggingface_hub.commands.version import VersionCommand
|
||||
|
||||
|
||||
def main():
|
||||
parser = ArgumentParser("huggingface-cli", usage="huggingface-cli <command> [<args>]")
|
||||
commands_parser = parser.add_subparsers(help="huggingface-cli command helpers")
|
||||
|
||||
# Register commands
|
||||
DownloadCommand.register_subcommand(commands_parser)
|
||||
UploadCommand.register_subcommand(commands_parser)
|
||||
RepoFilesCommand.register_subcommand(commands_parser)
|
||||
EnvironmentCommand.register_subcommand(commands_parser)
|
||||
UserCommands.register_subcommand(commands_parser)
|
||||
LfsCommands.register_subcommand(commands_parser)
|
||||
ScanCacheCommand.register_subcommand(commands_parser)
|
||||
DeleteCacheCommand.register_subcommand(commands_parser)
|
||||
TagCommands.register_subcommand(commands_parser)
|
||||
VersionCommand.register_subcommand(commands_parser)
|
||||
|
||||
# Experimental
|
||||
UploadLargeFolderCommand.register_subcommand(commands_parser)
|
||||
|
||||
# Let's go
|
||||
args = parser.parse_args()
|
||||
if not hasattr(args, "func"):
|
||||
parser.print_help()
|
||||
exit(1)
|
||||
|
||||
# Run
|
||||
service = args.func(args)
|
||||
service.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,200 @@
|
||||
"""
|
||||
Implementation of a custom transfer agent for the transfer type "multipart" for
|
||||
git-lfs.
|
||||
|
||||
Inspired by:
|
||||
github.com/cbartz/git-lfs-swift-transfer-agent/blob/master/git_lfs_swift_transfer.py
|
||||
|
||||
Spec is: github.com/git-lfs/git-lfs/blob/master/docs/custom-transfers.md
|
||||
|
||||
|
||||
To launch debugger while developing:
|
||||
|
||||
``` [lfs "customtransfer.multipart"]
|
||||
path = /path/to/huggingface_hub/.env/bin/python args = -m debugpy --listen 5678
|
||||
--wait-for-client
|
||||
/path/to/huggingface_hub/src/huggingface_hub/commands/huggingface_cli.py
|
||||
lfs-multipart-upload ```"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from argparse import _SubParsersAction
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from huggingface_hub.commands import BaseHuggingfaceCLICommand
|
||||
from huggingface_hub.lfs import LFS_MULTIPART_UPLOAD_COMMAND
|
||||
|
||||
from ..utils import get_session, hf_raise_for_status, logging
|
||||
from ..utils._lfs import SliceFileObj
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class LfsCommands(BaseHuggingfaceCLICommand):
|
||||
"""
|
||||
Implementation of a custom transfer agent for the transfer type "multipart"
|
||||
for git-lfs. This lets users upload large files >5GB 🔥. Spec for LFS custom
|
||||
transfer agent is:
|
||||
https://github.com/git-lfs/git-lfs/blob/master/docs/custom-transfers.md
|
||||
|
||||
This introduces two commands to the CLI:
|
||||
|
||||
1. $ huggingface-cli lfs-enable-largefiles
|
||||
|
||||
This should be executed once for each model repo that contains a model file
|
||||
>5GB. It's documented in the error message you get if you just try to git
|
||||
push a 5GB file without having enabled it before.
|
||||
|
||||
2. $ huggingface-cli lfs-multipart-upload
|
||||
|
||||
This command is called by lfs directly and is not meant to be called by the
|
||||
user.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def register_subcommand(parser: _SubParsersAction):
|
||||
enable_parser = parser.add_parser(
|
||||
"lfs-enable-largefiles", help="Configure your repository to enable upload of files > 5GB."
|
||||
)
|
||||
enable_parser.add_argument("path", type=str, help="Local path to repository you want to configure.")
|
||||
enable_parser.set_defaults(func=lambda args: LfsEnableCommand(args))
|
||||
|
||||
# Command will get called by git-lfs, do not call it directly.
|
||||
upload_parser = parser.add_parser(LFS_MULTIPART_UPLOAD_COMMAND, add_help=False)
|
||||
upload_parser.set_defaults(func=lambda args: LfsUploadCommand(args))
|
||||
|
||||
|
||||
class LfsEnableCommand:
|
||||
def __init__(self, args):
|
||||
self.args = args
|
||||
|
||||
def run(self):
|
||||
local_path = os.path.abspath(self.args.path)
|
||||
if not os.path.isdir(local_path):
|
||||
print("This does not look like a valid git repo.")
|
||||
exit(1)
|
||||
subprocess.run(
|
||||
"git config lfs.customtransfer.multipart.path huggingface-cli".split(),
|
||||
check=True,
|
||||
cwd=local_path,
|
||||
)
|
||||
subprocess.run(
|
||||
f"git config lfs.customtransfer.multipart.args {LFS_MULTIPART_UPLOAD_COMMAND}".split(),
|
||||
check=True,
|
||||
cwd=local_path,
|
||||
)
|
||||
print("Local repo set up for largefiles")
|
||||
|
||||
|
||||
def write_msg(msg: Dict):
|
||||
"""Write out the message in Line delimited JSON."""
|
||||
msg_str = json.dumps(msg) + "\n"
|
||||
sys.stdout.write(msg_str)
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
def read_msg() -> Optional[Dict]:
|
||||
"""Read Line delimited JSON from stdin."""
|
||||
msg = json.loads(sys.stdin.readline().strip())
|
||||
|
||||
if "terminate" in (msg.get("type"), msg.get("event")):
|
||||
# terminate message received
|
||||
return None
|
||||
|
||||
if msg.get("event") not in ("download", "upload"):
|
||||
logger.critical("Received unexpected message")
|
||||
sys.exit(1)
|
||||
|
||||
return msg
|
||||
|
||||
|
||||
class LfsUploadCommand:
|
||||
def __init__(self, args) -> None:
|
||||
self.args = args
|
||||
|
||||
def run(self) -> None:
|
||||
# Immediately after invoking a custom transfer process, git-lfs
|
||||
# sends initiation data to the process over stdin.
|
||||
# This tells the process useful information about the configuration.
|
||||
init_msg = json.loads(sys.stdin.readline().strip())
|
||||
if not (init_msg.get("event") == "init" and init_msg.get("operation") == "upload"):
|
||||
write_msg({"error": {"code": 32, "message": "Wrong lfs init operation"}})
|
||||
sys.exit(1)
|
||||
|
||||
# The transfer process should use the information it needs from the
|
||||
# initiation structure, and also perform any one-off setup tasks it
|
||||
# needs to do. It should then respond on stdout with a simple empty
|
||||
# confirmation structure, as follows:
|
||||
write_msg({})
|
||||
|
||||
# After the initiation exchange, git-lfs will send any number of
|
||||
# transfer requests to the stdin of the transfer process, in a serial sequence.
|
||||
while True:
|
||||
msg = read_msg()
|
||||
if msg is None:
|
||||
# When all transfers have been processed, git-lfs will send
|
||||
# a terminate event to the stdin of the transfer process.
|
||||
# On receiving this message the transfer process should
|
||||
# clean up and terminate. No response is expected.
|
||||
sys.exit(0)
|
||||
|
||||
oid = msg["oid"]
|
||||
filepath = msg["path"]
|
||||
completion_url = msg["action"]["href"]
|
||||
header = msg["action"]["header"]
|
||||
chunk_size = int(header.pop("chunk_size"))
|
||||
presigned_urls: List[str] = list(header.values())
|
||||
|
||||
# Send a "started" progress event to allow other workers to start.
|
||||
# Otherwise they're delayed until first "progress" event is reported,
|
||||
# i.e. after the first 5GB by default (!)
|
||||
write_msg(
|
||||
{
|
||||
"event": "progress",
|
||||
"oid": oid,
|
||||
"bytesSoFar": 1,
|
||||
"bytesSinceLast": 0,
|
||||
}
|
||||
)
|
||||
|
||||
parts = []
|
||||
with open(filepath, "rb") as file:
|
||||
for i, presigned_url in enumerate(presigned_urls):
|
||||
with SliceFileObj(
|
||||
file,
|
||||
seek_from=i * chunk_size,
|
||||
read_limit=chunk_size,
|
||||
) as data:
|
||||
r = get_session().put(presigned_url, data=data)
|
||||
hf_raise_for_status(r)
|
||||
parts.append(
|
||||
{
|
||||
"etag": r.headers.get("etag"),
|
||||
"partNumber": i + 1,
|
||||
}
|
||||
)
|
||||
# In order to support progress reporting while data is uploading / downloading,
|
||||
# the transfer process should post messages to stdout
|
||||
write_msg(
|
||||
{
|
||||
"event": "progress",
|
||||
"oid": oid,
|
||||
"bytesSoFar": (i + 1) * chunk_size,
|
||||
"bytesSinceLast": chunk_size,
|
||||
}
|
||||
)
|
||||
# Not precise but that's ok.
|
||||
|
||||
r = get_session().post(
|
||||
completion_url,
|
||||
json={
|
||||
"oid": oid,
|
||||
"parts": parts,
|
||||
},
|
||||
)
|
||||
hf_raise_for_status(r)
|
||||
|
||||
write_msg({"event": "complete", "oid": oid})
|
||||
@@ -0,0 +1,128 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contains command to update or delete files in a repository using the CLI.
|
||||
|
||||
Usage:
|
||||
# delete all
|
||||
huggingface-cli repo-files <repo_id> delete "*"
|
||||
|
||||
# delete single file
|
||||
huggingface-cli repo-files <repo_id> delete file.txt
|
||||
|
||||
# delete single folder
|
||||
huggingface-cli repo-files <repo_id> delete folder/
|
||||
|
||||
# delete multiple
|
||||
huggingface-cli repo-files <repo_id> delete file.txt folder/ file2.txt
|
||||
|
||||
# delete multiple patterns
|
||||
huggingface-cli repo-files <repo_id> delete file.txt "*.json" "folder/*.parquet"
|
||||
|
||||
# delete from different revision / repo-type
|
||||
huggingface-cli repo-files <repo_id> delete file.txt --revision=refs/pr/1 --repo-type=dataset
|
||||
"""
|
||||
|
||||
from argparse import _SubParsersAction
|
||||
from typing import List, Optional
|
||||
|
||||
from huggingface_hub import logging
|
||||
from huggingface_hub.commands import BaseHuggingfaceCLICommand
|
||||
from huggingface_hub.hf_api import HfApi
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class DeleteFilesSubCommand:
|
||||
def __init__(self, args) -> None:
|
||||
self.args = args
|
||||
self.repo_id: str = args.repo_id
|
||||
self.repo_type: Optional[str] = args.repo_type
|
||||
self.revision: Optional[str] = args.revision
|
||||
self.api: HfApi = HfApi(token=args.token, library_name="huggingface-cli")
|
||||
self.patterns: List[str] = args.patterns
|
||||
self.commit_message: Optional[str] = args.commit_message
|
||||
self.commit_description: Optional[str] = args.commit_description
|
||||
self.create_pr: bool = args.create_pr
|
||||
self.token: Optional[str] = args.token
|
||||
|
||||
def run(self) -> None:
|
||||
logging.set_verbosity_info()
|
||||
url = self.api.delete_files(
|
||||
delete_patterns=self.patterns,
|
||||
repo_id=self.repo_id,
|
||||
repo_type=self.repo_type,
|
||||
revision=self.revision,
|
||||
commit_message=self.commit_message,
|
||||
commit_description=self.commit_description,
|
||||
create_pr=self.create_pr,
|
||||
)
|
||||
print(f"Files correctly deleted from repo. Commit: {url}.")
|
||||
logging.set_verbosity_warning()
|
||||
|
||||
|
||||
class RepoFilesCommand(BaseHuggingfaceCLICommand):
|
||||
@staticmethod
|
||||
def register_subcommand(parser: _SubParsersAction):
|
||||
repo_files_parser = parser.add_parser("repo-files", help="Manage files in a repo on the Hub")
|
||||
repo_files_parser.add_argument(
|
||||
"repo_id", type=str, help="The ID of the repo to manage (e.g. `username/repo-name`)."
|
||||
)
|
||||
repo_files_subparsers = repo_files_parser.add_subparsers(
|
||||
help="Action to execute against the files.",
|
||||
required=True,
|
||||
)
|
||||
delete_subparser = repo_files_subparsers.add_parser(
|
||||
"delete",
|
||||
help="Delete files from a repo on the Hub",
|
||||
)
|
||||
delete_subparser.set_defaults(func=lambda args: DeleteFilesSubCommand(args))
|
||||
delete_subparser.add_argument(
|
||||
"patterns",
|
||||
nargs="+",
|
||||
type=str,
|
||||
help="Glob patterns to match files to delete.",
|
||||
)
|
||||
delete_subparser.add_argument(
|
||||
"--repo-type",
|
||||
choices=["model", "dataset", "space"],
|
||||
default="model",
|
||||
help="Type of the repo to upload to (e.g. `dataset`).",
|
||||
)
|
||||
delete_subparser.add_argument(
|
||||
"--revision",
|
||||
type=str,
|
||||
help=(
|
||||
"An optional Git revision to push to. It can be a branch name "
|
||||
"or a PR reference. If revision does not"
|
||||
" exist and `--create-pr` is not set, a branch will be automatically created."
|
||||
),
|
||||
)
|
||||
delete_subparser.add_argument(
|
||||
"--commit-message", type=str, help="The summary / title / first line of the generated commit."
|
||||
)
|
||||
delete_subparser.add_argument(
|
||||
"--commit-description", type=str, help="The description of the generated commit."
|
||||
)
|
||||
delete_subparser.add_argument(
|
||||
"--create-pr", action="store_true", help="Whether to create a new Pull Request for these changes."
|
||||
)
|
||||
repo_files_parser.add_argument(
|
||||
"--token",
|
||||
type=str,
|
||||
help="A User Access Token generated from https://huggingface.co/settings/tokens",
|
||||
)
|
||||
|
||||
repo_files_parser.set_defaults(func=RepoFilesCommand)
|
||||
@@ -0,0 +1,181 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contains command to scan the HF cache directory.
|
||||
|
||||
Usage:
|
||||
huggingface-cli scan-cache
|
||||
huggingface-cli scan-cache -v
|
||||
huggingface-cli scan-cache -vvv
|
||||
huggingface-cli scan-cache --dir ~/.cache/huggingface/hub
|
||||
"""
|
||||
|
||||
import time
|
||||
from argparse import Namespace, _SubParsersAction
|
||||
from typing import Optional
|
||||
|
||||
from ..utils import CacheNotFound, HFCacheInfo, scan_cache_dir
|
||||
from . import BaseHuggingfaceCLICommand
|
||||
from ._cli_utils import ANSI, tabulate
|
||||
|
||||
|
||||
class ScanCacheCommand(BaseHuggingfaceCLICommand):
|
||||
@staticmethod
|
||||
def register_subcommand(parser: _SubParsersAction):
|
||||
scan_cache_parser = parser.add_parser("scan-cache", help="Scan cache directory.")
|
||||
|
||||
scan_cache_parser.add_argument(
|
||||
"--dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="cache directory to scan (optional). Default to the default HuggingFace cache.",
|
||||
)
|
||||
scan_cache_parser.add_argument(
|
||||
"-v",
|
||||
"--verbose",
|
||||
action="count",
|
||||
default=0,
|
||||
help="show a more verbose output",
|
||||
)
|
||||
scan_cache_parser.set_defaults(func=ScanCacheCommand)
|
||||
|
||||
def __init__(self, args: Namespace) -> None:
|
||||
self.verbosity: int = args.verbose
|
||||
self.cache_dir: Optional[str] = args.dir
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
t0 = time.time()
|
||||
hf_cache_info = scan_cache_dir(self.cache_dir)
|
||||
t1 = time.time()
|
||||
except CacheNotFound as exc:
|
||||
cache_dir = exc.cache_dir
|
||||
print(f"Cache directory not found: {cache_dir}")
|
||||
return
|
||||
|
||||
self._print_hf_cache_info_as_table(hf_cache_info)
|
||||
|
||||
print(
|
||||
f"\nDone in {round(t1 - t0, 1)}s. Scanned {len(hf_cache_info.repos)} repo(s)"
|
||||
f" for a total of {ANSI.red(hf_cache_info.size_on_disk_str)}."
|
||||
)
|
||||
if len(hf_cache_info.warnings) > 0:
|
||||
message = f"Got {len(hf_cache_info.warnings)} warning(s) while scanning."
|
||||
if self.verbosity >= 3:
|
||||
print(ANSI.gray(message))
|
||||
for warning in hf_cache_info.warnings:
|
||||
print(ANSI.gray(warning))
|
||||
else:
|
||||
print(ANSI.gray(message + " Use -vvv to print details."))
|
||||
|
||||
def _print_hf_cache_info_as_table(self, hf_cache_info: HFCacheInfo) -> None:
|
||||
print(get_table(hf_cache_info, verbosity=self.verbosity))
|
||||
|
||||
|
||||
def get_table(hf_cache_info: HFCacheInfo, *, verbosity: int = 0) -> str:
|
||||
"""Generate a table from the [`HFCacheInfo`] object.
|
||||
|
||||
Pass `verbosity=0` to get a table with a single row per repo, with columns
|
||||
"repo_id", "repo_type", "size_on_disk", "nb_files", "last_accessed", "last_modified", "refs", "local_path".
|
||||
|
||||
Pass `verbosity=1` to get a table with a row per repo and revision (thus multiple rows can appear for a single repo), with columns
|
||||
"repo_id", "repo_type", "revision", "size_on_disk", "nb_files", "last_modified", "refs", "local_path".
|
||||
|
||||
Example:
|
||||
```py
|
||||
>>> from huggingface_hub.utils import scan_cache_dir
|
||||
>>> from huggingface_hub.commands.scan_cache import get_table
|
||||
|
||||
>>> hf_cache_info = scan_cache_dir()
|
||||
HFCacheInfo(...)
|
||||
|
||||
>>> print(get_table(hf_cache_info, verbosity=0))
|
||||
REPO ID REPO TYPE SIZE ON DISK NB FILES LAST_ACCESSED LAST_MODIFIED REFS LOCAL PATH
|
||||
--------------------------------------------------- --------- ------------ -------- ------------- ------------- ---- --------------------------------------------------------------------------------------------------
|
||||
roberta-base model 2.7M 5 1 day ago 1 week ago main C:\\Users\\admin\\.cache\\huggingface\\hub\\models--roberta-base
|
||||
suno/bark model 8.8K 1 1 week ago 1 week ago main C:\\Users\\admin\\.cache\\huggingface\\hub\\models--suno--bark
|
||||
t5-base model 893.8M 4 4 days ago 7 months ago main C:\\Users\\admin\\.cache\\huggingface\\hub\\models--t5-base
|
||||
t5-large model 3.0G 4 5 weeks ago 5 months ago main C:\\Users\\admin\\.cache\\huggingface\\hub\\models--t5-large
|
||||
|
||||
>>> print(get_table(hf_cache_info, verbosity=1))
|
||||
REPO ID REPO TYPE REVISION SIZE ON DISK NB FILES LAST_MODIFIED REFS LOCAL PATH
|
||||
--------------------------------------------------- --------- ---------------------------------------- ------------ -------- ------------- ---- -----------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
roberta-base model e2da8e2f811d1448a5b465c236feacd80ffbac7b 2.7M 5 1 week ago main C:\\Users\\admin\\.cache\\huggingface\\hub\\models--roberta-base\\snapshots\\e2da8e2f811d1448a5b465c236feacd80ffbac7b
|
||||
suno/bark model 70a8a7d34168586dc5d028fa9666aceade177992 8.8K 1 1 week ago main C:\\Users\\admin\\.cache\\huggingface\\hub\\models--suno--bark\\snapshots\\70a8a7d34168586dc5d028fa9666aceade177992
|
||||
t5-base model a9723ea7f1b39c1eae772870f3b547bf6ef7e6c1 893.8M 4 7 months ago main C:\\Users\\admin\\.cache\\huggingface\\hub\\models--t5-base\\snapshots\\a9723ea7f1b39c1eae772870f3b547bf6ef7e6c1
|
||||
t5-large model 150ebc2c4b72291e770f58e6057481c8d2ed331a 3.0G 4 5 months ago main C:\\Users\\admin\\.cache\\huggingface\\hub\\models--t5-large\\snapshots\\150ebc2c4b72291e770f58e6057481c8d2ed331a ```
|
||||
```
|
||||
|
||||
Args:
|
||||
hf_cache_info ([`HFCacheInfo`]):
|
||||
The HFCacheInfo object to print.
|
||||
verbosity (`int`, *optional*):
|
||||
The verbosity level. Defaults to 0.
|
||||
|
||||
Returns:
|
||||
`str`: The table as a string.
|
||||
"""
|
||||
if verbosity == 0:
|
||||
return tabulate(
|
||||
rows=[
|
||||
[
|
||||
repo.repo_id,
|
||||
repo.repo_type,
|
||||
"{:>12}".format(repo.size_on_disk_str),
|
||||
repo.nb_files,
|
||||
repo.last_accessed_str,
|
||||
repo.last_modified_str,
|
||||
", ".join(sorted(repo.refs)),
|
||||
str(repo.repo_path),
|
||||
]
|
||||
for repo in sorted(hf_cache_info.repos, key=lambda repo: repo.repo_path)
|
||||
],
|
||||
headers=[
|
||||
"REPO ID",
|
||||
"REPO TYPE",
|
||||
"SIZE ON DISK",
|
||||
"NB FILES",
|
||||
"LAST_ACCESSED",
|
||||
"LAST_MODIFIED",
|
||||
"REFS",
|
||||
"LOCAL PATH",
|
||||
],
|
||||
)
|
||||
else:
|
||||
return tabulate(
|
||||
rows=[
|
||||
[
|
||||
repo.repo_id,
|
||||
repo.repo_type,
|
||||
revision.commit_hash,
|
||||
"{:>12}".format(revision.size_on_disk_str),
|
||||
revision.nb_files,
|
||||
revision.last_modified_str,
|
||||
", ".join(sorted(revision.refs)),
|
||||
str(revision.snapshot_path),
|
||||
]
|
||||
for repo in sorted(hf_cache_info.repos, key=lambda repo: repo.repo_path)
|
||||
for revision in sorted(repo.revisions, key=lambda revision: revision.commit_hash)
|
||||
],
|
||||
headers=[
|
||||
"REPO ID",
|
||||
"REPO TYPE",
|
||||
"REVISION",
|
||||
"SIZE ON DISK",
|
||||
"NB FILES",
|
||||
"LAST_MODIFIED",
|
||||
"REFS",
|
||||
"LOCAL PATH",
|
||||
],
|
||||
)
|
||||
@@ -0,0 +1,159 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Contains commands to perform tag management with the CLI.
|
||||
|
||||
Usage Examples:
|
||||
- Create a tag:
|
||||
$ huggingface-cli tag user/my-model 1.0 --message "First release"
|
||||
$ huggingface-cli tag user/my-model 1.0 -m "First release" --revision develop
|
||||
$ huggingface-cli tag user/my-dataset 1.0 -m "First release" --repo-type dataset
|
||||
$ huggingface-cli tag user/my-space 1.0
|
||||
- List all tags:
|
||||
$ huggingface-cli tag -l user/my-model
|
||||
$ huggingface-cli tag --list user/my-dataset --repo-type dataset
|
||||
- Delete a tag:
|
||||
$ huggingface-cli tag -d user/my-model 1.0
|
||||
$ huggingface-cli tag --delete user/my-dataset 1.0 --repo-type dataset
|
||||
$ huggingface-cli tag -d user/my-space 1.0 -y
|
||||
"""
|
||||
|
||||
from argparse import Namespace, _SubParsersAction
|
||||
|
||||
from requests.exceptions import HTTPError
|
||||
|
||||
from huggingface_hub.commands import BaseHuggingfaceCLICommand
|
||||
from huggingface_hub.constants import (
|
||||
REPO_TYPES,
|
||||
)
|
||||
from huggingface_hub.hf_api import HfApi
|
||||
|
||||
from ..errors import HfHubHTTPError, RepositoryNotFoundError, RevisionNotFoundError
|
||||
from ._cli_utils import ANSI
|
||||
|
||||
|
||||
class TagCommands(BaseHuggingfaceCLICommand):
|
||||
@staticmethod
|
||||
def register_subcommand(parser: _SubParsersAction):
|
||||
tag_parser = parser.add_parser("tag", help="(create, list, delete) tags for a repo in the hub")
|
||||
|
||||
tag_parser.add_argument("repo_id", type=str, help="The ID of the repo to tag (e.g. `username/repo-name`).")
|
||||
tag_parser.add_argument("tag", nargs="?", type=str, help="The name of the tag for creation or deletion.")
|
||||
tag_parser.add_argument("-m", "--message", type=str, help="The description of the tag to create.")
|
||||
tag_parser.add_argument("--revision", type=str, help="The git revision to tag.")
|
||||
tag_parser.add_argument(
|
||||
"--token", type=str, help="A User Access Token generated from https://huggingface.co/settings/tokens."
|
||||
)
|
||||
tag_parser.add_argument(
|
||||
"--repo-type",
|
||||
choices=["model", "dataset", "space"],
|
||||
default="model",
|
||||
help="Set the type of repository (model, dataset, or space).",
|
||||
)
|
||||
tag_parser.add_argument("-y", "--yes", action="store_true", help="Answer Yes to prompts automatically.")
|
||||
|
||||
tag_parser.add_argument("-l", "--list", action="store_true", help="List tags for a repository.")
|
||||
tag_parser.add_argument("-d", "--delete", action="store_true", help="Delete a tag for a repository.")
|
||||
|
||||
tag_parser.set_defaults(func=lambda args: handle_commands(args))
|
||||
|
||||
|
||||
def handle_commands(args: Namespace):
|
||||
if args.list:
|
||||
return TagListCommand(args)
|
||||
elif args.delete:
|
||||
return TagDeleteCommand(args)
|
||||
else:
|
||||
return TagCreateCommand(args)
|
||||
|
||||
|
||||
class TagCommand:
|
||||
def __init__(self, args: Namespace):
|
||||
self.args = args
|
||||
self.api = HfApi(token=self.args.token)
|
||||
self.repo_id = self.args.repo_id
|
||||
self.repo_type = self.args.repo_type
|
||||
if self.repo_type not in REPO_TYPES:
|
||||
print("Invalid repo --repo-type")
|
||||
exit(1)
|
||||
|
||||
|
||||
class TagCreateCommand(TagCommand):
|
||||
def run(self):
|
||||
print(f"You are about to create tag {ANSI.bold(self.args.tag)} on {self.repo_type} {ANSI.bold(self.repo_id)}")
|
||||
|
||||
try:
|
||||
self.api.create_tag(
|
||||
repo_id=self.repo_id,
|
||||
tag=self.args.tag,
|
||||
tag_message=self.args.message,
|
||||
revision=self.args.revision,
|
||||
repo_type=self.repo_type,
|
||||
)
|
||||
except RepositoryNotFoundError:
|
||||
print(f"{self.repo_type.capitalize()} {ANSI.bold(self.repo_id)} not found.")
|
||||
exit(1)
|
||||
except RevisionNotFoundError:
|
||||
print(f"Revision {ANSI.bold(self.args.revision)} not found.")
|
||||
exit(1)
|
||||
except HfHubHTTPError as e:
|
||||
if e.response.status_code == 409:
|
||||
print(f"Tag {ANSI.bold(self.args.tag)} already exists on {ANSI.bold(self.repo_id)}")
|
||||
exit(1)
|
||||
raise e
|
||||
|
||||
print(f"Tag {ANSI.bold(self.args.tag)} created on {ANSI.bold(self.repo_id)}")
|
||||
|
||||
|
||||
class TagListCommand(TagCommand):
|
||||
def run(self):
|
||||
try:
|
||||
refs = self.api.list_repo_refs(
|
||||
repo_id=self.repo_id,
|
||||
repo_type=self.repo_type,
|
||||
)
|
||||
except RepositoryNotFoundError:
|
||||
print(f"{self.repo_type.capitalize()} {ANSI.bold(self.repo_id)} not found.")
|
||||
exit(1)
|
||||
except HTTPError as e:
|
||||
print(e)
|
||||
print(ANSI.red(e.response.text))
|
||||
exit(1)
|
||||
if len(refs.tags) == 0:
|
||||
print("No tags found")
|
||||
exit(0)
|
||||
print(f"Tags for {self.repo_type} {ANSI.bold(self.repo_id)}:")
|
||||
for tag in refs.tags:
|
||||
print(tag.name)
|
||||
|
||||
|
||||
class TagDeleteCommand(TagCommand):
|
||||
def run(self):
|
||||
print(f"You are about to delete tag {ANSI.bold(self.args.tag)} on {self.repo_type} {ANSI.bold(self.repo_id)}")
|
||||
|
||||
if not self.args.yes:
|
||||
choice = input("Proceed? [Y/n] ").lower()
|
||||
if choice not in ("", "y", "yes"):
|
||||
print("Abort")
|
||||
exit()
|
||||
try:
|
||||
self.api.delete_tag(repo_id=self.repo_id, tag=self.args.tag, repo_type=self.repo_type)
|
||||
except RepositoryNotFoundError:
|
||||
print(f"{self.repo_type.capitalize()} {ANSI.bold(self.repo_id)} not found.")
|
||||
exit(1)
|
||||
except RevisionNotFoundError:
|
||||
print(f"Tag {ANSI.bold(self.args.tag)} not found on {ANSI.bold(self.repo_id)}")
|
||||
exit(1)
|
||||
print(f"Tag {ANSI.bold(self.args.tag)} deleted on {ANSI.bold(self.repo_id)}")
|
||||
@@ -0,0 +1,313 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contains command to upload a repo or file with the CLI.
|
||||
|
||||
Usage:
|
||||
# Upload file (implicit)
|
||||
huggingface-cli upload my-cool-model ./my-cool-model.safetensors
|
||||
|
||||
# Upload file (explicit)
|
||||
huggingface-cli upload my-cool-model ./my-cool-model.safetensors model.safetensors
|
||||
|
||||
# Upload directory (implicit). If `my-cool-model/` is a directory it will be uploaded, otherwise an exception is raised.
|
||||
huggingface-cli upload my-cool-model
|
||||
|
||||
# Upload directory (explicit)
|
||||
huggingface-cli upload my-cool-model ./models/my-cool-model .
|
||||
|
||||
# Upload filtered directory (example: tensorboard logs except for the last run)
|
||||
huggingface-cli upload my-cool-model ./model/training /logs --include "*.tfevents.*" --exclude "*20230905*"
|
||||
|
||||
# Upload with wildcard
|
||||
huggingface-cli upload my-cool-model "./model/training/*.safetensors"
|
||||
|
||||
# Upload private dataset
|
||||
huggingface-cli upload Wauplin/my-cool-dataset ./data . --repo-type=dataset --private
|
||||
|
||||
# Upload with token
|
||||
huggingface-cli upload Wauplin/my-cool-model --token=hf_****
|
||||
|
||||
# Sync local Space with Hub (upload new files, delete removed files)
|
||||
huggingface-cli upload Wauplin/space-example --repo-type=space --exclude="/logs/*" --delete="*" --commit-message="Sync local Space with Hub"
|
||||
|
||||
# Schedule commits every 30 minutes
|
||||
huggingface-cli upload Wauplin/my-cool-model --every=30
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import warnings
|
||||
from argparse import Namespace, _SubParsersAction
|
||||
from typing import List, Optional
|
||||
|
||||
from huggingface_hub import logging
|
||||
from huggingface_hub._commit_scheduler import CommitScheduler
|
||||
from huggingface_hub.commands import BaseHuggingfaceCLICommand
|
||||
from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
|
||||
from huggingface_hub.errors import RevisionNotFoundError
|
||||
from huggingface_hub.hf_api import HfApi
|
||||
from huggingface_hub.utils import disable_progress_bars, enable_progress_bars
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class UploadCommand(BaseHuggingfaceCLICommand):
|
||||
@staticmethod
|
||||
def register_subcommand(parser: _SubParsersAction):
|
||||
upload_parser = parser.add_parser("upload", help="Upload a file or a folder to a repo on the Hub")
|
||||
upload_parser.add_argument(
|
||||
"repo_id", type=str, help="The ID of the repo to upload to (e.g. `username/repo-name`)."
|
||||
)
|
||||
upload_parser.add_argument(
|
||||
"local_path",
|
||||
nargs="?",
|
||||
help="Local path to the file or folder to upload. Wildcard patterns are supported. Defaults to current directory.",
|
||||
)
|
||||
upload_parser.add_argument(
|
||||
"path_in_repo",
|
||||
nargs="?",
|
||||
help="Path of the file or folder in the repo. Defaults to the relative path of the file or folder.",
|
||||
)
|
||||
upload_parser.add_argument(
|
||||
"--repo-type",
|
||||
choices=["model", "dataset", "space"],
|
||||
default="model",
|
||||
help="Type of the repo to upload to (e.g. `dataset`).",
|
||||
)
|
||||
upload_parser.add_argument(
|
||||
"--revision",
|
||||
type=str,
|
||||
help=(
|
||||
"An optional Git revision to push to. It can be a branch name or a PR reference. If revision does not"
|
||||
" exist and `--create-pr` is not set, a branch will be automatically created."
|
||||
),
|
||||
)
|
||||
upload_parser.add_argument(
|
||||
"--private",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether to create a private repo if repo doesn't exist on the Hub. Ignored if the repo already"
|
||||
" exists."
|
||||
),
|
||||
)
|
||||
upload_parser.add_argument("--include", nargs="*", type=str, help="Glob patterns to match files to upload.")
|
||||
upload_parser.add_argument(
|
||||
"--exclude", nargs="*", type=str, help="Glob patterns to exclude from files to upload."
|
||||
)
|
||||
upload_parser.add_argument(
|
||||
"--delete",
|
||||
nargs="*",
|
||||
type=str,
|
||||
help="Glob patterns for file to be deleted from the repo while committing.",
|
||||
)
|
||||
upload_parser.add_argument(
|
||||
"--commit-message", type=str, help="The summary / title / first line of the generated commit."
|
||||
)
|
||||
upload_parser.add_argument("--commit-description", type=str, help="The description of the generated commit.")
|
||||
upload_parser.add_argument(
|
||||
"--create-pr", action="store_true", help="Whether to upload content as a new Pull Request."
|
||||
)
|
||||
upload_parser.add_argument(
|
||||
"--every",
|
||||
type=float,
|
||||
help="If set, a background job is scheduled to create commits every `every` minutes.",
|
||||
)
|
||||
upload_parser.add_argument(
|
||||
"--token", type=str, help="A User Access Token generated from https://huggingface.co/settings/tokens"
|
||||
)
|
||||
upload_parser.add_argument(
|
||||
"--quiet",
|
||||
action="store_true",
|
||||
help="If True, progress bars are disabled and only the path to the uploaded files is printed.",
|
||||
)
|
||||
upload_parser.set_defaults(func=UploadCommand)
|
||||
|
||||
def __init__(self, args: Namespace) -> None:
|
||||
self.repo_id: str = args.repo_id
|
||||
self.repo_type: Optional[str] = args.repo_type
|
||||
self.revision: Optional[str] = args.revision
|
||||
self.private: bool = args.private
|
||||
|
||||
self.include: Optional[List[str]] = args.include
|
||||
self.exclude: Optional[List[str]] = args.exclude
|
||||
self.delete: Optional[List[str]] = args.delete
|
||||
|
||||
self.commit_message: Optional[str] = args.commit_message
|
||||
self.commit_description: Optional[str] = args.commit_description
|
||||
self.create_pr: bool = args.create_pr
|
||||
self.api: HfApi = HfApi(token=args.token, library_name="huggingface-cli")
|
||||
self.quiet: bool = args.quiet # disable warnings and progress bars
|
||||
|
||||
# Check `--every` is valid
|
||||
if args.every is not None and args.every <= 0:
|
||||
raise ValueError(f"`every` must be a positive value (got '{args.every}')")
|
||||
self.every: Optional[float] = args.every
|
||||
|
||||
# Resolve `local_path` and `path_in_repo`
|
||||
repo_name: str = args.repo_id.split("/")[-1] # e.g. "Wauplin/my-cool-model" => "my-cool-model"
|
||||
self.local_path: str
|
||||
self.path_in_repo: str
|
||||
|
||||
if args.local_path is not None and any(c in args.local_path for c in ["*", "?", "["]):
|
||||
if args.include is not None:
|
||||
raise ValueError("Cannot set `--include` when passing a `local_path` containing a wildcard.")
|
||||
if args.path_in_repo is not None and args.path_in_repo != ".":
|
||||
raise ValueError("Cannot set `path_in_repo` when passing a `local_path` containing a wildcard.")
|
||||
self.local_path = "."
|
||||
self.include = args.local_path
|
||||
self.path_in_repo = "."
|
||||
elif args.local_path is None and os.path.isfile(repo_name):
|
||||
# Implicit case 1: user provided only a repo_id which happen to be a local file as well => upload it with same name
|
||||
self.local_path = repo_name
|
||||
self.path_in_repo = repo_name
|
||||
elif args.local_path is None and os.path.isdir(repo_name):
|
||||
# Implicit case 2: user provided only a repo_id which happen to be a local folder as well => upload it at root
|
||||
self.local_path = repo_name
|
||||
self.path_in_repo = "."
|
||||
elif args.local_path is None:
|
||||
# Implicit case 3: user provided only a repo_id that does not match a local file or folder
|
||||
# => the user must explicitly provide a local_path => raise exception
|
||||
raise ValueError(f"'{repo_name}' is not a local file or folder. Please set `local_path` explicitly.")
|
||||
elif args.path_in_repo is None and os.path.isfile(args.local_path):
|
||||
# Explicit local path to file, no path in repo => upload it at root with same name
|
||||
self.local_path = args.local_path
|
||||
self.path_in_repo = os.path.basename(args.local_path)
|
||||
elif args.path_in_repo is None:
|
||||
# Explicit local path to folder, no path in repo => upload at root
|
||||
self.local_path = args.local_path
|
||||
self.path_in_repo = "."
|
||||
else:
|
||||
# Finally, if both paths are explicit
|
||||
self.local_path = args.local_path
|
||||
self.path_in_repo = args.path_in_repo
|
||||
|
||||
def run(self) -> None:
|
||||
if self.quiet:
|
||||
disable_progress_bars()
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
print(self._upload())
|
||||
enable_progress_bars()
|
||||
else:
|
||||
logging.set_verbosity_info()
|
||||
print(self._upload())
|
||||
logging.set_verbosity_warning()
|
||||
|
||||
def _upload(self) -> str:
|
||||
if os.path.isfile(self.local_path):
|
||||
if self.include is not None and len(self.include) > 0:
|
||||
warnings.warn("Ignoring `--include` since a single file is uploaded.")
|
||||
if self.exclude is not None and len(self.exclude) > 0:
|
||||
warnings.warn("Ignoring `--exclude` since a single file is uploaded.")
|
||||
if self.delete is not None and len(self.delete) > 0:
|
||||
warnings.warn("Ignoring `--delete` since a single file is uploaded.")
|
||||
|
||||
if not HF_HUB_ENABLE_HF_TRANSFER:
|
||||
logger.info(
|
||||
"Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See"
|
||||
" https://huggingface.co/docs/huggingface_hub/hf_transfer for more details."
|
||||
)
|
||||
|
||||
# Schedule commits if `every` is set
|
||||
if self.every is not None:
|
||||
if os.path.isfile(self.local_path):
|
||||
# If file => watch entire folder + use allow_patterns
|
||||
folder_path = os.path.dirname(self.local_path)
|
||||
path_in_repo = (
|
||||
self.path_in_repo[: -len(self.local_path)] # remove filename from path_in_repo
|
||||
if self.path_in_repo.endswith(self.local_path)
|
||||
else self.path_in_repo
|
||||
)
|
||||
allow_patterns = [self.local_path]
|
||||
ignore_patterns = []
|
||||
else:
|
||||
folder_path = self.local_path
|
||||
path_in_repo = self.path_in_repo
|
||||
allow_patterns = self.include or []
|
||||
ignore_patterns = self.exclude or []
|
||||
if self.delete is not None and len(self.delete) > 0:
|
||||
warnings.warn("Ignoring `--delete` when uploading with scheduled commits.")
|
||||
|
||||
scheduler = CommitScheduler(
|
||||
folder_path=folder_path,
|
||||
repo_id=self.repo_id,
|
||||
repo_type=self.repo_type,
|
||||
revision=self.revision,
|
||||
allow_patterns=allow_patterns,
|
||||
ignore_patterns=ignore_patterns,
|
||||
path_in_repo=path_in_repo,
|
||||
private=self.private,
|
||||
every=self.every,
|
||||
hf_api=self.api,
|
||||
)
|
||||
print(f"Scheduling commits every {self.every} minutes to {scheduler.repo_id}.")
|
||||
try: # Block main thread until KeyboardInterrupt
|
||||
while True:
|
||||
time.sleep(100)
|
||||
except KeyboardInterrupt:
|
||||
scheduler.stop()
|
||||
return "Stopped scheduled commits."
|
||||
|
||||
# Otherwise, create repo and proceed with the upload
|
||||
if not os.path.isfile(self.local_path) and not os.path.isdir(self.local_path):
|
||||
raise FileNotFoundError(f"No such file or directory: '{self.local_path}'.")
|
||||
repo_id = self.api.create_repo(
|
||||
repo_id=self.repo_id,
|
||||
repo_type=self.repo_type,
|
||||
exist_ok=True,
|
||||
private=self.private,
|
||||
space_sdk="gradio" if self.repo_type == "space" else None,
|
||||
# ^ We don't want it to fail when uploading to a Space => let's set Gradio by default.
|
||||
# ^ I'd rather not add CLI args to set it explicitly as we already have `huggingface-cli repo create` for that.
|
||||
).repo_id
|
||||
|
||||
# Check if branch already exists and if not, create it
|
||||
if self.revision is not None and not self.create_pr:
|
||||
try:
|
||||
self.api.repo_info(repo_id=repo_id, repo_type=self.repo_type, revision=self.revision)
|
||||
except RevisionNotFoundError:
|
||||
logger.info(f"Branch '{self.revision}' not found. Creating it...")
|
||||
self.api.create_branch(repo_id=repo_id, repo_type=self.repo_type, branch=self.revision, exist_ok=True)
|
||||
# ^ `exist_ok=True` to avoid race concurrency issues
|
||||
|
||||
# File-based upload
|
||||
if os.path.isfile(self.local_path):
|
||||
return self.api.upload_file(
|
||||
path_or_fileobj=self.local_path,
|
||||
path_in_repo=self.path_in_repo,
|
||||
repo_id=repo_id,
|
||||
repo_type=self.repo_type,
|
||||
revision=self.revision,
|
||||
commit_message=self.commit_message,
|
||||
commit_description=self.commit_description,
|
||||
create_pr=self.create_pr,
|
||||
)
|
||||
|
||||
# Folder-based upload
|
||||
else:
|
||||
return self.api.upload_folder(
|
||||
folder_path=self.local_path,
|
||||
path_in_repo=self.path_in_repo,
|
||||
repo_id=repo_id,
|
||||
repo_type=self.repo_type,
|
||||
revision=self.revision,
|
||||
commit_message=self.commit_message,
|
||||
commit_description=self.commit_description,
|
||||
create_pr=self.create_pr,
|
||||
allow_patterns=self.include,
|
||||
ignore_patterns=self.exclude,
|
||||
delete_patterns=self.delete,
|
||||
)
|
||||
@@ -0,0 +1,129 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contains command to upload a large folder with the CLI."""
|
||||
|
||||
import os
|
||||
from argparse import Namespace, _SubParsersAction
|
||||
from typing import List, Optional
|
||||
|
||||
from huggingface_hub import logging
|
||||
from huggingface_hub.commands import BaseHuggingfaceCLICommand
|
||||
from huggingface_hub.hf_api import HfApi
|
||||
from huggingface_hub.utils import disable_progress_bars
|
||||
|
||||
from ._cli_utils import ANSI
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class UploadLargeFolderCommand(BaseHuggingfaceCLICommand):
|
||||
@staticmethod
|
||||
def register_subcommand(parser: _SubParsersAction):
|
||||
subparser = parser.add_parser("upload-large-folder", help="Upload a large folder to a repo on the Hub")
|
||||
subparser.add_argument(
|
||||
"repo_id", type=str, help="The ID of the repo to upload to (e.g. `username/repo-name`)."
|
||||
)
|
||||
subparser.add_argument("local_path", type=str, help="Local path to the file or folder to upload.")
|
||||
subparser.add_argument(
|
||||
"--repo-type",
|
||||
choices=["model", "dataset", "space"],
|
||||
help="Type of the repo to upload to (e.g. `dataset`).",
|
||||
)
|
||||
subparser.add_argument(
|
||||
"--revision",
|
||||
type=str,
|
||||
help=("An optional Git revision to push to. It can be a branch name or a PR reference."),
|
||||
)
|
||||
subparser.add_argument(
|
||||
"--private",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether to create a private repo if repo doesn't exist on the Hub. Ignored if the repo already exists."
|
||||
),
|
||||
)
|
||||
subparser.add_argument("--include", nargs="*", type=str, help="Glob patterns to match files to upload.")
|
||||
subparser.add_argument("--exclude", nargs="*", type=str, help="Glob patterns to exclude from files to upload.")
|
||||
subparser.add_argument(
|
||||
"--token", type=str, help="A User Access Token generated from https://huggingface.co/settings/tokens"
|
||||
)
|
||||
subparser.add_argument(
|
||||
"--num-workers", type=int, help="Number of workers to use to hash, upload and commit files."
|
||||
)
|
||||
subparser.add_argument("--no-report", action="store_true", help="Whether to disable regular status report.")
|
||||
subparser.add_argument("--no-bars", action="store_true", help="Whether to disable progress bars.")
|
||||
subparser.set_defaults(func=UploadLargeFolderCommand)
|
||||
|
||||
def __init__(self, args: Namespace) -> None:
|
||||
self.repo_id: str = args.repo_id
|
||||
self.local_path: str = args.local_path
|
||||
self.repo_type: str = args.repo_type
|
||||
self.revision: Optional[str] = args.revision
|
||||
self.private: bool = args.private
|
||||
|
||||
self.include: Optional[List[str]] = args.include
|
||||
self.exclude: Optional[List[str]] = args.exclude
|
||||
|
||||
self.api: HfApi = HfApi(token=args.token, library_name="huggingface-cli")
|
||||
|
||||
self.num_workers: Optional[int] = args.num_workers
|
||||
self.no_report: bool = args.no_report
|
||||
self.no_bars: bool = args.no_bars
|
||||
|
||||
if not os.path.isdir(self.local_path):
|
||||
raise ValueError("Large upload is only supported for folders.")
|
||||
|
||||
def run(self) -> None:
|
||||
logging.set_verbosity_info()
|
||||
|
||||
print(
|
||||
ANSI.yellow(
|
||||
"You are about to upload a large folder to the Hub using `huggingface-cli upload-large-folder`. "
|
||||
"This is a new feature so feedback is very welcome!\n"
|
||||
"\n"
|
||||
"A few things to keep in mind:\n"
|
||||
" - Repository limits still apply: https://huggingface.co/docs/hub/repositories-recommendations\n"
|
||||
" - Do not start several processes in parallel.\n"
|
||||
" - You can interrupt and resume the process at any time. "
|
||||
"The script will pick up where it left off except for partially uploaded files that would have to be entirely reuploaded.\n"
|
||||
" - Do not upload the same folder to several repositories. If you need to do so, you must delete the `./.cache/huggingface/` folder first.\n"
|
||||
"\n"
|
||||
f"Some temporary metadata will be stored under `{self.local_path}/.cache/huggingface`.\n"
|
||||
" - You must not modify those files manually.\n"
|
||||
" - You must not delete the `./.cache/huggingface/` folder while a process is running.\n"
|
||||
" - You can delete the `./.cache/huggingface/` folder to reinitialize the upload state when process is not running. Files will have to be hashed and preuploaded again, except for already committed files.\n"
|
||||
"\n"
|
||||
"If the process output is too verbose, you can disable the progress bars with `--no-bars`. "
|
||||
"You can also entirely disable the status report with `--no-report`.\n"
|
||||
"\n"
|
||||
"For more details, run `huggingface-cli upload-large-folder --help` or check the documentation at "
|
||||
"https://huggingface.co/docs/huggingface_hub/guides/upload#upload-a-large-folder."
|
||||
)
|
||||
)
|
||||
|
||||
if self.no_bars:
|
||||
disable_progress_bars()
|
||||
|
||||
self.api.upload_large_folder(
|
||||
repo_id=self.repo_id,
|
||||
folder_path=self.local_path,
|
||||
repo_type=self.repo_type,
|
||||
revision=self.revision,
|
||||
private=self.private,
|
||||
allow_patterns=self.include,
|
||||
ignore_patterns=self.exclude,
|
||||
num_workers=self.num_workers,
|
||||
print_report=not self.no_report,
|
||||
)
|
||||
@@ -0,0 +1,304 @@
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contains commands to authenticate to the Hugging Face Hub and interact with your repositories.
|
||||
|
||||
Usage:
|
||||
# login and save token locally.
|
||||
huggingface-cli login --token=hf_*** --add-to-git-credential
|
||||
|
||||
# switch between tokens
|
||||
huggingface-cli auth switch
|
||||
|
||||
# list all tokens
|
||||
huggingface-cli auth list
|
||||
|
||||
# logout from a specific token, if no token-name is provided, all tokens will be deleted from your machine.
|
||||
huggingface-cli logout --token-name=your_token_name
|
||||
|
||||
# find out which huggingface.co account you are logged in as
|
||||
huggingface-cli whoami
|
||||
|
||||
# create a new dataset repo on the Hub
|
||||
huggingface-cli repo create mydataset --type=dataset
|
||||
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
from argparse import _SubParsersAction
|
||||
from typing import List, Optional
|
||||
|
||||
from requests.exceptions import HTTPError
|
||||
|
||||
from huggingface_hub.commands import BaseHuggingfaceCLICommand
|
||||
from huggingface_hub.constants import ENDPOINT, REPO_TYPES, REPO_TYPES_URL_PREFIXES, SPACES_SDK_TYPES
|
||||
from huggingface_hub.hf_api import HfApi
|
||||
|
||||
from .._login import ( # noqa: F401 # for backward compatibility # noqa: F401 # for backward compatibility
|
||||
NOTEBOOK_LOGIN_PASSWORD_HTML,
|
||||
NOTEBOOK_LOGIN_TOKEN_HTML_END,
|
||||
NOTEBOOK_LOGIN_TOKEN_HTML_START,
|
||||
auth_list,
|
||||
auth_switch,
|
||||
login,
|
||||
logout,
|
||||
notebook_login,
|
||||
)
|
||||
from ..utils import get_stored_tokens, get_token, logging
|
||||
from ._cli_utils import ANSI
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
try:
|
||||
from InquirerPy import inquirer
|
||||
from InquirerPy.base.control import Choice
|
||||
|
||||
_inquirer_py_available = True
|
||||
except ImportError:
|
||||
_inquirer_py_available = False
|
||||
|
||||
|
||||
class UserCommands(BaseHuggingfaceCLICommand):
|
||||
@staticmethod
|
||||
def register_subcommand(parser: _SubParsersAction):
|
||||
login_parser = parser.add_parser("login", help="Log in using a token from huggingface.co/settings/tokens")
|
||||
login_parser.add_argument(
|
||||
"--token",
|
||||
type=str,
|
||||
help="Token generated from https://huggingface.co/settings/tokens",
|
||||
)
|
||||
login_parser.add_argument(
|
||||
"--add-to-git-credential",
|
||||
action="store_true",
|
||||
help="Optional: Save token to git credential helper.",
|
||||
)
|
||||
login_parser.set_defaults(func=lambda args: LoginCommand(args))
|
||||
whoami_parser = parser.add_parser("whoami", help="Find out which huggingface.co account you are logged in as.")
|
||||
whoami_parser.set_defaults(func=lambda args: WhoamiCommand(args))
|
||||
|
||||
logout_parser = parser.add_parser("logout", help="Log out")
|
||||
logout_parser.add_argument(
|
||||
"--token-name",
|
||||
type=str,
|
||||
help="Optional: Name of the access token to log out from.",
|
||||
)
|
||||
logout_parser.set_defaults(func=lambda args: LogoutCommand(args))
|
||||
|
||||
auth_parser = parser.add_parser("auth", help="Other authentication related commands")
|
||||
auth_subparsers = auth_parser.add_subparsers(help="Authentication subcommands")
|
||||
auth_switch_parser = auth_subparsers.add_parser("switch", help="Switch between access tokens")
|
||||
auth_switch_parser.add_argument(
|
||||
"--token-name",
|
||||
type=str,
|
||||
help="Optional: Name of the access token to switch to.",
|
||||
)
|
||||
auth_switch_parser.add_argument(
|
||||
"--add-to-git-credential",
|
||||
action="store_true",
|
||||
help="Optional: Save token to git credential helper.",
|
||||
)
|
||||
auth_switch_parser.set_defaults(func=lambda args: AuthSwitchCommand(args))
|
||||
auth_list_parser = auth_subparsers.add_parser("list", help="List all stored access tokens")
|
||||
auth_list_parser.set_defaults(func=lambda args: AuthListCommand(args))
|
||||
# new system: git-based repo system
|
||||
repo_parser = parser.add_parser("repo", help="{create} Commands to interact with your huggingface.co repos.")
|
||||
repo_subparsers = repo_parser.add_subparsers(help="huggingface.co repos related commands")
|
||||
repo_create_parser = repo_subparsers.add_parser("create", help="Create a new repo on huggingface.co")
|
||||
repo_create_parser.add_argument(
|
||||
"name",
|
||||
type=str,
|
||||
help="Name for your repo. Will be namespaced under your username to build the repo id.",
|
||||
)
|
||||
repo_create_parser.add_argument(
|
||||
"--type",
|
||||
type=str,
|
||||
help='Optional: repo_type: set to "dataset" or "space" if creating a dataset or space, default is model.',
|
||||
)
|
||||
repo_create_parser.add_argument("--organization", type=str, help="Optional: organization namespace.")
|
||||
repo_create_parser.add_argument(
|
||||
"--space_sdk",
|
||||
type=str,
|
||||
help='Optional: Hugging Face Spaces SDK type. Required when --type is set to "space".',
|
||||
choices=SPACES_SDK_TYPES,
|
||||
)
|
||||
repo_create_parser.add_argument(
|
||||
"-y",
|
||||
"--yes",
|
||||
action="store_true",
|
||||
help="Optional: answer Yes to the prompt",
|
||||
)
|
||||
repo_create_parser.set_defaults(func=lambda args: RepoCreateCommand(args))
|
||||
|
||||
|
||||
class BaseUserCommand:
|
||||
def __init__(self, args):
|
||||
self.args = args
|
||||
self._api = HfApi()
|
||||
|
||||
|
||||
class LoginCommand(BaseUserCommand):
|
||||
def run(self):
|
||||
logging.set_verbosity_info()
|
||||
login(
|
||||
token=self.args.token,
|
||||
add_to_git_credential=self.args.add_to_git_credential,
|
||||
)
|
||||
|
||||
|
||||
class LogoutCommand(BaseUserCommand):
|
||||
def run(self):
|
||||
logging.set_verbosity_info()
|
||||
logout(token_name=self.args.token_name)
|
||||
|
||||
|
||||
class AuthSwitchCommand(BaseUserCommand):
|
||||
def run(self):
|
||||
logging.set_verbosity_info()
|
||||
token_name = self.args.token_name
|
||||
if token_name is None:
|
||||
token_name = self._select_token_name()
|
||||
|
||||
if token_name is None:
|
||||
print("No token name provided. Aborting.")
|
||||
exit()
|
||||
auth_switch(token_name, add_to_git_credential=self.args.add_to_git_credential)
|
||||
|
||||
def _select_token_name(self) -> Optional[str]:
|
||||
token_names = list(get_stored_tokens().keys())
|
||||
|
||||
if not token_names:
|
||||
logger.error("No stored tokens found. Please login first.")
|
||||
return None
|
||||
|
||||
if _inquirer_py_available:
|
||||
return self._select_token_name_tui(token_names)
|
||||
# if inquirer is not available, use a simpler terminal UI
|
||||
print("Available stored tokens:")
|
||||
for i, token_name in enumerate(token_names, 1):
|
||||
print(f"{i}. {token_name}")
|
||||
while True:
|
||||
try:
|
||||
choice = input("Enter the number of the token to switch to (or 'q' to quit): ")
|
||||
if choice.lower() == "q":
|
||||
return None
|
||||
index = int(choice) - 1
|
||||
if 0 <= index < len(token_names):
|
||||
return token_names[index]
|
||||
else:
|
||||
print("Invalid selection. Please try again.")
|
||||
except ValueError:
|
||||
print("Invalid input. Please enter a number or 'q' to quit.")
|
||||
|
||||
def _select_token_name_tui(self, token_names: List[str]) -> Optional[str]:
|
||||
choices = [Choice(token_name, name=token_name) for token_name in token_names]
|
||||
try:
|
||||
return inquirer.select(
|
||||
message="Select a token to switch to:",
|
||||
choices=choices,
|
||||
default=None,
|
||||
).execute()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Token selection cancelled.")
|
||||
return None
|
||||
|
||||
|
||||
class AuthListCommand(BaseUserCommand):
|
||||
def run(self):
|
||||
logging.set_verbosity_info()
|
||||
auth_list()
|
||||
|
||||
|
||||
class WhoamiCommand(BaseUserCommand):
|
||||
def run(self):
|
||||
token = get_token()
|
||||
if token is None:
|
||||
print("Not logged in")
|
||||
exit()
|
||||
try:
|
||||
info = self._api.whoami(token)
|
||||
print(info["name"])
|
||||
orgs = [org["name"] for org in info["orgs"]]
|
||||
if orgs:
|
||||
print(ANSI.bold("orgs: "), ",".join(orgs))
|
||||
|
||||
if ENDPOINT != "https://huggingface.co":
|
||||
print(f"Authenticated through private endpoint: {ENDPOINT}")
|
||||
except HTTPError as e:
|
||||
print(e)
|
||||
print(ANSI.red(e.response.text))
|
||||
exit(1)
|
||||
|
||||
|
||||
class RepoCreateCommand(BaseUserCommand):
|
||||
def run(self):
|
||||
token = get_token()
|
||||
if token is None:
|
||||
print("Not logged in")
|
||||
exit(1)
|
||||
try:
|
||||
stdout = subprocess.check_output(["git", "--version"]).decode("utf-8")
|
||||
print(ANSI.gray(stdout.strip()))
|
||||
except FileNotFoundError:
|
||||
print("Looks like you do not have git installed, please install.")
|
||||
|
||||
try:
|
||||
stdout = subprocess.check_output(["git-lfs", "--version"]).decode("utf-8")
|
||||
print(ANSI.gray(stdout.strip()))
|
||||
except FileNotFoundError:
|
||||
print(
|
||||
ANSI.red(
|
||||
"Looks like you do not have git-lfs installed, please install."
|
||||
" You can install from https://git-lfs.github.com/."
|
||||
" Then run `git lfs install` (you only have to do this once)."
|
||||
)
|
||||
)
|
||||
print("")
|
||||
|
||||
user = self._api.whoami(token)["name"]
|
||||
namespace = self.args.organization if self.args.organization is not None else user
|
||||
|
||||
repo_id = f"{namespace}/{self.args.name}"
|
||||
|
||||
if self.args.type not in REPO_TYPES:
|
||||
print("Invalid repo --type")
|
||||
exit(1)
|
||||
|
||||
if self.args.type in REPO_TYPES_URL_PREFIXES:
|
||||
prefixed_repo_id = REPO_TYPES_URL_PREFIXES[self.args.type] + repo_id
|
||||
else:
|
||||
prefixed_repo_id = repo_id
|
||||
|
||||
print(f"You are about to create {ANSI.bold(prefixed_repo_id)}")
|
||||
|
||||
if not self.args.yes:
|
||||
choice = input("Proceed? [Y/n] ").lower()
|
||||
if not (choice == "" or choice == "y" or choice == "yes"):
|
||||
print("Abort")
|
||||
exit()
|
||||
try:
|
||||
url = self._api.create_repo(
|
||||
repo_id=repo_id,
|
||||
token=token,
|
||||
repo_type=self.args.type,
|
||||
space_sdk=self.args.space_sdk,
|
||||
)
|
||||
except HTTPError as e:
|
||||
print(e)
|
||||
print(ANSI.red(e.response.text))
|
||||
exit(1)
|
||||
print("\nYour repo now lives at:")
|
||||
print(f" {ANSI.bold(url)}")
|
||||
print("\nYou can clone it locally with the command below, and commit/push as usual.")
|
||||
print(f"\n git clone {url}")
|
||||
print("")
|
||||
@@ -0,0 +1,37 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contains command to print information about the version.
|
||||
|
||||
Usage:
|
||||
huggingface-cli version
|
||||
"""
|
||||
|
||||
from argparse import _SubParsersAction
|
||||
|
||||
from huggingface_hub import __version__
|
||||
|
||||
from . import BaseHuggingfaceCLICommand
|
||||
|
||||
|
||||
class VersionCommand(BaseHuggingfaceCLICommand):
|
||||
def __init__(self, args):
|
||||
self.args = args
|
||||
|
||||
@staticmethod
|
||||
def register_subcommand(parser: _SubParsersAction):
|
||||
version_parser = parser.add_parser("version", help="Print information about the huggingface-cli version.")
|
||||
version_parser.set_defaults(func=VersionCommand)
|
||||
|
||||
def run(self) -> None:
|
||||
print(f"huggingface_hub version: {__version__}")
|
||||
355
.venv/lib/python3.10/site-packages/huggingface_hub/community.py
Normal file
355
.venv/lib/python3.10/site-packages/huggingface_hub/community.py
Normal file
@@ -0,0 +1,355 @@
|
||||
"""
|
||||
Data structures to interact with Discussions and Pull Requests on the Hub.
|
||||
|
||||
See [the Discussions and Pull Requests guide](https://huggingface.co/docs/hub/repositories-pull-requests-discussions)
|
||||
for more information on Pull Requests, Discussions, and the community tab.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import List, Literal, Optional, Union
|
||||
|
||||
from . import constants
|
||||
from .utils import parse_datetime
|
||||
|
||||
|
||||
DiscussionStatus = Literal["open", "closed", "merged", "draft"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class Discussion:
|
||||
"""
|
||||
A Discussion or Pull Request on the Hub.
|
||||
|
||||
This dataclass is not intended to be instantiated directly.
|
||||
|
||||
Attributes:
|
||||
title (`str`):
|
||||
The title of the Discussion / Pull Request
|
||||
status (`str`):
|
||||
The status of the Discussion / Pull Request.
|
||||
It must be one of:
|
||||
* `"open"`
|
||||
* `"closed"`
|
||||
* `"merged"` (only for Pull Requests )
|
||||
* `"draft"` (only for Pull Requests )
|
||||
num (`int`):
|
||||
The number of the Discussion / Pull Request.
|
||||
repo_id (`str`):
|
||||
The id (`"{namespace}/{repo_name}"`) of the repo on which
|
||||
the Discussion / Pull Request was open.
|
||||
repo_type (`str`):
|
||||
The type of the repo on which the Discussion / Pull Request was open.
|
||||
Possible values are: `"model"`, `"dataset"`, `"space"`.
|
||||
author (`str`):
|
||||
The username of the Discussion / Pull Request author.
|
||||
Can be `"deleted"` if the user has been deleted since.
|
||||
is_pull_request (`bool`):
|
||||
Whether or not this is a Pull Request.
|
||||
created_at (`datetime`):
|
||||
The `datetime` of creation of the Discussion / Pull Request.
|
||||
endpoint (`str`):
|
||||
Endpoint of the Hub. Default is https://huggingface.co.
|
||||
git_reference (`str`, *optional*):
|
||||
(property) Git reference to which changes can be pushed if this is a Pull Request, `None` otherwise.
|
||||
url (`str`):
|
||||
(property) URL of the discussion on the Hub.
|
||||
"""
|
||||
|
||||
title: str
|
||||
status: DiscussionStatus
|
||||
num: int
|
||||
repo_id: str
|
||||
repo_type: str
|
||||
author: str
|
||||
is_pull_request: bool
|
||||
created_at: datetime
|
||||
endpoint: str
|
||||
|
||||
@property
|
||||
def git_reference(self) -> Optional[str]:
|
||||
"""
|
||||
If this is a Pull Request , returns the git reference to which changes can be pushed.
|
||||
Returns `None` otherwise.
|
||||
"""
|
||||
if self.is_pull_request:
|
||||
return f"refs/pr/{self.num}"
|
||||
return None
|
||||
|
||||
@property
|
||||
def url(self) -> str:
|
||||
"""Returns the URL of the discussion on the Hub."""
|
||||
if self.repo_type is None or self.repo_type == constants.REPO_TYPE_MODEL:
|
||||
return f"{self.endpoint}/{self.repo_id}/discussions/{self.num}"
|
||||
return f"{self.endpoint}/{self.repo_type}s/{self.repo_id}/discussions/{self.num}"
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiscussionWithDetails(Discussion):
|
||||
"""
|
||||
Subclass of [`Discussion`].
|
||||
|
||||
Attributes:
|
||||
title (`str`):
|
||||
The title of the Discussion / Pull Request
|
||||
status (`str`):
|
||||
The status of the Discussion / Pull Request.
|
||||
It can be one of:
|
||||
* `"open"`
|
||||
* `"closed"`
|
||||
* `"merged"` (only for Pull Requests )
|
||||
* `"draft"` (only for Pull Requests )
|
||||
num (`int`):
|
||||
The number of the Discussion / Pull Request.
|
||||
repo_id (`str`):
|
||||
The id (`"{namespace}/{repo_name}"`) of the repo on which
|
||||
the Discussion / Pull Request was open.
|
||||
repo_type (`str`):
|
||||
The type of the repo on which the Discussion / Pull Request was open.
|
||||
Possible values are: `"model"`, `"dataset"`, `"space"`.
|
||||
author (`str`):
|
||||
The username of the Discussion / Pull Request author.
|
||||
Can be `"deleted"` if the user has been deleted since.
|
||||
is_pull_request (`bool`):
|
||||
Whether or not this is a Pull Request.
|
||||
created_at (`datetime`):
|
||||
The `datetime` of creation of the Discussion / Pull Request.
|
||||
events (`list` of [`DiscussionEvent`])
|
||||
The list of [`DiscussionEvents`] in this Discussion or Pull Request.
|
||||
conflicting_files (`Union[List[str], bool, None]`, *optional*):
|
||||
A list of conflicting files if this is a Pull Request.
|
||||
`None` if `self.is_pull_request` is `False`.
|
||||
`True` if there are conflicting files but the list can't be retrieved.
|
||||
target_branch (`str`, *optional*):
|
||||
The branch into which changes are to be merged if this is a
|
||||
Pull Request . `None` if `self.is_pull_request` is `False`.
|
||||
merge_commit_oid (`str`, *optional*):
|
||||
If this is a merged Pull Request , this is set to the OID / SHA of
|
||||
the merge commit, `None` otherwise.
|
||||
diff (`str`, *optional*):
|
||||
The git diff if this is a Pull Request , `None` otherwise.
|
||||
endpoint (`str`):
|
||||
Endpoint of the Hub. Default is https://huggingface.co.
|
||||
git_reference (`str`, *optional*):
|
||||
(property) Git reference to which changes can be pushed if this is a Pull Request, `None` otherwise.
|
||||
url (`str`):
|
||||
(property) URL of the discussion on the Hub.
|
||||
"""
|
||||
|
||||
events: List["DiscussionEvent"]
|
||||
conflicting_files: Union[List[str], bool, None]
|
||||
target_branch: Optional[str]
|
||||
merge_commit_oid: Optional[str]
|
||||
diff: Optional[str]
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiscussionEvent:
|
||||
"""
|
||||
An event in a Discussion or Pull Request.
|
||||
|
||||
Use concrete classes:
|
||||
* [`DiscussionComment`]
|
||||
* [`DiscussionStatusChange`]
|
||||
* [`DiscussionCommit`]
|
||||
* [`DiscussionTitleChange`]
|
||||
|
||||
Attributes:
|
||||
id (`str`):
|
||||
The ID of the event. An hexadecimal string.
|
||||
type (`str`):
|
||||
The type of the event.
|
||||
created_at (`datetime`):
|
||||
A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
|
||||
object holding the creation timestamp for the event.
|
||||
author (`str`):
|
||||
The username of the Discussion / Pull Request author.
|
||||
Can be `"deleted"` if the user has been deleted since.
|
||||
"""
|
||||
|
||||
id: str
|
||||
type: str
|
||||
created_at: datetime
|
||||
author: str
|
||||
|
||||
_event: dict
|
||||
"""Stores the original event data, in case we need to access it later."""
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiscussionComment(DiscussionEvent):
|
||||
"""A comment in a Discussion / Pull Request.
|
||||
|
||||
Subclass of [`DiscussionEvent`].
|
||||
|
||||
|
||||
Attributes:
|
||||
id (`str`):
|
||||
The ID of the event. An hexadecimal string.
|
||||
type (`str`):
|
||||
The type of the event.
|
||||
created_at (`datetime`):
|
||||
A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
|
||||
object holding the creation timestamp for the event.
|
||||
author (`str`):
|
||||
The username of the Discussion / Pull Request author.
|
||||
Can be `"deleted"` if the user has been deleted since.
|
||||
content (`str`):
|
||||
The raw markdown content of the comment. Mentions, links and images are not rendered.
|
||||
edited (`bool`):
|
||||
Whether or not this comment has been edited.
|
||||
hidden (`bool`):
|
||||
Whether or not this comment has been hidden.
|
||||
"""
|
||||
|
||||
content: str
|
||||
edited: bool
|
||||
hidden: bool
|
||||
|
||||
@property
|
||||
def rendered(self) -> str:
|
||||
"""The rendered comment, as a HTML string"""
|
||||
return self._event["data"]["latest"]["html"]
|
||||
|
||||
@property
|
||||
def last_edited_at(self) -> datetime:
|
||||
"""The last edit time, as a `datetime` object."""
|
||||
return parse_datetime(self._event["data"]["latest"]["updatedAt"])
|
||||
|
||||
@property
|
||||
def last_edited_by(self) -> str:
|
||||
"""The last edit time, as a `datetime` object."""
|
||||
return self._event["data"]["latest"].get("author", {}).get("name", "deleted")
|
||||
|
||||
@property
|
||||
def edit_history(self) -> List[dict]:
|
||||
"""The edit history of the comment"""
|
||||
return self._event["data"]["history"]
|
||||
|
||||
@property
|
||||
def number_of_edits(self) -> int:
|
||||
return len(self.edit_history)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiscussionStatusChange(DiscussionEvent):
|
||||
"""A change of status in a Discussion / Pull Request.
|
||||
|
||||
Subclass of [`DiscussionEvent`].
|
||||
|
||||
Attributes:
|
||||
id (`str`):
|
||||
The ID of the event. An hexadecimal string.
|
||||
type (`str`):
|
||||
The type of the event.
|
||||
created_at (`datetime`):
|
||||
A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
|
||||
object holding the creation timestamp for the event.
|
||||
author (`str`):
|
||||
The username of the Discussion / Pull Request author.
|
||||
Can be `"deleted"` if the user has been deleted since.
|
||||
new_status (`str`):
|
||||
The status of the Discussion / Pull Request after the change.
|
||||
It can be one of:
|
||||
* `"open"`
|
||||
* `"closed"`
|
||||
* `"merged"` (only for Pull Requests )
|
||||
"""
|
||||
|
||||
new_status: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiscussionCommit(DiscussionEvent):
|
||||
"""A commit in a Pull Request.
|
||||
|
||||
Subclass of [`DiscussionEvent`].
|
||||
|
||||
Attributes:
|
||||
id (`str`):
|
||||
The ID of the event. An hexadecimal string.
|
||||
type (`str`):
|
||||
The type of the event.
|
||||
created_at (`datetime`):
|
||||
A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
|
||||
object holding the creation timestamp for the event.
|
||||
author (`str`):
|
||||
The username of the Discussion / Pull Request author.
|
||||
Can be `"deleted"` if the user has been deleted since.
|
||||
summary (`str`):
|
||||
The summary of the commit.
|
||||
oid (`str`):
|
||||
The OID / SHA of the commit, as a hexadecimal string.
|
||||
"""
|
||||
|
||||
summary: str
|
||||
oid: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiscussionTitleChange(DiscussionEvent):
|
||||
"""A rename event in a Discussion / Pull Request.
|
||||
|
||||
Subclass of [`DiscussionEvent`].
|
||||
|
||||
Attributes:
|
||||
id (`str`):
|
||||
The ID of the event. An hexadecimal string.
|
||||
type (`str`):
|
||||
The type of the event.
|
||||
created_at (`datetime`):
|
||||
A [`datetime`](https://docs.python.org/3/library/datetime.html?highlight=datetime#datetime.datetime)
|
||||
object holding the creation timestamp for the event.
|
||||
author (`str`):
|
||||
The username of the Discussion / Pull Request author.
|
||||
Can be `"deleted"` if the user has been deleted since.
|
||||
old_title (`str`):
|
||||
The previous title for the Discussion / Pull Request.
|
||||
new_title (`str`):
|
||||
The new title.
|
||||
"""
|
||||
|
||||
old_title: str
|
||||
new_title: str
|
||||
|
||||
|
||||
def deserialize_event(event: dict) -> DiscussionEvent:
|
||||
"""Instantiates a [`DiscussionEvent`] from a dict"""
|
||||
event_id: str = event["id"]
|
||||
event_type: str = event["type"]
|
||||
created_at = parse_datetime(event["createdAt"])
|
||||
|
||||
common_args = dict(
|
||||
id=event_id,
|
||||
type=event_type,
|
||||
created_at=created_at,
|
||||
author=event.get("author", {}).get("name", "deleted"),
|
||||
_event=event,
|
||||
)
|
||||
|
||||
if event_type == "comment":
|
||||
return DiscussionComment(
|
||||
**common_args,
|
||||
edited=event["data"]["edited"],
|
||||
hidden=event["data"]["hidden"],
|
||||
content=event["data"]["latest"]["raw"],
|
||||
)
|
||||
if event_type == "status-change":
|
||||
return DiscussionStatusChange(
|
||||
**common_args,
|
||||
new_status=event["data"]["status"],
|
||||
)
|
||||
if event_type == "commit":
|
||||
return DiscussionCommit(
|
||||
**common_args,
|
||||
summary=event["data"]["subject"],
|
||||
oid=event["data"]["oid"],
|
||||
)
|
||||
if event_type == "title-change":
|
||||
return DiscussionTitleChange(
|
||||
**common_args,
|
||||
old_title=event["data"]["from"],
|
||||
new_title=event["data"]["to"],
|
||||
)
|
||||
|
||||
return DiscussionEvent(**common_args)
|
||||
273
.venv/lib/python3.10/site-packages/huggingface_hub/constants.py
Normal file
273
.venv/lib/python3.10/site-packages/huggingface_hub/constants.py
Normal file
@@ -0,0 +1,273 @@
|
||||
import os
|
||||
import re
|
||||
import typing
|
||||
from typing import Literal, Optional, Tuple
|
||||
|
||||
|
||||
# Possible values for env variables
|
||||
|
||||
|
||||
ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
|
||||
ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
|
||||
|
||||
|
||||
def _is_true(value: Optional[str]) -> bool:
|
||||
if value is None:
|
||||
return False
|
||||
return value.upper() in ENV_VARS_TRUE_VALUES
|
||||
|
||||
|
||||
def _as_int(value: Optional[str]) -> Optional[int]:
|
||||
if value is None:
|
||||
return None
|
||||
return int(value)
|
||||
|
||||
|
||||
# Constants for file downloads
|
||||
|
||||
PYTORCH_WEIGHTS_NAME = "pytorch_model.bin"
|
||||
TF2_WEIGHTS_NAME = "tf_model.h5"
|
||||
TF_WEIGHTS_NAME = "model.ckpt"
|
||||
FLAX_WEIGHTS_NAME = "flax_model.msgpack"
|
||||
CONFIG_NAME = "config.json"
|
||||
REPOCARD_NAME = "README.md"
|
||||
DEFAULT_ETAG_TIMEOUT = 10
|
||||
DEFAULT_DOWNLOAD_TIMEOUT = 10
|
||||
DEFAULT_REQUEST_TIMEOUT = 10
|
||||
DOWNLOAD_CHUNK_SIZE = 10 * 1024 * 1024
|
||||
HF_TRANSFER_CONCURRENCY = 100
|
||||
|
||||
# Constants for serialization
|
||||
|
||||
PYTORCH_WEIGHTS_FILE_PATTERN = "pytorch_model{suffix}.bin" # Unsafe pickle: use safetensors instead
|
||||
SAFETENSORS_WEIGHTS_FILE_PATTERN = "model{suffix}.safetensors"
|
||||
TF2_WEIGHTS_FILE_PATTERN = "tf_model{suffix}.h5"
|
||||
|
||||
# Constants for safetensors repos
|
||||
|
||||
SAFETENSORS_SINGLE_FILE = "model.safetensors"
|
||||
SAFETENSORS_INDEX_FILE = "model.safetensors.index.json"
|
||||
SAFETENSORS_MAX_HEADER_LENGTH = 25_000_000
|
||||
|
||||
# Timeout of aquiring file lock and logging the attempt
|
||||
FILELOCK_LOG_EVERY_SECONDS = 10
|
||||
|
||||
# Git-related constants
|
||||
|
||||
DEFAULT_REVISION = "main"
|
||||
REGEX_COMMIT_OID = re.compile(r"[A-Fa-f0-9]{5,40}")
|
||||
|
||||
HUGGINGFACE_CO_URL_HOME = "https://huggingface.co/"
|
||||
|
||||
_staging_mode = _is_true(os.environ.get("HUGGINGFACE_CO_STAGING"))
|
||||
|
||||
_HF_DEFAULT_ENDPOINT = "https://huggingface.co"
|
||||
_HF_DEFAULT_STAGING_ENDPOINT = "https://hub-ci.huggingface.co"
|
||||
ENDPOINT = os.getenv("HF_ENDPOINT", _HF_DEFAULT_ENDPOINT).rstrip("/")
|
||||
HUGGINGFACE_CO_URL_TEMPLATE = ENDPOINT + "/{repo_id}/resolve/{revision}/{filename}"
|
||||
|
||||
if _staging_mode:
|
||||
ENDPOINT = _HF_DEFAULT_STAGING_ENDPOINT
|
||||
HUGGINGFACE_CO_URL_TEMPLATE = _HF_DEFAULT_STAGING_ENDPOINT + "/{repo_id}/resolve/{revision}/{filename}"
|
||||
|
||||
HUGGINGFACE_HEADER_X_REPO_COMMIT = "X-Repo-Commit"
|
||||
HUGGINGFACE_HEADER_X_LINKED_ETAG = "X-Linked-Etag"
|
||||
HUGGINGFACE_HEADER_X_LINKED_SIZE = "X-Linked-Size"
|
||||
HUGGINGFACE_HEADER_X_BILL_TO = "X-HF-Bill-To"
|
||||
|
||||
INFERENCE_ENDPOINT = os.environ.get("HF_INFERENCE_ENDPOINT", "https://api-inference.huggingface.co")
|
||||
|
||||
# See https://huggingface.co/docs/inference-endpoints/index
|
||||
INFERENCE_ENDPOINTS_ENDPOINT = "https://api.endpoints.huggingface.cloud/v2"
|
||||
INFERENCE_CATALOG_ENDPOINT = "https://endpoints.huggingface.co/api/catalog"
|
||||
|
||||
# Proxy for third-party providers
|
||||
INFERENCE_PROXY_TEMPLATE = "https://router.huggingface.co/{provider}"
|
||||
|
||||
REPO_ID_SEPARATOR = "--"
|
||||
# ^ this substring is not allowed in repo_ids on hf.co
|
||||
# and is the canonical one we use for serialization of repo ids elsewhere.
|
||||
|
||||
|
||||
REPO_TYPE_DATASET = "dataset"
|
||||
REPO_TYPE_SPACE = "space"
|
||||
REPO_TYPE_MODEL = "model"
|
||||
REPO_TYPES = [None, REPO_TYPE_MODEL, REPO_TYPE_DATASET, REPO_TYPE_SPACE]
|
||||
SPACES_SDK_TYPES = ["gradio", "streamlit", "docker", "static"]
|
||||
|
||||
REPO_TYPES_URL_PREFIXES = {
|
||||
REPO_TYPE_DATASET: "datasets/",
|
||||
REPO_TYPE_SPACE: "spaces/",
|
||||
}
|
||||
REPO_TYPES_MAPPING = {
|
||||
"datasets": REPO_TYPE_DATASET,
|
||||
"spaces": REPO_TYPE_SPACE,
|
||||
"models": REPO_TYPE_MODEL,
|
||||
}
|
||||
|
||||
DiscussionTypeFilter = Literal["all", "discussion", "pull_request"]
|
||||
DISCUSSION_TYPES: Tuple[DiscussionTypeFilter, ...] = typing.get_args(DiscussionTypeFilter)
|
||||
DiscussionStatusFilter = Literal["all", "open", "closed"]
|
||||
DISCUSSION_STATUS: Tuple[DiscussionTypeFilter, ...] = typing.get_args(DiscussionStatusFilter)
|
||||
|
||||
# Webhook subscription types
|
||||
WEBHOOK_DOMAIN_T = Literal["repo", "discussions"]
|
||||
|
||||
# default cache
|
||||
default_home = os.path.join(os.path.expanduser("~"), ".cache")
|
||||
HF_HOME = os.path.expandvars(
|
||||
os.path.expanduser(
|
||||
os.getenv(
|
||||
"HF_HOME",
|
||||
os.path.join(os.getenv("XDG_CACHE_HOME", default_home), "huggingface"),
|
||||
)
|
||||
)
|
||||
)
|
||||
hf_cache_home = HF_HOME # for backward compatibility. TODO: remove this in 1.0.0
|
||||
|
||||
default_cache_path = os.path.join(HF_HOME, "hub")
|
||||
default_assets_cache_path = os.path.join(HF_HOME, "assets")
|
||||
|
||||
# Legacy env variables
|
||||
HUGGINGFACE_HUB_CACHE = os.getenv("HUGGINGFACE_HUB_CACHE", default_cache_path)
|
||||
HUGGINGFACE_ASSETS_CACHE = os.getenv("HUGGINGFACE_ASSETS_CACHE", default_assets_cache_path)
|
||||
|
||||
# New env variables
|
||||
HF_HUB_CACHE = os.path.expandvars(
|
||||
os.path.expanduser(
|
||||
os.getenv(
|
||||
"HF_HUB_CACHE",
|
||||
HUGGINGFACE_HUB_CACHE,
|
||||
)
|
||||
)
|
||||
)
|
||||
HF_ASSETS_CACHE = os.path.expandvars(
|
||||
os.path.expanduser(
|
||||
os.getenv(
|
||||
"HF_ASSETS_CACHE",
|
||||
HUGGINGFACE_ASSETS_CACHE,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
HF_HUB_OFFLINE = _is_true(os.environ.get("HF_HUB_OFFLINE") or os.environ.get("TRANSFORMERS_OFFLINE"))
|
||||
|
||||
# If set, log level will be set to DEBUG and all requests made to the Hub will be logged
|
||||
# as curl commands for reproducibility.
|
||||
HF_DEBUG = _is_true(os.environ.get("HF_DEBUG"))
|
||||
|
||||
# Opt-out from telemetry requests
|
||||
HF_HUB_DISABLE_TELEMETRY = (
|
||||
_is_true(os.environ.get("HF_HUB_DISABLE_TELEMETRY")) # HF-specific env variable
|
||||
or _is_true(os.environ.get("DISABLE_TELEMETRY"))
|
||||
or _is_true(os.environ.get("DO_NOT_TRACK")) # https://consoledonottrack.com/
|
||||
)
|
||||
|
||||
HF_TOKEN_PATH = os.path.expandvars(
|
||||
os.path.expanduser(
|
||||
os.getenv(
|
||||
"HF_TOKEN_PATH",
|
||||
os.path.join(HF_HOME, "token"),
|
||||
)
|
||||
)
|
||||
)
|
||||
HF_STORED_TOKENS_PATH = os.path.join(os.path.dirname(HF_TOKEN_PATH), "stored_tokens")
|
||||
|
||||
if _staging_mode:
|
||||
# In staging mode, we use a different cache to ensure we don't mix up production and staging data or tokens
|
||||
# In practice in `huggingface_hub` tests, we monkeypatch these values with temporary directories. The following
|
||||
# lines are only used in third-party libraries tests (e.g. `transformers`, `diffusers`, etc.).
|
||||
_staging_home = os.path.join(os.path.expanduser("~"), ".cache", "huggingface_staging")
|
||||
HUGGINGFACE_HUB_CACHE = os.path.join(_staging_home, "hub")
|
||||
HF_TOKEN_PATH = os.path.join(_staging_home, "token")
|
||||
|
||||
# Here, `True` will disable progress bars globally without possibility of enabling it
|
||||
# programmatically. `False` will enable them without possibility of disabling them.
|
||||
# If environment variable is not set (None), then the user is free to enable/disable
|
||||
# them programmatically.
|
||||
# TL;DR: env variable has priority over code
|
||||
__HF_HUB_DISABLE_PROGRESS_BARS = os.environ.get("HF_HUB_DISABLE_PROGRESS_BARS")
|
||||
HF_HUB_DISABLE_PROGRESS_BARS: Optional[bool] = (
|
||||
_is_true(__HF_HUB_DISABLE_PROGRESS_BARS) if __HF_HUB_DISABLE_PROGRESS_BARS is not None else None
|
||||
)
|
||||
|
||||
# Disable warning on machines that do not support symlinks (e.g. Windows non-developer)
|
||||
HF_HUB_DISABLE_SYMLINKS_WARNING: bool = _is_true(os.environ.get("HF_HUB_DISABLE_SYMLINKS_WARNING"))
|
||||
|
||||
# Disable warning when using experimental features
|
||||
HF_HUB_DISABLE_EXPERIMENTAL_WARNING: bool = _is_true(os.environ.get("HF_HUB_DISABLE_EXPERIMENTAL_WARNING"))
|
||||
|
||||
# Disable sending the cached token by default is all HTTP requests to the Hub
|
||||
HF_HUB_DISABLE_IMPLICIT_TOKEN: bool = _is_true(os.environ.get("HF_HUB_DISABLE_IMPLICIT_TOKEN"))
|
||||
|
||||
# Enable fast-download using external dependency "hf_transfer"
|
||||
# See:
|
||||
# - https://pypi.org/project/hf-transfer/
|
||||
# - https://github.com/huggingface/hf_transfer (private)
|
||||
HF_HUB_ENABLE_HF_TRANSFER: bool = _is_true(os.environ.get("HF_HUB_ENABLE_HF_TRANSFER"))
|
||||
|
||||
|
||||
# UNUSED
|
||||
# We don't use symlinks in local dir anymore.
|
||||
HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD: int = (
|
||||
_as_int(os.environ.get("HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD")) or 5 * 1024 * 1024
|
||||
)
|
||||
|
||||
# Used to override the etag timeout on a system level
|
||||
HF_HUB_ETAG_TIMEOUT: int = _as_int(os.environ.get("HF_HUB_ETAG_TIMEOUT")) or DEFAULT_ETAG_TIMEOUT
|
||||
|
||||
# Used to override the get request timeout on a system level
|
||||
HF_HUB_DOWNLOAD_TIMEOUT: int = _as_int(os.environ.get("HF_HUB_DOWNLOAD_TIMEOUT")) or DEFAULT_DOWNLOAD_TIMEOUT
|
||||
|
||||
# Allows to add information about the requester in the user-agent (eg. partner name)
|
||||
HF_HUB_USER_AGENT_ORIGIN: Optional[str] = os.environ.get("HF_HUB_USER_AGENT_ORIGIN")
|
||||
|
||||
# List frameworks that are handled by the InferenceAPI service. Useful to scan endpoints and check which models are
|
||||
# deployed and running. Since 95% of the models are using the top 4 frameworks listed below, we scan only those by
|
||||
# default. We still keep the full list of supported frameworks in case we want to scan all of them.
|
||||
MAIN_INFERENCE_API_FRAMEWORKS = [
|
||||
"diffusers",
|
||||
"sentence-transformers",
|
||||
"text-generation-inference",
|
||||
"transformers",
|
||||
]
|
||||
|
||||
ALL_INFERENCE_API_FRAMEWORKS = MAIN_INFERENCE_API_FRAMEWORKS + [
|
||||
"adapter-transformers",
|
||||
"allennlp",
|
||||
"asteroid",
|
||||
"bertopic",
|
||||
"doctr",
|
||||
"espnet",
|
||||
"fairseq",
|
||||
"fastai",
|
||||
"fasttext",
|
||||
"flair",
|
||||
"k2",
|
||||
"keras",
|
||||
"mindspore",
|
||||
"nemo",
|
||||
"open_clip",
|
||||
"paddlenlp",
|
||||
"peft",
|
||||
"pyannote-audio",
|
||||
"sklearn",
|
||||
"spacy",
|
||||
"span-marker",
|
||||
"speechbrain",
|
||||
"stanza",
|
||||
"timm",
|
||||
]
|
||||
|
||||
# Xet constants
|
||||
|
||||
|
||||
HUGGINGFACE_HEADER_X_XET_ENDPOINT = "X-Xet-Cas-Url"
|
||||
HUGGINGFACE_HEADER_X_XET_ACCESS_TOKEN = "X-Xet-Access-Token"
|
||||
HUGGINGFACE_HEADER_X_XET_EXPIRATION = "X-Xet-Token-Expiration"
|
||||
HUGGINGFACE_HEADER_X_XET_HASH = "X-Xet-Hash"
|
||||
HUGGINGFACE_HEADER_X_XET_REFRESH_ROUTE = "X-Xet-Refresh-Route"
|
||||
HUGGINGFACE_HEADER_LINK_XET_AUTH_KEY = "xet-auth"
|
||||
|
||||
default_xet_cache_path = os.path.join(HF_HOME, "xet")
|
||||
HF_XET_CACHE = os.getenv("HF_XET_CACHE", default_xet_cache_path)
|
||||
348
.venv/lib/python3.10/site-packages/huggingface_hub/errors.py
Normal file
348
.venv/lib/python3.10/site-packages/huggingface_hub/errors.py
Normal file
@@ -0,0 +1,348 @@
|
||||
"""Contains all custom errors."""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
|
||||
from requests import HTTPError, Response
|
||||
|
||||
|
||||
# CACHE ERRORS
|
||||
|
||||
|
||||
class CacheNotFound(Exception):
|
||||
"""Exception thrown when the Huggingface cache is not found."""
|
||||
|
||||
cache_dir: Union[str, Path]
|
||||
|
||||
def __init__(self, msg: str, cache_dir: Union[str, Path], *args, **kwargs):
|
||||
super().__init__(msg, *args, **kwargs)
|
||||
self.cache_dir = cache_dir
|
||||
|
||||
|
||||
class CorruptedCacheException(Exception):
|
||||
"""Exception for any unexpected structure in the Huggingface cache-system."""
|
||||
|
||||
|
||||
# HEADERS ERRORS
|
||||
|
||||
|
||||
class LocalTokenNotFoundError(EnvironmentError):
|
||||
"""Raised if local token is required but not found."""
|
||||
|
||||
|
||||
# HTTP ERRORS
|
||||
|
||||
|
||||
class OfflineModeIsEnabled(ConnectionError):
|
||||
"""Raised when a request is made but `HF_HUB_OFFLINE=1` is set as environment variable."""
|
||||
|
||||
|
||||
class HfHubHTTPError(HTTPError):
|
||||
"""
|
||||
HTTPError to inherit from for any custom HTTP Error raised in HF Hub.
|
||||
|
||||
Any HTTPError is converted at least into a `HfHubHTTPError`. If some information is
|
||||
sent back by the server, it will be added to the error message.
|
||||
|
||||
Added details:
|
||||
- Request id from "X-Request-Id" header if exists. If not, fallback to "X-Amzn-Trace-Id" header if exists.
|
||||
- Server error message from the header "X-Error-Message".
|
||||
- Server error message if we can found one in the response body.
|
||||
|
||||
Example:
|
||||
```py
|
||||
import requests
|
||||
from huggingface_hub.utils import get_session, hf_raise_for_status, HfHubHTTPError
|
||||
|
||||
response = get_session().post(...)
|
||||
try:
|
||||
hf_raise_for_status(response)
|
||||
except HfHubHTTPError as e:
|
||||
print(str(e)) # formatted message
|
||||
e.request_id, e.server_message # details returned by server
|
||||
|
||||
# Complete the error message with additional information once it's raised
|
||||
e.append_to_message("\n`create_commit` expects the repository to exist.")
|
||||
raise
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, message: str, response: Optional[Response] = None, *, server_message: Optional[str] = None):
|
||||
self.request_id = (
|
||||
response.headers.get("x-request-id") or response.headers.get("X-Amzn-Trace-Id")
|
||||
if response is not None
|
||||
else None
|
||||
)
|
||||
self.server_message = server_message
|
||||
|
||||
super().__init__(
|
||||
message,
|
||||
response=response, # type: ignore [arg-type]
|
||||
request=response.request if response is not None else None, # type: ignore [arg-type]
|
||||
)
|
||||
|
||||
def append_to_message(self, additional_message: str) -> None:
|
||||
"""Append additional information to the `HfHubHTTPError` initial message."""
|
||||
self.args = (self.args[0] + additional_message,) + self.args[1:]
|
||||
|
||||
|
||||
# INFERENCE CLIENT ERRORS
|
||||
|
||||
|
||||
class InferenceTimeoutError(HTTPError, TimeoutError):
|
||||
"""Error raised when a model is unavailable or the request times out."""
|
||||
|
||||
|
||||
# INFERENCE ENDPOINT ERRORS
|
||||
|
||||
|
||||
class InferenceEndpointError(Exception):
|
||||
"""Generic exception when dealing with Inference Endpoints."""
|
||||
|
||||
|
||||
class InferenceEndpointTimeoutError(InferenceEndpointError, TimeoutError):
|
||||
"""Exception for timeouts while waiting for Inference Endpoint."""
|
||||
|
||||
|
||||
# SAFETENSORS ERRORS
|
||||
|
||||
|
||||
class SafetensorsParsingError(Exception):
|
||||
"""Raised when failing to parse a safetensors file metadata.
|
||||
|
||||
This can be the case if the file is not a safetensors file or does not respect the specification.
|
||||
"""
|
||||
|
||||
|
||||
class NotASafetensorsRepoError(Exception):
|
||||
"""Raised when a repo is not a Safetensors repo i.e. doesn't have either a `model.safetensors` or a
|
||||
`model.safetensors.index.json` file.
|
||||
"""
|
||||
|
||||
|
||||
# TEXT GENERATION ERRORS
|
||||
|
||||
|
||||
class TextGenerationError(HTTPError):
|
||||
"""Generic error raised if text-generation went wrong."""
|
||||
|
||||
|
||||
# Text Generation Inference Errors
|
||||
class ValidationError(TextGenerationError):
|
||||
"""Server-side validation error."""
|
||||
|
||||
|
||||
class GenerationError(TextGenerationError):
|
||||
pass
|
||||
|
||||
|
||||
class OverloadedError(TextGenerationError):
|
||||
pass
|
||||
|
||||
|
||||
class IncompleteGenerationError(TextGenerationError):
|
||||
pass
|
||||
|
||||
|
||||
class UnknownError(TextGenerationError):
|
||||
pass
|
||||
|
||||
|
||||
# VALIDATION ERRORS
|
||||
|
||||
|
||||
class HFValidationError(ValueError):
|
||||
"""Generic exception thrown by `huggingface_hub` validators.
|
||||
|
||||
Inherits from [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError).
|
||||
"""
|
||||
|
||||
|
||||
# FILE METADATA ERRORS
|
||||
|
||||
|
||||
class FileMetadataError(OSError):
|
||||
"""Error triggered when the metadata of a file on the Hub cannot be retrieved (missing ETag or commit_hash).
|
||||
|
||||
Inherits from `OSError` for backward compatibility.
|
||||
"""
|
||||
|
||||
|
||||
# REPOSITORY ERRORS
|
||||
|
||||
|
||||
class RepositoryNotFoundError(HfHubHTTPError):
|
||||
"""
|
||||
Raised when trying to access a hf.co URL with an invalid repository name, or
|
||||
with a private repo name the user does not have access to.
|
||||
|
||||
Example:
|
||||
|
||||
```py
|
||||
>>> from huggingface_hub import model_info
|
||||
>>> model_info("<non_existent_repository>")
|
||||
(...)
|
||||
huggingface_hub.utils._errors.RepositoryNotFoundError: 401 Client Error. (Request ID: PvMw_VjBMjVdMz53WKIzP)
|
||||
|
||||
Repository Not Found for url: https://huggingface.co/api/models/%3Cnon_existent_repository%3E.
|
||||
Please make sure you specified the correct `repo_id` and `repo_type`.
|
||||
If the repo is private, make sure you are authenticated.
|
||||
Invalid username or password.
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
class GatedRepoError(RepositoryNotFoundError):
|
||||
"""
|
||||
Raised when trying to access a gated repository for which the user is not on the
|
||||
authorized list.
|
||||
|
||||
Note: derives from `RepositoryNotFoundError` to ensure backward compatibility.
|
||||
|
||||
Example:
|
||||
|
||||
```py
|
||||
>>> from huggingface_hub import model_info
|
||||
>>> model_info("<gated_repository>")
|
||||
(...)
|
||||
huggingface_hub.utils._errors.GatedRepoError: 403 Client Error. (Request ID: ViT1Bf7O_026LGSQuVqfa)
|
||||
|
||||
Cannot access gated repo for url https://huggingface.co/api/models/ardent-figment/gated-model.
|
||||
Access to model ardent-figment/gated-model is restricted and you are not in the authorized list.
|
||||
Visit https://huggingface.co/ardent-figment/gated-model to ask for access.
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
class DisabledRepoError(HfHubHTTPError):
|
||||
"""
|
||||
Raised when trying to access a repository that has been disabled by its author.
|
||||
|
||||
Example:
|
||||
|
||||
```py
|
||||
>>> from huggingface_hub import dataset_info
|
||||
>>> dataset_info("laion/laion-art")
|
||||
(...)
|
||||
huggingface_hub.utils._errors.DisabledRepoError: 403 Client Error. (Request ID: Root=1-659fc3fa-3031673e0f92c71a2260dbe2;bc6f4dfb-b30a-4862-af0a-5cfe827610d8)
|
||||
|
||||
Cannot access repository for url https://huggingface.co/api/datasets/laion/laion-art.
|
||||
Access to this resource is disabled.
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
# REVISION ERROR
|
||||
|
||||
|
||||
class RevisionNotFoundError(HfHubHTTPError):
|
||||
"""
|
||||
Raised when trying to access a hf.co URL with a valid repository but an invalid
|
||||
revision.
|
||||
|
||||
Example:
|
||||
|
||||
```py
|
||||
>>> from huggingface_hub import hf_hub_download
|
||||
>>> hf_hub_download('bert-base-cased', 'config.json', revision='<non-existent-revision>')
|
||||
(...)
|
||||
huggingface_hub.utils._errors.RevisionNotFoundError: 404 Client Error. (Request ID: Mwhe_c3Kt650GcdKEFomX)
|
||||
|
||||
Revision Not Found for url: https://huggingface.co/bert-base-cased/resolve/%3Cnon-existent-revision%3E/config.json.
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
# ENTRY ERRORS
|
||||
class EntryNotFoundError(HfHubHTTPError):
|
||||
"""
|
||||
Raised when trying to access a hf.co URL with a valid repository and revision
|
||||
but an invalid filename.
|
||||
|
||||
Example:
|
||||
|
||||
```py
|
||||
>>> from huggingface_hub import hf_hub_download
|
||||
>>> hf_hub_download('bert-base-cased', '<non-existent-file>')
|
||||
(...)
|
||||
huggingface_hub.utils._errors.EntryNotFoundError: 404 Client Error. (Request ID: 53pNl6M0MxsnG5Sw8JA6x)
|
||||
|
||||
Entry Not Found for url: https://huggingface.co/bert-base-cased/resolve/main/%3Cnon-existent-file%3E.
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
class LocalEntryNotFoundError(EntryNotFoundError, FileNotFoundError, ValueError):
|
||||
"""
|
||||
Raised when trying to access a file or snapshot that is not on the disk when network is
|
||||
disabled or unavailable (connection issue). The entry may exist on the Hub.
|
||||
|
||||
Note: `ValueError` type is to ensure backward compatibility.
|
||||
Note: `LocalEntryNotFoundError` derives from `HTTPError` because of `EntryNotFoundError`
|
||||
even when it is not a network issue.
|
||||
|
||||
Example:
|
||||
|
||||
```py
|
||||
>>> from huggingface_hub import hf_hub_download
|
||||
>>> hf_hub_download('bert-base-cased', '<non-cached-file>', local_files_only=True)
|
||||
(...)
|
||||
huggingface_hub.utils._errors.LocalEntryNotFoundError: Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable hf.co look-ups and downloads online, set 'local_files_only' to False.
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, message: str):
|
||||
super().__init__(message, response=None)
|
||||
|
||||
|
||||
# REQUEST ERROR
|
||||
class BadRequestError(HfHubHTTPError, ValueError):
|
||||
"""
|
||||
Raised by `hf_raise_for_status` when the server returns a HTTP 400 error.
|
||||
|
||||
Example:
|
||||
|
||||
```py
|
||||
>>> resp = requests.post("hf.co/api/check", ...)
|
||||
>>> hf_raise_for_status(resp, endpoint_name="check")
|
||||
huggingface_hub.utils._errors.BadRequestError: Bad request for check endpoint: {details} (Request ID: XXX)
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
# DDUF file format ERROR
|
||||
|
||||
|
||||
class DDUFError(Exception):
|
||||
"""Base exception for errors related to the DDUF format."""
|
||||
|
||||
|
||||
class DDUFCorruptedFileError(DDUFError):
|
||||
"""Exception thrown when the DDUF file is corrupted."""
|
||||
|
||||
|
||||
class DDUFExportError(DDUFError):
|
||||
"""Base exception for errors during DDUF export."""
|
||||
|
||||
|
||||
class DDUFInvalidEntryNameError(DDUFExportError):
|
||||
"""Exception thrown when the entry name is invalid."""
|
||||
|
||||
|
||||
# XET ERRORS
|
||||
|
||||
|
||||
class XetError(Exception):
|
||||
"""Base exception for errors related to Xet Storage."""
|
||||
|
||||
|
||||
class XetAuthorizationError(XetError):
|
||||
"""Exception thrown when the user does not have the right authorization to use Xet Storage."""
|
||||
|
||||
|
||||
class XetRefreshTokenError(XetError):
|
||||
"""Exception thrown when the refresh token is invalid."""
|
||||
|
||||
|
||||
class XetDownloadError(Exception):
|
||||
"""Exception thrown when the download from Xet Storage fails."""
|
||||
@@ -0,0 +1,425 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from pickle import DEFAULT_PROTOCOL, PicklingError
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from packaging import version
|
||||
|
||||
from huggingface_hub import constants, snapshot_download
|
||||
from huggingface_hub.hf_api import HfApi
|
||||
from huggingface_hub.utils import (
|
||||
SoftTemporaryDirectory,
|
||||
get_fastai_version,
|
||||
get_fastcore_version,
|
||||
get_python_version,
|
||||
)
|
||||
|
||||
from .utils import logging, validate_hf_hub_args
|
||||
from .utils._runtime import _PY_VERSION # noqa: F401 # for backward compatibility...
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
def _check_fastai_fastcore_versions(
|
||||
fastai_min_version: str = "2.4",
|
||||
fastcore_min_version: str = "1.3.27",
|
||||
):
|
||||
"""
|
||||
Checks that the installed fastai and fastcore versions are compatible for pickle serialization.
|
||||
|
||||
Args:
|
||||
fastai_min_version (`str`, *optional*):
|
||||
The minimum fastai version supported.
|
||||
fastcore_min_version (`str`, *optional*):
|
||||
The minimum fastcore version supported.
|
||||
|
||||
<Tip>
|
||||
Raises the following error:
|
||||
|
||||
- [`ImportError`](https://docs.python.org/3/library/exceptions.html#ImportError)
|
||||
if the fastai or fastcore libraries are not available or are of an invalid version.
|
||||
|
||||
</Tip>
|
||||
"""
|
||||
|
||||
if (get_fastcore_version() or get_fastai_version()) == "N/A":
|
||||
raise ImportError(
|
||||
f"fastai>={fastai_min_version} and fastcore>={fastcore_min_version} are"
|
||||
f" required. Currently using fastai=={get_fastai_version()} and"
|
||||
f" fastcore=={get_fastcore_version()}."
|
||||
)
|
||||
|
||||
current_fastai_version = version.Version(get_fastai_version())
|
||||
current_fastcore_version = version.Version(get_fastcore_version())
|
||||
|
||||
if current_fastai_version < version.Version(fastai_min_version):
|
||||
raise ImportError(
|
||||
"`push_to_hub_fastai` and `from_pretrained_fastai` require a"
|
||||
f" fastai>={fastai_min_version} version, but you are using fastai version"
|
||||
f" {get_fastai_version()} which is incompatible. Upgrade with `pip install"
|
||||
" fastai==2.5.6`."
|
||||
)
|
||||
|
||||
if current_fastcore_version < version.Version(fastcore_min_version):
|
||||
raise ImportError(
|
||||
"`push_to_hub_fastai` and `from_pretrained_fastai` require a"
|
||||
f" fastcore>={fastcore_min_version} version, but you are using fastcore"
|
||||
f" version {get_fastcore_version()} which is incompatible. Upgrade with"
|
||||
" `pip install fastcore==1.3.27`."
|
||||
)
|
||||
|
||||
|
||||
def _check_fastai_fastcore_pyproject_versions(
|
||||
storage_folder: str,
|
||||
fastai_min_version: str = "2.4",
|
||||
fastcore_min_version: str = "1.3.27",
|
||||
):
|
||||
"""
|
||||
Checks that the `pyproject.toml` file in the directory `storage_folder` has fastai and fastcore versions
|
||||
that are compatible with `from_pretrained_fastai` and `push_to_hub_fastai`. If `pyproject.toml` does not exist
|
||||
or does not contain versions for fastai and fastcore, then it logs a warning.
|
||||
|
||||
Args:
|
||||
storage_folder (`str`):
|
||||
Folder to look for the `pyproject.toml` file.
|
||||
fastai_min_version (`str`, *optional*):
|
||||
The minimum fastai version supported.
|
||||
fastcore_min_version (`str`, *optional*):
|
||||
The minimum fastcore version supported.
|
||||
|
||||
<Tip>
|
||||
Raises the following errors:
|
||||
|
||||
- [`ImportError`](https://docs.python.org/3/library/exceptions.html#ImportError)
|
||||
if the `toml` module is not installed.
|
||||
- [`ImportError`](https://docs.python.org/3/library/exceptions.html#ImportError)
|
||||
if the `pyproject.toml` indicates a lower than minimum supported version of fastai or fastcore.
|
||||
|
||||
</Tip>
|
||||
"""
|
||||
|
||||
try:
|
||||
import toml
|
||||
except ModuleNotFoundError:
|
||||
raise ImportError(
|
||||
"`push_to_hub_fastai` and `from_pretrained_fastai` require the toml module."
|
||||
" Install it with `pip install toml`."
|
||||
)
|
||||
|
||||
# Checks that a `pyproject.toml`, with `build-system` and `requires` sections, exists in the repository. If so, get a list of required packages.
|
||||
if not os.path.isfile(f"{storage_folder}/pyproject.toml"):
|
||||
logger.warning(
|
||||
"There is no `pyproject.toml` in the repository that contains the fastai"
|
||||
" `Learner`. The `pyproject.toml` would allow us to verify that your fastai"
|
||||
" and fastcore versions are compatible with those of the model you want to"
|
||||
" load."
|
||||
)
|
||||
return
|
||||
pyproject_toml = toml.load(f"{storage_folder}/pyproject.toml")
|
||||
|
||||
if "build-system" not in pyproject_toml.keys():
|
||||
logger.warning(
|
||||
"There is no `build-system` section in the pyproject.toml of the repository"
|
||||
" that contains the fastai `Learner`. The `build-system` would allow us to"
|
||||
" verify that your fastai and fastcore versions are compatible with those"
|
||||
" of the model you want to load."
|
||||
)
|
||||
return
|
||||
build_system_toml = pyproject_toml["build-system"]
|
||||
|
||||
if "requires" not in build_system_toml.keys():
|
||||
logger.warning(
|
||||
"There is no `requires` section in the pyproject.toml of the repository"
|
||||
" that contains the fastai `Learner`. The `requires` would allow us to"
|
||||
" verify that your fastai and fastcore versions are compatible with those"
|
||||
" of the model you want to load."
|
||||
)
|
||||
return
|
||||
package_versions = build_system_toml["requires"]
|
||||
|
||||
# Extracts contains fastai and fastcore versions from `pyproject.toml` if available.
|
||||
# If the package is specified but not the version (e.g. "fastai" instead of "fastai=2.4"), the default versions are the highest.
|
||||
fastai_packages = [pck for pck in package_versions if pck.startswith("fastai")]
|
||||
if len(fastai_packages) == 0:
|
||||
logger.warning("The repository does not have a fastai version specified in the `pyproject.toml`.")
|
||||
# fastai_version is an empty string if not specified
|
||||
else:
|
||||
fastai_version = str(fastai_packages[0]).partition("=")[2]
|
||||
if fastai_version != "" and version.Version(fastai_version) < version.Version(fastai_min_version):
|
||||
raise ImportError(
|
||||
"`from_pretrained_fastai` requires"
|
||||
f" fastai>={fastai_min_version} version but the model to load uses"
|
||||
f" {fastai_version} which is incompatible."
|
||||
)
|
||||
|
||||
fastcore_packages = [pck for pck in package_versions if pck.startswith("fastcore")]
|
||||
if len(fastcore_packages) == 0:
|
||||
logger.warning("The repository does not have a fastcore version specified in the `pyproject.toml`.")
|
||||
# fastcore_version is an empty string if not specified
|
||||
else:
|
||||
fastcore_version = str(fastcore_packages[0]).partition("=")[2]
|
||||
if fastcore_version != "" and version.Version(fastcore_version) < version.Version(fastcore_min_version):
|
||||
raise ImportError(
|
||||
"`from_pretrained_fastai` requires"
|
||||
f" fastcore>={fastcore_min_version} version, but you are using fastcore"
|
||||
f" version {fastcore_version} which is incompatible."
|
||||
)
|
||||
|
||||
|
||||
README_TEMPLATE = """---
|
||||
tags:
|
||||
- fastai
|
||||
---
|
||||
|
||||
# Amazing!
|
||||
|
||||
🥳 Congratulations on hosting your fastai model on the Hugging Face Hub!
|
||||
|
||||
# Some next steps
|
||||
1. Fill out this model card with more information (see the template below and the [documentation here](https://huggingface.co/docs/hub/model-repos))!
|
||||
|
||||
2. Create a demo in Gradio or Streamlit using 🤗 Spaces ([documentation here](https://huggingface.co/docs/hub/spaces)).
|
||||
|
||||
3. Join the fastai community on the [Fastai Discord](https://discord.com/invite/YKrxeNn)!
|
||||
|
||||
Greetings fellow fastlearner 🤝! Don't forget to delete this content from your model card.
|
||||
|
||||
|
||||
---
|
||||
|
||||
|
||||
# Model card
|
||||
|
||||
## Model description
|
||||
More information needed
|
||||
|
||||
## Intended uses & limitations
|
||||
More information needed
|
||||
|
||||
## Training and evaluation data
|
||||
More information needed
|
||||
"""
|
||||
|
||||
PYPROJECT_TEMPLATE = f"""[build-system]
|
||||
requires = ["setuptools>=40.8.0", "wheel", "python={get_python_version()}", "fastai={get_fastai_version()}", "fastcore={get_fastcore_version()}"]
|
||||
build-backend = "setuptools.build_meta:__legacy__"
|
||||
"""
|
||||
|
||||
|
||||
def _create_model_card(repo_dir: Path):
|
||||
"""
|
||||
Creates a model card for the repository.
|
||||
|
||||
Args:
|
||||
repo_dir (`Path`):
|
||||
Directory where model card is created.
|
||||
"""
|
||||
readme_path = repo_dir / "README.md"
|
||||
|
||||
if not readme_path.exists():
|
||||
with readme_path.open("w", encoding="utf-8") as f:
|
||||
f.write(README_TEMPLATE)
|
||||
|
||||
|
||||
def _create_model_pyproject(repo_dir: Path):
|
||||
"""
|
||||
Creates a `pyproject.toml` for the repository.
|
||||
|
||||
Args:
|
||||
repo_dir (`Path`):
|
||||
Directory where `pyproject.toml` is created.
|
||||
"""
|
||||
pyproject_path = repo_dir / "pyproject.toml"
|
||||
|
||||
if not pyproject_path.exists():
|
||||
with pyproject_path.open("w", encoding="utf-8") as f:
|
||||
f.write(PYPROJECT_TEMPLATE)
|
||||
|
||||
|
||||
def _save_pretrained_fastai(
|
||||
learner,
|
||||
save_directory: Union[str, Path],
|
||||
config: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
"""
|
||||
Saves a fastai learner to `save_directory` in pickle format using the default pickle protocol for the version of python used.
|
||||
|
||||
Args:
|
||||
learner (`Learner`):
|
||||
The `fastai.Learner` you'd like to save.
|
||||
save_directory (`str` or `Path`):
|
||||
Specific directory in which you want to save the fastai learner.
|
||||
config (`dict`, *optional*):
|
||||
Configuration object. Will be uploaded as a .json file. Example: 'https://huggingface.co/espejelomar/fastai-pet-breeds-classification/blob/main/config.json'.
|
||||
|
||||
<Tip>
|
||||
|
||||
Raises the following error:
|
||||
|
||||
- [`RuntimeError`](https://docs.python.org/3/library/exceptions.html#RuntimeError)
|
||||
if the config file provided is not a dictionary.
|
||||
|
||||
</Tip>
|
||||
"""
|
||||
_check_fastai_fastcore_versions()
|
||||
|
||||
os.makedirs(save_directory, exist_ok=True)
|
||||
|
||||
# if the user provides config then we update it with the fastai and fastcore versions in CONFIG_TEMPLATE.
|
||||
if config is not None:
|
||||
if not isinstance(config, dict):
|
||||
raise RuntimeError(f"Provided config should be a dict. Got: '{type(config)}'")
|
||||
path = os.path.join(save_directory, constants.CONFIG_NAME)
|
||||
with open(path, "w") as f:
|
||||
json.dump(config, f)
|
||||
|
||||
_create_model_card(Path(save_directory))
|
||||
_create_model_pyproject(Path(save_directory))
|
||||
|
||||
# learner.export saves the model in `self.path`.
|
||||
learner.path = Path(save_directory)
|
||||
os.makedirs(save_directory, exist_ok=True)
|
||||
try:
|
||||
learner.export(
|
||||
fname="model.pkl",
|
||||
pickle_protocol=DEFAULT_PROTOCOL,
|
||||
)
|
||||
except PicklingError:
|
||||
raise PicklingError(
|
||||
"You are using a lambda function, i.e., an anonymous function. `pickle`"
|
||||
" cannot pickle function objects and requires that all functions have"
|
||||
" names. One possible solution is to name the function."
|
||||
)
|
||||
|
||||
|
||||
@validate_hf_hub_args
|
||||
def from_pretrained_fastai(
|
||||
repo_id: str,
|
||||
revision: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Load pretrained fastai model from the Hub or from a local directory.
|
||||
|
||||
Args:
|
||||
repo_id (`str`):
|
||||
The location where the pickled fastai.Learner is. It can be either of the two:
|
||||
- Hosted on the Hugging Face Hub. E.g.: 'espejelomar/fatai-pet-breeds-classification' or 'distilgpt2'.
|
||||
You can add a `revision` by appending `@` at the end of `repo_id`. E.g.: `dbmdz/bert-base-german-cased@main`.
|
||||
Revision is the specific model version to use. Since we use a git-based system for storing models and other
|
||||
artifacts on the Hugging Face Hub, it can be a branch name, a tag name, or a commit id.
|
||||
- Hosted locally. `repo_id` would be a directory containing the pickle and a pyproject.toml
|
||||
indicating the fastai and fastcore versions used to build the `fastai.Learner`. E.g.: `./my_model_directory/`.
|
||||
revision (`str`, *optional*):
|
||||
Revision at which the repo's files are downloaded. See documentation of `snapshot_download`.
|
||||
|
||||
Returns:
|
||||
The `fastai.Learner` model in the `repo_id` repo.
|
||||
"""
|
||||
_check_fastai_fastcore_versions()
|
||||
|
||||
# Load the `repo_id` repo.
|
||||
# `snapshot_download` returns the folder where the model was stored.
|
||||
# `cache_dir` will be the default '/root/.cache/huggingface/hub'
|
||||
if not os.path.isdir(repo_id):
|
||||
storage_folder = snapshot_download(
|
||||
repo_id=repo_id,
|
||||
revision=revision,
|
||||
library_name="fastai",
|
||||
library_version=get_fastai_version(),
|
||||
)
|
||||
else:
|
||||
storage_folder = repo_id
|
||||
|
||||
_check_fastai_fastcore_pyproject_versions(storage_folder)
|
||||
|
||||
from fastai.learner import load_learner # type: ignore
|
||||
|
||||
return load_learner(os.path.join(storage_folder, "model.pkl"))
|
||||
|
||||
|
||||
@validate_hf_hub_args
|
||||
def push_to_hub_fastai(
|
||||
learner,
|
||||
*,
|
||||
repo_id: str,
|
||||
commit_message: str = "Push FastAI model using huggingface_hub.",
|
||||
private: Optional[bool] = None,
|
||||
token: Optional[str] = None,
|
||||
config: Optional[dict] = None,
|
||||
branch: Optional[str] = None,
|
||||
create_pr: Optional[bool] = None,
|
||||
allow_patterns: Optional[Union[List[str], str]] = None,
|
||||
ignore_patterns: Optional[Union[List[str], str]] = None,
|
||||
delete_patterns: Optional[Union[List[str], str]] = None,
|
||||
api_endpoint: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Upload learner checkpoint files to the Hub.
|
||||
|
||||
Use `allow_patterns` and `ignore_patterns` to precisely filter which files should be pushed to the hub. Use
|
||||
`delete_patterns` to delete existing remote files in the same commit. See [`upload_folder`] reference for more
|
||||
details.
|
||||
|
||||
Args:
|
||||
learner (`Learner`):
|
||||
The `fastai.Learner' you'd like to push to the Hub.
|
||||
repo_id (`str`):
|
||||
The repository id for your model in Hub in the format of "namespace/repo_name". The namespace can be your individual account or an organization to which you have write access (for example, 'stanfordnlp/stanza-de').
|
||||
commit_message (`str`, *optional*):
|
||||
Message to commit while pushing. Will default to :obj:`"add model"`.
|
||||
private (`bool`, *optional*):
|
||||
Whether or not the repository created should be private.
|
||||
If `None` (default), will default to been public except if the organization's default is private.
|
||||
token (`str`, *optional*):
|
||||
The Hugging Face account token to use as HTTP bearer authorization for remote files. If :obj:`None`, the token will be asked by a prompt.
|
||||
config (`dict`, *optional*):
|
||||
Configuration object to be saved alongside the model weights.
|
||||
branch (`str`, *optional*):
|
||||
The git branch on which to push the model. This defaults to
|
||||
the default branch as specified in your repository, which
|
||||
defaults to `"main"`.
|
||||
create_pr (`boolean`, *optional*):
|
||||
Whether or not to create a Pull Request from `branch` with that commit.
|
||||
Defaults to `False`.
|
||||
api_endpoint (`str`, *optional*):
|
||||
The API endpoint to use when pushing the model to the hub.
|
||||
allow_patterns (`List[str]` or `str`, *optional*):
|
||||
If provided, only files matching at least one pattern are pushed.
|
||||
ignore_patterns (`List[str]` or `str`, *optional*):
|
||||
If provided, files matching any of the patterns are not pushed.
|
||||
delete_patterns (`List[str]` or `str`, *optional*):
|
||||
If provided, remote files matching any of the patterns will be deleted from the repo.
|
||||
|
||||
Returns:
|
||||
The url of the commit of your model in the given repository.
|
||||
|
||||
<Tip>
|
||||
|
||||
Raises the following error:
|
||||
|
||||
- [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
||||
if the user is not log on to the Hugging Face Hub.
|
||||
|
||||
</Tip>
|
||||
"""
|
||||
_check_fastai_fastcore_versions()
|
||||
api = HfApi(endpoint=api_endpoint)
|
||||
repo_id = api.create_repo(repo_id=repo_id, token=token, private=private, exist_ok=True).repo_id
|
||||
|
||||
# Push the files to the repo in a single commit
|
||||
with SoftTemporaryDirectory() as tmp:
|
||||
saved_path = Path(tmp) / repo_id
|
||||
_save_pretrained_fastai(learner, saved_path, config=config)
|
||||
return api.upload_folder(
|
||||
repo_id=repo_id,
|
||||
token=token,
|
||||
folder_path=saved_path,
|
||||
commit_message=commit_message,
|
||||
revision=branch,
|
||||
create_pr=create_pr,
|
||||
allow_patterns=allow_patterns,
|
||||
ignore_patterns=ignore_patterns,
|
||||
delete_patterns=delete_patterns,
|
||||
)
|
||||
1753
.venv/lib/python3.10/site-packages/huggingface_hub/file_download.py
Normal file
1753
.venv/lib/python3.10/site-packages/huggingface_hub/file_download.py
Normal file
File diff suppressed because it is too large
Load Diff
9966
.venv/lib/python3.10/site-packages/huggingface_hub/hf_api.py
Normal file
9966
.venv/lib/python3.10/site-packages/huggingface_hub/hf_api.py
Normal file
File diff suppressed because it is too large
Load Diff
1140
.venv/lib/python3.10/site-packages/huggingface_hub/hf_file_system.py
Normal file
1140
.venv/lib/python3.10/site-packages/huggingface_hub/hf_file_system.py
Normal file
File diff suppressed because it is too large
Load Diff
851
.venv/lib/python3.10/site-packages/huggingface_hub/hub_mixin.py
Normal file
851
.venv/lib/python3.10/site-packages/huggingface_hub/hub_mixin.py
Normal file
@@ -0,0 +1,851 @@
|
||||
import inspect
|
||||
import json
|
||||
import os
|
||||
from dataclasses import Field, asdict, dataclass, is_dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, ClassVar, Dict, List, Optional, Protocol, Tuple, Type, TypeVar, Union
|
||||
|
||||
import packaging.version
|
||||
|
||||
from . import constants
|
||||
from .errors import EntryNotFoundError, HfHubHTTPError
|
||||
from .file_download import hf_hub_download
|
||||
from .hf_api import HfApi
|
||||
from .repocard import ModelCard, ModelCardData
|
||||
from .utils import (
|
||||
SoftTemporaryDirectory,
|
||||
is_jsonable,
|
||||
is_safetensors_available,
|
||||
is_simple_optional_type,
|
||||
is_torch_available,
|
||||
logging,
|
||||
unwrap_simple_optional_type,
|
||||
validate_hf_hub_args,
|
||||
)
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch # type: ignore
|
||||
|
||||
if is_safetensors_available():
|
||||
import safetensors
|
||||
from safetensors.torch import load_model as load_model_as_safetensor
|
||||
from safetensors.torch import save_model as save_model_as_safetensor
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
# Type alias for dataclass instances, copied from https://github.com/python/typeshed/blob/9f28171658b9ca6c32a7cb93fbb99fc92b17858b/stdlib/_typeshed/__init__.pyi#L349
|
||||
class DataclassInstance(Protocol):
|
||||
__dataclass_fields__: ClassVar[Dict[str, Field]]
|
||||
|
||||
|
||||
# Generic variable that is either ModelHubMixin or a subclass thereof
|
||||
T = TypeVar("T", bound="ModelHubMixin")
|
||||
# Generic variable to represent an args type
|
||||
ARGS_T = TypeVar("ARGS_T")
|
||||
ENCODER_T = Callable[[ARGS_T], Any]
|
||||
DECODER_T = Callable[[Any], ARGS_T]
|
||||
CODER_T = Tuple[ENCODER_T, DECODER_T]
|
||||
|
||||
|
||||
DEFAULT_MODEL_CARD = """
|
||||
---
|
||||
# For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1
|
||||
# Doc / guide: https://huggingface.co/docs/hub/model-cards
|
||||
{{ card_data }}
|
||||
---
|
||||
|
||||
This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
|
||||
- Code: {{ repo_url | default("[More Information Needed]", true) }}
|
||||
- Paper: {{ paper_url | default("[More Information Needed]", true) }}
|
||||
- Docs: {{ docs_url | default("[More Information Needed]", true) }}
|
||||
"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class MixinInfo:
|
||||
model_card_template: str
|
||||
model_card_data: ModelCardData
|
||||
docs_url: Optional[str] = None
|
||||
paper_url: Optional[str] = None
|
||||
repo_url: Optional[str] = None
|
||||
|
||||
|
||||
class ModelHubMixin:
|
||||
"""
|
||||
A generic mixin to integrate ANY machine learning framework with the Hub.
|
||||
|
||||
To integrate your framework, your model class must inherit from this class. Custom logic for saving/loading models
|
||||
have to be overwritten in [`_from_pretrained`] and [`_save_pretrained`]. [`PyTorchModelHubMixin`] is a good example
|
||||
of mixin integration with the Hub. Check out our [integration guide](../guides/integrations) for more instructions.
|
||||
|
||||
When inheriting from [`ModelHubMixin`], you can define class-level attributes. These attributes are not passed to
|
||||
`__init__` but to the class definition itself. This is useful to define metadata about the library integrating
|
||||
[`ModelHubMixin`].
|
||||
|
||||
For more details on how to integrate the mixin with your library, checkout the [integration guide](../guides/integrations).
|
||||
|
||||
Args:
|
||||
repo_url (`str`, *optional*):
|
||||
URL of the library repository. Used to generate model card.
|
||||
paper_url (`str`, *optional*):
|
||||
URL of the library paper. Used to generate model card.
|
||||
docs_url (`str`, *optional*):
|
||||
URL of the library documentation. Used to generate model card.
|
||||
model_card_template (`str`, *optional*):
|
||||
Template of the model card. Used to generate model card. Defaults to a generic template.
|
||||
language (`str` or `List[str]`, *optional*):
|
||||
Language supported by the library. Used to generate model card.
|
||||
library_name (`str`, *optional*):
|
||||
Name of the library integrating ModelHubMixin. Used to generate model card.
|
||||
license (`str`, *optional*):
|
||||
License of the library integrating ModelHubMixin. Used to generate model card.
|
||||
E.g: "apache-2.0"
|
||||
license_name (`str`, *optional*):
|
||||
Name of the library integrating ModelHubMixin. Used to generate model card.
|
||||
Only used if `license` is set to `other`.
|
||||
E.g: "coqui-public-model-license".
|
||||
license_link (`str`, *optional*):
|
||||
URL to the license of the library integrating ModelHubMixin. Used to generate model card.
|
||||
Only used if `license` is set to `other` and `license_name` is set.
|
||||
E.g: "https://coqui.ai/cpml".
|
||||
pipeline_tag (`str`, *optional*):
|
||||
Tag of the pipeline. Used to generate model card. E.g. "text-classification".
|
||||
tags (`List[str]`, *optional*):
|
||||
Tags to be added to the model card. Used to generate model card. E.g. ["computer-vision"]
|
||||
coders (`Dict[Type, Tuple[Callable, Callable]]`, *optional*):
|
||||
Dictionary of custom types and their encoders/decoders. Used to encode/decode arguments that are not
|
||||
jsonable by default. E.g dataclasses, argparse.Namespace, OmegaConf, etc.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from huggingface_hub import ModelHubMixin
|
||||
|
||||
# Inherit from ModelHubMixin
|
||||
>>> class MyCustomModel(
|
||||
... ModelHubMixin,
|
||||
... library_name="my-library",
|
||||
... tags=["computer-vision"],
|
||||
... repo_url="https://github.com/huggingface/my-cool-library",
|
||||
... paper_url="https://arxiv.org/abs/2304.12244",
|
||||
... docs_url="https://huggingface.co/docs/my-cool-library",
|
||||
... # ^ optional metadata to generate model card
|
||||
... ):
|
||||
... def __init__(self, size: int = 512, device: str = "cpu"):
|
||||
... # define how to initialize your model
|
||||
... super().__init__()
|
||||
... ...
|
||||
...
|
||||
... def _save_pretrained(self, save_directory: Path) -> None:
|
||||
... # define how to serialize your model
|
||||
... ...
|
||||
...
|
||||
... @classmethod
|
||||
... def from_pretrained(
|
||||
... cls: Type[T],
|
||||
... pretrained_model_name_or_path: Union[str, Path],
|
||||
... *,
|
||||
... force_download: bool = False,
|
||||
... resume_download: Optional[bool] = None,
|
||||
... proxies: Optional[Dict] = None,
|
||||
... token: Optional[Union[str, bool]] = None,
|
||||
... cache_dir: Optional[Union[str, Path]] = None,
|
||||
... local_files_only: bool = False,
|
||||
... revision: Optional[str] = None,
|
||||
... **model_kwargs,
|
||||
... ) -> T:
|
||||
... # define how to deserialize your model
|
||||
... ...
|
||||
|
||||
>>> model = MyCustomModel(size=256, device="gpu")
|
||||
|
||||
# Save model weights to local directory
|
||||
>>> model.save_pretrained("my-awesome-model")
|
||||
|
||||
# Push model weights to the Hub
|
||||
>>> model.push_to_hub("my-awesome-model")
|
||||
|
||||
# Download and initialize weights from the Hub
|
||||
>>> reloaded_model = MyCustomModel.from_pretrained("username/my-awesome-model")
|
||||
>>> reloaded_model.size
|
||||
256
|
||||
|
||||
# Model card has been correctly populated
|
||||
>>> from huggingface_hub import ModelCard
|
||||
>>> card = ModelCard.load("username/my-awesome-model")
|
||||
>>> card.data.tags
|
||||
["x-custom-tag", "pytorch_model_hub_mixin", "model_hub_mixin"]
|
||||
>>> card.data.library_name
|
||||
"my-library"
|
||||
```
|
||||
"""
|
||||
|
||||
_hub_mixin_config: Optional[Union[dict, DataclassInstance]] = None
|
||||
# ^ optional config attribute automatically set in `from_pretrained`
|
||||
_hub_mixin_info: MixinInfo
|
||||
# ^ information about the library integrating ModelHubMixin (used to generate model card)
|
||||
_hub_mixin_inject_config: bool # whether `_from_pretrained` expects `config` or not
|
||||
_hub_mixin_init_parameters: Dict[str, inspect.Parameter] # __init__ parameters
|
||||
_hub_mixin_jsonable_default_values: Dict[str, Any] # default values for __init__ parameters
|
||||
_hub_mixin_jsonable_custom_types: Tuple[Type, ...] # custom types that can be encoded/decoded
|
||||
_hub_mixin_coders: Dict[Type, CODER_T] # encoders/decoders for custom types
|
||||
# ^ internal values to handle config
|
||||
|
||||
def __init_subclass__(
|
||||
cls,
|
||||
*,
|
||||
# Generic info for model card
|
||||
repo_url: Optional[str] = None,
|
||||
paper_url: Optional[str] = None,
|
||||
docs_url: Optional[str] = None,
|
||||
# Model card template
|
||||
model_card_template: str = DEFAULT_MODEL_CARD,
|
||||
# Model card metadata
|
||||
language: Optional[List[str]] = None,
|
||||
library_name: Optional[str] = None,
|
||||
license: Optional[str] = None,
|
||||
license_name: Optional[str] = None,
|
||||
license_link: Optional[str] = None,
|
||||
pipeline_tag: Optional[str] = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
# How to encode/decode arguments with custom type into a JSON config?
|
||||
coders: Optional[
|
||||
Dict[Type, CODER_T]
|
||||
# Key is a type.
|
||||
# Value is a tuple (encoder, decoder).
|
||||
# Example: {MyCustomType: (lambda x: x.value, lambda data: MyCustomType(data))}
|
||||
] = None,
|
||||
) -> None:
|
||||
"""Inspect __init__ signature only once when subclassing + handle modelcard."""
|
||||
super().__init_subclass__()
|
||||
|
||||
# Will be reused when creating modelcard
|
||||
tags = tags or []
|
||||
tags.append("model_hub_mixin")
|
||||
|
||||
# Initialize MixinInfo if not existent
|
||||
info = MixinInfo(model_card_template=model_card_template, model_card_data=ModelCardData())
|
||||
|
||||
# If parent class has a MixinInfo, inherit from it as a copy
|
||||
if hasattr(cls, "_hub_mixin_info"):
|
||||
# Inherit model card template from parent class if not explicitly set
|
||||
if model_card_template == DEFAULT_MODEL_CARD:
|
||||
info.model_card_template = cls._hub_mixin_info.model_card_template
|
||||
|
||||
# Inherit from parent model card data
|
||||
info.model_card_data = ModelCardData(**cls._hub_mixin_info.model_card_data.to_dict())
|
||||
|
||||
# Inherit other info
|
||||
info.docs_url = cls._hub_mixin_info.docs_url
|
||||
info.paper_url = cls._hub_mixin_info.paper_url
|
||||
info.repo_url = cls._hub_mixin_info.repo_url
|
||||
cls._hub_mixin_info = info
|
||||
|
||||
# Update MixinInfo with metadata
|
||||
if model_card_template is not None and model_card_template != DEFAULT_MODEL_CARD:
|
||||
info.model_card_template = model_card_template
|
||||
if repo_url is not None:
|
||||
info.repo_url = repo_url
|
||||
if paper_url is not None:
|
||||
info.paper_url = paper_url
|
||||
if docs_url is not None:
|
||||
info.docs_url = docs_url
|
||||
if language is not None:
|
||||
info.model_card_data.language = language
|
||||
if library_name is not None:
|
||||
info.model_card_data.library_name = library_name
|
||||
if license is not None:
|
||||
info.model_card_data.license = license
|
||||
if license_name is not None:
|
||||
info.model_card_data.license_name = license_name
|
||||
if license_link is not None:
|
||||
info.model_card_data.license_link = license_link
|
||||
if pipeline_tag is not None:
|
||||
info.model_card_data.pipeline_tag = pipeline_tag
|
||||
if tags is not None:
|
||||
if info.model_card_data.tags is not None:
|
||||
info.model_card_data.tags.extend(tags)
|
||||
else:
|
||||
info.model_card_data.tags = tags
|
||||
|
||||
info.model_card_data.tags = sorted(set(info.model_card_data.tags))
|
||||
|
||||
# Handle encoders/decoders for args
|
||||
cls._hub_mixin_coders = coders or {}
|
||||
cls._hub_mixin_jsonable_custom_types = tuple(cls._hub_mixin_coders.keys())
|
||||
|
||||
# Inspect __init__ signature to handle config
|
||||
cls._hub_mixin_init_parameters = dict(inspect.signature(cls.__init__).parameters)
|
||||
cls._hub_mixin_jsonable_default_values = {
|
||||
param.name: cls._encode_arg(param.default)
|
||||
for param in cls._hub_mixin_init_parameters.values()
|
||||
if param.default is not inspect.Parameter.empty and cls._is_jsonable(param.default)
|
||||
}
|
||||
cls._hub_mixin_inject_config = "config" in inspect.signature(cls._from_pretrained).parameters
|
||||
|
||||
def __new__(cls: Type[T], *args, **kwargs) -> T:
|
||||
"""Create a new instance of the class and handle config.
|
||||
|
||||
3 cases:
|
||||
- If `self._hub_mixin_config` is already set, do nothing.
|
||||
- If `config` is passed as a dataclass, set it as `self._hub_mixin_config`.
|
||||
- Otherwise, build `self._hub_mixin_config` from default values and passed values.
|
||||
"""
|
||||
instance = super().__new__(cls)
|
||||
|
||||
# If `config` is already set, return early
|
||||
if instance._hub_mixin_config is not None:
|
||||
return instance
|
||||
|
||||
# Infer passed values
|
||||
passed_values = {
|
||||
**{
|
||||
key: value
|
||||
for key, value in zip(
|
||||
# [1:] to skip `self` parameter
|
||||
list(cls._hub_mixin_init_parameters)[1:],
|
||||
args,
|
||||
)
|
||||
},
|
||||
**kwargs,
|
||||
}
|
||||
|
||||
# If config passed as dataclass => set it and return early
|
||||
if is_dataclass(passed_values.get("config")):
|
||||
instance._hub_mixin_config = passed_values["config"]
|
||||
return instance
|
||||
|
||||
# Otherwise, build config from default + passed values
|
||||
init_config = {
|
||||
# default values
|
||||
**cls._hub_mixin_jsonable_default_values,
|
||||
# passed values
|
||||
**{
|
||||
key: cls._encode_arg(value) # Encode custom types as jsonable value
|
||||
for key, value in passed_values.items()
|
||||
if instance._is_jsonable(value) # Only if jsonable or we have a custom encoder
|
||||
},
|
||||
}
|
||||
passed_config = init_config.pop("config", {})
|
||||
|
||||
# Populate `init_config` with provided config
|
||||
if isinstance(passed_config, dict):
|
||||
init_config.update(passed_config)
|
||||
|
||||
# Set `config` attribute and return
|
||||
if init_config != {}:
|
||||
instance._hub_mixin_config = init_config
|
||||
return instance
|
||||
|
||||
@classmethod
|
||||
def _is_jsonable(cls, value: Any) -> bool:
|
||||
"""Check if a value is JSON serializable."""
|
||||
if is_dataclass(value):
|
||||
return True
|
||||
if isinstance(value, cls._hub_mixin_jsonable_custom_types):
|
||||
return True
|
||||
return is_jsonable(value)
|
||||
|
||||
@classmethod
|
||||
def _encode_arg(cls, arg: Any) -> Any:
|
||||
"""Encode an argument into a JSON serializable format."""
|
||||
if is_dataclass(arg):
|
||||
return asdict(arg)
|
||||
for type_, (encoder, _) in cls._hub_mixin_coders.items():
|
||||
if isinstance(arg, type_):
|
||||
if arg is None:
|
||||
return None
|
||||
return encoder(arg)
|
||||
return arg
|
||||
|
||||
@classmethod
|
||||
def _decode_arg(cls, expected_type: Type[ARGS_T], value: Any) -> Optional[ARGS_T]:
|
||||
"""Decode a JSON serializable value into an argument."""
|
||||
if is_simple_optional_type(expected_type):
|
||||
if value is None:
|
||||
return None
|
||||
expected_type = unwrap_simple_optional_type(expected_type)
|
||||
# Dataclass => handle it
|
||||
if is_dataclass(expected_type):
|
||||
return _load_dataclass(expected_type, value) # type: ignore[return-value]
|
||||
# Otherwise => check custom decoders
|
||||
for type_, (_, decoder) in cls._hub_mixin_coders.items():
|
||||
if inspect.isclass(expected_type) and issubclass(expected_type, type_):
|
||||
return decoder(value)
|
||||
# Otherwise => don't decode
|
||||
return value
|
||||
|
||||
def save_pretrained(
|
||||
self,
|
||||
save_directory: Union[str, Path],
|
||||
*,
|
||||
config: Optional[Union[dict, DataclassInstance]] = None,
|
||||
repo_id: Optional[str] = None,
|
||||
push_to_hub: bool = False,
|
||||
model_card_kwargs: Optional[Dict[str, Any]] = None,
|
||||
**push_to_hub_kwargs,
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Save weights in local directory.
|
||||
|
||||
Args:
|
||||
save_directory (`str` or `Path`):
|
||||
Path to directory in which the model weights and configuration will be saved.
|
||||
config (`dict` or `DataclassInstance`, *optional*):
|
||||
Model configuration specified as a key/value dictionary or a dataclass instance.
|
||||
push_to_hub (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to push your model to the Huggingface Hub after saving it.
|
||||
repo_id (`str`, *optional*):
|
||||
ID of your repository on the Hub. Used only if `push_to_hub=True`. Will default to the folder name if
|
||||
not provided.
|
||||
model_card_kwargs (`Dict[str, Any]`, *optional*):
|
||||
Additional arguments passed to the model card template to customize the model card.
|
||||
push_to_hub_kwargs:
|
||||
Additional key word arguments passed along to the [`~ModelHubMixin.push_to_hub`] method.
|
||||
Returns:
|
||||
`str` or `None`: url of the commit on the Hub if `push_to_hub=True`, `None` otherwise.
|
||||
"""
|
||||
save_directory = Path(save_directory)
|
||||
save_directory.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Remove config.json if already exists. After `_save_pretrained` we don't want to overwrite config.json
|
||||
# as it might have been saved by the custom `_save_pretrained` already. However we do want to overwrite
|
||||
# an existing config.json if it was not saved by `_save_pretrained`.
|
||||
config_path = save_directory / constants.CONFIG_NAME
|
||||
config_path.unlink(missing_ok=True)
|
||||
|
||||
# save model weights/files (framework-specific)
|
||||
self._save_pretrained(save_directory)
|
||||
|
||||
# save config (if provided and if not serialized yet in `_save_pretrained`)
|
||||
if config is None:
|
||||
config = self._hub_mixin_config
|
||||
if config is not None:
|
||||
if is_dataclass(config):
|
||||
config = asdict(config) # type: ignore[arg-type]
|
||||
if not config_path.exists():
|
||||
config_str = json.dumps(config, sort_keys=True, indent=2)
|
||||
config_path.write_text(config_str)
|
||||
|
||||
# save model card
|
||||
model_card_path = save_directory / "README.md"
|
||||
model_card_kwargs = model_card_kwargs if model_card_kwargs is not None else {}
|
||||
if not model_card_path.exists(): # do not overwrite if already exists
|
||||
self.generate_model_card(**model_card_kwargs).save(save_directory / "README.md")
|
||||
|
||||
# push to the Hub if required
|
||||
if push_to_hub:
|
||||
kwargs = push_to_hub_kwargs.copy() # soft-copy to avoid mutating input
|
||||
if config is not None: # kwarg for `push_to_hub`
|
||||
kwargs["config"] = config
|
||||
if repo_id is None:
|
||||
repo_id = save_directory.name # Defaults to `save_directory` name
|
||||
return self.push_to_hub(repo_id=repo_id, model_card_kwargs=model_card_kwargs, **kwargs)
|
||||
return None
|
||||
|
||||
def _save_pretrained(self, save_directory: Path) -> None:
|
||||
"""
|
||||
Overwrite this method in subclass to define how to save your model.
|
||||
Check out our [integration guide](../guides/integrations) for instructions.
|
||||
|
||||
Args:
|
||||
save_directory (`str` or `Path`):
|
||||
Path to directory in which the model weights and configuration will be saved.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
@validate_hf_hub_args
|
||||
def from_pretrained(
|
||||
cls: Type[T],
|
||||
pretrained_model_name_or_path: Union[str, Path],
|
||||
*,
|
||||
force_download: bool = False,
|
||||
resume_download: Optional[bool] = None,
|
||||
proxies: Optional[Dict] = None,
|
||||
token: Optional[Union[str, bool]] = None,
|
||||
cache_dir: Optional[Union[str, Path]] = None,
|
||||
local_files_only: bool = False,
|
||||
revision: Optional[str] = None,
|
||||
**model_kwargs,
|
||||
) -> T:
|
||||
"""
|
||||
Download a model from the Huggingface Hub and instantiate it.
|
||||
|
||||
Args:
|
||||
pretrained_model_name_or_path (`str`, `Path`):
|
||||
- Either the `model_id` (string) of a model hosted on the Hub, e.g. `bigscience/bloom`.
|
||||
- Or a path to a `directory` containing model weights saved using
|
||||
[`~transformers.PreTrainedModel.save_pretrained`], e.g., `../path/to/my_model_directory/`.
|
||||
revision (`str`, *optional*):
|
||||
Revision of the model on the Hub. Can be a branch name, a git tag or any commit id.
|
||||
Defaults to the latest commit on `main` branch.
|
||||
force_download (`bool`, *optional*, defaults to `False`):
|
||||
Whether to force (re-)downloading the model weights and configuration files from the Hub, overriding
|
||||
the existing cache.
|
||||
proxies (`Dict[str, str]`, *optional*):
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
|
||||
'http://hostname': 'foo.bar:4012'}`. The proxies are used on every request.
|
||||
token (`str` or `bool`, *optional*):
|
||||
The token to use as HTTP bearer authorization for remote files. By default, it will use the token
|
||||
cached when running `huggingface-cli login`.
|
||||
cache_dir (`str`, `Path`, *optional*):
|
||||
Path to the folder where cached files are stored.
|
||||
local_files_only (`bool`, *optional*, defaults to `False`):
|
||||
If `True`, avoid downloading the file and return the path to the local cached file if it exists.
|
||||
model_kwargs (`Dict`, *optional*):
|
||||
Additional kwargs to pass to the model during initialization.
|
||||
"""
|
||||
model_id = str(pretrained_model_name_or_path)
|
||||
config_file: Optional[str] = None
|
||||
if os.path.isdir(model_id):
|
||||
if constants.CONFIG_NAME in os.listdir(model_id):
|
||||
config_file = os.path.join(model_id, constants.CONFIG_NAME)
|
||||
else:
|
||||
logger.warning(f"{constants.CONFIG_NAME} not found in {Path(model_id).resolve()}")
|
||||
else:
|
||||
try:
|
||||
config_file = hf_hub_download(
|
||||
repo_id=model_id,
|
||||
filename=constants.CONFIG_NAME,
|
||||
revision=revision,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
resume_download=resume_download,
|
||||
token=token,
|
||||
local_files_only=local_files_only,
|
||||
)
|
||||
except HfHubHTTPError as e:
|
||||
logger.info(f"{constants.CONFIG_NAME} not found on the HuggingFace Hub: {str(e)}")
|
||||
|
||||
# Read config
|
||||
config = None
|
||||
if config_file is not None:
|
||||
with open(config_file, "r", encoding="utf-8") as f:
|
||||
config = json.load(f)
|
||||
|
||||
# Decode custom types in config
|
||||
for key, value in config.items():
|
||||
if key in cls._hub_mixin_init_parameters:
|
||||
expected_type = cls._hub_mixin_init_parameters[key].annotation
|
||||
if expected_type is not inspect.Parameter.empty:
|
||||
config[key] = cls._decode_arg(expected_type, value)
|
||||
|
||||
# Populate model_kwargs from config
|
||||
for param in cls._hub_mixin_init_parameters.values():
|
||||
if param.name not in model_kwargs and param.name in config:
|
||||
model_kwargs[param.name] = config[param.name]
|
||||
|
||||
# Check if `config` argument was passed at init
|
||||
if "config" in cls._hub_mixin_init_parameters and "config" not in model_kwargs:
|
||||
# Decode `config` argument if it was passed
|
||||
config_annotation = cls._hub_mixin_init_parameters["config"].annotation
|
||||
config = cls._decode_arg(config_annotation, config)
|
||||
|
||||
# Forward config to model initialization
|
||||
model_kwargs["config"] = config
|
||||
|
||||
# Inject config if `**kwargs` are expected
|
||||
if is_dataclass(cls):
|
||||
for key in cls.__dataclass_fields__:
|
||||
if key not in model_kwargs and key in config:
|
||||
model_kwargs[key] = config[key]
|
||||
elif any(param.kind == inspect.Parameter.VAR_KEYWORD for param in cls._hub_mixin_init_parameters.values()):
|
||||
for key, value in config.items():
|
||||
if key not in model_kwargs:
|
||||
model_kwargs[key] = value
|
||||
|
||||
# Finally, also inject if `_from_pretrained` expects it
|
||||
if cls._hub_mixin_inject_config and "config" not in model_kwargs:
|
||||
model_kwargs["config"] = config
|
||||
|
||||
instance = cls._from_pretrained(
|
||||
model_id=str(model_id),
|
||||
revision=revision,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
resume_download=resume_download,
|
||||
local_files_only=local_files_only,
|
||||
token=token,
|
||||
**model_kwargs,
|
||||
)
|
||||
|
||||
# Implicitly set the config as instance attribute if not already set by the class
|
||||
# This way `config` will be available when calling `save_pretrained` or `push_to_hub`.
|
||||
if config is not None and (getattr(instance, "_hub_mixin_config", None) in (None, {})):
|
||||
instance._hub_mixin_config = config
|
||||
|
||||
return instance
|
||||
|
||||
@classmethod
|
||||
def _from_pretrained(
|
||||
cls: Type[T],
|
||||
*,
|
||||
model_id: str,
|
||||
revision: Optional[str],
|
||||
cache_dir: Optional[Union[str, Path]],
|
||||
force_download: bool,
|
||||
proxies: Optional[Dict],
|
||||
resume_download: Optional[bool],
|
||||
local_files_only: bool,
|
||||
token: Optional[Union[str, bool]],
|
||||
**model_kwargs,
|
||||
) -> T:
|
||||
"""Overwrite this method in subclass to define how to load your model from pretrained.
|
||||
|
||||
Use [`hf_hub_download`] or [`snapshot_download`] to download files from the Hub before loading them. Most
|
||||
args taken as input can be directly passed to those 2 methods. If needed, you can add more arguments to this
|
||||
method using "model_kwargs". For example [`PyTorchModelHubMixin._from_pretrained`] takes as input a `map_location`
|
||||
parameter to set on which device the model should be loaded.
|
||||
|
||||
Check out our [integration guide](../guides/integrations) for more instructions.
|
||||
|
||||
Args:
|
||||
model_id (`str`):
|
||||
ID of the model to load from the Huggingface Hub (e.g. `bigscience/bloom`).
|
||||
revision (`str`, *optional*):
|
||||
Revision of the model on the Hub. Can be a branch name, a git tag or any commit id. Defaults to the
|
||||
latest commit on `main` branch.
|
||||
force_download (`bool`, *optional*, defaults to `False`):
|
||||
Whether to force (re-)downloading the model weights and configuration files from the Hub, overriding
|
||||
the existing cache.
|
||||
proxies (`Dict[str, str]`, *optional*):
|
||||
A dictionary of proxy servers to use by protocol or endpoint (e.g., `{'http': 'foo.bar:3128',
|
||||
'http://hostname': 'foo.bar:4012'}`).
|
||||
token (`str` or `bool`, *optional*):
|
||||
The token to use as HTTP bearer authorization for remote files. By default, it will use the token
|
||||
cached when running `huggingface-cli login`.
|
||||
cache_dir (`str`, `Path`, *optional*):
|
||||
Path to the folder where cached files are stored.
|
||||
local_files_only (`bool`, *optional*, defaults to `False`):
|
||||
If `True`, avoid downloading the file and return the path to the local cached file if it exists.
|
||||
model_kwargs:
|
||||
Additional keyword arguments passed along to the [`~ModelHubMixin._from_pretrained`] method.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@validate_hf_hub_args
|
||||
def push_to_hub(
|
||||
self,
|
||||
repo_id: str,
|
||||
*,
|
||||
config: Optional[Union[dict, DataclassInstance]] = None,
|
||||
commit_message: str = "Push model using huggingface_hub.",
|
||||
private: Optional[bool] = None,
|
||||
token: Optional[str] = None,
|
||||
branch: Optional[str] = None,
|
||||
create_pr: Optional[bool] = None,
|
||||
allow_patterns: Optional[Union[List[str], str]] = None,
|
||||
ignore_patterns: Optional[Union[List[str], str]] = None,
|
||||
delete_patterns: Optional[Union[List[str], str]] = None,
|
||||
model_card_kwargs: Optional[Dict[str, Any]] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Upload model checkpoint to the Hub.
|
||||
|
||||
Use `allow_patterns` and `ignore_patterns` to precisely filter which files should be pushed to the hub. Use
|
||||
`delete_patterns` to delete existing remote files in the same commit. See [`upload_folder`] reference for more
|
||||
details.
|
||||
|
||||
Args:
|
||||
repo_id (`str`):
|
||||
ID of the repository to push to (example: `"username/my-model"`).
|
||||
config (`dict` or `DataclassInstance`, *optional*):
|
||||
Model configuration specified as a key/value dictionary or a dataclass instance.
|
||||
commit_message (`str`, *optional*):
|
||||
Message to commit while pushing.
|
||||
private (`bool`, *optional*):
|
||||
Whether the repository created should be private.
|
||||
If `None` (default), the repo will be public unless the organization's default is private.
|
||||
token (`str`, *optional*):
|
||||
The token to use as HTTP bearer authorization for remote files. By default, it will use the token
|
||||
cached when running `huggingface-cli login`.
|
||||
branch (`str`, *optional*):
|
||||
The git branch on which to push the model. This defaults to `"main"`.
|
||||
create_pr (`boolean`, *optional*):
|
||||
Whether or not to create a Pull Request from `branch` with that commit. Defaults to `False`.
|
||||
allow_patterns (`List[str]` or `str`, *optional*):
|
||||
If provided, only files matching at least one pattern are pushed.
|
||||
ignore_patterns (`List[str]` or `str`, *optional*):
|
||||
If provided, files matching any of the patterns are not pushed.
|
||||
delete_patterns (`List[str]` or `str`, *optional*):
|
||||
If provided, remote files matching any of the patterns will be deleted from the repo.
|
||||
model_card_kwargs (`Dict[str, Any]`, *optional*):
|
||||
Additional arguments passed to the model card template to customize the model card.
|
||||
|
||||
Returns:
|
||||
The url of the commit of your model in the given repository.
|
||||
"""
|
||||
api = HfApi(token=token)
|
||||
repo_id = api.create_repo(repo_id=repo_id, private=private, exist_ok=True).repo_id
|
||||
|
||||
# Push the files to the repo in a single commit
|
||||
with SoftTemporaryDirectory() as tmp:
|
||||
saved_path = Path(tmp) / repo_id
|
||||
self.save_pretrained(saved_path, config=config, model_card_kwargs=model_card_kwargs)
|
||||
return api.upload_folder(
|
||||
repo_id=repo_id,
|
||||
repo_type="model",
|
||||
folder_path=saved_path,
|
||||
commit_message=commit_message,
|
||||
revision=branch,
|
||||
create_pr=create_pr,
|
||||
allow_patterns=allow_patterns,
|
||||
ignore_patterns=ignore_patterns,
|
||||
delete_patterns=delete_patterns,
|
||||
)
|
||||
|
||||
def generate_model_card(self, *args, **kwargs) -> ModelCard:
|
||||
card = ModelCard.from_template(
|
||||
card_data=self._hub_mixin_info.model_card_data,
|
||||
template_str=self._hub_mixin_info.model_card_template,
|
||||
repo_url=self._hub_mixin_info.repo_url,
|
||||
paper_url=self._hub_mixin_info.paper_url,
|
||||
docs_url=self._hub_mixin_info.docs_url,
|
||||
**kwargs,
|
||||
)
|
||||
return card
|
||||
|
||||
|
||||
class PyTorchModelHubMixin(ModelHubMixin):
|
||||
"""
|
||||
Implementation of [`ModelHubMixin`] to provide model Hub upload/download capabilities to PyTorch models. The model
|
||||
is set in evaluation mode by default using `model.eval()` (dropout modules are deactivated). To train the model,
|
||||
you should first set it back in training mode with `model.train()`.
|
||||
|
||||
See [`ModelHubMixin`] for more details on how to use the mixin.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> import torch
|
||||
>>> import torch.nn as nn
|
||||
>>> from huggingface_hub import PyTorchModelHubMixin
|
||||
|
||||
>>> class MyModel(
|
||||
... nn.Module,
|
||||
... PyTorchModelHubMixin,
|
||||
... library_name="keras-nlp",
|
||||
... repo_url="https://github.com/keras-team/keras-nlp",
|
||||
... paper_url="https://arxiv.org/abs/2304.12244",
|
||||
... docs_url="https://keras.io/keras_nlp/",
|
||||
... # ^ optional metadata to generate model card
|
||||
... ):
|
||||
... def __init__(self, hidden_size: int = 512, vocab_size: int = 30000, output_size: int = 4):
|
||||
... super().__init__()
|
||||
... self.param = nn.Parameter(torch.rand(hidden_size, vocab_size))
|
||||
... self.linear = nn.Linear(output_size, vocab_size)
|
||||
|
||||
... def forward(self, x):
|
||||
... return self.linear(x + self.param)
|
||||
>>> model = MyModel(hidden_size=256)
|
||||
|
||||
# Save model weights to local directory
|
||||
>>> model.save_pretrained("my-awesome-model")
|
||||
|
||||
# Push model weights to the Hub
|
||||
>>> model.push_to_hub("my-awesome-model")
|
||||
|
||||
# Download and initialize weights from the Hub
|
||||
>>> model = MyModel.from_pretrained("username/my-awesome-model")
|
||||
>>> model.hidden_size
|
||||
256
|
||||
```
|
||||
"""
|
||||
|
||||
def __init_subclass__(cls, *args, tags: Optional[List[str]] = None, **kwargs) -> None:
|
||||
tags = tags or []
|
||||
tags.append("pytorch_model_hub_mixin")
|
||||
kwargs["tags"] = tags
|
||||
return super().__init_subclass__(*args, **kwargs)
|
||||
|
||||
def _save_pretrained(self, save_directory: Path) -> None:
|
||||
"""Save weights from a Pytorch model to a local directory."""
|
||||
model_to_save = self.module if hasattr(self, "module") else self # type: ignore
|
||||
save_model_as_safetensor(model_to_save, str(save_directory / constants.SAFETENSORS_SINGLE_FILE))
|
||||
|
||||
@classmethod
|
||||
def _from_pretrained(
|
||||
cls,
|
||||
*,
|
||||
model_id: str,
|
||||
revision: Optional[str],
|
||||
cache_dir: Optional[Union[str, Path]],
|
||||
force_download: bool,
|
||||
proxies: Optional[Dict],
|
||||
resume_download: Optional[bool],
|
||||
local_files_only: bool,
|
||||
token: Union[str, bool, None],
|
||||
map_location: str = "cpu",
|
||||
strict: bool = False,
|
||||
**model_kwargs,
|
||||
):
|
||||
"""Load Pytorch pretrained weights and return the loaded model."""
|
||||
model = cls(**model_kwargs)
|
||||
if os.path.isdir(model_id):
|
||||
print("Loading weights from local directory")
|
||||
model_file = os.path.join(model_id, constants.SAFETENSORS_SINGLE_FILE)
|
||||
return cls._load_as_safetensor(model, model_file, map_location, strict)
|
||||
else:
|
||||
try:
|
||||
model_file = hf_hub_download(
|
||||
repo_id=model_id,
|
||||
filename=constants.SAFETENSORS_SINGLE_FILE,
|
||||
revision=revision,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
resume_download=resume_download,
|
||||
token=token,
|
||||
local_files_only=local_files_only,
|
||||
)
|
||||
return cls._load_as_safetensor(model, model_file, map_location, strict)
|
||||
except EntryNotFoundError:
|
||||
model_file = hf_hub_download(
|
||||
repo_id=model_id,
|
||||
filename=constants.PYTORCH_WEIGHTS_NAME,
|
||||
revision=revision,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
resume_download=resume_download,
|
||||
token=token,
|
||||
local_files_only=local_files_only,
|
||||
)
|
||||
return cls._load_as_pickle(model, model_file, map_location, strict)
|
||||
|
||||
@classmethod
|
||||
def _load_as_pickle(cls, model: T, model_file: str, map_location: str, strict: bool) -> T:
|
||||
state_dict = torch.load(model_file, map_location=torch.device(map_location), weights_only=True)
|
||||
model.load_state_dict(state_dict, strict=strict) # type: ignore
|
||||
model.eval() # type: ignore
|
||||
return model
|
||||
|
||||
@classmethod
|
||||
def _load_as_safetensor(cls, model: T, model_file: str, map_location: str, strict: bool) -> T:
|
||||
if packaging.version.parse(safetensors.__version__) < packaging.version.parse("0.4.3"): # type: ignore [attr-defined]
|
||||
load_model_as_safetensor(model, model_file, strict=strict) # type: ignore [arg-type]
|
||||
if map_location != "cpu":
|
||||
logger.warning(
|
||||
"Loading model weights on other devices than 'cpu' is not supported natively in your version of safetensors."
|
||||
" This means that the model is loaded on 'cpu' first and then copied to the device."
|
||||
" This leads to a slower loading time."
|
||||
" Please update safetensors to version 0.4.3 or above for improved performance."
|
||||
)
|
||||
model.to(map_location) # type: ignore [attr-defined]
|
||||
else:
|
||||
safetensors.torch.load_model(model, model_file, strict=strict, device=map_location) # type: ignore [arg-type]
|
||||
return model
|
||||
|
||||
|
||||
def _load_dataclass(datacls: Type[DataclassInstance], data: dict) -> DataclassInstance:
|
||||
"""Load a dataclass instance from a dictionary.
|
||||
|
||||
Fields not expected by the dataclass are ignored.
|
||||
"""
|
||||
return datacls(**{k: v for k, v in data.items() if k in datacls.__dataclass_fields__})
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,422 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contains utilities used by both the sync and async inference clients."""
|
||||
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
AsyncIterable,
|
||||
BinaryIO,
|
||||
ContextManager,
|
||||
Dict,
|
||||
Generator,
|
||||
Iterable,
|
||||
List,
|
||||
Literal,
|
||||
NoReturn,
|
||||
Optional,
|
||||
Union,
|
||||
overload,
|
||||
)
|
||||
|
||||
from requests import HTTPError
|
||||
|
||||
from huggingface_hub.errors import (
|
||||
GenerationError,
|
||||
IncompleteGenerationError,
|
||||
OverloadedError,
|
||||
TextGenerationError,
|
||||
UnknownError,
|
||||
ValidationError,
|
||||
)
|
||||
|
||||
from ..utils import get_session, is_aiohttp_available, is_numpy_available, is_pillow_available
|
||||
from ._generated.types import ChatCompletionStreamOutput, TextGenerationStreamOutput
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from aiohttp import ClientResponse, ClientSession
|
||||
from PIL.Image import Image
|
||||
|
||||
# TYPES
|
||||
UrlT = str
|
||||
PathT = Union[str, Path]
|
||||
BinaryT = Union[bytes, BinaryIO]
|
||||
ContentT = Union[BinaryT, PathT, UrlT]
|
||||
|
||||
# Use to set a Accept: image/png header
|
||||
TASKS_EXPECTING_IMAGES = {"text-to-image", "image-to-image"}
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RequestParameters:
|
||||
url: str
|
||||
task: str
|
||||
model: Optional[str]
|
||||
json: Optional[Union[str, Dict, List]]
|
||||
data: Optional[ContentT]
|
||||
headers: Dict[str, Any]
|
||||
|
||||
|
||||
# Add dataclass for ModelStatus. We use this dataclass in get_model_status function.
|
||||
@dataclass
|
||||
class ModelStatus:
|
||||
"""
|
||||
This Dataclass represents the model status in the HF Inference API.
|
||||
|
||||
Args:
|
||||
loaded (`bool`):
|
||||
If the model is currently loaded into HF's Inference API. Models
|
||||
are loaded on-demand, leading to the user's first request taking longer.
|
||||
If a model is loaded, you can be assured that it is in a healthy state.
|
||||
state (`str`):
|
||||
The current state of the model. This can be 'Loaded', 'Loadable', 'TooBig'.
|
||||
If a model's state is 'Loadable', it's not too big and has a supported
|
||||
backend. Loadable models are automatically loaded when the user first
|
||||
requests inference on the endpoint. This means it is transparent for the
|
||||
user to load a model, except that the first call takes longer to complete.
|
||||
compute_type (`Dict`):
|
||||
Information about the compute resource the model is using or will use, such as 'gpu' type and number of
|
||||
replicas.
|
||||
framework (`str`):
|
||||
The name of the framework that the model was built with, such as 'transformers'
|
||||
or 'text-generation-inference'.
|
||||
"""
|
||||
|
||||
loaded: bool
|
||||
state: str
|
||||
compute_type: Dict
|
||||
framework: str
|
||||
|
||||
|
||||
## IMPORT UTILS
|
||||
|
||||
|
||||
def _import_aiohttp():
|
||||
# Make sure `aiohttp` is installed on the machine.
|
||||
if not is_aiohttp_available():
|
||||
raise ImportError("Please install aiohttp to use `AsyncInferenceClient` (`pip install aiohttp`).")
|
||||
import aiohttp
|
||||
|
||||
return aiohttp
|
||||
|
||||
|
||||
def _import_numpy():
|
||||
"""Make sure `numpy` is installed on the machine."""
|
||||
if not is_numpy_available():
|
||||
raise ImportError("Please install numpy to use deal with embeddings (`pip install numpy`).")
|
||||
import numpy
|
||||
|
||||
return numpy
|
||||
|
||||
|
||||
def _import_pil_image():
|
||||
"""Make sure `PIL` is installed on the machine."""
|
||||
if not is_pillow_available():
|
||||
raise ImportError(
|
||||
"Please install Pillow to use deal with images (`pip install Pillow`). If you don't want the image to be"
|
||||
" post-processed, use `client.post(...)` and get the raw response from the server."
|
||||
)
|
||||
from PIL import Image
|
||||
|
||||
return Image
|
||||
|
||||
|
||||
## ENCODING / DECODING UTILS
|
||||
|
||||
|
||||
@overload
|
||||
def _open_as_binary(
|
||||
content: ContentT,
|
||||
) -> ContextManager[BinaryT]: ... # means "if input is not None, output is not None"
|
||||
|
||||
|
||||
@overload
|
||||
def _open_as_binary(
|
||||
content: Literal[None],
|
||||
) -> ContextManager[Literal[None]]: ... # means "if input is None, output is None"
|
||||
|
||||
|
||||
@contextmanager # type: ignore
|
||||
def _open_as_binary(content: Optional[ContentT]) -> Generator[Optional[BinaryT], None, None]:
|
||||
"""Open `content` as a binary file, either from a URL, a local path, or raw bytes.
|
||||
|
||||
Do nothing if `content` is None,
|
||||
|
||||
TODO: handle a PIL.Image as input
|
||||
TODO: handle base64 as input
|
||||
"""
|
||||
# If content is a string => must be either a URL or a path
|
||||
if isinstance(content, str):
|
||||
if content.startswith("https://") or content.startswith("http://"):
|
||||
logger.debug(f"Downloading content from {content}")
|
||||
yield get_session().get(content).content # TODO: retrieve as stream and pipe to post request ?
|
||||
return
|
||||
content = Path(content)
|
||||
if not content.exists():
|
||||
raise FileNotFoundError(
|
||||
f"File not found at {content}. If `data` is a string, it must either be a URL or a path to a local"
|
||||
" file. To pass raw content, please encode it as bytes first."
|
||||
)
|
||||
|
||||
# If content is a Path => open it
|
||||
if isinstance(content, Path):
|
||||
logger.debug(f"Opening content from {content}")
|
||||
with content.open("rb") as f:
|
||||
yield f
|
||||
else:
|
||||
# Otherwise: already a file-like object or None
|
||||
yield content
|
||||
|
||||
|
||||
def _b64_encode(content: ContentT) -> str:
|
||||
"""Encode a raw file (image, audio) into base64. Can be bytes, an opened file, a path or a URL."""
|
||||
with _open_as_binary(content) as data:
|
||||
data_as_bytes = data if isinstance(data, bytes) else data.read()
|
||||
return base64.b64encode(data_as_bytes).decode()
|
||||
|
||||
|
||||
def _b64_to_image(encoded_image: str) -> "Image":
|
||||
"""Parse a base64-encoded string into a PIL Image."""
|
||||
Image = _import_pil_image()
|
||||
return Image.open(io.BytesIO(base64.b64decode(encoded_image)))
|
||||
|
||||
|
||||
def _bytes_to_list(content: bytes) -> List:
|
||||
"""Parse bytes from a Response object into a Python list.
|
||||
|
||||
Expects the response body to be JSON-encoded data.
|
||||
|
||||
NOTE: This is exactly the same implementation as `_bytes_to_dict` and will not complain if the returned data is a
|
||||
dictionary. The only advantage of having both is to help the user (and mypy) understand what kind of data to expect.
|
||||
"""
|
||||
return json.loads(content.decode())
|
||||
|
||||
|
||||
def _bytes_to_dict(content: bytes) -> Dict:
|
||||
"""Parse bytes from a Response object into a Python dictionary.
|
||||
|
||||
Expects the response body to be JSON-encoded data.
|
||||
|
||||
NOTE: This is exactly the same implementation as `_bytes_to_list` and will not complain if the returned data is a
|
||||
list. The only advantage of having both is to help the user (and mypy) understand what kind of data to expect.
|
||||
"""
|
||||
return json.loads(content.decode())
|
||||
|
||||
|
||||
def _bytes_to_image(content: bytes) -> "Image":
|
||||
"""Parse bytes from a Response object into a PIL Image.
|
||||
|
||||
Expects the response body to be raw bytes. To deal with b64 encoded images, use `_b64_to_image` instead.
|
||||
"""
|
||||
Image = _import_pil_image()
|
||||
return Image.open(io.BytesIO(content))
|
||||
|
||||
|
||||
def _as_dict(response: Union[bytes, Dict]) -> Dict:
|
||||
return json.loads(response) if isinstance(response, bytes) else response
|
||||
|
||||
|
||||
## PAYLOAD UTILS
|
||||
|
||||
|
||||
## STREAMING UTILS
|
||||
|
||||
|
||||
def _stream_text_generation_response(
|
||||
bytes_output_as_lines: Iterable[bytes], details: bool
|
||||
) -> Union[Iterable[str], Iterable[TextGenerationStreamOutput]]:
|
||||
"""Used in `InferenceClient.text_generation`."""
|
||||
# Parse ServerSentEvents
|
||||
for byte_payload in bytes_output_as_lines:
|
||||
try:
|
||||
output = _format_text_generation_stream_output(byte_payload, details)
|
||||
except StopIteration:
|
||||
break
|
||||
if output is not None:
|
||||
yield output
|
||||
|
||||
|
||||
async def _async_stream_text_generation_response(
|
||||
bytes_output_as_lines: AsyncIterable[bytes], details: bool
|
||||
) -> Union[AsyncIterable[str], AsyncIterable[TextGenerationStreamOutput]]:
|
||||
"""Used in `AsyncInferenceClient.text_generation`."""
|
||||
# Parse ServerSentEvents
|
||||
async for byte_payload in bytes_output_as_lines:
|
||||
try:
|
||||
output = _format_text_generation_stream_output(byte_payload, details)
|
||||
except StopIteration:
|
||||
break
|
||||
if output is not None:
|
||||
yield output
|
||||
|
||||
|
||||
def _format_text_generation_stream_output(
|
||||
byte_payload: bytes, details: bool
|
||||
) -> Optional[Union[str, TextGenerationStreamOutput]]:
|
||||
if not byte_payload.startswith(b"data:"):
|
||||
return None # empty line
|
||||
|
||||
if byte_payload.strip() == b"data: [DONE]":
|
||||
raise StopIteration("[DONE] signal received.")
|
||||
|
||||
# Decode payload
|
||||
payload = byte_payload.decode("utf-8")
|
||||
json_payload = json.loads(payload.lstrip("data:").rstrip("/n"))
|
||||
|
||||
# Either an error as being returned
|
||||
if json_payload.get("error") is not None:
|
||||
raise _parse_text_generation_error(json_payload["error"], json_payload.get("error_type"))
|
||||
|
||||
# Or parse token payload
|
||||
output = TextGenerationStreamOutput.parse_obj_as_instance(json_payload)
|
||||
return output.token.text if not details else output
|
||||
|
||||
|
||||
def _stream_chat_completion_response(
|
||||
bytes_lines: Iterable[bytes],
|
||||
) -> Iterable[ChatCompletionStreamOutput]:
|
||||
"""Used in `InferenceClient.chat_completion` if model is served with TGI."""
|
||||
for item in bytes_lines:
|
||||
try:
|
||||
output = _format_chat_completion_stream_output(item)
|
||||
except StopIteration:
|
||||
break
|
||||
if output is not None:
|
||||
yield output
|
||||
|
||||
|
||||
async def _async_stream_chat_completion_response(
|
||||
bytes_lines: AsyncIterable[bytes],
|
||||
) -> AsyncIterable[ChatCompletionStreamOutput]:
|
||||
"""Used in `AsyncInferenceClient.chat_completion`."""
|
||||
async for item in bytes_lines:
|
||||
try:
|
||||
output = _format_chat_completion_stream_output(item)
|
||||
except StopIteration:
|
||||
break
|
||||
if output is not None:
|
||||
yield output
|
||||
|
||||
|
||||
def _format_chat_completion_stream_output(
|
||||
byte_payload: bytes,
|
||||
) -> Optional[ChatCompletionStreamOutput]:
|
||||
if not byte_payload.startswith(b"data:"):
|
||||
return None # empty line
|
||||
|
||||
if byte_payload.strip() == b"data: [DONE]":
|
||||
raise StopIteration("[DONE] signal received.")
|
||||
|
||||
# Decode payload
|
||||
payload = byte_payload.decode("utf-8")
|
||||
json_payload = json.loads(payload.lstrip("data:").rstrip("/n"))
|
||||
|
||||
# Either an error as being returned
|
||||
if json_payload.get("error") is not None:
|
||||
raise _parse_text_generation_error(json_payload["error"], json_payload.get("error_type"))
|
||||
|
||||
# Or parse token payload
|
||||
return ChatCompletionStreamOutput.parse_obj_as_instance(json_payload)
|
||||
|
||||
|
||||
async def _async_yield_from(client: "ClientSession", response: "ClientResponse") -> AsyncIterable[bytes]:
|
||||
async for byte_payload in response.content:
|
||||
yield byte_payload.strip()
|
||||
await client.close()
|
||||
|
||||
|
||||
# "TGI servers" are servers running with the `text-generation-inference` backend.
|
||||
# This backend is the go-to solution to run large language models at scale. However,
|
||||
# for some smaller models (e.g. "gpt2") the default `transformers` + `api-inference`
|
||||
# solution is still in use.
|
||||
#
|
||||
# Both approaches have very similar APIs, but not exactly the same. What we do first in
|
||||
# the `text_generation` method is to assume the model is served via TGI. If we realize
|
||||
# it's not the case (i.e. we receive an HTTP 400 Bad Request), we fallback to the
|
||||
# default API with a warning message. When that's the case, We remember the unsupported
|
||||
# attributes for this model in the `_UNSUPPORTED_TEXT_GENERATION_KWARGS` global variable.
|
||||
#
|
||||
# In addition, TGI servers have a built-in API route for chat-completion, which is not
|
||||
# available on the default API. We use this route to provide a more consistent behavior
|
||||
# when available.
|
||||
#
|
||||
# For more details, see https://github.com/huggingface/text-generation-inference and
|
||||
# https://huggingface.co/docs/api-inference/detailed_parameters#text-generation-task.
|
||||
|
||||
_UNSUPPORTED_TEXT_GENERATION_KWARGS: Dict[Optional[str], List[str]] = {}
|
||||
|
||||
|
||||
def _set_unsupported_text_generation_kwargs(model: Optional[str], unsupported_kwargs: List[str]) -> None:
|
||||
_UNSUPPORTED_TEXT_GENERATION_KWARGS.setdefault(model, []).extend(unsupported_kwargs)
|
||||
|
||||
|
||||
def _get_unsupported_text_generation_kwargs(model: Optional[str]) -> List[str]:
|
||||
return _UNSUPPORTED_TEXT_GENERATION_KWARGS.get(model, [])
|
||||
|
||||
|
||||
# TEXT GENERATION ERRORS
|
||||
# ----------------------
|
||||
# Text-generation errors are parsed separately to handle as much as possible the errors returned by the text generation
|
||||
# inference project (https://github.com/huggingface/text-generation-inference).
|
||||
# ----------------------
|
||||
|
||||
|
||||
def raise_text_generation_error(http_error: HTTPError) -> NoReturn:
|
||||
"""
|
||||
Try to parse text-generation-inference error message and raise HTTPError in any case.
|
||||
|
||||
Args:
|
||||
error (`HTTPError`):
|
||||
The HTTPError that have been raised.
|
||||
"""
|
||||
# Try to parse a Text Generation Inference error
|
||||
|
||||
try:
|
||||
# Hacky way to retrieve payload in case of aiohttp error
|
||||
payload = getattr(http_error, "response_error_payload", None) or http_error.response.json()
|
||||
error = payload.get("error")
|
||||
error_type = payload.get("error_type")
|
||||
except Exception: # no payload
|
||||
raise http_error
|
||||
|
||||
# If error_type => more information than `hf_raise_for_status`
|
||||
if error_type is not None:
|
||||
exception = _parse_text_generation_error(error, error_type)
|
||||
raise exception from http_error
|
||||
|
||||
# Otherwise, fallback to default error
|
||||
raise http_error
|
||||
|
||||
|
||||
def _parse_text_generation_error(error: Optional[str], error_type: Optional[str]) -> TextGenerationError:
|
||||
if error_type == "generation":
|
||||
return GenerationError(error) # type: ignore
|
||||
if error_type == "incomplete_generation":
|
||||
return IncompleteGenerationError(error) # type: ignore
|
||||
if error_type == "overloaded":
|
||||
return OverloadedError(error) # type: ignore
|
||||
if error_type == "validation":
|
||||
return ValidationError(error) # type: ignore
|
||||
return UnknownError(error) # type: ignore
|
||||
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,188 @@
|
||||
# This file is auto-generated by `utils/generate_inference_types.py`.
|
||||
# Do not modify it manually.
|
||||
#
|
||||
# ruff: noqa: F401
|
||||
|
||||
from .audio_classification import (
|
||||
AudioClassificationInput,
|
||||
AudioClassificationOutputElement,
|
||||
AudioClassificationOutputTransform,
|
||||
AudioClassificationParameters,
|
||||
)
|
||||
from .audio_to_audio import AudioToAudioInput, AudioToAudioOutputElement
|
||||
from .automatic_speech_recognition import (
|
||||
AutomaticSpeechRecognitionEarlyStoppingEnum,
|
||||
AutomaticSpeechRecognitionGenerationParameters,
|
||||
AutomaticSpeechRecognitionInput,
|
||||
AutomaticSpeechRecognitionOutput,
|
||||
AutomaticSpeechRecognitionOutputChunk,
|
||||
AutomaticSpeechRecognitionParameters,
|
||||
)
|
||||
from .base import BaseInferenceType
|
||||
from .chat_completion import (
|
||||
ChatCompletionInput,
|
||||
ChatCompletionInputFunctionDefinition,
|
||||
ChatCompletionInputFunctionName,
|
||||
ChatCompletionInputGrammarType,
|
||||
ChatCompletionInputGrammarTypeType,
|
||||
ChatCompletionInputMessage,
|
||||
ChatCompletionInputMessageChunk,
|
||||
ChatCompletionInputMessageChunkType,
|
||||
ChatCompletionInputStreamOptions,
|
||||
ChatCompletionInputTool,
|
||||
ChatCompletionInputToolCall,
|
||||
ChatCompletionInputToolChoiceClass,
|
||||
ChatCompletionInputToolChoiceEnum,
|
||||
ChatCompletionInputURL,
|
||||
ChatCompletionOutput,
|
||||
ChatCompletionOutputComplete,
|
||||
ChatCompletionOutputFunctionDefinition,
|
||||
ChatCompletionOutputLogprob,
|
||||
ChatCompletionOutputLogprobs,
|
||||
ChatCompletionOutputMessage,
|
||||
ChatCompletionOutputToolCall,
|
||||
ChatCompletionOutputTopLogprob,
|
||||
ChatCompletionOutputUsage,
|
||||
ChatCompletionStreamOutput,
|
||||
ChatCompletionStreamOutputChoice,
|
||||
ChatCompletionStreamOutputDelta,
|
||||
ChatCompletionStreamOutputDeltaToolCall,
|
||||
ChatCompletionStreamOutputFunction,
|
||||
ChatCompletionStreamOutputLogprob,
|
||||
ChatCompletionStreamOutputLogprobs,
|
||||
ChatCompletionStreamOutputTopLogprob,
|
||||
ChatCompletionStreamOutputUsage,
|
||||
)
|
||||
from .depth_estimation import DepthEstimationInput, DepthEstimationOutput
|
||||
from .document_question_answering import (
|
||||
DocumentQuestionAnsweringInput,
|
||||
DocumentQuestionAnsweringInputData,
|
||||
DocumentQuestionAnsweringOutputElement,
|
||||
DocumentQuestionAnsweringParameters,
|
||||
)
|
||||
from .feature_extraction import FeatureExtractionInput, FeatureExtractionInputTruncationDirection
|
||||
from .fill_mask import FillMaskInput, FillMaskOutputElement, FillMaskParameters
|
||||
from .image_classification import (
|
||||
ImageClassificationInput,
|
||||
ImageClassificationOutputElement,
|
||||
ImageClassificationOutputTransform,
|
||||
ImageClassificationParameters,
|
||||
)
|
||||
from .image_segmentation import (
|
||||
ImageSegmentationInput,
|
||||
ImageSegmentationOutputElement,
|
||||
ImageSegmentationParameters,
|
||||
ImageSegmentationSubtask,
|
||||
)
|
||||
from .image_to_image import ImageToImageInput, ImageToImageOutput, ImageToImageParameters, ImageToImageTargetSize
|
||||
from .image_to_text import (
|
||||
ImageToTextEarlyStoppingEnum,
|
||||
ImageToTextGenerationParameters,
|
||||
ImageToTextInput,
|
||||
ImageToTextOutput,
|
||||
ImageToTextParameters,
|
||||
)
|
||||
from .object_detection import (
|
||||
ObjectDetectionBoundingBox,
|
||||
ObjectDetectionInput,
|
||||
ObjectDetectionOutputElement,
|
||||
ObjectDetectionParameters,
|
||||
)
|
||||
from .question_answering import (
|
||||
QuestionAnsweringInput,
|
||||
QuestionAnsweringInputData,
|
||||
QuestionAnsweringOutputElement,
|
||||
QuestionAnsweringParameters,
|
||||
)
|
||||
from .sentence_similarity import SentenceSimilarityInput, SentenceSimilarityInputData
|
||||
from .summarization import (
|
||||
SummarizationInput,
|
||||
SummarizationOutput,
|
||||
SummarizationParameters,
|
||||
SummarizationTruncationStrategy,
|
||||
)
|
||||
from .table_question_answering import (
|
||||
Padding,
|
||||
TableQuestionAnsweringInput,
|
||||
TableQuestionAnsweringInputData,
|
||||
TableQuestionAnsweringOutputElement,
|
||||
TableQuestionAnsweringParameters,
|
||||
)
|
||||
from .text2text_generation import (
|
||||
Text2TextGenerationInput,
|
||||
Text2TextGenerationOutput,
|
||||
Text2TextGenerationParameters,
|
||||
Text2TextGenerationTruncationStrategy,
|
||||
)
|
||||
from .text_classification import (
|
||||
TextClassificationInput,
|
||||
TextClassificationOutputElement,
|
||||
TextClassificationOutputTransform,
|
||||
TextClassificationParameters,
|
||||
)
|
||||
from .text_generation import (
|
||||
TextGenerationInput,
|
||||
TextGenerationInputGenerateParameters,
|
||||
TextGenerationInputGrammarType,
|
||||
TextGenerationOutput,
|
||||
TextGenerationOutputBestOfSequence,
|
||||
TextGenerationOutputDetails,
|
||||
TextGenerationOutputFinishReason,
|
||||
TextGenerationOutputPrefillToken,
|
||||
TextGenerationOutputToken,
|
||||
TextGenerationStreamOutput,
|
||||
TextGenerationStreamOutputStreamDetails,
|
||||
TextGenerationStreamOutputToken,
|
||||
TypeEnum,
|
||||
)
|
||||
from .text_to_audio import (
|
||||
TextToAudioEarlyStoppingEnum,
|
||||
TextToAudioGenerationParameters,
|
||||
TextToAudioInput,
|
||||
TextToAudioOutput,
|
||||
TextToAudioParameters,
|
||||
)
|
||||
from .text_to_image import TextToImageInput, TextToImageOutput, TextToImageParameters
|
||||
from .text_to_speech import (
|
||||
TextToSpeechEarlyStoppingEnum,
|
||||
TextToSpeechGenerationParameters,
|
||||
TextToSpeechInput,
|
||||
TextToSpeechOutput,
|
||||
TextToSpeechParameters,
|
||||
)
|
||||
from .text_to_video import TextToVideoInput, TextToVideoOutput, TextToVideoParameters
|
||||
from .token_classification import (
|
||||
TokenClassificationAggregationStrategy,
|
||||
TokenClassificationInput,
|
||||
TokenClassificationOutputElement,
|
||||
TokenClassificationParameters,
|
||||
)
|
||||
from .translation import TranslationInput, TranslationOutput, TranslationParameters, TranslationTruncationStrategy
|
||||
from .video_classification import (
|
||||
VideoClassificationInput,
|
||||
VideoClassificationOutputElement,
|
||||
VideoClassificationOutputTransform,
|
||||
VideoClassificationParameters,
|
||||
)
|
||||
from .visual_question_answering import (
|
||||
VisualQuestionAnsweringInput,
|
||||
VisualQuestionAnsweringInputData,
|
||||
VisualQuestionAnsweringOutputElement,
|
||||
VisualQuestionAnsweringParameters,
|
||||
)
|
||||
from .zero_shot_classification import (
|
||||
ZeroShotClassificationInput,
|
||||
ZeroShotClassificationOutputElement,
|
||||
ZeroShotClassificationParameters,
|
||||
)
|
||||
from .zero_shot_image_classification import (
|
||||
ZeroShotImageClassificationInput,
|
||||
ZeroShotImageClassificationOutputElement,
|
||||
ZeroShotImageClassificationParameters,
|
||||
)
|
||||
from .zero_shot_object_detection import (
|
||||
ZeroShotObjectDetectionBoundingBox,
|
||||
ZeroShotObjectDetectionInput,
|
||||
ZeroShotObjectDetectionOutputElement,
|
||||
ZeroShotObjectDetectionParameters,
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user