structure saas with tools

2025-04-25 15:30:54 -03:00
commit 1aef473937
16434 changed files with 6584257 additions and 0 deletions
--- a/.venv/lib/python3.10/site-packages/huggingface_hub/_commit_api.py
+++ b/.venv/lib/python3.10/site-packages/huggingface_hub/_commit_api.py
@@ -0,0 +1,896 @@
+"""
+Type definitions and utilities for the `create_commit` API
+"""
+
+import base64
+import io
+import math
+import os
+import warnings
+from collections import defaultdict
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from itertools import groupby
+from pathlib import Path, PurePosixPath
+from typing import TYPE_CHECKING, Any, BinaryIO, Dict, Iterable, Iterator, List, Literal, Optional, Tuple, Union
+
+from tqdm.contrib.concurrent import thread_map
+
+from . import constants
+from .errors import EntryNotFoundError, HfHubHTTPError, XetAuthorizationError, XetRefreshTokenError
+from .file_download import hf_hub_url
+from .lfs import UploadInfo, lfs_upload, post_lfs_batch_info
+from .utils import (
+    FORBIDDEN_FOLDERS,
+    XetTokenType,
+    chunk_iterable,
+    fetch_xet_connection_info_from_repo_info,
+    get_session,
+    hf_raise_for_status,
+    logging,
+    sha,
+    tqdm_stream_file,
+    validate_hf_hub_args,
+)
+from .utils import tqdm as hf_tqdm
+from .utils.tqdm import _get_progress_bar_context
+
+
+if TYPE_CHECKING:
+    from .hf_api import RepoFile
+
+
+logger = logging.get_logger(__name__)
+
+
+UploadMode = Literal["lfs", "regular"]
+
+# Max is 1,000 per request on the Hub for HfApi.get_paths_info
+# Otherwise we get:
+# HfHubHTTPError: 413 Client Error: Payload Too Large for url: https://huggingface.co/api/datasets/xxx (Request ID: xxx)\n\ntoo many parameters
+# See https://github.com/huggingface/huggingface_hub/issues/1503
+FETCH_LFS_BATCH_SIZE = 500
+
+UPLOAD_BATCH_MAX_NUM_FILES = 256
+
+
+@dataclass
+class CommitOperationDelete:
+    """
+    Data structure holding necessary info to delete a file or a folder from a repository
+    on the Hub.
+
+    Args:
+        path_in_repo (`str`):
+            Relative filepath in the repo, for example: `"checkpoints/1fec34a/weights.bin"`
+            for a file or `"checkpoints/1fec34a/"` for a folder.
+        is_folder (`bool` or `Literal["auto"]`, *optional*)
+            Whether the Delete Operation applies to a folder or not. If "auto", the path
+            type (file or folder) is guessed automatically by looking if path ends with
+            a "/" (folder) or not (file). To explicitly set the path type, you can set
+            `is_folder=True` or `is_folder=False`.
+    """
+
+    path_in_repo: str
+    is_folder: Union[bool, Literal["auto"]] = "auto"
+
+    def __post_init__(self):
+        self.path_in_repo = _validate_path_in_repo(self.path_in_repo)
+
+        if self.is_folder == "auto":
+            self.is_folder = self.path_in_repo.endswith("/")
+        if not isinstance(self.is_folder, bool):
+            raise ValueError(
+                f"Wrong value for `is_folder`. Must be one of [`True`, `False`, `'auto'`]. Got '{self.is_folder}'."
+            )
+
+
+@dataclass
+class CommitOperationCopy:
+    """
+    Data structure holding necessary info to copy a file in a repository on the Hub.
+
+    Limitations:
+      - Only LFS files can be copied. To copy a regular file, you need to download it locally and re-upload it
+      - Cross-repository copies are not supported.
+
+    Note: you can combine a [`CommitOperationCopy`] and a [`CommitOperationDelete`] to rename an LFS file on the Hub.
+
+    Args:
+        src_path_in_repo (`str`):
+            Relative filepath in the repo of the file to be copied, e.g. `"checkpoints/1fec34a/weights.bin"`.
+        path_in_repo (`str`):
+            Relative filepath in the repo where to copy the file, e.g. `"checkpoints/1fec34a/weights_copy.bin"`.
+        src_revision (`str`, *optional*):
+            The git revision of the file to be copied. Can be any valid git revision.
+            Default to the target commit revision.
+    """
+
+    src_path_in_repo: str
+    path_in_repo: str
+    src_revision: Optional[str] = None
+    # set to the OID of the file to be copied if it has already been uploaded
+    # useful to determine if a commit will be empty or not.
+    _src_oid: Optional[str] = None
+    # set to the OID of the file to copy to if it has already been uploaded
+    # useful to determine if a commit will be empty or not.
+    _dest_oid: Optional[str] = None
+
+    def __post_init__(self):
+        self.src_path_in_repo = _validate_path_in_repo(self.src_path_in_repo)
+        self.path_in_repo = _validate_path_in_repo(self.path_in_repo)
+
+
+@dataclass
+class CommitOperationAdd:
+    """
+    Data structure holding necessary info to upload a file to a repository on the Hub.
+
+    Args:
+        path_in_repo (`str`):
+            Relative filepath in the repo, for example: `"checkpoints/1fec34a/weights.bin"`
+        path_or_fileobj (`str`, `Path`, `bytes`, or `BinaryIO`):
+            Either:
+            - a path to a local file (as `str` or `pathlib.Path`) to upload
+            - a buffer of bytes (`bytes`) holding the content of the file to upload
+            - a "file object" (subclass of `io.BufferedIOBase`), typically obtained
+                with `open(path, "rb")`. It must support `seek()` and `tell()` methods.
+
+    Raises:
+        [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
+            If `path_or_fileobj` is not one of `str`, `Path`, `bytes` or `io.BufferedIOBase`.
+        [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
+            If `path_or_fileobj` is a `str` or `Path` but not a path to an existing file.
+        [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
+            If `path_or_fileobj` is a `io.BufferedIOBase` but it doesn't support both
+            `seek()` and `tell()`.
+    """
+
+    path_in_repo: str
+    path_or_fileobj: Union[str, Path, bytes, BinaryIO]
+    upload_info: UploadInfo = field(init=False, repr=False)
+
+    # Internal attributes
+
+    # set to "lfs" or "regular" once known
+    _upload_mode: Optional[UploadMode] = field(init=False, repr=False, default=None)
+
+    # set to True if .gitignore rules prevent the file from being uploaded as LFS
+    # (server-side check)
+    _should_ignore: Optional[bool] = field(init=False, repr=False, default=None)
+
+    # set to the remote OID of the file if it has already been uploaded
+    # useful to determine if a commit will be empty or not
+    _remote_oid: Optional[str] = field(init=False, repr=False, default=None)
+
+    # set to True once the file has been uploaded as LFS
+    _is_uploaded: bool = field(init=False, repr=False, default=False)
+
+    # set to True once the file has been committed
+    _is_committed: bool = field(init=False, repr=False, default=False)
+
+    def __post_init__(self) -> None:
+        """Validates `path_or_fileobj` and compute `upload_info`."""
+        self.path_in_repo = _validate_path_in_repo(self.path_in_repo)
+
+        # Validate `path_or_fileobj` value
+        if isinstance(self.path_or_fileobj, Path):
+            self.path_or_fileobj = str(self.path_or_fileobj)
+        if isinstance(self.path_or_fileobj, str):
+            path_or_fileobj = os.path.normpath(os.path.expanduser(self.path_or_fileobj))
+            if not os.path.isfile(path_or_fileobj):
+                raise ValueError(f"Provided path: '{path_or_fileobj}' is not a file on the local file system")
+        elif not isinstance(self.path_or_fileobj, (io.BufferedIOBase, bytes)):
+            # ^^ Inspired from: https://stackoverflow.com/questions/44584829/how-to-determine-if-file-is-opened-in-binary-or-text-mode
+            raise ValueError(
+                "path_or_fileobj must be either an instance of str, bytes or"
+                " io.BufferedIOBase. If you passed a file-like object, make sure it is"
+                " in binary mode."
+            )
+        if isinstance(self.path_or_fileobj, io.BufferedIOBase):
+            try:
+                self.path_or_fileobj.tell()
+                self.path_or_fileobj.seek(0, os.SEEK_CUR)
+            except (OSError, AttributeError) as exc:
+                raise ValueError(
+                    "path_or_fileobj is a file-like object but does not implement seek() and tell()"
+                ) from exc
+
+        # Compute "upload_info" attribute
+        if isinstance(self.path_or_fileobj, str):
+            self.upload_info = UploadInfo.from_path(self.path_or_fileobj)
+        elif isinstance(self.path_or_fileobj, bytes):
+            self.upload_info = UploadInfo.from_bytes(self.path_or_fileobj)
+        else:
+            self.upload_info = UploadInfo.from_fileobj(self.path_or_fileobj)
+
+    @contextmanager
+    def as_file(self, with_tqdm: bool = False) -> Iterator[BinaryIO]:
+        """
+        A context manager that yields a file-like object allowing to read the underlying
+        data behind `path_or_fileobj`.
+
+        Args:
+            with_tqdm (`bool`, *optional*, defaults to `False`):
+                If True, iterating over the file object will display a progress bar. Only
+                works if the file-like object is a path to a file. Pure bytes and buffers
+                are not supported.
+
+        Example:
+
+        ```python
+        >>> operation = CommitOperationAdd(
+        ...        path_in_repo="remote/dir/weights.h5",
+        ...        path_or_fileobj="./local/weights.h5",
+        ... )
+        CommitOperationAdd(path_in_repo='remote/dir/weights.h5', path_or_fileobj='./local/weights.h5')
+
+        >>> with operation.as_file() as file:
+        ...     content = file.read()
+
+        >>> with operation.as_file(with_tqdm=True) as file:
+        ...     while True:
+        ...         data = file.read(1024)
+        ...         if not data:
+        ...              break
+        config.json: 100%|█████████████████████████| 8.19k/8.19k [00:02<00:00, 3.72kB/s]
+
+        >>> with operation.as_file(with_tqdm=True) as file:
+        ...     requests.put(..., data=file)
+        config.json: 100%|█████████████████████████| 8.19k/8.19k [00:02<00:00, 3.72kB/s]
+        ```
+        """
+        if isinstance(self.path_or_fileobj, str) or isinstance(self.path_or_fileobj, Path):
+            if with_tqdm:
+                with tqdm_stream_file(self.path_or_fileobj) as file:
+                    yield file
+            else:
+                with open(self.path_or_fileobj, "rb") as file:
+                    yield file
+        elif isinstance(self.path_or_fileobj, bytes):
+            yield io.BytesIO(self.path_or_fileobj)
+        elif isinstance(self.path_or_fileobj, io.BufferedIOBase):
+            prev_pos = self.path_or_fileobj.tell()
+            yield self.path_or_fileobj
+            self.path_or_fileobj.seek(prev_pos, io.SEEK_SET)
+
+    def b64content(self) -> bytes:
+        """
+        The base64-encoded content of `path_or_fileobj`
+
+        Returns: `bytes`
+        """
+        with self.as_file() as file:
+            return base64.b64encode(file.read())
+
+    @property
+    def _local_oid(self) -> Optional[str]:
+        """Return the OID of the local file.
+
+        This OID is then compared to `self._remote_oid` to check if the file has changed compared to the remote one.
+        If the file did not change, we won't upload it again to prevent empty commits.
+
+        For LFS files, the OID corresponds to the SHA256 of the file content (used a LFS ref).
+        For regular files, the OID corresponds to the SHA1 of the file content.
+        Note: this is slightly different to git OID computation since the oid of an LFS file is usually the git-SHA1 of the
+              pointer file content (not the actual file content). However, using the SHA256 is enough to detect changes
+              and more convenient client-side.
+        """
+        if self._upload_mode is None:
+            return None
+        elif self._upload_mode == "lfs":
+            return self.upload_info.sha256.hex()
+        else:
+            # Regular file => compute sha1
+            # => no need to read by chunk since the file is guaranteed to be <=5MB.
+            with self.as_file() as file:
+                return sha.git_hash(file.read())
+
+
+def _validate_path_in_repo(path_in_repo: str) -> str:
+    # Validate `path_in_repo` value to prevent a server-side issue
+    if path_in_repo.startswith("/"):
+        path_in_repo = path_in_repo[1:]
+    if path_in_repo == "." or path_in_repo == ".." or path_in_repo.startswith("../"):
+        raise ValueError(f"Invalid `path_in_repo` in CommitOperation: '{path_in_repo}'")
+    if path_in_repo.startswith("./"):
+        path_in_repo = path_in_repo[2:]
+    for forbidden in FORBIDDEN_FOLDERS:
+        if any(part == forbidden for part in path_in_repo.split("/")):
+            raise ValueError(
+                f"Invalid `path_in_repo` in CommitOperation: cannot update files under a '{forbidden}/' folder (path:"
+                f" '{path_in_repo}')."
+            )
+    return path_in_repo
+
+
+CommitOperation = Union[CommitOperationAdd, CommitOperationCopy, CommitOperationDelete]
+
+
+def _warn_on_overwriting_operations(operations: List[CommitOperation]) -> None:
+    """
+    Warn user when a list of operations is expected to overwrite itself in a single
+    commit.
+
+    Rules:
+    - If a filepath is updated by multiple `CommitOperationAdd` operations, a warning
+      message is triggered.
+    - If a filepath is updated at least once by a `CommitOperationAdd` and then deleted
+      by a `CommitOperationDelete`, a warning is triggered.
+    - If a `CommitOperationDelete` deletes a filepath that is then updated by a
+      `CommitOperationAdd`, no warning is triggered. This is usually useless (no need to
+      delete before upload) but can happen if a user deletes an entire folder and then
+      add new files to it.
+    """
+    nb_additions_per_path: Dict[str, int] = defaultdict(int)
+    for operation in operations:
+        path_in_repo = operation.path_in_repo
+        if isinstance(operation, CommitOperationAdd):
+            if nb_additions_per_path[path_in_repo] > 0:
+                warnings.warn(
+                    "About to update multiple times the same file in the same commit:"
+                    f" '{path_in_repo}'. This can cause undesired inconsistencies in"
+                    " your repo."
+                )
+            nb_additions_per_path[path_in_repo] += 1
+            for parent in PurePosixPath(path_in_repo).parents:
+                # Also keep track of number of updated files per folder
+                # => warns if deleting a folder overwrite some contained files
+                nb_additions_per_path[str(parent)] += 1
+        if isinstance(operation, CommitOperationDelete):
+            if nb_additions_per_path[str(PurePosixPath(path_in_repo))] > 0:
+                if operation.is_folder:
+                    warnings.warn(
+                        "About to delete a folder containing files that have just been"
+                        f" updated within the same commit: '{path_in_repo}'. This can"
+                        " cause undesired inconsistencies in your repo."
+                    )
+                else:
+                    warnings.warn(
+                        "About to delete a file that have just been updated within the"
+                        f" same commit: '{path_in_repo}'. This can cause undesired"
+                        " inconsistencies in your repo."
+                    )
+
+
+@validate_hf_hub_args
+def _upload_lfs_files(
+    *,
+    additions: List[CommitOperationAdd],
+    repo_type: str,
+    repo_id: str,
+    headers: Dict[str, str],
+    endpoint: Optional[str] = None,
+    num_threads: int = 5,
+    revision: Optional[str] = None,
+):
+    """
+    Uploads the content of `additions` to the Hub using the large file storage protocol.
+
+    Relevant external documentation:
+        - LFS Batch API: https://github.com/git-lfs/git-lfs/blob/main/docs/api/batch.md
+
+    Args:
+        additions (`List` of `CommitOperationAdd`):
+            The files to be uploaded
+        repo_type (`str`):
+            Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
+        repo_id (`str`):
+            A namespace (user or an organization) and a repo name separated
+            by a `/`.
+        headers (`Dict[str, str]`):
+            Headers to use for the request, including authorization headers and user agent.
+        num_threads (`int`, *optional*):
+            The number of concurrent threads to use when uploading. Defaults to 5.
+        revision (`str`, *optional*):
+            The git revision to upload to.
+
+    Raises:
+        [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
+            If an upload failed for any reason
+        [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
+            If the server returns malformed responses
+        [`HTTPError`](https://requests.readthedocs.io/en/latest/api/#requests.HTTPError)
+            If the LFS batch endpoint returned an HTTP error.
+    """
+    # Step 1: retrieve upload instructions from the LFS batch endpoint.
+    #         Upload instructions are retrieved by chunk of 256 files to avoid reaching
+    #         the payload limit.
+    batch_actions: List[Dict] = []
+    for chunk in chunk_iterable(additions, chunk_size=UPLOAD_BATCH_MAX_NUM_FILES):
+        batch_actions_chunk, batch_errors_chunk = post_lfs_batch_info(
+            upload_infos=[op.upload_info for op in chunk],
+            repo_id=repo_id,
+            repo_type=repo_type,
+            revision=revision,
+            endpoint=endpoint,
+            headers=headers,
+            token=None,  # already passed in 'headers'
+        )
+
+        # If at least 1 error, we do not retrieve information for other chunks
+        if batch_errors_chunk:
+            message = "\n".join(
+                [
+                    f"Encountered error for file with OID {err.get('oid')}: `{err.get('error', {}).get('message')}"
+                    for err in batch_errors_chunk
+                ]
+            )
+            raise ValueError(f"LFS batch endpoint returned errors:\n{message}")
+
+        batch_actions += batch_actions_chunk
+    oid2addop = {add_op.upload_info.sha256.hex(): add_op for add_op in additions}
+
+    # Step 2: ignore files that have already been uploaded
+    filtered_actions = []
+    for action in batch_actions:
+        if action.get("actions") is None:
+            logger.debug(
+                f"Content of file {oid2addop[action['oid']].path_in_repo} is already"
+                " present upstream - skipping upload."
+            )
+        else:
+            filtered_actions.append(action)
+
+    if len(filtered_actions) == 0:
+        logger.debug("No LFS files to upload.")
+        return
+
+    # Step 3: upload files concurrently according to these instructions
+    def _wrapped_lfs_upload(batch_action) -> None:
+        try:
+            operation = oid2addop[batch_action["oid"]]
+            lfs_upload(operation=operation, lfs_batch_action=batch_action, headers=headers, endpoint=endpoint)
+        except Exception as exc:
+            raise RuntimeError(f"Error while uploading '{operation.path_in_repo}' to the Hub.") from exc
+
+    if constants.HF_HUB_ENABLE_HF_TRANSFER:
+        logger.debug(f"Uploading {len(filtered_actions)} LFS files to the Hub using `hf_transfer`.")
+        for action in hf_tqdm(filtered_actions, name="huggingface_hub.lfs_upload"):
+            _wrapped_lfs_upload(action)
+    elif len(filtered_actions) == 1:
+        logger.debug("Uploading 1 LFS file to the Hub")
+        _wrapped_lfs_upload(filtered_actions[0])
+    else:
+        logger.debug(
+            f"Uploading {len(filtered_actions)} LFS files to the Hub using up to {num_threads} threads concurrently"
+        )
+        thread_map(
+            _wrapped_lfs_upload,
+            filtered_actions,
+            desc=f"Upload {len(filtered_actions)} LFS files",
+            max_workers=num_threads,
+            tqdm_class=hf_tqdm,
+        )
+
+
+@validate_hf_hub_args
+def _upload_xet_files(
+    *,
+    additions: List[CommitOperationAdd],
+    repo_type: str,
+    repo_id: str,
+    headers: Dict[str, str],
+    endpoint: Optional[str] = None,
+    revision: Optional[str] = None,
+    create_pr: Optional[bool] = None,
+):
+    """
+    Uploads the content of `additions` to the Hub using the xet storage protocol.
+    This chunks the files and deduplicates the chunks before uploading them to xetcas storage.
+
+    Args:
+        additions (`List` of `CommitOperationAdd`):
+            The files to be uploaded.
+        repo_type (`str`):
+            Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
+        repo_id (`str`):
+            A namespace (user or an organization) and a repo name separated
+            by a `/`.
+        headers (`Dict[str, str]`):
+            Headers to use for the request, including authorization headers and user agent.
+        endpoint: (`str`, *optional*):
+            The endpoint to use for the xetcas service. Defaults to `constants.ENDPOINT`.
+        revision (`str`, *optional*):
+            The git revision to upload to.
+        create_pr (`bool`, *optional*):
+            Whether or not to create a Pull Request with that commit.
+
+    Raises:
+        [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
+            If an upload failed for any reason.
+        [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
+            If the server returns malformed responses or if the user is unauthorized to upload to xet storage.
+        [`HTTPError`](https://requests.readthedocs.io/en/latest/api/#requests.HTTPError)
+            If the LFS batch endpoint returned an HTTP error.
+
+    **How it works:**
+        The file download system uses Xet storage, which is a content-addressable storage system that breaks files into chunks
+            for efficient storage and transfer.
+
+        `hf_xet.upload_files` manages uploading files by:
+            - Taking a list of file paths to upload
+            - Breaking files into smaller chunks for efficient storage
+            - Avoiding duplicate storage by recognizing identical chunks across files
+            - Connecting to a storage server (CAS server) that manages these chunks
+
+        The upload process works like this:
+        1. Create a local folder at ~/.cache/huggingface/xet/chunk-cache to store file chunks for reuse.
+        2. Process files in parallel (up to 8 files at once):
+            2.1. Read the file content.
+            2.2. Split the file content into smaller chunks based on content patterns: each chunk gets a unique ID based on what's in it.
+            2.3. For each chunk:
+                - Check if it already exists in storage.
+                - Skip uploading chunks that already exist.
+            2.4. Group chunks into larger blocks for efficient transfer.
+            2.5. Upload these blocks to the storage server.
+            2.6. Create and upload information about how the file is structured.
+        3. Return reference files that contain information about the uploaded files, which can be used later to download them.
+    """
+    if len(additions) == 0:
+        return
+    # at this point, we know that hf_xet is installed
+    from hf_xet import upload_files
+
+    try:
+        xet_connection_info = fetch_xet_connection_info_from_repo_info(
+            token_type=XetTokenType.WRITE,
+            repo_id=repo_id,
+            repo_type=repo_type,
+            revision=revision,
+            headers=headers,
+            endpoint=endpoint,
+            params={"create_pr": "1"} if create_pr else None,
+        )
+    except HfHubHTTPError as e:
+        if e.response.status_code == 401:
+            raise XetAuthorizationError(
+                f"You are unauthorized to upload to xet storage for {repo_type}/{repo_id}. "
+                f"Please check that you have configured your access token with write access to the repo."
+            ) from e
+        raise
+
+    xet_endpoint = xet_connection_info.endpoint
+    access_token_info = (xet_connection_info.access_token, xet_connection_info.expiration_unix_epoch)
+
+    def token_refresher() -> Tuple[str, int]:
+        new_xet_connection = fetch_xet_connection_info_from_repo_info(
+            token_type=XetTokenType.WRITE,
+            repo_id=repo_id,
+            repo_type=repo_type,
+            revision=revision,
+            headers=headers,
+            endpoint=endpoint,
+            params={"create_pr": "1"} if create_pr else None,
+        )
+        if new_xet_connection is None:
+            raise XetRefreshTokenError("Failed to refresh xet token")
+        return new_xet_connection.access_token, new_xet_connection.expiration_unix_epoch
+
+    num_chunks = math.ceil(len(additions) / UPLOAD_BATCH_MAX_NUM_FILES)
+    num_chunks_num_digits = int(math.log10(num_chunks)) + 1
+    for i, chunk in enumerate(chunk_iterable(additions, chunk_size=UPLOAD_BATCH_MAX_NUM_FILES)):
+        _chunk = [op for op in chunk]
+        paths = [str(op.path_or_fileobj) for op in _chunk]
+        expected_size = sum([os.path.getsize(path) for path in paths])
+
+        if num_chunks > 1:
+            description = f"Uploading Batch [{str(i + 1).zfill(num_chunks_num_digits)}/{num_chunks}]..."
+        else:
+            description = "Uploading..."
+        progress_cm = _get_progress_bar_context(
+            desc=description,
+            total=expected_size,
+            initial=0,
+            unit="B",
+            unit_scale=True,
+            name="huggingface_hub.xet_put",
+            log_level=logger.getEffectiveLevel(),
+        )
+        with progress_cm as progress:
+
+            def update_progress(increment: int):
+                progress.update(increment)
+
+            upload_files(paths, xet_endpoint, access_token_info, token_refresher, update_progress, repo_type)
+    return
+
+
+def _validate_preupload_info(preupload_info: dict):
+    files = preupload_info.get("files")
+    if not isinstance(files, list):
+        raise ValueError("preupload_info is improperly formatted")
+    for file_info in files:
+        if not (
+            isinstance(file_info, dict)
+            and isinstance(file_info.get("path"), str)
+            and isinstance(file_info.get("uploadMode"), str)
+            and (file_info["uploadMode"] in ("lfs", "regular"))
+        ):
+            raise ValueError("preupload_info is improperly formatted:")
+    return preupload_info
+
+
+@validate_hf_hub_args
+def _fetch_upload_modes(
+    additions: Iterable[CommitOperationAdd],
+    repo_type: str,
+    repo_id: str,
+    headers: Dict[str, str],
+    revision: str,
+    endpoint: Optional[str] = None,
+    create_pr: bool = False,
+    gitignore_content: Optional[str] = None,
+) -> None:
+    """
+    Requests the Hub "preupload" endpoint to determine whether each input file should be uploaded as a regular git blob,
+    as a git LFS blob, or as a XET file. Input `additions` are mutated in-place with the upload mode.
+
+    Args:
+        additions (`Iterable` of :class:`CommitOperationAdd`):
+            Iterable of :class:`CommitOperationAdd` describing the files to
+            upload to the Hub.
+        repo_type (`str`):
+            Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
+        repo_id (`str`):
+            A namespace (user or an organization) and a repo name separated
+            by a `/`.
+        headers (`Dict[str, str]`):
+            Headers to use for the request, including authorization headers and user agent.
+        revision (`str`):
+            The git revision to upload the files to. Can be any valid git revision.
+        gitignore_content (`str`, *optional*):
+            The content of the `.gitignore` file to know which files should be ignored. The order of priority
+            is to first check if `gitignore_content` is passed, then check if the `.gitignore` file is present
+            in the list of files to commit and finally default to the `.gitignore` file already hosted on the Hub
+            (if any).
+    Raises:
+        [`~utils.HfHubHTTPError`]
+            If the Hub API returned an error.
+        [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
+            If the Hub API response is improperly formatted.
+    """
+    endpoint = endpoint if endpoint is not None else constants.ENDPOINT
+
+    # Fetch upload mode (LFS or regular) chunk by chunk.
+    upload_modes: Dict[str, UploadMode] = {}
+    should_ignore_info: Dict[str, bool] = {}
+    oid_info: Dict[str, Optional[str]] = {}
+
+    for chunk in chunk_iterable(additions, 256):
+        payload: Dict = {
+            "files": [
+                {
+                    "path": op.path_in_repo,
+                    "sample": base64.b64encode(op.upload_info.sample).decode("ascii"),
+                    "size": op.upload_info.size,
+                }
+                for op in chunk
+            ]
+        }
+        if gitignore_content is not None:
+            payload["gitIgnore"] = gitignore_content
+
+        resp = get_session().post(
+            f"{endpoint}/api/{repo_type}s/{repo_id}/preupload/{revision}",
+            json=payload,
+            headers=headers,
+            params={"create_pr": "1"} if create_pr else None,
+        )
+        hf_raise_for_status(resp)
+        preupload_info = _validate_preupload_info(resp.json())
+        upload_modes.update(**{file["path"]: file["uploadMode"] for file in preupload_info["files"]})
+        should_ignore_info.update(**{file["path"]: file["shouldIgnore"] for file in preupload_info["files"]})
+        oid_info.update(**{file["path"]: file.get("oid") for file in preupload_info["files"]})
+
+    # Set upload mode for each addition operation
+    for addition in additions:
+        addition._upload_mode = upload_modes[addition.path_in_repo]
+        addition._should_ignore = should_ignore_info[addition.path_in_repo]
+        addition._remote_oid = oid_info[addition.path_in_repo]
+
+    # Empty files cannot be uploaded as LFS (S3 would fail with a 501 Not Implemented)
+    # => empty files are uploaded as "regular" to still allow users to commit them.
+    for addition in additions:
+        if addition.upload_info.size == 0:
+            addition._upload_mode = "regular"
+
+
+@validate_hf_hub_args
+def _fetch_files_to_copy(
+    copies: Iterable[CommitOperationCopy],
+    repo_type: str,
+    repo_id: str,
+    headers: Dict[str, str],
+    revision: str,
+    endpoint: Optional[str] = None,
+) -> Dict[Tuple[str, Optional[str]], Union["RepoFile", bytes]]:
+    """
+    Fetch information about the files to copy.
+
+    For LFS files, we only need their metadata (file size and sha256) while for regular files
+    we need to download the raw content from the Hub.
+
+    Args:
+        copies (`Iterable` of :class:`CommitOperationCopy`):
+            Iterable of :class:`CommitOperationCopy` describing the files to
+            copy on the Hub.
+        repo_type (`str`):
+            Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
+        repo_id (`str`):
+            A namespace (user or an organization) and a repo name separated
+            by a `/`.
+        headers (`Dict[str, str]`):
+            Headers to use for the request, including authorization headers and user agent.
+        revision (`str`):
+            The git revision to upload the files to. Can be any valid git revision.
+
+    Returns: `Dict[Tuple[str, Optional[str]], Union[RepoFile, bytes]]]`
+        Key is the file path and revision of the file to copy.
+        Value is the raw content as bytes (for regular files) or the file information as a RepoFile (for LFS files).
+
+    Raises:
+        [`~utils.HfHubHTTPError`]
+            If the Hub API returned an error.
+        [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
+            If the Hub API response is improperly formatted.
+    """
+    from .hf_api import HfApi, RepoFolder
+
+    hf_api = HfApi(endpoint=endpoint, headers=headers)
+    files_to_copy: Dict[Tuple[str, Optional[str]], Union["RepoFile", bytes]] = {}
+    # Store (path, revision) -> oid mapping
+    oid_info: Dict[Tuple[str, Optional[str]], Optional[str]] = {}
+    # 1. Fetch OIDs for destination paths in batches.
+    dest_paths = [op.path_in_repo for op in copies]
+    for offset in range(0, len(dest_paths), FETCH_LFS_BATCH_SIZE):
+        dest_repo_files = hf_api.get_paths_info(
+            repo_id=repo_id,
+            paths=dest_paths[offset : offset + FETCH_LFS_BATCH_SIZE],
+            revision=revision,
+            repo_type=repo_type,
+        )
+        for file in dest_repo_files:
+            if not isinstance(file, RepoFolder):
+                oid_info[(file.path, revision)] = file.blob_id
+
+    # 2. Group by source revision and fetch source file info in batches.
+    for src_revision, operations in groupby(copies, key=lambda op: op.src_revision):
+        operations = list(operations)  # type: ignore
+        src_paths = [op.src_path_in_repo for op in operations]
+        for offset in range(0, len(src_paths), FETCH_LFS_BATCH_SIZE):
+            src_repo_files = hf_api.get_paths_info(
+                repo_id=repo_id,
+                paths=src_paths[offset : offset + FETCH_LFS_BATCH_SIZE],
+                revision=src_revision or revision,
+                repo_type=repo_type,
+            )
+
+            for src_repo_file in src_repo_files:
+                if isinstance(src_repo_file, RepoFolder):
+                    raise NotImplementedError("Copying a folder is not implemented.")
+                oid_info[(src_repo_file.path, src_revision)] = src_repo_file.blob_id
+                # If it's an LFS file, store the RepoFile object. Otherwise, download raw bytes.
+                if src_repo_file.lfs:
+                    files_to_copy[(src_repo_file.path, src_revision)] = src_repo_file
+                else:
+                    # TODO: (optimization) download regular files to copy concurrently
+                    url = hf_hub_url(
+                        endpoint=endpoint,
+                        repo_type=repo_type,
+                        repo_id=repo_id,
+                        revision=src_revision or revision,
+                        filename=src_repo_file.path,
+                    )
+                    response = get_session().get(url, headers=headers)
+                    hf_raise_for_status(response)
+                    files_to_copy[(src_repo_file.path, src_revision)] = response.content
+        # 3. Ensure all operations found a corresponding file in the Hub
+        #  and track src/dest OIDs for each operation.
+        for operation in operations:
+            if (operation.src_path_in_repo, src_revision) not in files_to_copy:
+                raise EntryNotFoundError(
+                    f"Cannot copy {operation.src_path_in_repo} at revision "
+                    f"{src_revision or revision}: file is missing on repo."
+                )
+            operation._src_oid = oid_info.get((operation.src_path_in_repo, operation.src_revision))
+            operation._dest_oid = oid_info.get((operation.path_in_repo, revision))
+    return files_to_copy
+
+
+def _prepare_commit_payload(
+    operations: Iterable[CommitOperation],
+    files_to_copy: Dict[Tuple[str, Optional[str]], Union["RepoFile", bytes]],
+    commit_message: str,
+    commit_description: Optional[str] = None,
+    parent_commit: Optional[str] = None,
+) -> Iterable[Dict[str, Any]]:
+    """
+    Builds the payload to POST to the `/commit` API of the Hub.
+
+    Payload is returned as an iterator so that it can be streamed as a ndjson in the
+    POST request.
+
+    For more information, see:
+        - https://github.com/huggingface/huggingface_hub/issues/1085#issuecomment-1265208073
+        - http://ndjson.org/
+    """
+    commit_description = commit_description if commit_description is not None else ""
+
+    # 1. Send a header item with the commit metadata
+    header_value = {"summary": commit_message, "description": commit_description}
+    if parent_commit is not None:
+        header_value["parentCommit"] = parent_commit
+    yield {"key": "header", "value": header_value}
+
+    nb_ignored_files = 0
+
+    # 2. Send operations, one per line
+    for operation in operations:
+        # Skip ignored files
+        if isinstance(operation, CommitOperationAdd) and operation._should_ignore:
+            logger.debug(f"Skipping file '{operation.path_in_repo}' in commit (ignored by gitignore file).")
+            nb_ignored_files += 1
+            continue
+
+        # 2.a. Case adding a regular file
+        if isinstance(operation, CommitOperationAdd) and operation._upload_mode == "regular":
+            yield {
+                "key": "file",
+                "value": {
+                    "content": operation.b64content().decode(),
+                    "path": operation.path_in_repo,
+                    "encoding": "base64",
+                },
+            }
+        # 2.b. Case adding an LFS file
+        elif isinstance(operation, CommitOperationAdd) and operation._upload_mode == "lfs":
+            yield {
+                "key": "lfsFile",
+                "value": {
+                    "path": operation.path_in_repo,
+                    "algo": "sha256",
+                    "oid": operation.upload_info.sha256.hex(),
+                    "size": operation.upload_info.size,
+                },
+            }
+        # 2.c. Case deleting a file or folder
+        elif isinstance(operation, CommitOperationDelete):
+            yield {
+                "key": "deletedFolder" if operation.is_folder else "deletedFile",
+                "value": {"path": operation.path_in_repo},
+            }
+        # 2.d. Case copying a file or folder
+        elif isinstance(operation, CommitOperationCopy):
+            file_to_copy = files_to_copy[(operation.src_path_in_repo, operation.src_revision)]
+            if isinstance(file_to_copy, bytes):
+                yield {
+                    "key": "file",
+                    "value": {
+                        "content": base64.b64encode(file_to_copy).decode(),
+                        "path": operation.path_in_repo,
+                        "encoding": "base64",
+                    },
+                }
+            elif file_to_copy.lfs:
+                yield {
+                    "key": "lfsFile",
+                    "value": {
+                        "path": operation.path_in_repo,
+                        "algo": "sha256",
+                        "oid": file_to_copy.lfs.sha256,
+                    },
+                }
+            else:
+                raise ValueError(
+                    "Malformed files_to_copy (should be raw file content as bytes or RepoFile objects with LFS info."
+                )
+        # 2.e. Never expected to happen
+        else:
+            raise ValueError(
+                f"Unknown operation to commit. Operation: {operation}. Upload mode:"
+                f" {getattr(operation, '_upload_mode', None)}"
+            )
+
+    if nb_ignored_files > 0:
+        logger.info(f"Skipped {nb_ignored_files} file(s) in commit (ignored by gitignore file).")