structure saas with tools
This commit is contained in:
100
.venv/lib/python3.10/site-packages/tokenizers/__init__.py
Normal file
100
.venv/lib/python3.10/site-packages/tokenizers/__init__.py
Normal file
@@ -0,0 +1,100 @@
|
||||
from enum import Enum
|
||||
from typing import List, Tuple, Union
|
||||
|
||||
|
||||
Offsets = Tuple[int, int]
|
||||
|
||||
TextInputSequence = str
|
||||
"""A :obj:`str` that represents an input sequence """
|
||||
|
||||
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
|
||||
"""A pre-tokenized input sequence. Can be one of:
|
||||
|
||||
- A :obj:`List` of :obj:`str`
|
||||
- A :obj:`Tuple` of :obj:`str`
|
||||
"""
|
||||
|
||||
TextEncodeInput = Union[
|
||||
TextInputSequence,
|
||||
Tuple[TextInputSequence, TextInputSequence],
|
||||
List[TextInputSequence],
|
||||
]
|
||||
"""Represents a textual input for encoding. Can be either:
|
||||
|
||||
- A single sequence: :data:`~tokenizers.TextInputSequence`
|
||||
- A pair of sequences:
|
||||
|
||||
- A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence`
|
||||
- Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2
|
||||
"""
|
||||
|
||||
PreTokenizedEncodeInput = Union[
|
||||
PreTokenizedInputSequence,
|
||||
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
|
||||
List[PreTokenizedInputSequence],
|
||||
]
|
||||
"""Represents a pre-tokenized input for encoding. Can be either:
|
||||
|
||||
- A single sequence: :data:`~tokenizers.PreTokenizedInputSequence`
|
||||
- A pair of sequences:
|
||||
|
||||
- A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence`
|
||||
- Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2
|
||||
"""
|
||||
|
||||
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
|
||||
"""Represents all the possible types of input sequences for encoding. Can be:
|
||||
|
||||
- When ``is_pretokenized=False``: :data:`~TextInputSequence`
|
||||
- When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence`
|
||||
"""
|
||||
|
||||
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
|
||||
"""Represents all the possible types of input for encoding. Can be:
|
||||
|
||||
- When ``is_pretokenized=False``: :data:`~TextEncodeInput`
|
||||
- When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput`
|
||||
"""
|
||||
|
||||
|
||||
class OffsetReferential(Enum):
|
||||
ORIGINAL = "original"
|
||||
NORMALIZED = "normalized"
|
||||
|
||||
|
||||
class OffsetType(Enum):
|
||||
BYTE = "byte"
|
||||
CHAR = "char"
|
||||
|
||||
|
||||
class SplitDelimiterBehavior(Enum):
|
||||
REMOVED = "removed"
|
||||
ISOLATED = "isolated"
|
||||
MERGED_WITH_PREVIOUS = "merged_with_previous"
|
||||
MERGED_WITH_NEXT = "merged_with_next"
|
||||
CONTIGUOUS = "contiguous"
|
||||
|
||||
|
||||
from .tokenizers import (
|
||||
AddedToken,
|
||||
Encoding,
|
||||
NormalizedString,
|
||||
PreTokenizedString,
|
||||
Regex,
|
||||
Token,
|
||||
Tokenizer,
|
||||
decoders,
|
||||
models,
|
||||
normalizers,
|
||||
pre_tokenizers,
|
||||
processors,
|
||||
trainers,
|
||||
__version__,
|
||||
)
|
||||
from .implementations import (
|
||||
BertWordPieceTokenizer,
|
||||
ByteLevelBPETokenizer,
|
||||
CharBPETokenizer,
|
||||
SentencePieceBPETokenizer,
|
||||
SentencePieceUnigramTokenizer,
|
||||
)
|
||||
Reference in New Issue
Block a user