structure saas with tools

This commit is contained in:
Davidson Gomes
2025-04-25 15:30:54 -03:00
commit 1aef473937
16434 changed files with 6584257 additions and 0 deletions

View File

@@ -0,0 +1,100 @@
from enum import Enum
from typing import List, Tuple, Union
Offsets = Tuple[int, int]
TextInputSequence = str
"""A :obj:`str` that represents an input sequence """
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
"""A pre-tokenized input sequence. Can be one of:
- A :obj:`List` of :obj:`str`
- A :obj:`Tuple` of :obj:`str`
"""
TextEncodeInput = Union[
TextInputSequence,
Tuple[TextInputSequence, TextInputSequence],
List[TextInputSequence],
]
"""Represents a textual input for encoding. Can be either:
- A single sequence: :data:`~tokenizers.TextInputSequence`
- A pair of sequences:
- A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence`
- Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2
"""
PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence,
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
List[PreTokenizedInputSequence],
]
"""Represents a pre-tokenized input for encoding. Can be either:
- A single sequence: :data:`~tokenizers.PreTokenizedInputSequence`
- A pair of sequences:
- A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence`
- Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2
"""
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
"""Represents all the possible types of input sequences for encoding. Can be:
- When ``is_pretokenized=False``: :data:`~TextInputSequence`
- When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence`
"""
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
"""Represents all the possible types of input for encoding. Can be:
- When ``is_pretokenized=False``: :data:`~TextEncodeInput`
- When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput`
"""
class OffsetReferential(Enum):
ORIGINAL = "original"
NORMALIZED = "normalized"
class OffsetType(Enum):
BYTE = "byte"
CHAR = "char"
class SplitDelimiterBehavior(Enum):
REMOVED = "removed"
ISOLATED = "isolated"
MERGED_WITH_PREVIOUS = "merged_with_previous"
MERGED_WITH_NEXT = "merged_with_next"
CONTIGUOUS = "contiguous"
from .tokenizers import (
AddedToken,
Encoding,
NormalizedString,
PreTokenizedString,
Regex,
Token,
Tokenizer,
decoders,
models,
normalizers,
pre_tokenizers,
processors,
trainers,
__version__,
)
from .implementations import (
BertWordPieceTokenizer,
ByteLevelBPETokenizer,
CharBPETokenizer,
SentencePieceBPETokenizer,
SentencePieceUnigramTokenizer,
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
from .. import decoders
Decoder = decoders.Decoder
ByteLevel = decoders.ByteLevel
Replace = decoders.Replace
WordPiece = decoders.WordPiece
ByteFallback = decoders.ByteFallback
Fuse = decoders.Fuse
Strip = decoders.Strip
Metaspace = decoders.Metaspace
BPEDecoder = decoders.BPEDecoder
CTC = decoders.CTC
Sequence = decoders.Sequence
DecodeStream = decoders.DecodeStream

View File

@@ -0,0 +1,279 @@
# Generated content DO NOT EDIT
class DecodeStream:
"""
Class needed for streaming decode
"""
def __init__(self, skip_special_tokens):
pass
class Decoder:
"""
Base class for all decoders
This class is not supposed to be instantiated directly. Instead, any implementation of
a Decoder will return an instance of this class when instantiated.
"""
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
Args:
tokens (:obj:`List[str]`):
The list of tokens to decode
Returns:
:obj:`str`: The decoded string
"""
pass
class BPEDecoder(Decoder):
"""
BPEDecoder Decoder
Args:
suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`):
The suffix that was used to caracterize an end-of-word. This suffix will
be replaced by whitespaces during the decoding
"""
def __init__(self, suffix="</w>"):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
Args:
tokens (:obj:`List[str]`):
The list of tokens to decode
Returns:
:obj:`str`: The decoded string
"""
pass
class ByteFallback(Decoder):
"""
ByteFallback Decoder
ByteFallback is a simple trick which converts tokens looking like `<0x61>`
to pure bytes, and attempts to make them into a string. If the tokens
cannot be decoded you will get <20> instead for each inconvertible byte token
"""
def __init__(self):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
Args:
tokens (:obj:`List[str]`):
The list of tokens to decode
Returns:
:obj:`str`: The decoded string
"""
pass
class ByteLevel(Decoder):
"""
ByteLevel Decoder
This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel`
:class:`~tokenizers.pre_tokenizers.PreTokenizer`.
"""
def __init__(self):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
Args:
tokens (:obj:`List[str]`):
The list of tokens to decode
Returns:
:obj:`str`: The decoded string
"""
pass
class CTC(Decoder):
"""
CTC Decoder
Args:
pad_token (:obj:`str`, `optional`, defaults to :obj:`<pad>`):
The pad token used by CTC to delimit a new token.
word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`|`):
The word delimiter token. It will be replaced by a <space>
cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to cleanup some tokenization artifacts.
Mainly spaces before punctuation, and some abbreviated english forms.
"""
def __init__(self, pad_token="<pad>", word_delimiter_token="|", cleanup=True):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
Args:
tokens (:obj:`List[str]`):
The list of tokens to decode
Returns:
:obj:`str`: The decoded string
"""
pass
class Fuse(Decoder):
"""
Fuse Decoder
Fuse simply fuses every token into a single string.
This is the last step of decoding, this decoder exists only if
there is need to add other decoders *after* the fusion
"""
def __init__(self):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
Args:
tokens (:obj:`List[str]`):
The list of tokens to decode
Returns:
:obj:`str`: The decoded string
"""
pass
class Metaspace(Decoder):
"""
Metaspace Decoder
Args:
replacement (:obj:`str`, `optional`, defaults to :obj:`▁`):
The replacement character. Must be exactly one character. By default we
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
Choices: "always", "never", "first". First means the space is only added on the first
token (relevant when special tokens are used or other pre_tokenizer are used).
"""
def __init__(self, replacement="", prepend_scheme="always", split=True):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
Args:
tokens (:obj:`List[str]`):
The list of tokens to decode
Returns:
:obj:`str`: The decoded string
"""
pass
class Replace(Decoder):
"""
Replace Decoder
This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace`
:class:`~tokenizers.pre_tokenizers.PreTokenizer`.
"""
def __init__(self, pattern, content):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
Args:
tokens (:obj:`List[str]`):
The list of tokens to decode
Returns:
:obj:`str`: The decoded string
"""
pass
class Sequence(Decoder):
"""
Sequence Decoder
Args:
decoders (:obj:`List[Decoder]`)
The decoders that need to be chained
"""
def __init__(self, decoders):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
Args:
tokens (:obj:`List[str]`):
The list of tokens to decode
Returns:
:obj:`str`: The decoded string
"""
pass
class Strip(Decoder):
"""
Strip normalizer
Strips n left characters of each token, or n right characters of each token
"""
def __init__(self, content, left=0, right=0):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
Args:
tokens (:obj:`List[str]`):
The list of tokens to decode
Returns:
:obj:`str`: The decoded string
"""
pass
class WordPiece(Decoder):
"""
WordPiece Decoder
Args:
prefix (:obj:`str`, `optional`, defaults to :obj:`##`):
The prefix to use for subwords that are not a beginning-of-word
cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
and some abbreviated english forms.
"""
def __init__(self, prefix="##", cleanup=True):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
Args:
tokens (:obj:`List[str]`):
The list of tokens to decode
Returns:
:obj:`str`: The decoded string
"""
pass

View File

@@ -0,0 +1,6 @@
from .base_tokenizer import BaseTokenizer
from .bert_wordpiece import BertWordPieceTokenizer
from .byte_level_bpe import ByteLevelBPETokenizer
from .char_level_bpe import CharBPETokenizer
from .sentencepiece_bpe import SentencePieceBPETokenizer
from .sentencepiece_unigram import SentencePieceUnigramTokenizer

View File

@@ -0,0 +1,418 @@
from typing import Dict, List, Optional, Tuple, Union
from tokenizers import AddedToken, EncodeInput, Encoding, InputSequence, Tokenizer
from tokenizers.decoders import Decoder
from tokenizers.models import Model
from tokenizers.normalizers import Normalizer
from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.processors import PostProcessor
Offsets = Tuple[int, int]
class BaseTokenizer:
def __init__(self, tokenizer: Tokenizer, parameters=None):
self._tokenizer = tokenizer
self._parameters = parameters if parameters is not None else {}
def __repr__(self):
return "Tokenizer(vocabulary_size={}, {})".format(
self._tokenizer.get_vocab_size(),
", ".join(k + "=" + str(v) for k, v in self._parameters.items()),
)
def num_special_tokens_to_add(self, is_pair: bool) -> int:
"""
Return the number of special tokens that would be added for single/pair sentences.
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
:return:
"""
return self._tokenizer.num_special_tokens_to_add(is_pair)
def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
"""Returns the vocabulary
Args:
with_added_tokens: boolean:
Whether to include the added tokens in the vocabulary
Returns:
The vocabulary
"""
return self._tokenizer.get_vocab(with_added_tokens=with_added_tokens)
def get_added_tokens_decoder(self) -> Dict[int, AddedToken]:
"""Returns the added reverse vocabulary
Returns:
The added vocabulary mapping ints to AddedTokens
"""
return self._tokenizer.get_added_tokens_decoder()
def get_vocab_size(self, with_added_tokens: bool = True) -> int:
"""Return the size of vocabulary, with or without added tokens.
Args:
with_added_tokens: (`optional`) bool:
Whether to count in added special tokens or not
Returns:
Size of vocabulary
"""
return self._tokenizer.get_vocab_size(with_added_tokens=with_added_tokens)
def enable_padding(
self,
direction: Optional[str] = "right",
pad_to_multiple_of: Optional[int] = None,
pad_id: Optional[int] = 0,
pad_type_id: Optional[int] = 0,
pad_token: Optional[str] = "[PAD]",
length: Optional[int] = None,
):
"""Change the padding strategy
Args:
direction: (`optional`) str:
Can be one of: `right` or `left`
pad_to_multiple_of: (`optional`) unsigned int:
If specified, the padding length should always snap to the next multiple of
the given value. For example if we were going to pad with a length of 250 but
`pad_to_multiple_of=8` then we will pad to 256.
pad_id: (`optional`) unsigned int:
The indice to be used when padding
pad_type_id: (`optional`) unsigned int:
The type indice to be used when padding
pad_token: (`optional`) str:
The pad token to be used when padding
length: (`optional`) unsigned int:
If specified, the length at which to pad. If not specified
we pad using the size of the longest sequence in a batch
"""
return self._tokenizer.enable_padding(
direction=direction,
pad_to_multiple_of=pad_to_multiple_of,
pad_id=pad_id,
pad_type_id=pad_type_id,
pad_token=pad_token,
length=length,
)
def no_padding(self):
"""Disable padding"""
return self._tokenizer.no_padding()
@property
def padding(self) -> Optional[dict]:
"""Get the current padding parameters
Returns:
None if padding is disabled, a dict with the currently set parameters
if the padding is enabled.
"""
return self._tokenizer.padding
def enable_truncation(self, max_length: int, stride: Optional[int] = 0, strategy: Optional[str] = "longest_first"):
"""Change the truncation options
Args:
max_length: unsigned int:
The maximum length at which to truncate
stride: (`optional`) unsigned int:
The length of the previous first sequence to be included
in the overflowing sequence
strategy: (`optional`) str:
Can be one of `longest_first`, `only_first` or `only_second`
"""
return self._tokenizer.enable_truncation(max_length, stride=stride, strategy=strategy)
def no_truncation(self):
"""Disable truncation"""
return self._tokenizer.no_truncation()
@property
def truncation(self) -> Optional[dict]:
"""Get the current truncation parameters
Returns:
None if truncation is disabled, a dict with the current truncation parameters if
truncation is enabled
"""
return self._tokenizer.truncation
def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
"""Add the given tokens to the vocabulary
Args:
tokens: List[Union[str, AddedToken]]:
A list of tokens to add to the vocabulary. Each token can either be
a string, or an instance of AddedToken
Returns:
The number of tokens that were added to the vocabulary
"""
return self._tokenizer.add_tokens(tokens)
def add_special_tokens(self, special_tokens: List[Union[str, AddedToken]]) -> int:
"""Add the given special tokens to the vocabulary, and treat them as special tokens.
The special tokens will never be processed by the model, and will be
removed while decoding.
Args:
tokens: List[Union[str, AddedToken]]:
A list of special tokens to add to the vocabulary. Each token can either be
a string, or an instance of AddedToken
Returns:
The number of tokens that were added to the vocabulary
"""
return self._tokenizer.add_special_tokens(special_tokens)
def normalize(self, sequence: str) -> str:
"""Normalize the given sequence
Args:
sequence: str:
The sequence to normalize
Returns:
The normalized string
"""
return self._tokenizer.normalize(sequence)
def encode(
self,
sequence: InputSequence,
pair: Optional[InputSequence] = None,
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> Encoding:
"""Encode the given sequence and pair. This method can process raw text sequences as well
as already pre-tokenized sequences.
Args:
sequence: InputSequence:
The sequence we want to encode. This sequence can be either raw text or
pre-tokenized, according to the `is_pretokenized` argument:
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
- If `is_pretokenized=True`: `InputSequence` is expected to be
`Union[List[str], Tuple[str]]`
is_pretokenized: bool:
Whether the input is already pre-tokenized.
add_special_tokens: bool:
Whether to add the special tokens while encoding.
Returns:
An Encoding
"""
if sequence is None:
raise ValueError("encode: `sequence` can't be `None`")
return self._tokenizer.encode(sequence, pair, is_pretokenized, add_special_tokens)
def encode_batch(
self,
inputs: List[EncodeInput],
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> List[Encoding]:
"""Encode the given inputs. This method accept both raw text sequences as well as already
pre-tokenized sequences.
Args:
inputs: List[EncodeInput]:
A list of single sequences or pair sequences to encode. Each `EncodeInput` is
expected to be of the following form:
`Union[InputSequence, Tuple[InputSequence, InputSequence]]`
Each `InputSequence` can either be raw text or pre-tokenized,
according to the `is_pretokenized` argument:
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
- If `is_pretokenized=True`: `InputSequence` is expected to be
`Union[List[str], Tuple[str]]`
is_pretokenized: bool:
Whether the input is already pre-tokenized.
add_special_tokens: bool:
Whether to add the special tokens while encoding.
Returns:
A list of Encoding
"""
if inputs is None:
raise ValueError("encode_batch: `inputs` can't be `None`")
return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
"""Decode the given list of ids to a string sequence
Args:
ids: List[unsigned int]:
A list of ids to be decoded
skip_special_tokens: (`optional`) boolean:
Whether to remove all the special tokens from the output string
Returns:
The decoded string
"""
if ids is None:
raise ValueError("None input is not valid. Should be a list of integers.")
return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
def decode_batch(self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True) -> str:
"""Decode the list of sequences to a list of string sequences
Args:
sequences: List[List[unsigned int]]:
A list of sequence of ids to be decoded
skip_special_tokens: (`optional`) boolean:
Whether to remove all the special tokens from the output strings
Returns:
A list of decoded strings
"""
if sequences is None:
raise ValueError("None input is not valid. Should be list of list of integers.")
return self._tokenizer.decode_batch(sequences, skip_special_tokens=skip_special_tokens)
def token_to_id(self, token: str) -> Optional[int]:
"""Convert the given token to its corresponding id
Args:
token: str:
The token to convert
Returns:
The corresponding id if it exists, None otherwise
"""
return self._tokenizer.token_to_id(token)
def id_to_token(self, id: int) -> Optional[str]:
"""Convert the given token id to its corresponding string
Args:
token: id:
The token id to convert
Returns:
The corresponding string if it exists, None otherwise
"""
return self._tokenizer.id_to_token(id)
def save_model(self, directory: str, prefix: Optional[str] = None):
"""Save the current model to the given directory
Args:
directory: str:
A path to the destination directory
prefix: (Optional) str:
An optional prefix, used to prefix each file name
"""
return self._tokenizer.model.save(directory, prefix=prefix)
def save(self, path: str, pretty: bool = True):
"""Save the current Tokenizer at the given path
Args:
path: str:
A path to the destination Tokenizer file
"""
return self._tokenizer.save(path, pretty)
def to_str(self, pretty: bool = False):
"""Get a serialized JSON version of the Tokenizer as a str
Args:
pretty: bool:
Whether the JSON string should be prettified
Returns:
str
"""
return self._tokenizer.to_str(pretty)
def post_process(
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True
) -> Encoding:
"""Apply all the post-processing steps to the given encodings.
The various steps are:
1. Truncate according to global params (provided to `enable_truncation`)
2. Apply the PostProcessor
3. Pad according to global params. (provided to `enable_padding`)
Args:
encoding: Encoding:
The main Encoding to post process
pair: Optional[Encoding]:
An optional pair Encoding
add_special_tokens: bool:
Whether to add special tokens
Returns:
The resulting Encoding
"""
return self._tokenizer.post_process(encoding, pair, add_special_tokens)
@property
def model(self) -> Model:
return self._tokenizer.model
@model.setter
def model(self, model: Model):
self._tokenizer.model = model
@property
def normalizer(self) -> Normalizer:
return self._tokenizer.normalizer
@normalizer.setter
def normalizer(self, normalizer: Normalizer):
self._tokenizer.normalizer = normalizer
@property
def pre_tokenizer(self) -> PreTokenizer:
return self._tokenizer.pre_tokenizer
@pre_tokenizer.setter
def pre_tokenizer(self, pre_tokenizer: PreTokenizer):
self._tokenizer.pre_tokenizer = pre_tokenizer
@property
def post_processor(self) -> PostProcessor:
return self._tokenizer.post_processor
@post_processor.setter
def post_processor(self, post_processor: PostProcessor):
self._tokenizer.post_processor = post_processor
@property
def decoder(self) -> Decoder:
return self._tokenizer.decoder
@decoder.setter
def decoder(self, decoder: Decoder):
self._tokenizer.decoder = decoder

View File

@@ -0,0 +1,151 @@
from typing import Dict, Iterator, List, Optional, Union
from tokenizers import AddedToken, Tokenizer, decoders, trainers
from tokenizers.models import WordPiece
from tokenizers.normalizers import BertNormalizer
from tokenizers.pre_tokenizers import BertPreTokenizer
from tokenizers.processors import BertProcessing
from .base_tokenizer import BaseTokenizer
class BertWordPieceTokenizer(BaseTokenizer):
"""Bert WordPiece Tokenizer"""
def __init__(
self,
vocab: Optional[Union[str, Dict[str, int]]] = None,
unk_token: Union[str, AddedToken] = "[UNK]",
sep_token: Union[str, AddedToken] = "[SEP]",
cls_token: Union[str, AddedToken] = "[CLS]",
pad_token: Union[str, AddedToken] = "[PAD]",
mask_token: Union[str, AddedToken] = "[MASK]",
clean_text: bool = True,
handle_chinese_chars: bool = True,
strip_accents: Optional[bool] = None,
lowercase: bool = True,
wordpieces_prefix: str = "##",
):
if vocab is not None:
tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token)))
else:
tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token)))
# Let the tokenizer know about special tokens if they are part of the vocab
if tokenizer.token_to_id(str(unk_token)) is not None:
tokenizer.add_special_tokens([str(unk_token)])
if tokenizer.token_to_id(str(sep_token)) is not None:
tokenizer.add_special_tokens([str(sep_token)])
if tokenizer.token_to_id(str(cls_token)) is not None:
tokenizer.add_special_tokens([str(cls_token)])
if tokenizer.token_to_id(str(pad_token)) is not None:
tokenizer.add_special_tokens([str(pad_token)])
if tokenizer.token_to_id(str(mask_token)) is not None:
tokenizer.add_special_tokens([str(mask_token)])
tokenizer.normalizer = BertNormalizer(
clean_text=clean_text,
handle_chinese_chars=handle_chinese_chars,
strip_accents=strip_accents,
lowercase=lowercase,
)
tokenizer.pre_tokenizer = BertPreTokenizer()
if vocab is not None:
sep_token_id = tokenizer.token_to_id(str(sep_token))
if sep_token_id is None:
raise TypeError("sep_token not found in the vocabulary")
cls_token_id = tokenizer.token_to_id(str(cls_token))
if cls_token_id is None:
raise TypeError("cls_token not found in the vocabulary")
tokenizer.post_processor = BertProcessing((str(sep_token), sep_token_id), (str(cls_token), cls_token_id))
tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)
parameters = {
"model": "BertWordPiece",
"unk_token": unk_token,
"sep_token": sep_token,
"cls_token": cls_token,
"pad_token": pad_token,
"mask_token": mask_token,
"clean_text": clean_text,
"handle_chinese_chars": handle_chinese_chars,
"strip_accents": strip_accents,
"lowercase": lowercase,
"wordpieces_prefix": wordpieces_prefix,
}
super().__init__(tokenizer, parameters)
@staticmethod
def from_file(vocab: str, **kwargs):
vocab = WordPiece.read_file(vocab)
return BertWordPieceTokenizer(vocab, **kwargs)
def train(
self,
files: Union[str, List[str]],
vocab_size: int = 30000,
min_frequency: int = 2,
limit_alphabet: int = 1000,
initial_alphabet: List[str] = [],
special_tokens: List[Union[str, AddedToken]] = [
"[PAD]",
"[UNK]",
"[CLS]",
"[SEP]",
"[MASK]",
],
show_progress: bool = True,
wordpieces_prefix: str = "##",
):
"""Train the model using the given files"""
trainer = trainers.WordPieceTrainer(
vocab_size=vocab_size,
min_frequency=min_frequency,
limit_alphabet=limit_alphabet,
initial_alphabet=initial_alphabet,
special_tokens=special_tokens,
show_progress=show_progress,
continuing_subword_prefix=wordpieces_prefix,
)
if isinstance(files, str):
files = [files]
self._tokenizer.train(files, trainer=trainer)
def train_from_iterator(
self,
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
vocab_size: int = 30000,
min_frequency: int = 2,
limit_alphabet: int = 1000,
initial_alphabet: List[str] = [],
special_tokens: List[Union[str, AddedToken]] = [
"[PAD]",
"[UNK]",
"[CLS]",
"[SEP]",
"[MASK]",
],
show_progress: bool = True,
wordpieces_prefix: str = "##",
length: Optional[int] = None,
):
"""Train the model using the given iterator"""
trainer = trainers.WordPieceTrainer(
vocab_size=vocab_size,
min_frequency=min_frequency,
limit_alphabet=limit_alphabet,
initial_alphabet=initial_alphabet,
special_tokens=special_tokens,
show_progress=show_progress,
continuing_subword_prefix=wordpieces_prefix,
)
self._tokenizer.train_from_iterator(
iterator,
trainer=trainer,
length=length,
)

View File

@@ -0,0 +1,122 @@
from typing import Dict, Iterator, List, Optional, Tuple, Union
from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, processors, trainers
from tokenizers.models import BPE
from tokenizers.normalizers import Lowercase, Sequence, unicode_normalizer_from_str
from .base_tokenizer import BaseTokenizer
class ByteLevelBPETokenizer(BaseTokenizer):
"""ByteLevelBPETokenizer
Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
"""
def __init__(
self,
vocab: Optional[Union[str, Dict[str, int]]] = None,
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
add_prefix_space: bool = False,
lowercase: bool = False,
dropout: Optional[float] = None,
unicode_normalizer: Optional[str] = None,
continuing_subword_prefix: Optional[str] = None,
end_of_word_suffix: Optional[str] = None,
trim_offsets: bool = False,
):
if vocab is not None and merges is not None:
tokenizer = Tokenizer(
BPE(
vocab,
merges,
dropout=dropout,
continuing_subword_prefix=continuing_subword_prefix or "",
end_of_word_suffix=end_of_word_suffix or "",
)
)
else:
tokenizer = Tokenizer(BPE())
# Check for Unicode normalization first (before everything else)
normalizers = []
if unicode_normalizer:
normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
if lowercase:
normalizers += [Lowercase()]
# Create the normalizer structure
if len(normalizers) > 0:
if len(normalizers) > 1:
tokenizer.normalizer = Sequence(normalizers)
else:
tokenizer.normalizer = normalizers[0]
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=trim_offsets)
parameters = {
"model": "ByteLevelBPE",
"add_prefix_space": add_prefix_space,
"lowercase": lowercase,
"dropout": dropout,
"unicode_normalizer": unicode_normalizer,
"continuing_subword_prefix": continuing_subword_prefix,
"end_of_word_suffix": end_of_word_suffix,
"trim_offsets": trim_offsets,
}
super().__init__(tokenizer, parameters)
@staticmethod
def from_file(vocab_filename: str, merges_filename: str, **kwargs):
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
return ByteLevelBPETokenizer(vocab, merges, **kwargs)
def train(
self,
files: Union[str, List[str]],
vocab_size: int = 30000,
min_frequency: int = 2,
show_progress: bool = True,
special_tokens: List[Union[str, AddedToken]] = [],
):
"""Train the model using the given files"""
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
min_frequency=min_frequency,
show_progress=show_progress,
special_tokens=special_tokens,
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
)
if isinstance(files, str):
files = [files]
self._tokenizer.train(files, trainer=trainer)
def train_from_iterator(
self,
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
vocab_size: int = 30000,
min_frequency: int = 2,
show_progress: bool = True,
special_tokens: List[Union[str, AddedToken]] = [],
length: Optional[int] = None,
):
"""Train the model using the given iterator"""
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
min_frequency=min_frequency,
show_progress=show_progress,
special_tokens=special_tokens,
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
)
self._tokenizer.train_from_iterator(
iterator,
trainer=trainer,
length=length,
)

View File

@@ -0,0 +1,150 @@
from typing import Dict, Iterator, List, Optional, Tuple, Union
from .. import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers
from ..models import BPE
from ..normalizers import BertNormalizer, Lowercase, Sequence, unicode_normalizer_from_str
from .base_tokenizer import BaseTokenizer
class CharBPETokenizer(BaseTokenizer):
"""Original BPE Tokenizer
Represents the BPE algorithm, as introduced by Rico Sennrich
(https://arxiv.org/abs/1508.07909)
The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
Sennrich subword-nmt implementation by the following options that you can deactivate:
- adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
* removing any control characters and replacing all whitespaces by the classic one.
* handle chinese chars by putting spaces around them.
* strip all accents.
- spitting on punctuation in addition to whitespaces (deactivate it with
`split_on_whitespace_only=True`)
"""
def __init__(
self,
vocab: Optional[Union[str, Dict[str, int]]] = None,
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
unk_token: Union[str, AddedToken] = "<unk>",
suffix: str = "</w>",
dropout: Optional[float] = None,
lowercase: bool = False,
unicode_normalizer: Optional[str] = None,
bert_normalizer: bool = True,
split_on_whitespace_only: bool = False,
):
if vocab is not None and merges is not None:
tokenizer = Tokenizer(
BPE(
vocab,
merges,
dropout=dropout,
unk_token=str(unk_token),
end_of_word_suffix=suffix,
)
)
else:
tokenizer = Tokenizer(BPE(unk_token=str(unk_token), dropout=dropout, end_of_word_suffix=suffix))
if tokenizer.token_to_id(str(unk_token)) is not None:
tokenizer.add_special_tokens([str(unk_token)])
# Check for Unicode normalization first (before everything else)
normalizers = []
if unicode_normalizer:
normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
if bert_normalizer:
normalizers += [BertNormalizer(lowercase=False)]
if lowercase:
normalizers += [Lowercase()]
# Create the normalizer structure
if len(normalizers) > 0:
if len(normalizers) > 1:
tokenizer.normalizer = Sequence(normalizers)
else:
tokenizer.normalizer = normalizers[0]
if split_on_whitespace_only:
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
else:
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
tokenizer.decoder = decoders.BPEDecoder(suffix=suffix)
parameters = {
"model": "BPE",
"unk_token": unk_token,
"suffix": suffix,
"dropout": dropout,
"lowercase": lowercase,
"unicode_normalizer": unicode_normalizer,
"bert_normalizer": bert_normalizer,
"split_on_whitespace_only": split_on_whitespace_only,
}
super().__init__(tokenizer, parameters)
@staticmethod
def from_file(vocab_filename: str, merges_filename: str, **kwargs):
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
return CharBPETokenizer(vocab, merges, **kwargs)
def train(
self,
files: Union[str, List[str]],
vocab_size: int = 30000,
min_frequency: int = 2,
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
limit_alphabet: int = 1000,
initial_alphabet: List[str] = [],
suffix: Optional[str] = "</w>",
show_progress: bool = True,
):
"""Train the model using the given files"""
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
min_frequency=min_frequency,
special_tokens=special_tokens,
limit_alphabet=limit_alphabet,
initial_alphabet=initial_alphabet,
end_of_word_suffix=suffix,
show_progress=show_progress,
)
if isinstance(files, str):
files = [files]
self._tokenizer.train(files, trainer=trainer)
def train_from_iterator(
self,
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
vocab_size: int = 30000,
min_frequency: int = 2,
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
limit_alphabet: int = 1000,
initial_alphabet: List[str] = [],
suffix: Optional[str] = "</w>",
show_progress: bool = True,
length: Optional[int] = None,
):
"""Train the model using the given iterator"""
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
min_frequency=min_frequency,
special_tokens=special_tokens,
limit_alphabet=limit_alphabet,
initial_alphabet=initial_alphabet,
end_of_word_suffix=suffix,
show_progress=show_progress,
)
self._tokenizer.train_from_iterator(
iterator,
trainer=trainer,
length=length,
)

View File

@@ -0,0 +1,103 @@
from typing import Dict, Iterator, List, Optional, Tuple, Union
from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers
from tokenizers.models import BPE
from tokenizers.normalizers import NFKC
from .base_tokenizer import BaseTokenizer
class SentencePieceBPETokenizer(BaseTokenizer):
"""SentencePiece BPE Tokenizer
Represents the BPE algorithm, with the pretokenization used by SentencePiece
"""
def __init__(
self,
vocab: Optional[Union[str, Dict[str, int]]] = None,
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
unk_token: Union[str, AddedToken] = "<unk>",
replacement: str = "",
add_prefix_space: bool = True,
dropout: Optional[float] = None,
fuse_unk: Optional[bool] = False,
):
if vocab is not None and merges is not None:
tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk))
else:
tokenizer = Tokenizer(BPE(dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk))
if tokenizer.token_to_id(str(unk_token)) is not None:
tokenizer.add_special_tokens([str(unk_token)])
tokenizer.normalizer = NFKC()
prepend_scheme = "always" if add_prefix_space else "never"
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
parameters = {
"model": "SentencePieceBPE",
"unk_token": unk_token,
"replacement": replacement,
"add_prefix_space": add_prefix_space,
"dropout": dropout,
}
super().__init__(tokenizer, parameters)
@staticmethod
def from_file(vocab_filename: str, merges_filename: str, **kwargs):
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
return SentencePieceBPETokenizer(vocab, merges, **kwargs)
def train(
self,
files: Union[str, List[str]],
vocab_size: int = 30000,
min_frequency: int = 2,
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
limit_alphabet: int = 1000,
initial_alphabet: List[str] = [],
show_progress: bool = True,
):
"""Train the model using the given files"""
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
min_frequency=min_frequency,
special_tokens=special_tokens,
limit_alphabet=limit_alphabet,
initial_alphabet=initial_alphabet,
show_progress=show_progress,
)
if isinstance(files, str):
files = [files]
self._tokenizer.train(files, trainer=trainer)
def train_from_iterator(
self,
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
vocab_size: int = 30000,
min_frequency: int = 2,
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
limit_alphabet: int = 1000,
initial_alphabet: List[str] = [],
show_progress: bool = True,
length: Optional[int] = None,
):
"""Train the model using the given iterator"""
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
min_frequency=min_frequency,
special_tokens=special_tokens,
limit_alphabet=limit_alphabet,
initial_alphabet=initial_alphabet,
show_progress=show_progress,
)
self._tokenizer.train_from_iterator(
iterator,
trainer=trainer,
length=length,
)

View File

@@ -0,0 +1,196 @@
import json
import os
from typing import Iterator, List, Optional, Union, Tuple
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, trainers
from tokenizers.models import Unigram
from .base_tokenizer import BaseTokenizer
class SentencePieceUnigramTokenizer(BaseTokenizer):
"""SentencePiece Unigram Tokenizer
Represents the Unigram algorithm, with the pretokenization used by SentencePiece
"""
def __init__(
self,
vocab: Optional[List[Tuple[str, float]]] = None,
replacement: str = "",
add_prefix_space: bool = True,
):
if vocab is not None:
# Let Unigram(..) fail if only one of them is None
tokenizer = Tokenizer(Unigram(vocab))
else:
tokenizer = Tokenizer(Unigram())
tokenizer.normalizer = normalizers.Sequence(
[normalizers.Nmt(), normalizers.NFKC(), normalizers.Replace(Regex(" {2,}"), " ")]
)
prepend_scheme = "always" if add_prefix_space else "never"
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
parameters = {
"model": "SentencePieceUnigram",
"replacement": replacement,
"add_prefix_space": add_prefix_space,
}
super().__init__(tokenizer, parameters)
def train(
self,
files: Union[str, List[str]],
vocab_size: int = 8000,
show_progress: bool = True,
special_tokens: Optional[List[Union[str, AddedToken]]] = None,
initial_alphabet: Optional[List[str]] = None,
unk_token: Optional[str] = None,
):
"""
Train the model using the given files
Args:
files (:obj:`List[str]`):
A list of path to the files that we should use for training
vocab_size (:obj:`int`):
The size of the final vocabulary, including all tokens and alphabet.
show_progress (:obj:`bool`):
Whether to show progress bars while training.
special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
A list of special tokens the model should know of.
initial_alphabet (:obj:`List[str]`, `optional`):
A list of characters to include in the initial alphabet, even
if not seen in the training dataset.
If the strings contain more than one character, only the first one
is kept.
unk_token (:obj:`str`, `optional`):
The unknown token to be used by the model.
"""
if special_tokens is None:
special_tokens = []
if initial_alphabet is None:
initial_alphabet = []
trainer = trainers.UnigramTrainer(
vocab_size=vocab_size,
special_tokens=special_tokens,
show_progress=show_progress,
initial_alphabet=initial_alphabet,
unk_token=unk_token,
)
if isinstance(files, str):
files = [files]
self._tokenizer.train(files, trainer=trainer)
def train_from_iterator(
self,
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
vocab_size: int = 8000,
show_progress: bool = True,
special_tokens: Optional[List[Union[str, AddedToken]]] = None,
initial_alphabet: Optional[List[str]] = None,
unk_token: Optional[str] = None,
length: Optional[int] = None,
):
"""
Train the model using the given iterator
Args:
iterator (:obj:`Union[Iterator[str], Iterator[Iterator[str]]]`):
Any iterator over strings or list of strings
vocab_size (:obj:`int`):
The size of the final vocabulary, including all tokens and alphabet.
show_progress (:obj:`bool`):
Whether to show progress bars while training.
special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
A list of special tokens the model should know of.
initial_alphabet (:obj:`List[str]`, `optional`):
A list of characters to include in the initial alphabet, even
if not seen in the training dataset.
If the strings contain more than one character, only the first one
is kept.
unk_token (:obj:`str`, `optional`):
The unknown token to be used by the model.
length (:obj:`int`, `optional`):
The total number of sequences in the iterator. This is used to
provide meaningful progress tracking
"""
if special_tokens is None:
special_tokens = []
if initial_alphabet is None:
initial_alphabet = []
trainer = trainers.UnigramTrainer(
vocab_size=vocab_size,
special_tokens=special_tokens,
show_progress=show_progress,
initial_alphabet=initial_alphabet,
unk_token=unk_token,
)
self._tokenizer.train_from_iterator(
iterator,
trainer=trainer,
length=length,
)
@staticmethod
def from_spm(filename: str):
try:
import sys
sys.path.append(".")
import sentencepiece_model_pb2 as model
except Exception:
raise Exception(
"You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required."
)
m = model.ModelProto()
m.ParseFromString(open(filename, "rb").read())
precompiled_charsmap = m.normalizer_spec.precompiled_charsmap
vocab = [(piece.piece, piece.score) for piece in m.pieces]
unk_id = m.trainer_spec.unk_id
model_type = m.trainer_spec.model_type
byte_fallback = m.trainer_spec.byte_fallback
if model_type != 1:
raise Exception(
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
)
replacement = ""
add_prefix_space = True
tokenizer = Tokenizer(Unigram(vocab, unk_id, byte_fallback))
if precompiled_charsmap:
tokenizer.normalizer = normalizers.Sequence(
[
normalizers.Precompiled(precompiled_charsmap),
normalizers.Replace(Regex(" {2,}"), " "),
]
)
else:
tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")])
prepend_scheme = "always" if add_prefix_space else "never"
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
parameters = {
"model": "SentencePieceUnigram",
}
obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters)
BaseTokenizer.__init__(obj, tokenizer, parameters)
return obj

View File

@@ -0,0 +1,8 @@
# Generated content DO NOT EDIT
from .. import models
Model = models.Model
BPE = models.BPE
Unigram = models.Unigram
WordLevel = models.WordLevel
WordPiece = models.WordPiece

View File

@@ -0,0 +1,591 @@
# Generated content DO NOT EDIT
class Model:
"""
Base class for all models
The model represents the actual tokenization algorithm. This is the part that
will contain and manage the learned vocabulary.
This class cannot be constructed directly. Please use one of the concrete models.
"""
def get_trainer(self):
"""
Get the associated :class:`~tokenizers.trainers.Trainer`
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
:class:`~tokenizers.models.Model`.
Returns:
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
"""
pass
def id_to_token(self, id):
"""
Get the token associated to an ID
Args:
id (:obj:`int`):
An ID to convert to a token
Returns:
:obj:`str`: The token associated to the ID
"""
pass
def save(self, folder, prefix):
"""
Save the current model
Save the current model in the given folder, using the given prefix for the various
files that will get created.
Any file with the same name that already exists in this folder will be overwritten.
Args:
folder (:obj:`str`):
The path to the target folder in which to save the various files
prefix (:obj:`str`, `optional`):
An optional prefix, used to prefix each file name
Returns:
:obj:`List[str]`: The list of saved files
"""
pass
def token_to_id(self, tokens):
"""
Get the ID associated to a token
Args:
token (:obj:`str`):
A token to convert to an ID
Returns:
:obj:`int`: The ID associated to the token
"""
pass
def tokenize(self, sequence):
"""
Tokenize a sequence
Args:
sequence (:obj:`str`):
A sequence to tokenize
Returns:
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
"""
pass
class BPE(Model):
"""
An implementation of the BPE (Byte-Pair Encoding) algorithm
Args:
vocab (:obj:`Dict[str, int]`, `optional`):
A dictionary of string keys and their ids :obj:`{"am": 0,...}`
merges (:obj:`List[Tuple[str, str]]`, `optional`):
A list of pairs of tokens (:obj:`Tuple[str, str]`) :obj:`[("a", "b"),...]`
cache_capacity (:obj:`int`, `optional`):
The number of words that the BPE cache can contain. The cache allows
to speed-up the process by keeping the result of the merge operations
for a number of words.
dropout (:obj:`float`, `optional`):
A float between 0 and 1 that represents the BPE dropout to use.
unk_token (:obj:`str`, `optional`):
The unknown token to be used by the model.
continuing_subword_prefix (:obj:`str`, `optional`):
The prefix to attach to subword units that don't represent a beginning of word.
end_of_word_suffix (:obj:`str`, `optional`):
The suffix to attach to subword units that represent an end of word.
fuse_unk (:obj:`bool`, `optional`):
Whether to fuse any subsequent unknown tokens into a single one
byte_fallback (:obj:`bool`, `optional`):
Whether to use spm byte-fallback trick (defaults to False)
ignore_merges (:obj:`bool`, `optional`):
Whether or not to match tokens with the vocab before using merges.
"""
def __init__(
self,
vocab=None,
merges=None,
cache_capacity=None,
dropout=None,
unk_token=None,
continuing_subword_prefix=None,
end_of_word_suffix=None,
fuse_unk=None,
byte_fallback=False,
ignore_merges=False,
):
pass
@staticmethod
def from_file(cls, vocab, merge, **kwargs):
"""
Instantiate a BPE model from the given files.
This method is roughly equivalent to doing::
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
bpe = BPE(vocab, merges)
If you don't need to keep the :obj:`vocab, merges` values lying around,
this method is more optimized than manually calling
:meth:`~tokenizers.models.BPE.read_file` to initialize a :class:`~tokenizers.models.BPE`
Args:
vocab (:obj:`str`):
The path to a :obj:`vocab.json` file
merges (:obj:`str`):
The path to a :obj:`merges.txt` file
Returns:
:class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
"""
pass
def get_trainer(self):
"""
Get the associated :class:`~tokenizers.trainers.Trainer`
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
:class:`~tokenizers.models.Model`.
Returns:
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
"""
pass
def id_to_token(self, id):
"""
Get the token associated to an ID
Args:
id (:obj:`int`):
An ID to convert to a token
Returns:
:obj:`str`: The token associated to the ID
"""
pass
@staticmethod
def read_file(self, vocab, merges):
"""
Read a :obj:`vocab.json` and a :obj:`merges.txt` files
This method provides a way to read and parse the content of these files,
returning the relevant data structures. If you want to instantiate some BPE models
from memory, this method gives you the expected input from the standard files.
Args:
vocab (:obj:`str`):
The path to a :obj:`vocab.json` file
merges (:obj:`str`):
The path to a :obj:`merges.txt` file
Returns:
A :obj:`Tuple` with the vocab and the merges:
The vocabulary and merges loaded into memory
"""
pass
def save(self, folder, prefix):
"""
Save the current model
Save the current model in the given folder, using the given prefix for the various
files that will get created.
Any file with the same name that already exists in this folder will be overwritten.
Args:
folder (:obj:`str`):
The path to the target folder in which to save the various files
prefix (:obj:`str`, `optional`):
An optional prefix, used to prefix each file name
Returns:
:obj:`List[str]`: The list of saved files
"""
pass
def token_to_id(self, tokens):
"""
Get the ID associated to a token
Args:
token (:obj:`str`):
A token to convert to an ID
Returns:
:obj:`int`: The ID associated to the token
"""
pass
def tokenize(self, sequence):
"""
Tokenize a sequence
Args:
sequence (:obj:`str`):
A sequence to tokenize
Returns:
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
"""
pass
class Unigram(Model):
"""
An implementation of the Unigram algorithm
Args:
vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
A list of vocabulary items and their relative score [("am", -0.2442),...]
"""
def __init__(self, vocab, unk_id, byte_fallback):
pass
def get_trainer(self):
"""
Get the associated :class:`~tokenizers.trainers.Trainer`
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
:class:`~tokenizers.models.Model`.
Returns:
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
"""
pass
def id_to_token(self, id):
"""
Get the token associated to an ID
Args:
id (:obj:`int`):
An ID to convert to a token
Returns:
:obj:`str`: The token associated to the ID
"""
pass
def save(self, folder, prefix):
"""
Save the current model
Save the current model in the given folder, using the given prefix for the various
files that will get created.
Any file with the same name that already exists in this folder will be overwritten.
Args:
folder (:obj:`str`):
The path to the target folder in which to save the various files
prefix (:obj:`str`, `optional`):
An optional prefix, used to prefix each file name
Returns:
:obj:`List[str]`: The list of saved files
"""
pass
def token_to_id(self, tokens):
"""
Get the ID associated to a token
Args:
token (:obj:`str`):
A token to convert to an ID
Returns:
:obj:`int`: The ID associated to the token
"""
pass
def tokenize(self, sequence):
"""
Tokenize a sequence
Args:
sequence (:obj:`str`):
A sequence to tokenize
Returns:
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
"""
pass
class WordLevel(Model):
"""
An implementation of the WordLevel algorithm
Most simple tokenizer model based on mapping tokens to their corresponding id.
Args:
vocab (:obj:`str`, `optional`):
A dictionary of string keys and their ids :obj:`{"am": 0,...}`
unk_token (:obj:`str`, `optional`):
The unknown token to be used by the model.
"""
def __init__(self, vocab, unk_token):
pass
@staticmethod
def from_file(vocab, unk_token):
"""
Instantiate a WordLevel model from the given file
This method is roughly equivalent to doing::
vocab = WordLevel.read_file(vocab_filename)
wordlevel = WordLevel(vocab)
If you don't need to keep the :obj:`vocab` values lying around, this method is
more optimized than manually calling :meth:`~tokenizers.models.WordLevel.read_file` to
initialize a :class:`~tokenizers.models.WordLevel`
Args:
vocab (:obj:`str`):
The path to a :obj:`vocab.json` file
Returns:
:class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
"""
pass
def get_trainer(self):
"""
Get the associated :class:`~tokenizers.trainers.Trainer`
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
:class:`~tokenizers.models.Model`.
Returns:
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
"""
pass
def id_to_token(self, id):
"""
Get the token associated to an ID
Args:
id (:obj:`int`):
An ID to convert to a token
Returns:
:obj:`str`: The token associated to the ID
"""
pass
@staticmethod
def read_file(vocab):
"""
Read a :obj:`vocab.json`
This method provides a way to read and parse the content of a vocabulary file,
returning the relevant data structures. If you want to instantiate some WordLevel models
from memory, this method gives you the expected input from the standard files.
Args:
vocab (:obj:`str`):
The path to a :obj:`vocab.json` file
Returns:
:obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
"""
pass
def save(self, folder, prefix):
"""
Save the current model
Save the current model in the given folder, using the given prefix for the various
files that will get created.
Any file with the same name that already exists in this folder will be overwritten.
Args:
folder (:obj:`str`):
The path to the target folder in which to save the various files
prefix (:obj:`str`, `optional`):
An optional prefix, used to prefix each file name
Returns:
:obj:`List[str]`: The list of saved files
"""
pass
def token_to_id(self, tokens):
"""
Get the ID associated to a token
Args:
token (:obj:`str`):
A token to convert to an ID
Returns:
:obj:`int`: The ID associated to the token
"""
pass
def tokenize(self, sequence):
"""
Tokenize a sequence
Args:
sequence (:obj:`str`):
A sequence to tokenize
Returns:
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
"""
pass
class WordPiece(Model):
"""
An implementation of the WordPiece algorithm
Args:
vocab (:obj:`Dict[str, int]`, `optional`):
A dictionary of string keys and their ids :obj:`{"am": 0,...}`
unk_token (:obj:`str`, `optional`):
The unknown token to be used by the model.
max_input_chars_per_word (:obj:`int`, `optional`):
The maximum number of characters to authorize in a single word.
"""
def __init__(self, vocab, unk_token, max_input_chars_per_word):
pass
@staticmethod
def from_file(vocab, **kwargs):
"""
Instantiate a WordPiece model from the given file
This method is roughly equivalent to doing::
vocab = WordPiece.read_file(vocab_filename)
wordpiece = WordPiece(vocab)
If you don't need to keep the :obj:`vocab` values lying around, this method is
more optimized than manually calling :meth:`~tokenizers.models.WordPiece.read_file` to
initialize a :class:`~tokenizers.models.WordPiece`
Args:
vocab (:obj:`str`):
The path to a :obj:`vocab.txt` file
Returns:
:class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
"""
pass
def get_trainer(self):
"""
Get the associated :class:`~tokenizers.trainers.Trainer`
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
:class:`~tokenizers.models.Model`.
Returns:
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
"""
pass
def id_to_token(self, id):
"""
Get the token associated to an ID
Args:
id (:obj:`int`):
An ID to convert to a token
Returns:
:obj:`str`: The token associated to the ID
"""
pass
@staticmethod
def read_file(vocab):
"""
Read a :obj:`vocab.txt` file
This method provides a way to read and parse the content of a standard `vocab.txt`
file as used by the WordPiece Model, returning the relevant data structures. If you
want to instantiate some WordPiece models from memory, this method gives you the
expected input from the standard files.
Args:
vocab (:obj:`str`):
The path to a :obj:`vocab.txt` file
Returns:
:obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
"""
pass
def save(self, folder, prefix):
"""
Save the current model
Save the current model in the given folder, using the given prefix for the various
files that will get created.
Any file with the same name that already exists in this folder will be overwritten.
Args:
folder (:obj:`str`):
The path to the target folder in which to save the various files
prefix (:obj:`str`, `optional`):
An optional prefix, used to prefix each file name
Returns:
:obj:`List[str]`: The list of saved files
"""
pass
def token_to_id(self, tokens):
"""
Get the ID associated to a token
Args:
token (:obj:`str`):
A token to convert to an ID
Returns:
:obj:`int`: The ID associated to the token
"""
pass
def tokenize(self, sequence):
"""
Tokenize a sequence
Args:
sequence (:obj:`str`):
A sequence to tokenize
Returns:
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
"""
pass

View File

@@ -0,0 +1,29 @@
from .. import normalizers
Normalizer = normalizers.Normalizer
BertNormalizer = normalizers.BertNormalizer
NFD = normalizers.NFD
NFKD = normalizers.NFKD
NFC = normalizers.NFC
NFKC = normalizers.NFKC
Sequence = normalizers.Sequence
Lowercase = normalizers.Lowercase
Prepend = normalizers.Prepend
Strip = normalizers.Strip
StripAccents = normalizers.StripAccents
Nmt = normalizers.Nmt
Precompiled = normalizers.Precompiled
Replace = normalizers.Replace
ByteLevel = normalizers.ByteLevel
NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
if normalizer not in NORMALIZERS:
raise ValueError(
"{} is not a known unicode normalizer. Available are {}".format(normalizer, NORMALIZERS.keys())
)
return NORMALIZERS[normalizer]()

View File

@@ -0,0 +1,636 @@
# Generated content DO NOT EDIT
class Normalizer:
"""
Base class for all normalizers
This class is not supposed to be instantiated directly. Instead, any implementation of a
Normalizer will return an instance of this class when instantiated.
"""
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
This method allows to modify a :class:`~tokenizers.NormalizedString` to
keep track of the alignment information. If you just want to see the result
of the normalization on a raw string, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
Args:
normalized (:class:`~tokenizers.NormalizedString`):
The normalized string on which to apply this
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
information. If you need to get/convert offsets, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize`
Args:
sequence (:obj:`str`):
A string to normalize
Returns:
:obj:`str`: A string after normalization
"""
pass
class BertNormalizer(Normalizer):
"""
BertNormalizer
Takes care of normalizing raw text before giving it to a Bert model.
This includes cleaning the text, handling accents, chinese chars and lowercasing
Args:
clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to clean the text, by removing any control characters
and replacing all whitespaces by the classic one.
handle_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to handle chinese chars by putting spaces around them.
strip_accents (:obj:`bool`, `optional`):
Whether to strip all accents. If this option is not specified (ie == None),
then it will be determined by the value for `lowercase` (as in the original Bert).
lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lowercase.
"""
def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
This method allows to modify a :class:`~tokenizers.NormalizedString` to
keep track of the alignment information. If you just want to see the result
of the normalization on a raw string, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
Args:
normalized (:class:`~tokenizers.NormalizedString`):
The normalized string on which to apply this
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
information. If you need to get/convert offsets, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize`
Args:
sequence (:obj:`str`):
A string to normalize
Returns:
:obj:`str`: A string after normalization
"""
pass
class ByteLevel(Normalizer):
"""
Bytelevel Normalizer
"""
def __init__(self):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
This method allows to modify a :class:`~tokenizers.NormalizedString` to
keep track of the alignment information. If you just want to see the result
of the normalization on a raw string, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
Args:
normalized (:class:`~tokenizers.NormalizedString`):
The normalized string on which to apply this
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
information. If you need to get/convert offsets, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize`
Args:
sequence (:obj:`str`):
A string to normalize
Returns:
:obj:`str`: A string after normalization
"""
pass
class Lowercase(Normalizer):
"""
Lowercase Normalizer
"""
def __init__(self):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
This method allows to modify a :class:`~tokenizers.NormalizedString` to
keep track of the alignment information. If you just want to see the result
of the normalization on a raw string, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
Args:
normalized (:class:`~tokenizers.NormalizedString`):
The normalized string on which to apply this
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
information. If you need to get/convert offsets, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize`
Args:
sequence (:obj:`str`):
A string to normalize
Returns:
:obj:`str`: A string after normalization
"""
pass
class NFC(Normalizer):
"""
NFC Unicode Normalizer
"""
def __init__(self):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
This method allows to modify a :class:`~tokenizers.NormalizedString` to
keep track of the alignment information. If you just want to see the result
of the normalization on a raw string, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
Args:
normalized (:class:`~tokenizers.NormalizedString`):
The normalized string on which to apply this
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
information. If you need to get/convert offsets, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize`
Args:
sequence (:obj:`str`):
A string to normalize
Returns:
:obj:`str`: A string after normalization
"""
pass
class NFD(Normalizer):
"""
NFD Unicode Normalizer
"""
def __init__(self):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
This method allows to modify a :class:`~tokenizers.NormalizedString` to
keep track of the alignment information. If you just want to see the result
of the normalization on a raw string, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
Args:
normalized (:class:`~tokenizers.NormalizedString`):
The normalized string on which to apply this
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
information. If you need to get/convert offsets, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize`
Args:
sequence (:obj:`str`):
A string to normalize
Returns:
:obj:`str`: A string after normalization
"""
pass
class NFKC(Normalizer):
"""
NFKC Unicode Normalizer
"""
def __init__(self):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
This method allows to modify a :class:`~tokenizers.NormalizedString` to
keep track of the alignment information. If you just want to see the result
of the normalization on a raw string, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
Args:
normalized (:class:`~tokenizers.NormalizedString`):
The normalized string on which to apply this
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
information. If you need to get/convert offsets, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize`
Args:
sequence (:obj:`str`):
A string to normalize
Returns:
:obj:`str`: A string after normalization
"""
pass
class NFKD(Normalizer):
"""
NFKD Unicode Normalizer
"""
def __init__(self):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
This method allows to modify a :class:`~tokenizers.NormalizedString` to
keep track of the alignment information. If you just want to see the result
of the normalization on a raw string, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
Args:
normalized (:class:`~tokenizers.NormalizedString`):
The normalized string on which to apply this
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
information. If you need to get/convert offsets, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize`
Args:
sequence (:obj:`str`):
A string to normalize
Returns:
:obj:`str`: A string after normalization
"""
pass
class Nmt(Normalizer):
"""
Nmt normalizer
"""
def __init__(self):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
This method allows to modify a :class:`~tokenizers.NormalizedString` to
keep track of the alignment information. If you just want to see the result
of the normalization on a raw string, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
Args:
normalized (:class:`~tokenizers.NormalizedString`):
The normalized string on which to apply this
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
information. If you need to get/convert offsets, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize`
Args:
sequence (:obj:`str`):
A string to normalize
Returns:
:obj:`str`: A string after normalization
"""
pass
class Precompiled(Normalizer):
"""
Precompiled normalizer
Don't use manually it is used for compatibility for SentencePiece.
"""
def __init__(self, precompiled_charsmap):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
This method allows to modify a :class:`~tokenizers.NormalizedString` to
keep track of the alignment information. If you just want to see the result
of the normalization on a raw string, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
Args:
normalized (:class:`~tokenizers.NormalizedString`):
The normalized string on which to apply this
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
information. If you need to get/convert offsets, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize`
Args:
sequence (:obj:`str`):
A string to normalize
Returns:
:obj:`str`: A string after normalization
"""
pass
class Prepend(Normalizer):
"""
Prepend normalizer
"""
def __init__(self, prepend):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
This method allows to modify a :class:`~tokenizers.NormalizedString` to
keep track of the alignment information. If you just want to see the result
of the normalization on a raw string, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
Args:
normalized (:class:`~tokenizers.NormalizedString`):
The normalized string on which to apply this
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
information. If you need to get/convert offsets, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize`
Args:
sequence (:obj:`str`):
A string to normalize
Returns:
:obj:`str`: A string after normalization
"""
pass
class Replace(Normalizer):
"""
Replace normalizer
"""
def __init__(self, pattern, content):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
This method allows to modify a :class:`~tokenizers.NormalizedString` to
keep track of the alignment information. If you just want to see the result
of the normalization on a raw string, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
Args:
normalized (:class:`~tokenizers.NormalizedString`):
The normalized string on which to apply this
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
information. If you need to get/convert offsets, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize`
Args:
sequence (:obj:`str`):
A string to normalize
Returns:
:obj:`str`: A string after normalization
"""
pass
class Sequence(Normalizer):
"""
Allows concatenating multiple other Normalizer as a Sequence.
All the normalizers run in sequence in the given order
Args:
normalizers (:obj:`List[Normalizer]`):
A list of Normalizer to be run as a sequence
"""
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
This method allows to modify a :class:`~tokenizers.NormalizedString` to
keep track of the alignment information. If you just want to see the result
of the normalization on a raw string, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
Args:
normalized (:class:`~tokenizers.NormalizedString`):
The normalized string on which to apply this
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
information. If you need to get/convert offsets, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize`
Args:
sequence (:obj:`str`):
A string to normalize
Returns:
:obj:`str`: A string after normalization
"""
pass
class Strip(Normalizer):
"""
Strip normalizer
"""
def __init__(self, left=True, right=True):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
This method allows to modify a :class:`~tokenizers.NormalizedString` to
keep track of the alignment information. If you just want to see the result
of the normalization on a raw string, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
Args:
normalized (:class:`~tokenizers.NormalizedString`):
The normalized string on which to apply this
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
information. If you need to get/convert offsets, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize`
Args:
sequence (:obj:`str`):
A string to normalize
Returns:
:obj:`str`: A string after normalization
"""
pass
class StripAccents(Normalizer):
"""
StripAccents normalizer
"""
def __init__(self):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
This method allows to modify a :class:`~tokenizers.NormalizedString` to
keep track of the alignment information. If you just want to see the result
of the normalization on a raw string, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
Args:
normalized (:class:`~tokenizers.NormalizedString`):
The normalized string on which to apply this
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
information. If you need to get/convert offsets, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize`
Args:
sequence (:obj:`str`):
A string to normalize
Returns:
:obj:`str`: A string after normalization
"""
pass

View File

@@ -0,0 +1,15 @@
# Generated content DO NOT EDIT
from .. import pre_tokenizers
PreTokenizer = pre_tokenizers.PreTokenizer
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
ByteLevel = pre_tokenizers.ByteLevel
CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
Digits = pre_tokenizers.Digits
Metaspace = pre_tokenizers.Metaspace
Punctuation = pre_tokenizers.Punctuation
Sequence = pre_tokenizers.Sequence
Split = pre_tokenizers.Split
UnicodeScripts = pre_tokenizers.UnicodeScripts
Whitespace = pre_tokenizers.Whitespace
WhitespaceSplit = pre_tokenizers.WhitespaceSplit

View File

@@ -0,0 +1,610 @@
# Generated content DO NOT EDIT
class PreTokenizer:
"""
Base class for all pre-tokenizers
This class is not supposed to be instantiated directly. Instead, any implementation of a
PreTokenizer will return an instance of this class when instantiated.
"""
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
class BertPreTokenizer(PreTokenizer):
"""
BertPreTokenizer
This pre-tokenizer splits tokens on spaces, and also on punctuation.
Each occurrence of a punctuation character will be treated separately.
"""
def __init__(self):
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
class ByteLevel(PreTokenizer):
"""
ByteLevel PreTokenizer
This pre-tokenizer takes care of replacing all bytes of the given string
with a corresponding representation, as well as splitting into words.
Args:
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
use_regex (:obj:`bool`, `optional`, defaults to :obj:`True`):
Set this to :obj:`False` to prevent this `pre_tokenizer` from using
the GPT2 specific regexp for spliting on whitespace.
"""
def __init__(self, add_prefix_space=True, use_regex=True):
pass
@staticmethod
def alphabet():
"""
Returns the alphabet used by this PreTokenizer.
Since the ByteLevel works as its name suggests, at the byte level, it
encodes each byte value to a unique visible character. This means that there is a
total of 256 different characters composing this alphabet.
Returns:
:obj:`List[str]`: A list of characters that compose the alphabet
"""
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
class CharDelimiterSplit(PreTokenizer):
"""
This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
Args:
delimiter: str:
The delimiter char that will be used to split input
"""
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
class Digits(PreTokenizer):
"""
This pre-tokenizer simply splits using the digits in separate tokens
Args:
individual_digits (:obj:`bool`, `optional`, defaults to :obj:`False`):
If set to True, digits will each be separated as follows::
"Call 123 please" -> "Call ", "1", "2", "3", " please"
If set to False, digits will grouped as follows::
"Call 123 please" -> "Call ", "123", " please"
"""
def __init__(self, individual_digits=False):
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
class Metaspace(PreTokenizer):
"""
Metaspace pre-tokenizer
This pre-tokenizer replaces any whitespace by the provided replacement character.
It then tries to split on these spaces.
Args:
replacement (:obj:`str`, `optional`, defaults to :obj:`▁`):
The replacement character. Must be exactly one character. By default we
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
Choices: "always", "never", "first". First means the space is only added on the first
token (relevant when special tokens are used or other pre_tokenizer are used).
"""
def __init__(self, replacement="_", prepend_scheme="always", split=True):
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
class Punctuation(PreTokenizer):
"""
This pre-tokenizer simply splits on punctuation as individual characters.
Args:
behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
The behavior to use when splitting.
Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
"contiguous"
"""
def __init__(self, behavior="isolated"):
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
class Sequence(PreTokenizer):
"""
This pre-tokenizer composes other pre_tokenizers and applies them in sequence
"""
def __init__(self, pretokenizers):
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
class Split(PreTokenizer):
"""
Split PreTokenizer
This versatile pre-tokenizer splits using the provided pattern and
according to the provided behavior. The pattern can be inverted by
making use of the invert flag.
Args:
pattern (:obj:`str` or :class:`~tokenizers.Regex`):
A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`.
If you want to use a regex pattern, it has to be wrapped around a `tokenizers.Regex`,
otherwise we consider is as a string pattern. For example `pattern="|"`
means you want to split on `|` (imagine a csv file for example), while
`pattern=tokenizers.Regex("1|2")` means you split on either '1' or '2'.
behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
The behavior to use when splitting.
Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
"contiguous"
invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to invert the pattern.
"""
def __init__(self, pattern, behavior, invert=False):
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
class UnicodeScripts(PreTokenizer):
"""
This pre-tokenizer splits on characters that belong to different language family
It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt
Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
This mimicks SentencePiece Unigram implementation.
"""
def __init__(self):
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
class Whitespace(PreTokenizer):
"""
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
"""
def __init__(self):
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
class WhitespaceSplit(PreTokenizer):
"""
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
"""
def __init__(self):
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass

View File

@@ -0,0 +1,9 @@
# Generated content DO NOT EDIT
from .. import processors
PostProcessor = processors.PostProcessor
BertProcessing = processors.BertProcessing
ByteLevel = processors.ByteLevel
RobertaProcessing = processors.RobertaProcessing
Sequence = processors.Sequence
TemplateProcessing = processors.TemplateProcessing

View File

@@ -0,0 +1,342 @@
# Generated content DO NOT EDIT
class PostProcessor:
"""
Base class for all post-processors
This class is not supposed to be instantiated directly. Instead, any implementation of
a PostProcessor will return an instance of this class when instantiated.
"""
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
Args:
is_pair (:obj:`bool`):
Whether the input would be a pair of sequences
Returns:
:obj:`int`: The number of tokens to add
"""
pass
def process(self, encoding, pair=None, add_special_tokens=True):
"""
Post-process the given encodings, generating the final one
Args:
encoding (:class:`~tokenizers.Encoding`):
The encoding for the first sequence
pair (:class:`~tokenizers.Encoding`, `optional`):
The encoding for the pair sequence
add_special_tokens (:obj:`bool`):
Whether to add the special tokens
Return:
:class:`~tokenizers.Encoding`: The final encoding
"""
pass
class BertProcessing(PostProcessor):
"""
This post-processor takes care of adding the special tokens needed by
a Bert model:
- a SEP token
- a CLS token
Args:
sep (:obj:`Tuple[str, int]`):
A tuple with the string representation of the SEP token, and its id
cls (:obj:`Tuple[str, int]`):
A tuple with the string representation of the CLS token, and its id
"""
def __init__(self, sep, cls):
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
Args:
is_pair (:obj:`bool`):
Whether the input would be a pair of sequences
Returns:
:obj:`int`: The number of tokens to add
"""
pass
def process(self, encoding, pair=None, add_special_tokens=True):
"""
Post-process the given encodings, generating the final one
Args:
encoding (:class:`~tokenizers.Encoding`):
The encoding for the first sequence
pair (:class:`~tokenizers.Encoding`, `optional`):
The encoding for the pair sequence
add_special_tokens (:obj:`bool`):
Whether to add the special tokens
Return:
:class:`~tokenizers.Encoding`: The final encoding
"""
pass
class ByteLevel(PostProcessor):
"""
This post-processor takes care of trimming the offsets.
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
want the offsets to include these whitespaces, then this PostProcessor must be used.
Args:
trim_offsets (:obj:`bool`):
Whether to trim the whitespaces from the produced offsets.
"""
def __init__(self, trim_offsets=True):
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
Args:
is_pair (:obj:`bool`):
Whether the input would be a pair of sequences
Returns:
:obj:`int`: The number of tokens to add
"""
pass
def process(self, encoding, pair=None, add_special_tokens=True):
"""
Post-process the given encodings, generating the final one
Args:
encoding (:class:`~tokenizers.Encoding`):
The encoding for the first sequence
pair (:class:`~tokenizers.Encoding`, `optional`):
The encoding for the pair sequence
add_special_tokens (:obj:`bool`):
Whether to add the special tokens
Return:
:class:`~tokenizers.Encoding`: The final encoding
"""
pass
class RobertaProcessing(PostProcessor):
"""
This post-processor takes care of adding the special tokens needed by
a Roberta model:
- a SEP token
- a CLS token
It also takes care of trimming the offsets.
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
want the offsets to include these whitespaces, then this PostProcessor should be initialized
with :obj:`trim_offsets=True`
Args:
sep (:obj:`Tuple[str, int]`):
A tuple with the string representation of the SEP token, and its id
cls (:obj:`Tuple[str, int]`):
A tuple with the string representation of the CLS token, and its id
trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to trim the whitespaces from the produced offsets.
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether the add_prefix_space option was enabled during pre-tokenization. This
is relevant because it defines the way the offsets are trimmed out.
"""
def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True):
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
Args:
is_pair (:obj:`bool`):
Whether the input would be a pair of sequences
Returns:
:obj:`int`: The number of tokens to add
"""
pass
def process(self, encoding, pair=None, add_special_tokens=True):
"""
Post-process the given encodings, generating the final one
Args:
encoding (:class:`~tokenizers.Encoding`):
The encoding for the first sequence
pair (:class:`~tokenizers.Encoding`, `optional`):
The encoding for the pair sequence
add_special_tokens (:obj:`bool`):
Whether to add the special tokens
Return:
:class:`~tokenizers.Encoding`: The final encoding
"""
pass
class Sequence(PostProcessor):
"""
Sequence Processor
Args:
processors (:obj:`List[PostProcessor]`)
The processors that need to be chained
"""
def __init__(self, processors):
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
Args:
is_pair (:obj:`bool`):
Whether the input would be a pair of sequences
Returns:
:obj:`int`: The number of tokens to add
"""
pass
def process(self, encoding, pair=None, add_special_tokens=True):
"""
Post-process the given encodings, generating the final one
Args:
encoding (:class:`~tokenizers.Encoding`):
The encoding for the first sequence
pair (:class:`~tokenizers.Encoding`, `optional`):
The encoding for the pair sequence
add_special_tokens (:obj:`bool`):
Whether to add the special tokens
Return:
:class:`~tokenizers.Encoding`: The final encoding
"""
pass
class TemplateProcessing(PostProcessor):
"""
Provides a way to specify templates in order to add the special tokens to each
input sequence as relevant.
Let's take :obj:`BERT` tokenizer as an example. It uses two special tokens, used to
delimitate each sequence. :obj:`[CLS]` is always used at the beginning of the first
sequence, and :obj:`[SEP]` is added at the end of both the first, and the pair
sequences. The final result looks like this:
- Single sequence: :obj:`[CLS] Hello there [SEP]`
- Pair sequences: :obj:`[CLS] My name is Anthony [SEP] What is my name? [SEP]`
With the type ids as following::
[CLS] ... [SEP] ... [SEP]
0 0 0 1 1
You can achieve such behavior using a TemplateProcessing::
TemplateProcessing(
single="[CLS] $0 [SEP]",
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
)
In this example, each input sequence is identified using a ``$`` construct. This identifier
lets us specify each input sequence, and the type_id to use. When nothing is specified,
it uses the default values. Here are the different ways to specify it:
- Specifying the sequence, with default ``type_id == 0``: ``$A`` or ``$B``
- Specifying the `type_id` with default ``sequence == A``: ``$0``, ``$1``, ``$2``, ...
- Specifying both: ``$A:0``, ``$B:1``, ...
The same construct is used for special tokens: ``<identifier>(:<type_id>)?``.
**Warning**: You must ensure that you are giving the correct tokens/ids as these
will be added to the Encoding without any further check. If the given ids correspond
to something totally different in a `Tokenizer` using this `PostProcessor`, it
might lead to unexpected results.
Args:
single (:obj:`Template`):
The template used for single sequences
pair (:obj:`Template`):
The template used when both sequences are specified
special_tokens (:obj:`Tokens`):
The list of special tokens used in each sequences
Types:
Template (:obj:`str` or :obj:`List`):
- If a :obj:`str` is provided, the whitespace is used as delimiter between tokens
- If a :obj:`List[str]` is provided, a list of tokens
Tokens (:obj:`List[Union[Tuple[int, str], Tuple[str, int], dict]]`):
- A :obj:`Tuple` with both a token and its associated ID, in any order
- A :obj:`dict` with the following keys:
- "id": :obj:`str` => The special token id, as specified in the Template
- "ids": :obj:`List[int]` => The associated IDs
- "tokens": :obj:`List[str]` => The associated tokens
The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have
the same length.
"""
def __init__(self, single, pair, special_tokens):
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
Args:
is_pair (:obj:`bool`):
Whether the input would be a pair of sequences
Returns:
:obj:`int`: The number of tokens to add
"""
pass
def process(self, encoding, pair=None, add_special_tokens=True):
"""
Post-process the given encodings, generating the final one
Args:
encoding (:class:`~tokenizers.Encoding`):
The encoding for the first sequence
pair (:class:`~tokenizers.Encoding`, `optional`):
The encoding for the pair sequence
add_special_tokens (:obj:`bool`):
Whether to add the special tokens
Return:
:class:`~tokenizers.Encoding`: The final encoding
"""
pass

View File

@@ -0,0 +1 @@
from .visualizer import Annotation, EncodingVisualizer

View File

@@ -0,0 +1,170 @@
.tokenized-text {
width:100%;
padding:2rem;
max-height: 400px;
overflow-y: auto;
box-sizing:border-box;
line-height:4rem; /* Lots of space between lines */
font-family: "Roboto Light", "Ubuntu Light", "Ubuntu", monospace;
box-shadow: 2px 2px 2px rgba(0,0,0,0.2);
background-color: rgba(0,0,0,0.01);
letter-spacing:2px; /* Give some extra separation between chars */
}
.non-token{
/* White space and other things the tokenizer ignores*/
white-space: pre;
letter-spacing:4px;
border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/
border-bottom:1px solid #A0A0A0;
line-height: 1rem;
height: calc(100% - 2px);
}
.token {
white-space: pre;
position:relative;
color:black;
letter-spacing:2px;
}
.annotation{
white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */
border-radius:4px;
position:relative;
width:fit-content;
}
.annotation:before {
/*The before holds the text and the after holds the background*/
z-index:1000; /* Make sure this is above the background */
content:attr(data-label); /* The annotations label is on a data attribute */
color:white;
position:absolute;
font-size:1rem;
text-align:center;
font-weight:bold;
top:1.75rem;
line-height:0;
left:0;
width:100%;
padding:0.5rem 0;
/* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/
overflow: hidden;
white-space: nowrap;
text-overflow:ellipsis;
}
.annotation:after {
content:attr(data-label); /* The content defines the width of the annotation*/
position:absolute;
font-size:0.75rem;
text-align:center;
font-weight:bold;
text-overflow:ellipsis;
top:1.75rem;
line-height:0;
overflow: hidden;
white-space: nowrap;
left:0;
width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/
padding:0.5rem 0;
/* Nast hack below:
We set the annotations color in code because we don't know the colors at css time.
But you can't pass a color as a data attribute to get it into the pseudo element (this thing)
So to get around that, annotations have the color set on them with a style attribute and then we
can get the color with currentColor.
Annotations wrap tokens and tokens set the color back to black
*/
background-color: currentColor;
}
.annotation:hover::after, .annotation:hover::before{
/* When the user hovers over an annotation expand the label to display in full
*/
min-width: fit-content;
}
.annotation:hover{
/* Emphasize the annotation start end with a border on hover*/
border-color: currentColor;
border: 2px solid;
}
.special-token:not(:empty){
/*
A none empty special token is like UNK (as opposed to CLS which has no representation in the text )
*/
position:relative;
}
.special-token:empty::before{
/* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/
content:attr(data-stok);
background:#202020;
font-size:0.75rem;
color:white;
margin: 0 0.25rem;
padding: 0.25rem;
border-radius:4px
}
.special-token:not(:empty):before {
/* Special tokens that have text (UNK) are displayed above the actual text*/
content:attr(data-stok);
position:absolute;
bottom:1.75rem;
min-width:100%;
width:100%;
height:1rem;
line-height:1rem;
font-size:1rem;
text-align:center;
color:white;
font-weight:bold;
background:#202020;
border-radius:10%;
}
/*
We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations
instead we apply even and odd class at generation time and color them that way
*/
.even-token{
background:#DCDCDC ;
border: 1px solid #DCDCDC;
}
.odd-token{
background:#A0A0A0;
border: 1px solid #A0A0A0;
}
.even-token.multi-token,.odd-token.multi-token{
background: repeating-linear-gradient(
45deg,
transparent,
transparent 1px,
#ccc 1px,
#ccc 1px
),
/* on "bottom" */
linear-gradient(
to bottom,
#FFB6C1,
#999
);
}
.multi-token:hover::after {
content:"This char has more than 1 token"; /* The content defines the width of the annotation*/
color:white;
background-color: black;
position:absolute;
font-size:0.75rem;
text-align:center;
font-weight:bold;
text-overflow:ellipsis;
top:1.75rem;
line-height:0;
overflow: hidden;
white-space: nowrap;
left:0;
width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/
padding:0.5rem 0;
}

View File

@@ -0,0 +1,403 @@
import itertools
import os
import re
from string import Template
from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple
from tokenizers import Encoding, Tokenizer
dirname = os.path.dirname(__file__)
css_filename = os.path.join(dirname, "visualizer-styles.css")
with open(css_filename) as f:
css = f.read()
class Annotation:
start: int
end: int
label: int
def __init__(self, start: int, end: int, label: str):
self.start = start
self.end = end
self.label = label
AnnotationList = List[Annotation]
PartialIntList = List[Optional[int]]
class CharStateKey(NamedTuple):
token_ix: Optional[int]
anno_ix: Optional[int]
class CharState:
char_ix: Optional[int]
def __init__(self, char_ix):
self.char_ix = char_ix
self.anno_ix: Optional[int] = None
self.tokens: List[int] = []
@property
def token_ix(self):
return self.tokens[0] if len(self.tokens) > 0 else None
@property
def is_multitoken(self):
"""
BPE tokenizers can output more than one token for a char
"""
return len(self.tokens) > 1
def partition_key(self) -> CharStateKey:
return CharStateKey(
token_ix=self.token_ix,
anno_ix=self.anno_ix,
)
class Aligned:
pass
class EncodingVisualizer:
"""
Build an EncodingVisualizer
Args:
tokenizer (:class:`~tokenizers.Tokenizer`):
A tokenizer instance
default_to_notebook (:obj:`bool`):
Whether to render html output in a notebook by default
annotation_converter (:obj:`Callable`, `optional`):
An optional (lambda) function that takes an annotation in any format and returns
an Annotation object
"""
unk_token_regex = re.compile("(.{1}\b)?(unk|oov)(\b.{1})?", flags=re.IGNORECASE)
def __init__(
self,
tokenizer: Tokenizer,
default_to_notebook: bool = True,
annotation_converter: Optional[Callable[[Any], Annotation]] = None,
):
if default_to_notebook:
try:
from IPython.core.display import HTML, display
except ImportError:
raise Exception(
"""We couldn't import IPython utils for html display.
Are you running in a notebook?
You can also pass `default_to_notebook=False` to get back raw HTML
"""
)
self.tokenizer = tokenizer
self.default_to_notebook = default_to_notebook
self.annotation_coverter = annotation_converter
pass
def __call__(
self,
text: str,
annotations: AnnotationList = [],
default_to_notebook: Optional[bool] = None,
) -> Optional[str]:
"""
Build a visualization of the given text
Args:
text (:obj:`str`):
The text to tokenize
annotations (:obj:`List[Annotation]`, `optional`):
An optional list of annotations of the text. The can either be an annotation class
or anything else if you instantiated the visualizer with a converter function
default_to_notebook (:obj:`bool`, `optional`, defaults to `False`):
If True, will render the html in a notebook. Otherwise returns an html string.
Returns:
The HTML string if default_to_notebook is False, otherwise (default) returns None and
renders the HTML in the notebook
"""
final_default_to_notebook = self.default_to_notebook
if default_to_notebook is not None:
final_default_to_notebook = default_to_notebook
if final_default_to_notebook:
try:
from IPython.core.display import HTML, display
except ImportError:
raise Exception(
"""We couldn't import IPython utils for html display.
Are you running in a notebook?"""
)
if self.annotation_coverter is not None:
annotations = list(map(self.annotation_coverter, annotations))
encoding = self.tokenizer.encode(text)
html = EncodingVisualizer.__make_html(text, encoding, annotations)
if final_default_to_notebook:
display(HTML(html))
else:
return html
@staticmethod
def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]:
"""
Generates a color palette for all the labels in a given set of annotations
Args:
annotations (:obj:`Annotation`):
A list of annotations
Returns:
:obj:`dict`: A dictionary mapping labels to colors in HSL format
"""
if len(annotations) == 0:
return {}
labels = set(map(lambda x: x.label, annotations))
num_labels = len(labels)
h_step = int(255 / num_labels)
if h_step < 20:
h_step = 20
s = 32
l = 64 # noqa: E741
h = 10
colors = {}
for label in sorted(labels): # sort so we always get the same colors for a given set of labels
colors[label] = f"hsl({h},{s}%,{l}%"
h += h_step
return colors
@staticmethod
def consecutive_chars_to_html(
consecutive_chars_list: List[CharState],
text: str,
encoding: Encoding,
):
"""
Converts a list of "consecutive chars" into a single HTML element.
Chars are consecutive if they fall under the same word, token and annotation.
The CharState class is a named tuple with a "partition_key" method that makes it easy to
compare if two chars are consecutive.
Args:
consecutive_chars_list (:obj:`List[CharState]`):
A list of CharStates that have been grouped together
text (:obj:`str`):
The original text being processed
encoding (:class:`~tokenizers.Encoding`):
The encoding returned from the tokenizer
Returns:
:obj:`str`: The HTML span for a set of consecutive chars
"""
first = consecutive_chars_list[0]
if first.char_ix is None:
# its a special token
stoken = encoding.tokens[first.token_ix]
# special tokens are represented as empty spans. We use the data attribute and css
# magic to display it
return f'<span class="special-token" data-stoken={stoken}></span>'
# We're not in a special token so this group has a start and end.
last = consecutive_chars_list[-1]
start = first.char_ix
end = last.char_ix + 1
span_text = text[start:end]
css_classes = [] # What css classes will we apply on the resulting span
data_items = {} # What data attributes will we apply on the result span
if first.token_ix is not None:
# We can either be in a token or not (e.g. in white space)
css_classes.append("token")
if first.is_multitoken:
css_classes.append("multi-token")
if first.token_ix % 2:
# We use this to color alternating tokens.
# A token might be split by an annotation that ends in the middle of it, so this
# lets us visually indicate a consecutive token despite its possible splitting in
# the html markup
css_classes.append("odd-token")
else:
# Like above, but a different color so we can see the tokens alternate
css_classes.append("even-token")
if EncodingVisualizer.unk_token_regex.search(encoding.tokens[first.token_ix]) is not None:
# This is a special token that is in the text. probably UNK
css_classes.append("special-token")
# TODO is this the right name for the data attribute ?
data_items["stok"] = encoding.tokens[first.token_ix]
else:
# In this case we are looking at a group/single char that is not tokenized.
# e.g. white space
css_classes.append("non-token")
css = f'''class="{" ".join(css_classes)}"'''
data = ""
for key, val in data_items.items():
data += f' data-{key}="{val}"'
return f"<span {css} {data} >{span_text}</span>"
@staticmethod
def __make_html(text: str, encoding: Encoding, annotations: AnnotationList) -> str:
char_states = EncodingVisualizer.__make_char_states(text, encoding, annotations)
current_consecutive_chars = [char_states[0]]
prev_anno_ix = char_states[0].anno_ix
spans = []
label_colors_dict = EncodingVisualizer.calculate_label_colors(annotations)
cur_anno_ix = char_states[0].anno_ix
if cur_anno_ix is not None:
# If we started in an annotation make a span for it
anno = annotations[cur_anno_ix]
label = anno.label
color = label_colors_dict[label]
spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
for cs in char_states[1:]:
cur_anno_ix = cs.anno_ix
if cur_anno_ix != prev_anno_ix:
# If we've transitioned in or out of an annotation
spans.append(
# Create a span from the current consecutive characters
EncodingVisualizer.consecutive_chars_to_html(
current_consecutive_chars,
text=text,
encoding=encoding,
)
)
current_consecutive_chars = [cs]
if prev_anno_ix is not None:
# if we transitioned out of an annotation close it's span
spans.append("</span>")
if cur_anno_ix is not None:
# If we entered a new annotation make a span for it
anno = annotations[cur_anno_ix]
label = anno.label
color = label_colors_dict[label]
spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
prev_anno_ix = cur_anno_ix
if cs.partition_key() == current_consecutive_chars[0].partition_key():
# If the current charchter is in the same "group" as the previous one
current_consecutive_chars.append(cs)
else:
# Otherwise we make a span for the previous group
spans.append(
EncodingVisualizer.consecutive_chars_to_html(
current_consecutive_chars,
text=text,
encoding=encoding,
)
)
# An reset the consecutive_char_list to form a new group
current_consecutive_chars = [cs]
# All that's left is to fill out the final span
# TODO I think there is an edge case here where an annotation's span might not close
spans.append(
EncodingVisualizer.consecutive_chars_to_html(
current_consecutive_chars,
text=text,
encoding=encoding,
)
)
res = HTMLBody(spans) # Send the list of spans to the body of our html
return res
@staticmethod
def __make_anno_map(text: str, annotations: AnnotationList) -> PartialIntList:
"""
Args:
text (:obj:`str`):
The raw text we want to align to
annotations (:obj:`AnnotationList`):
A (possibly empty) list of annotations
Returns:
A list of length len(text) whose entry at index i is None if there is no annotation on
character i or k, the index of the annotation that covers index i where k is with
respect to the list of annotations
"""
annotation_map = [None] * len(text)
for anno_ix, a in enumerate(annotations):
for i in range(a.start, a.end):
annotation_map[i] = anno_ix
return annotation_map
@staticmethod
def __make_char_states(text: str, encoding: Encoding, annotations: AnnotationList) -> List[CharState]:
"""
For each character in the original text, we emit a tuple representing it's "state":
* which token_ix it corresponds to
* which word_ix it corresponds to
* which annotation_ix it corresponds to
Args:
text (:obj:`str`):
The raw text we want to align to
annotations (:obj:`List[Annotation]`):
A (possibly empty) list of annotations
encoding: (:class:`~tokenizers.Encoding`):
The encoding returned from the tokenizer
Returns:
:obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what
it's state is
"""
annotation_map = EncodingVisualizer.__make_anno_map(text, annotations)
# Todo make this a dataclass or named tuple
char_states: List[CharState] = [CharState(char_ix) for char_ix in range(len(text))]
for token_ix, token in enumerate(encoding.tokens):
offsets = encoding.token_to_chars(token_ix)
if offsets is not None:
start, end = offsets
for i in range(start, end):
char_states[i].tokens.append(token_ix)
for char_ix, anno_ix in enumerate(annotation_map):
char_states[char_ix].anno_ix = anno_ix
return char_states
def HTMLBody(children: List[str], css_styles=css) -> str:
"""
Generates the full html with css from a list of html spans
Args:
children (:obj:`List[str]`):
A list of strings, assumed to be html elements
css_styles (:obj:`str`, `optional`):
Optional alternative implementation of the css
Returns:
:obj:`str`: An HTML string with style markup
"""
children_text = "".join(children)
return f"""
<html>
<head>
<style>
{css_styles}
</style>
</head>
<body>
<div class="tokenized-text" dir=auto>
{children_text}
</div>
</body>
</html>
"""

View File

@@ -0,0 +1,8 @@
# Generated content DO NOT EDIT
from .. import trainers
Trainer = trainers.Trainer
BpeTrainer = trainers.BpeTrainer
UnigramTrainer = trainers.UnigramTrainer
WordLevelTrainer = trainers.WordLevelTrainer
WordPieceTrainer = trainers.WordPieceTrainer

View File

@@ -0,0 +1,156 @@
# Generated content DO NOT EDIT
class Trainer:
"""
Base class for all trainers
This class is not supposed to be instantiated directly. Instead, any implementation of a
Trainer will return an instance of this class when instantiated.
"""
class BpeTrainer(Trainer):
"""
Trainer capable of training a BPE model
Args:
vocab_size (:obj:`int`, `optional`):
The size of the final vocabulary, including all tokens and alphabet.
min_frequency (:obj:`int`, `optional`):
The minimum frequency a pair should have in order to be merged.
show_progress (:obj:`bool`, `optional`):
Whether to show progress bars while training.
special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
A list of special tokens the model should know of.
limit_alphabet (:obj:`int`, `optional`):
The maximum different characters to keep in the alphabet.
initial_alphabet (:obj:`List[str]`, `optional`):
A list of characters to include in the initial alphabet, even
if not seen in the training dataset.
If the strings contain more than one character, only the first one
is kept.
continuing_subword_prefix (:obj:`str`, `optional`):
A prefix to be used for every subword that is not a beginning-of-word.
end_of_word_suffix (:obj:`str`, `optional`):
A suffix to be used for every subword that is a end-of-word.
max_token_length (:obj:`int`, `optional`):
Prevents creating tokens longer than the specified size.
This can help with reducing polluting your vocabulary with
highly repetitive tokens like `======` for wikipedia
"""
class UnigramTrainer(Trainer):
"""
Trainer capable of training a Unigram model
Args:
vocab_size (:obj:`int`):
The size of the final vocabulary, including all tokens and alphabet.
show_progress (:obj:`bool`):
Whether to show progress bars while training.
special_tokens (:obj:`List[Union[str, AddedToken]]`):
A list of special tokens the model should know of.
initial_alphabet (:obj:`List[str]`):
A list of characters to include in the initial alphabet, even
if not seen in the training dataset.
If the strings contain more than one character, only the first one
is kept.
shrinking_factor (:obj:`float`):
The shrinking factor used at each step of the training to prune the
vocabulary.
unk_token (:obj:`str`):
The token used for out-of-vocabulary tokens.
max_piece_length (:obj:`int`):
The maximum length of a given token.
n_sub_iterations (:obj:`int`):
The number of iterations of the EM algorithm to perform before
pruning the vocabulary.
"""
def __init__(
self,
vocab_size=8000,
show_progress=True,
special_tokens=[],
shrinking_factor=0.75,
unk_token=None,
max_piece_length=16,
n_sub_iterations=2,
):
pass
class WordLevelTrainer(Trainer):
"""
Trainer capable of training a WorldLevel model
Args:
vocab_size (:obj:`int`, `optional`):
The size of the final vocabulary, including all tokens and alphabet.
min_frequency (:obj:`int`, `optional`):
The minimum frequency a pair should have in order to be merged.
show_progress (:obj:`bool`, `optional`):
Whether to show progress bars while training.
special_tokens (:obj:`List[Union[str, AddedToken]]`):
A list of special tokens the model should know of.
"""
class WordPieceTrainer(Trainer):
"""
Trainer capable of training a WordPiece model
Args:
vocab_size (:obj:`int`, `optional`):
The size of the final vocabulary, including all tokens and alphabet.
min_frequency (:obj:`int`, `optional`):
The minimum frequency a pair should have in order to be merged.
show_progress (:obj:`bool`, `optional`):
Whether to show progress bars while training.
special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
A list of special tokens the model should know of.
limit_alphabet (:obj:`int`, `optional`):
The maximum different characters to keep in the alphabet.
initial_alphabet (:obj:`List[str]`, `optional`):
A list of characters to include in the initial alphabet, even
if not seen in the training dataset.
If the strings contain more than one character, only the first one
is kept.
continuing_subword_prefix (:obj:`str`, `optional`):
A prefix to be used for every subword that is not a beginning-of-word.
end_of_word_suffix (:obj:`str`, `optional`):
A suffix to be used for every subword that is a end-of-word.
"""
def __init__(
self,
vocab_size=30000,
min_frequency=0,
show_progress=True,
special_tokens=[],
limit_alphabet=None,
initial_alphabet=[],
continuing_subword_prefix="##",
end_of_word_suffix=None,
):
pass