structure saas with tools
This commit is contained in:
100
.venv/lib/python3.10/site-packages/tokenizers/__init__.py
Normal file
100
.venv/lib/python3.10/site-packages/tokenizers/__init__.py
Normal file
@@ -0,0 +1,100 @@
|
||||
from enum import Enum
|
||||
from typing import List, Tuple, Union
|
||||
|
||||
|
||||
Offsets = Tuple[int, int]
|
||||
|
||||
TextInputSequence = str
|
||||
"""A :obj:`str` that represents an input sequence """
|
||||
|
||||
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
|
||||
"""A pre-tokenized input sequence. Can be one of:
|
||||
|
||||
- A :obj:`List` of :obj:`str`
|
||||
- A :obj:`Tuple` of :obj:`str`
|
||||
"""
|
||||
|
||||
TextEncodeInput = Union[
|
||||
TextInputSequence,
|
||||
Tuple[TextInputSequence, TextInputSequence],
|
||||
List[TextInputSequence],
|
||||
]
|
||||
"""Represents a textual input for encoding. Can be either:
|
||||
|
||||
- A single sequence: :data:`~tokenizers.TextInputSequence`
|
||||
- A pair of sequences:
|
||||
|
||||
- A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence`
|
||||
- Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2
|
||||
"""
|
||||
|
||||
PreTokenizedEncodeInput = Union[
|
||||
PreTokenizedInputSequence,
|
||||
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
|
||||
List[PreTokenizedInputSequence],
|
||||
]
|
||||
"""Represents a pre-tokenized input for encoding. Can be either:
|
||||
|
||||
- A single sequence: :data:`~tokenizers.PreTokenizedInputSequence`
|
||||
- A pair of sequences:
|
||||
|
||||
- A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence`
|
||||
- Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2
|
||||
"""
|
||||
|
||||
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
|
||||
"""Represents all the possible types of input sequences for encoding. Can be:
|
||||
|
||||
- When ``is_pretokenized=False``: :data:`~TextInputSequence`
|
||||
- When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence`
|
||||
"""
|
||||
|
||||
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
|
||||
"""Represents all the possible types of input for encoding. Can be:
|
||||
|
||||
- When ``is_pretokenized=False``: :data:`~TextEncodeInput`
|
||||
- When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput`
|
||||
"""
|
||||
|
||||
|
||||
class OffsetReferential(Enum):
|
||||
ORIGINAL = "original"
|
||||
NORMALIZED = "normalized"
|
||||
|
||||
|
||||
class OffsetType(Enum):
|
||||
BYTE = "byte"
|
||||
CHAR = "char"
|
||||
|
||||
|
||||
class SplitDelimiterBehavior(Enum):
|
||||
REMOVED = "removed"
|
||||
ISOLATED = "isolated"
|
||||
MERGED_WITH_PREVIOUS = "merged_with_previous"
|
||||
MERGED_WITH_NEXT = "merged_with_next"
|
||||
CONTIGUOUS = "contiguous"
|
||||
|
||||
|
||||
from .tokenizers import (
|
||||
AddedToken,
|
||||
Encoding,
|
||||
NormalizedString,
|
||||
PreTokenizedString,
|
||||
Regex,
|
||||
Token,
|
||||
Tokenizer,
|
||||
decoders,
|
||||
models,
|
||||
normalizers,
|
||||
pre_tokenizers,
|
||||
processors,
|
||||
trainers,
|
||||
__version__,
|
||||
)
|
||||
from .implementations import (
|
||||
BertWordPieceTokenizer,
|
||||
ByteLevelBPETokenizer,
|
||||
CharBPETokenizer,
|
||||
SentencePieceBPETokenizer,
|
||||
SentencePieceUnigramTokenizer,
|
||||
)
|
||||
1238
.venv/lib/python3.10/site-packages/tokenizers/__init__.pyi
Normal file
1238
.venv/lib/python3.10/site-packages/tokenizers/__init__.pyi
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,15 @@
|
||||
from .. import decoders
|
||||
|
||||
|
||||
Decoder = decoders.Decoder
|
||||
ByteLevel = decoders.ByteLevel
|
||||
Replace = decoders.Replace
|
||||
WordPiece = decoders.WordPiece
|
||||
ByteFallback = decoders.ByteFallback
|
||||
Fuse = decoders.Fuse
|
||||
Strip = decoders.Strip
|
||||
Metaspace = decoders.Metaspace
|
||||
BPEDecoder = decoders.BPEDecoder
|
||||
CTC = decoders.CTC
|
||||
Sequence = decoders.Sequence
|
||||
DecodeStream = decoders.DecodeStream
|
||||
@@ -0,0 +1,279 @@
|
||||
# Generated content DO NOT EDIT
|
||||
class DecodeStream:
|
||||
"""
|
||||
Class needed for streaming decode
|
||||
|
||||
"""
|
||||
def __init__(self, skip_special_tokens):
|
||||
pass
|
||||
|
||||
class Decoder:
|
||||
"""
|
||||
Base class for all decoders
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||
a Decoder will return an instance of this class when instantiated.
|
||||
"""
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
|
||||
Args:
|
||||
tokens (:obj:`List[str]`):
|
||||
The list of tokens to decode
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The decoded string
|
||||
"""
|
||||
pass
|
||||
|
||||
class BPEDecoder(Decoder):
|
||||
"""
|
||||
BPEDecoder Decoder
|
||||
|
||||
Args:
|
||||
suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`):
|
||||
The suffix that was used to caracterize an end-of-word. This suffix will
|
||||
be replaced by whitespaces during the decoding
|
||||
"""
|
||||
def __init__(self, suffix="</w>"):
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
|
||||
Args:
|
||||
tokens (:obj:`List[str]`):
|
||||
The list of tokens to decode
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The decoded string
|
||||
"""
|
||||
pass
|
||||
|
||||
class ByteFallback(Decoder):
|
||||
"""
|
||||
ByteFallback Decoder
|
||||
ByteFallback is a simple trick which converts tokens looking like `<0x61>`
|
||||
to pure bytes, and attempts to make them into a string. If the tokens
|
||||
cannot be decoded you will get <20> instead for each inconvertible byte token
|
||||
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
|
||||
Args:
|
||||
tokens (:obj:`List[str]`):
|
||||
The list of tokens to decode
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The decoded string
|
||||
"""
|
||||
pass
|
||||
|
||||
class ByteLevel(Decoder):
|
||||
"""
|
||||
ByteLevel Decoder
|
||||
|
||||
This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel`
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`.
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
|
||||
Args:
|
||||
tokens (:obj:`List[str]`):
|
||||
The list of tokens to decode
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The decoded string
|
||||
"""
|
||||
pass
|
||||
|
||||
class CTC(Decoder):
|
||||
"""
|
||||
CTC Decoder
|
||||
|
||||
Args:
|
||||
pad_token (:obj:`str`, `optional`, defaults to :obj:`<pad>`):
|
||||
The pad token used by CTC to delimit a new token.
|
||||
word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`|`):
|
||||
The word delimiter token. It will be replaced by a <space>
|
||||
cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to cleanup some tokenization artifacts.
|
||||
Mainly spaces before punctuation, and some abbreviated english forms.
|
||||
"""
|
||||
def __init__(self, pad_token="<pad>", word_delimiter_token="|", cleanup=True):
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
|
||||
Args:
|
||||
tokens (:obj:`List[str]`):
|
||||
The list of tokens to decode
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The decoded string
|
||||
"""
|
||||
pass
|
||||
|
||||
class Fuse(Decoder):
|
||||
"""
|
||||
Fuse Decoder
|
||||
Fuse simply fuses every token into a single string.
|
||||
This is the last step of decoding, this decoder exists only if
|
||||
there is need to add other decoders *after* the fusion
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
|
||||
Args:
|
||||
tokens (:obj:`List[str]`):
|
||||
The list of tokens to decode
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The decoded string
|
||||
"""
|
||||
pass
|
||||
|
||||
class Metaspace(Decoder):
|
||||
"""
|
||||
Metaspace Decoder
|
||||
|
||||
Args:
|
||||
replacement (:obj:`str`, `optional`, defaults to :obj:`▁`):
|
||||
The replacement character. Must be exactly one character. By default we
|
||||
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||
|
||||
prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
Choices: "always", "never", "first". First means the space is only added on the first
|
||||
token (relevant when special tokens are used or other pre_tokenizer are used).
|
||||
"""
|
||||
def __init__(self, replacement="▁", prepend_scheme="always", split=True):
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
|
||||
Args:
|
||||
tokens (:obj:`List[str]`):
|
||||
The list of tokens to decode
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The decoded string
|
||||
"""
|
||||
pass
|
||||
|
||||
class Replace(Decoder):
|
||||
"""
|
||||
Replace Decoder
|
||||
|
||||
This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace`
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`.
|
||||
"""
|
||||
def __init__(self, pattern, content):
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
|
||||
Args:
|
||||
tokens (:obj:`List[str]`):
|
||||
The list of tokens to decode
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The decoded string
|
||||
"""
|
||||
pass
|
||||
|
||||
class Sequence(Decoder):
|
||||
"""
|
||||
Sequence Decoder
|
||||
|
||||
Args:
|
||||
decoders (:obj:`List[Decoder]`)
|
||||
The decoders that need to be chained
|
||||
"""
|
||||
def __init__(self, decoders):
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
|
||||
Args:
|
||||
tokens (:obj:`List[str]`):
|
||||
The list of tokens to decode
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The decoded string
|
||||
"""
|
||||
pass
|
||||
|
||||
class Strip(Decoder):
|
||||
"""
|
||||
Strip normalizer
|
||||
Strips n left characters of each token, or n right characters of each token
|
||||
"""
|
||||
def __init__(self, content, left=0, right=0):
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
|
||||
Args:
|
||||
tokens (:obj:`List[str]`):
|
||||
The list of tokens to decode
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The decoded string
|
||||
"""
|
||||
pass
|
||||
|
||||
class WordPiece(Decoder):
|
||||
"""
|
||||
WordPiece Decoder
|
||||
|
||||
Args:
|
||||
prefix (:obj:`str`, `optional`, defaults to :obj:`##`):
|
||||
The prefix to use for subwords that are not a beginning-of-word
|
||||
|
||||
cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
|
||||
and some abbreviated english forms.
|
||||
"""
|
||||
def __init__(self, prefix="##", cleanup=True):
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
|
||||
Args:
|
||||
tokens (:obj:`List[str]`):
|
||||
The list of tokens to decode
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The decoded string
|
||||
"""
|
||||
pass
|
||||
Binary file not shown.
@@ -0,0 +1,6 @@
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
from .bert_wordpiece import BertWordPieceTokenizer
|
||||
from .byte_level_bpe import ByteLevelBPETokenizer
|
||||
from .char_level_bpe import CharBPETokenizer
|
||||
from .sentencepiece_bpe import SentencePieceBPETokenizer
|
||||
from .sentencepiece_unigram import SentencePieceUnigramTokenizer
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,418 @@
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
from tokenizers import AddedToken, EncodeInput, Encoding, InputSequence, Tokenizer
|
||||
from tokenizers.decoders import Decoder
|
||||
from tokenizers.models import Model
|
||||
from tokenizers.normalizers import Normalizer
|
||||
from tokenizers.pre_tokenizers import PreTokenizer
|
||||
from tokenizers.processors import PostProcessor
|
||||
|
||||
|
||||
Offsets = Tuple[int, int]
|
||||
|
||||
|
||||
class BaseTokenizer:
|
||||
def __init__(self, tokenizer: Tokenizer, parameters=None):
|
||||
self._tokenizer = tokenizer
|
||||
self._parameters = parameters if parameters is not None else {}
|
||||
|
||||
def __repr__(self):
|
||||
return "Tokenizer(vocabulary_size={}, {})".format(
|
||||
self._tokenizer.get_vocab_size(),
|
||||
", ".join(k + "=" + str(v) for k, v in self._parameters.items()),
|
||||
)
|
||||
|
||||
def num_special_tokens_to_add(self, is_pair: bool) -> int:
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
||||
:return:
|
||||
"""
|
||||
return self._tokenizer.num_special_tokens_to_add(is_pair)
|
||||
|
||||
def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
|
||||
"""Returns the vocabulary
|
||||
|
||||
Args:
|
||||
with_added_tokens: boolean:
|
||||
Whether to include the added tokens in the vocabulary
|
||||
|
||||
Returns:
|
||||
The vocabulary
|
||||
"""
|
||||
return self._tokenizer.get_vocab(with_added_tokens=with_added_tokens)
|
||||
|
||||
def get_added_tokens_decoder(self) -> Dict[int, AddedToken]:
|
||||
"""Returns the added reverse vocabulary
|
||||
|
||||
Returns:
|
||||
The added vocabulary mapping ints to AddedTokens
|
||||
"""
|
||||
return self._tokenizer.get_added_tokens_decoder()
|
||||
|
||||
def get_vocab_size(self, with_added_tokens: bool = True) -> int:
|
||||
"""Return the size of vocabulary, with or without added tokens.
|
||||
|
||||
Args:
|
||||
with_added_tokens: (`optional`) bool:
|
||||
Whether to count in added special tokens or not
|
||||
|
||||
Returns:
|
||||
Size of vocabulary
|
||||
"""
|
||||
return self._tokenizer.get_vocab_size(with_added_tokens=with_added_tokens)
|
||||
|
||||
def enable_padding(
|
||||
self,
|
||||
direction: Optional[str] = "right",
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
pad_id: Optional[int] = 0,
|
||||
pad_type_id: Optional[int] = 0,
|
||||
pad_token: Optional[str] = "[PAD]",
|
||||
length: Optional[int] = None,
|
||||
):
|
||||
"""Change the padding strategy
|
||||
|
||||
Args:
|
||||
direction: (`optional`) str:
|
||||
Can be one of: `right` or `left`
|
||||
|
||||
pad_to_multiple_of: (`optional`) unsigned int:
|
||||
If specified, the padding length should always snap to the next multiple of
|
||||
the given value. For example if we were going to pad with a length of 250 but
|
||||
`pad_to_multiple_of=8` then we will pad to 256.
|
||||
|
||||
pad_id: (`optional`) unsigned int:
|
||||
The indice to be used when padding
|
||||
|
||||
pad_type_id: (`optional`) unsigned int:
|
||||
The type indice to be used when padding
|
||||
|
||||
pad_token: (`optional`) str:
|
||||
The pad token to be used when padding
|
||||
|
||||
length: (`optional`) unsigned int:
|
||||
If specified, the length at which to pad. If not specified
|
||||
we pad using the size of the longest sequence in a batch
|
||||
"""
|
||||
return self._tokenizer.enable_padding(
|
||||
direction=direction,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
pad_id=pad_id,
|
||||
pad_type_id=pad_type_id,
|
||||
pad_token=pad_token,
|
||||
length=length,
|
||||
)
|
||||
|
||||
def no_padding(self):
|
||||
"""Disable padding"""
|
||||
return self._tokenizer.no_padding()
|
||||
|
||||
@property
|
||||
def padding(self) -> Optional[dict]:
|
||||
"""Get the current padding parameters
|
||||
|
||||
Returns:
|
||||
None if padding is disabled, a dict with the currently set parameters
|
||||
if the padding is enabled.
|
||||
"""
|
||||
return self._tokenizer.padding
|
||||
|
||||
def enable_truncation(self, max_length: int, stride: Optional[int] = 0, strategy: Optional[str] = "longest_first"):
|
||||
"""Change the truncation options
|
||||
|
||||
Args:
|
||||
max_length: unsigned int:
|
||||
The maximum length at which to truncate
|
||||
|
||||
stride: (`optional`) unsigned int:
|
||||
The length of the previous first sequence to be included
|
||||
in the overflowing sequence
|
||||
|
||||
strategy: (`optional`) str:
|
||||
Can be one of `longest_first`, `only_first` or `only_second`
|
||||
"""
|
||||
return self._tokenizer.enable_truncation(max_length, stride=stride, strategy=strategy)
|
||||
|
||||
def no_truncation(self):
|
||||
"""Disable truncation"""
|
||||
return self._tokenizer.no_truncation()
|
||||
|
||||
@property
|
||||
def truncation(self) -> Optional[dict]:
|
||||
"""Get the current truncation parameters
|
||||
|
||||
Returns:
|
||||
None if truncation is disabled, a dict with the current truncation parameters if
|
||||
truncation is enabled
|
||||
"""
|
||||
return self._tokenizer.truncation
|
||||
|
||||
def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
|
||||
"""Add the given tokens to the vocabulary
|
||||
|
||||
Args:
|
||||
tokens: List[Union[str, AddedToken]]:
|
||||
A list of tokens to add to the vocabulary. Each token can either be
|
||||
a string, or an instance of AddedToken
|
||||
|
||||
Returns:
|
||||
The number of tokens that were added to the vocabulary
|
||||
"""
|
||||
return self._tokenizer.add_tokens(tokens)
|
||||
|
||||
def add_special_tokens(self, special_tokens: List[Union[str, AddedToken]]) -> int:
|
||||
"""Add the given special tokens to the vocabulary, and treat them as special tokens.
|
||||
|
||||
The special tokens will never be processed by the model, and will be
|
||||
removed while decoding.
|
||||
|
||||
Args:
|
||||
tokens: List[Union[str, AddedToken]]:
|
||||
A list of special tokens to add to the vocabulary. Each token can either be
|
||||
a string, or an instance of AddedToken
|
||||
|
||||
Returns:
|
||||
The number of tokens that were added to the vocabulary
|
||||
"""
|
||||
return self._tokenizer.add_special_tokens(special_tokens)
|
||||
|
||||
def normalize(self, sequence: str) -> str:
|
||||
"""Normalize the given sequence
|
||||
|
||||
Args:
|
||||
sequence: str:
|
||||
The sequence to normalize
|
||||
|
||||
Returns:
|
||||
The normalized string
|
||||
"""
|
||||
return self._tokenizer.normalize(sequence)
|
||||
|
||||
def encode(
|
||||
self,
|
||||
sequence: InputSequence,
|
||||
pair: Optional[InputSequence] = None,
|
||||
is_pretokenized: bool = False,
|
||||
add_special_tokens: bool = True,
|
||||
) -> Encoding:
|
||||
"""Encode the given sequence and pair. This method can process raw text sequences as well
|
||||
as already pre-tokenized sequences.
|
||||
|
||||
Args:
|
||||
sequence: InputSequence:
|
||||
The sequence we want to encode. This sequence can be either raw text or
|
||||
pre-tokenized, according to the `is_pretokenized` argument:
|
||||
|
||||
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
|
||||
- If `is_pretokenized=True`: `InputSequence` is expected to be
|
||||
`Union[List[str], Tuple[str]]`
|
||||
|
||||
is_pretokenized: bool:
|
||||
Whether the input is already pre-tokenized.
|
||||
|
||||
add_special_tokens: bool:
|
||||
Whether to add the special tokens while encoding.
|
||||
|
||||
Returns:
|
||||
An Encoding
|
||||
"""
|
||||
if sequence is None:
|
||||
raise ValueError("encode: `sequence` can't be `None`")
|
||||
|
||||
return self._tokenizer.encode(sequence, pair, is_pretokenized, add_special_tokens)
|
||||
|
||||
def encode_batch(
|
||||
self,
|
||||
inputs: List[EncodeInput],
|
||||
is_pretokenized: bool = False,
|
||||
add_special_tokens: bool = True,
|
||||
) -> List[Encoding]:
|
||||
"""Encode the given inputs. This method accept both raw text sequences as well as already
|
||||
pre-tokenized sequences.
|
||||
|
||||
Args:
|
||||
inputs: List[EncodeInput]:
|
||||
A list of single sequences or pair sequences to encode. Each `EncodeInput` is
|
||||
expected to be of the following form:
|
||||
`Union[InputSequence, Tuple[InputSequence, InputSequence]]`
|
||||
|
||||
Each `InputSequence` can either be raw text or pre-tokenized,
|
||||
according to the `is_pretokenized` argument:
|
||||
|
||||
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
|
||||
- If `is_pretokenized=True`: `InputSequence` is expected to be
|
||||
`Union[List[str], Tuple[str]]`
|
||||
|
||||
is_pretokenized: bool:
|
||||
Whether the input is already pre-tokenized.
|
||||
|
||||
add_special_tokens: bool:
|
||||
Whether to add the special tokens while encoding.
|
||||
|
||||
Returns:
|
||||
A list of Encoding
|
||||
"""
|
||||
|
||||
if inputs is None:
|
||||
raise ValueError("encode_batch: `inputs` can't be `None`")
|
||||
|
||||
return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)
|
||||
|
||||
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
|
||||
"""Decode the given list of ids to a string sequence
|
||||
|
||||
Args:
|
||||
ids: List[unsigned int]:
|
||||
A list of ids to be decoded
|
||||
|
||||
skip_special_tokens: (`optional`) boolean:
|
||||
Whether to remove all the special tokens from the output string
|
||||
|
||||
Returns:
|
||||
The decoded string
|
||||
"""
|
||||
if ids is None:
|
||||
raise ValueError("None input is not valid. Should be a list of integers.")
|
||||
|
||||
return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
|
||||
|
||||
def decode_batch(self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True) -> str:
|
||||
"""Decode the list of sequences to a list of string sequences
|
||||
|
||||
Args:
|
||||
sequences: List[List[unsigned int]]:
|
||||
A list of sequence of ids to be decoded
|
||||
|
||||
skip_special_tokens: (`optional`) boolean:
|
||||
Whether to remove all the special tokens from the output strings
|
||||
|
||||
Returns:
|
||||
A list of decoded strings
|
||||
"""
|
||||
if sequences is None:
|
||||
raise ValueError("None input is not valid. Should be list of list of integers.")
|
||||
|
||||
return self._tokenizer.decode_batch(sequences, skip_special_tokens=skip_special_tokens)
|
||||
|
||||
def token_to_id(self, token: str) -> Optional[int]:
|
||||
"""Convert the given token to its corresponding id
|
||||
|
||||
Args:
|
||||
token: str:
|
||||
The token to convert
|
||||
|
||||
Returns:
|
||||
The corresponding id if it exists, None otherwise
|
||||
"""
|
||||
return self._tokenizer.token_to_id(token)
|
||||
|
||||
def id_to_token(self, id: int) -> Optional[str]:
|
||||
"""Convert the given token id to its corresponding string
|
||||
|
||||
Args:
|
||||
token: id:
|
||||
The token id to convert
|
||||
|
||||
Returns:
|
||||
The corresponding string if it exists, None otherwise
|
||||
"""
|
||||
return self._tokenizer.id_to_token(id)
|
||||
|
||||
def save_model(self, directory: str, prefix: Optional[str] = None):
|
||||
"""Save the current model to the given directory
|
||||
|
||||
Args:
|
||||
directory: str:
|
||||
A path to the destination directory
|
||||
|
||||
prefix: (Optional) str:
|
||||
An optional prefix, used to prefix each file name
|
||||
"""
|
||||
return self._tokenizer.model.save(directory, prefix=prefix)
|
||||
|
||||
def save(self, path: str, pretty: bool = True):
|
||||
"""Save the current Tokenizer at the given path
|
||||
|
||||
Args:
|
||||
path: str:
|
||||
A path to the destination Tokenizer file
|
||||
"""
|
||||
return self._tokenizer.save(path, pretty)
|
||||
|
||||
def to_str(self, pretty: bool = False):
|
||||
"""Get a serialized JSON version of the Tokenizer as a str
|
||||
|
||||
Args:
|
||||
pretty: bool:
|
||||
Whether the JSON string should be prettified
|
||||
|
||||
Returns:
|
||||
str
|
||||
"""
|
||||
return self._tokenizer.to_str(pretty)
|
||||
|
||||
def post_process(
|
||||
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True
|
||||
) -> Encoding:
|
||||
"""Apply all the post-processing steps to the given encodings.
|
||||
|
||||
The various steps are:
|
||||
1. Truncate according to global params (provided to `enable_truncation`)
|
||||
2. Apply the PostProcessor
|
||||
3. Pad according to global params. (provided to `enable_padding`)
|
||||
|
||||
Args:
|
||||
encoding: Encoding:
|
||||
The main Encoding to post process
|
||||
|
||||
pair: Optional[Encoding]:
|
||||
An optional pair Encoding
|
||||
|
||||
add_special_tokens: bool:
|
||||
Whether to add special tokens
|
||||
|
||||
Returns:
|
||||
The resulting Encoding
|
||||
"""
|
||||
return self._tokenizer.post_process(encoding, pair, add_special_tokens)
|
||||
|
||||
@property
|
||||
def model(self) -> Model:
|
||||
return self._tokenizer.model
|
||||
|
||||
@model.setter
|
||||
def model(self, model: Model):
|
||||
self._tokenizer.model = model
|
||||
|
||||
@property
|
||||
def normalizer(self) -> Normalizer:
|
||||
return self._tokenizer.normalizer
|
||||
|
||||
@normalizer.setter
|
||||
def normalizer(self, normalizer: Normalizer):
|
||||
self._tokenizer.normalizer = normalizer
|
||||
|
||||
@property
|
||||
def pre_tokenizer(self) -> PreTokenizer:
|
||||
return self._tokenizer.pre_tokenizer
|
||||
|
||||
@pre_tokenizer.setter
|
||||
def pre_tokenizer(self, pre_tokenizer: PreTokenizer):
|
||||
self._tokenizer.pre_tokenizer = pre_tokenizer
|
||||
|
||||
@property
|
||||
def post_processor(self) -> PostProcessor:
|
||||
return self._tokenizer.post_processor
|
||||
|
||||
@post_processor.setter
|
||||
def post_processor(self, post_processor: PostProcessor):
|
||||
self._tokenizer.post_processor = post_processor
|
||||
|
||||
@property
|
||||
def decoder(self) -> Decoder:
|
||||
return self._tokenizer.decoder
|
||||
|
||||
@decoder.setter
|
||||
def decoder(self, decoder: Decoder):
|
||||
self._tokenizer.decoder = decoder
|
||||
@@ -0,0 +1,151 @@
|
||||
from typing import Dict, Iterator, List, Optional, Union
|
||||
|
||||
from tokenizers import AddedToken, Tokenizer, decoders, trainers
|
||||
from tokenizers.models import WordPiece
|
||||
from tokenizers.normalizers import BertNormalizer
|
||||
from tokenizers.pre_tokenizers import BertPreTokenizer
|
||||
from tokenizers.processors import BertProcessing
|
||||
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
|
||||
class BertWordPieceTokenizer(BaseTokenizer):
|
||||
"""Bert WordPiece Tokenizer"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Optional[Union[str, Dict[str, int]]] = None,
|
||||
unk_token: Union[str, AddedToken] = "[UNK]",
|
||||
sep_token: Union[str, AddedToken] = "[SEP]",
|
||||
cls_token: Union[str, AddedToken] = "[CLS]",
|
||||
pad_token: Union[str, AddedToken] = "[PAD]",
|
||||
mask_token: Union[str, AddedToken] = "[MASK]",
|
||||
clean_text: bool = True,
|
||||
handle_chinese_chars: bool = True,
|
||||
strip_accents: Optional[bool] = None,
|
||||
lowercase: bool = True,
|
||||
wordpieces_prefix: str = "##",
|
||||
):
|
||||
if vocab is not None:
|
||||
tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token)))
|
||||
else:
|
||||
tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token)))
|
||||
|
||||
# Let the tokenizer know about special tokens if they are part of the vocab
|
||||
if tokenizer.token_to_id(str(unk_token)) is not None:
|
||||
tokenizer.add_special_tokens([str(unk_token)])
|
||||
if tokenizer.token_to_id(str(sep_token)) is not None:
|
||||
tokenizer.add_special_tokens([str(sep_token)])
|
||||
if tokenizer.token_to_id(str(cls_token)) is not None:
|
||||
tokenizer.add_special_tokens([str(cls_token)])
|
||||
if tokenizer.token_to_id(str(pad_token)) is not None:
|
||||
tokenizer.add_special_tokens([str(pad_token)])
|
||||
if tokenizer.token_to_id(str(mask_token)) is not None:
|
||||
tokenizer.add_special_tokens([str(mask_token)])
|
||||
|
||||
tokenizer.normalizer = BertNormalizer(
|
||||
clean_text=clean_text,
|
||||
handle_chinese_chars=handle_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
lowercase=lowercase,
|
||||
)
|
||||
tokenizer.pre_tokenizer = BertPreTokenizer()
|
||||
|
||||
if vocab is not None:
|
||||
sep_token_id = tokenizer.token_to_id(str(sep_token))
|
||||
if sep_token_id is None:
|
||||
raise TypeError("sep_token not found in the vocabulary")
|
||||
cls_token_id = tokenizer.token_to_id(str(cls_token))
|
||||
if cls_token_id is None:
|
||||
raise TypeError("cls_token not found in the vocabulary")
|
||||
|
||||
tokenizer.post_processor = BertProcessing((str(sep_token), sep_token_id), (str(cls_token), cls_token_id))
|
||||
tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)
|
||||
|
||||
parameters = {
|
||||
"model": "BertWordPiece",
|
||||
"unk_token": unk_token,
|
||||
"sep_token": sep_token,
|
||||
"cls_token": cls_token,
|
||||
"pad_token": pad_token,
|
||||
"mask_token": mask_token,
|
||||
"clean_text": clean_text,
|
||||
"handle_chinese_chars": handle_chinese_chars,
|
||||
"strip_accents": strip_accents,
|
||||
"lowercase": lowercase,
|
||||
"wordpieces_prefix": wordpieces_prefix,
|
||||
}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
@staticmethod
|
||||
def from_file(vocab: str, **kwargs):
|
||||
vocab = WordPiece.read_file(vocab)
|
||||
return BertWordPieceTokenizer(vocab, **kwargs)
|
||||
|
||||
def train(
|
||||
self,
|
||||
files: Union[str, List[str]],
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 2,
|
||||
limit_alphabet: int = 1000,
|
||||
initial_alphabet: List[str] = [],
|
||||
special_tokens: List[Union[str, AddedToken]] = [
|
||||
"[PAD]",
|
||||
"[UNK]",
|
||||
"[CLS]",
|
||||
"[SEP]",
|
||||
"[MASK]",
|
||||
],
|
||||
show_progress: bool = True,
|
||||
wordpieces_prefix: str = "##",
|
||||
):
|
||||
"""Train the model using the given files"""
|
||||
|
||||
trainer = trainers.WordPieceTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
limit_alphabet=limit_alphabet,
|
||||
initial_alphabet=initial_alphabet,
|
||||
special_tokens=special_tokens,
|
||||
show_progress=show_progress,
|
||||
continuing_subword_prefix=wordpieces_prefix,
|
||||
)
|
||||
if isinstance(files, str):
|
||||
files = [files]
|
||||
self._tokenizer.train(files, trainer=trainer)
|
||||
|
||||
def train_from_iterator(
|
||||
self,
|
||||
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 2,
|
||||
limit_alphabet: int = 1000,
|
||||
initial_alphabet: List[str] = [],
|
||||
special_tokens: List[Union[str, AddedToken]] = [
|
||||
"[PAD]",
|
||||
"[UNK]",
|
||||
"[CLS]",
|
||||
"[SEP]",
|
||||
"[MASK]",
|
||||
],
|
||||
show_progress: bool = True,
|
||||
wordpieces_prefix: str = "##",
|
||||
length: Optional[int] = None,
|
||||
):
|
||||
"""Train the model using the given iterator"""
|
||||
|
||||
trainer = trainers.WordPieceTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
limit_alphabet=limit_alphabet,
|
||||
initial_alphabet=initial_alphabet,
|
||||
special_tokens=special_tokens,
|
||||
show_progress=show_progress,
|
||||
continuing_subword_prefix=wordpieces_prefix,
|
||||
)
|
||||
self._tokenizer.train_from_iterator(
|
||||
iterator,
|
||||
trainer=trainer,
|
||||
length=length,
|
||||
)
|
||||
@@ -0,0 +1,122 @@
|
||||
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
||||
|
||||
from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, processors, trainers
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import Lowercase, Sequence, unicode_normalizer_from_str
|
||||
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
|
||||
class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
"""ByteLevelBPETokenizer
|
||||
|
||||
Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Optional[Union[str, Dict[str, int]]] = None,
|
||||
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
|
||||
add_prefix_space: bool = False,
|
||||
lowercase: bool = False,
|
||||
dropout: Optional[float] = None,
|
||||
unicode_normalizer: Optional[str] = None,
|
||||
continuing_subword_prefix: Optional[str] = None,
|
||||
end_of_word_suffix: Optional[str] = None,
|
||||
trim_offsets: bool = False,
|
||||
):
|
||||
if vocab is not None and merges is not None:
|
||||
tokenizer = Tokenizer(
|
||||
BPE(
|
||||
vocab,
|
||||
merges,
|
||||
dropout=dropout,
|
||||
continuing_subword_prefix=continuing_subword_prefix or "",
|
||||
end_of_word_suffix=end_of_word_suffix or "",
|
||||
)
|
||||
)
|
||||
else:
|
||||
tokenizer = Tokenizer(BPE())
|
||||
|
||||
# Check for Unicode normalization first (before everything else)
|
||||
normalizers = []
|
||||
|
||||
if unicode_normalizer:
|
||||
normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
|
||||
|
||||
if lowercase:
|
||||
normalizers += [Lowercase()]
|
||||
|
||||
# Create the normalizer structure
|
||||
if len(normalizers) > 0:
|
||||
if len(normalizers) > 1:
|
||||
tokenizer.normalizer = Sequence(normalizers)
|
||||
else:
|
||||
tokenizer.normalizer = normalizers[0]
|
||||
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
|
||||
tokenizer.decoder = decoders.ByteLevel()
|
||||
tokenizer.post_processor = processors.ByteLevel(trim_offsets=trim_offsets)
|
||||
|
||||
parameters = {
|
||||
"model": "ByteLevelBPE",
|
||||
"add_prefix_space": add_prefix_space,
|
||||
"lowercase": lowercase,
|
||||
"dropout": dropout,
|
||||
"unicode_normalizer": unicode_normalizer,
|
||||
"continuing_subword_prefix": continuing_subword_prefix,
|
||||
"end_of_word_suffix": end_of_word_suffix,
|
||||
"trim_offsets": trim_offsets,
|
||||
}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
@staticmethod
|
||||
def from_file(vocab_filename: str, merges_filename: str, **kwargs):
|
||||
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
|
||||
return ByteLevelBPETokenizer(vocab, merges, **kwargs)
|
||||
|
||||
def train(
|
||||
self,
|
||||
files: Union[str, List[str]],
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 2,
|
||||
show_progress: bool = True,
|
||||
special_tokens: List[Union[str, AddedToken]] = [],
|
||||
):
|
||||
"""Train the model using the given files"""
|
||||
|
||||
trainer = trainers.BpeTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
show_progress=show_progress,
|
||||
special_tokens=special_tokens,
|
||||
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
|
||||
)
|
||||
if isinstance(files, str):
|
||||
files = [files]
|
||||
self._tokenizer.train(files, trainer=trainer)
|
||||
|
||||
def train_from_iterator(
|
||||
self,
|
||||
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 2,
|
||||
show_progress: bool = True,
|
||||
special_tokens: List[Union[str, AddedToken]] = [],
|
||||
length: Optional[int] = None,
|
||||
):
|
||||
"""Train the model using the given iterator"""
|
||||
|
||||
trainer = trainers.BpeTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
show_progress=show_progress,
|
||||
special_tokens=special_tokens,
|
||||
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
|
||||
)
|
||||
self._tokenizer.train_from_iterator(
|
||||
iterator,
|
||||
trainer=trainer,
|
||||
length=length,
|
||||
)
|
||||
@@ -0,0 +1,150 @@
|
||||
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
||||
|
||||
from .. import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers
|
||||
from ..models import BPE
|
||||
from ..normalizers import BertNormalizer, Lowercase, Sequence, unicode_normalizer_from_str
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
|
||||
class CharBPETokenizer(BaseTokenizer):
|
||||
"""Original BPE Tokenizer
|
||||
|
||||
Represents the BPE algorithm, as introduced by Rico Sennrich
|
||||
(https://arxiv.org/abs/1508.07909)
|
||||
|
||||
The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
|
||||
Sennrich subword-nmt implementation by the following options that you can deactivate:
|
||||
- adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
|
||||
* removing any control characters and replacing all whitespaces by the classic one.
|
||||
* handle chinese chars by putting spaces around them.
|
||||
* strip all accents.
|
||||
- spitting on punctuation in addition to whitespaces (deactivate it with
|
||||
`split_on_whitespace_only=True`)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Optional[Union[str, Dict[str, int]]] = None,
|
||||
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
|
||||
unk_token: Union[str, AddedToken] = "<unk>",
|
||||
suffix: str = "</w>",
|
||||
dropout: Optional[float] = None,
|
||||
lowercase: bool = False,
|
||||
unicode_normalizer: Optional[str] = None,
|
||||
bert_normalizer: bool = True,
|
||||
split_on_whitespace_only: bool = False,
|
||||
):
|
||||
if vocab is not None and merges is not None:
|
||||
tokenizer = Tokenizer(
|
||||
BPE(
|
||||
vocab,
|
||||
merges,
|
||||
dropout=dropout,
|
||||
unk_token=str(unk_token),
|
||||
end_of_word_suffix=suffix,
|
||||
)
|
||||
)
|
||||
else:
|
||||
tokenizer = Tokenizer(BPE(unk_token=str(unk_token), dropout=dropout, end_of_word_suffix=suffix))
|
||||
|
||||
if tokenizer.token_to_id(str(unk_token)) is not None:
|
||||
tokenizer.add_special_tokens([str(unk_token)])
|
||||
|
||||
# Check for Unicode normalization first (before everything else)
|
||||
normalizers = []
|
||||
|
||||
if unicode_normalizer:
|
||||
normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
|
||||
|
||||
if bert_normalizer:
|
||||
normalizers += [BertNormalizer(lowercase=False)]
|
||||
|
||||
if lowercase:
|
||||
normalizers += [Lowercase()]
|
||||
|
||||
# Create the normalizer structure
|
||||
if len(normalizers) > 0:
|
||||
if len(normalizers) > 1:
|
||||
tokenizer.normalizer = Sequence(normalizers)
|
||||
else:
|
||||
tokenizer.normalizer = normalizers[0]
|
||||
|
||||
if split_on_whitespace_only:
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
|
||||
else:
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
|
||||
|
||||
tokenizer.decoder = decoders.BPEDecoder(suffix=suffix)
|
||||
|
||||
parameters = {
|
||||
"model": "BPE",
|
||||
"unk_token": unk_token,
|
||||
"suffix": suffix,
|
||||
"dropout": dropout,
|
||||
"lowercase": lowercase,
|
||||
"unicode_normalizer": unicode_normalizer,
|
||||
"bert_normalizer": bert_normalizer,
|
||||
"split_on_whitespace_only": split_on_whitespace_only,
|
||||
}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
@staticmethod
|
||||
def from_file(vocab_filename: str, merges_filename: str, **kwargs):
|
||||
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
|
||||
return CharBPETokenizer(vocab, merges, **kwargs)
|
||||
|
||||
def train(
|
||||
self,
|
||||
files: Union[str, List[str]],
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 2,
|
||||
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
|
||||
limit_alphabet: int = 1000,
|
||||
initial_alphabet: List[str] = [],
|
||||
suffix: Optional[str] = "</w>",
|
||||
show_progress: bool = True,
|
||||
):
|
||||
"""Train the model using the given files"""
|
||||
|
||||
trainer = trainers.BpeTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
special_tokens=special_tokens,
|
||||
limit_alphabet=limit_alphabet,
|
||||
initial_alphabet=initial_alphabet,
|
||||
end_of_word_suffix=suffix,
|
||||
show_progress=show_progress,
|
||||
)
|
||||
if isinstance(files, str):
|
||||
files = [files]
|
||||
self._tokenizer.train(files, trainer=trainer)
|
||||
|
||||
def train_from_iterator(
|
||||
self,
|
||||
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 2,
|
||||
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
|
||||
limit_alphabet: int = 1000,
|
||||
initial_alphabet: List[str] = [],
|
||||
suffix: Optional[str] = "</w>",
|
||||
show_progress: bool = True,
|
||||
length: Optional[int] = None,
|
||||
):
|
||||
"""Train the model using the given iterator"""
|
||||
|
||||
trainer = trainers.BpeTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
special_tokens=special_tokens,
|
||||
limit_alphabet=limit_alphabet,
|
||||
initial_alphabet=initial_alphabet,
|
||||
end_of_word_suffix=suffix,
|
||||
show_progress=show_progress,
|
||||
)
|
||||
self._tokenizer.train_from_iterator(
|
||||
iterator,
|
||||
trainer=trainer,
|
||||
length=length,
|
||||
)
|
||||
@@ -0,0 +1,103 @@
|
||||
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
||||
|
||||
from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import NFKC
|
||||
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
|
||||
class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
"""SentencePiece BPE Tokenizer
|
||||
|
||||
Represents the BPE algorithm, with the pretokenization used by SentencePiece
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Optional[Union[str, Dict[str, int]]] = None,
|
||||
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
|
||||
unk_token: Union[str, AddedToken] = "<unk>",
|
||||
replacement: str = "▁",
|
||||
add_prefix_space: bool = True,
|
||||
dropout: Optional[float] = None,
|
||||
fuse_unk: Optional[bool] = False,
|
||||
):
|
||||
if vocab is not None and merges is not None:
|
||||
tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk))
|
||||
else:
|
||||
tokenizer = Tokenizer(BPE(dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk))
|
||||
|
||||
if tokenizer.token_to_id(str(unk_token)) is not None:
|
||||
tokenizer.add_special_tokens([str(unk_token)])
|
||||
|
||||
tokenizer.normalizer = NFKC()
|
||||
prepend_scheme = "always" if add_prefix_space else "never"
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
|
||||
tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
|
||||
|
||||
parameters = {
|
||||
"model": "SentencePieceBPE",
|
||||
"unk_token": unk_token,
|
||||
"replacement": replacement,
|
||||
"add_prefix_space": add_prefix_space,
|
||||
"dropout": dropout,
|
||||
}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
@staticmethod
|
||||
def from_file(vocab_filename: str, merges_filename: str, **kwargs):
|
||||
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
|
||||
return SentencePieceBPETokenizer(vocab, merges, **kwargs)
|
||||
|
||||
def train(
|
||||
self,
|
||||
files: Union[str, List[str]],
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 2,
|
||||
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
|
||||
limit_alphabet: int = 1000,
|
||||
initial_alphabet: List[str] = [],
|
||||
show_progress: bool = True,
|
||||
):
|
||||
"""Train the model using the given files"""
|
||||
|
||||
trainer = trainers.BpeTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
special_tokens=special_tokens,
|
||||
limit_alphabet=limit_alphabet,
|
||||
initial_alphabet=initial_alphabet,
|
||||
show_progress=show_progress,
|
||||
)
|
||||
if isinstance(files, str):
|
||||
files = [files]
|
||||
self._tokenizer.train(files, trainer=trainer)
|
||||
|
||||
def train_from_iterator(
|
||||
self,
|
||||
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 2,
|
||||
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
|
||||
limit_alphabet: int = 1000,
|
||||
initial_alphabet: List[str] = [],
|
||||
show_progress: bool = True,
|
||||
length: Optional[int] = None,
|
||||
):
|
||||
"""Train the model using the given iterator"""
|
||||
|
||||
trainer = trainers.BpeTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
special_tokens=special_tokens,
|
||||
limit_alphabet=limit_alphabet,
|
||||
initial_alphabet=initial_alphabet,
|
||||
show_progress=show_progress,
|
||||
)
|
||||
self._tokenizer.train_from_iterator(
|
||||
iterator,
|
||||
trainer=trainer,
|
||||
length=length,
|
||||
)
|
||||
@@ -0,0 +1,196 @@
|
||||
import json
|
||||
import os
|
||||
from typing import Iterator, List, Optional, Union, Tuple
|
||||
|
||||
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, trainers
|
||||
from tokenizers.models import Unigram
|
||||
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
|
||||
class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
"""SentencePiece Unigram Tokenizer
|
||||
|
||||
Represents the Unigram algorithm, with the pretokenization used by SentencePiece
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Optional[List[Tuple[str, float]]] = None,
|
||||
replacement: str = "▁",
|
||||
add_prefix_space: bool = True,
|
||||
):
|
||||
if vocab is not None:
|
||||
# Let Unigram(..) fail if only one of them is None
|
||||
tokenizer = Tokenizer(Unigram(vocab))
|
||||
else:
|
||||
tokenizer = Tokenizer(Unigram())
|
||||
|
||||
tokenizer.normalizer = normalizers.Sequence(
|
||||
[normalizers.Nmt(), normalizers.NFKC(), normalizers.Replace(Regex(" {2,}"), " ")]
|
||||
)
|
||||
prepend_scheme = "always" if add_prefix_space else "never"
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
|
||||
tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
|
||||
|
||||
parameters = {
|
||||
"model": "SentencePieceUnigram",
|
||||
"replacement": replacement,
|
||||
"add_prefix_space": add_prefix_space,
|
||||
}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
def train(
|
||||
self,
|
||||
files: Union[str, List[str]],
|
||||
vocab_size: int = 8000,
|
||||
show_progress: bool = True,
|
||||
special_tokens: Optional[List[Union[str, AddedToken]]] = None,
|
||||
initial_alphabet: Optional[List[str]] = None,
|
||||
unk_token: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Train the model using the given files
|
||||
|
||||
Args:
|
||||
files (:obj:`List[str]`):
|
||||
A list of path to the files that we should use for training
|
||||
vocab_size (:obj:`int`):
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
show_progress (:obj:`bool`):
|
||||
Whether to show progress bars while training.
|
||||
special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
|
||||
A list of special tokens the model should know of.
|
||||
initial_alphabet (:obj:`List[str]`, `optional`):
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contain more than one character, only the first one
|
||||
is kept.
|
||||
unk_token (:obj:`str`, `optional`):
|
||||
The unknown token to be used by the model.
|
||||
"""
|
||||
|
||||
if special_tokens is None:
|
||||
special_tokens = []
|
||||
|
||||
if initial_alphabet is None:
|
||||
initial_alphabet = []
|
||||
|
||||
trainer = trainers.UnigramTrainer(
|
||||
vocab_size=vocab_size,
|
||||
special_tokens=special_tokens,
|
||||
show_progress=show_progress,
|
||||
initial_alphabet=initial_alphabet,
|
||||
unk_token=unk_token,
|
||||
)
|
||||
|
||||
if isinstance(files, str):
|
||||
files = [files]
|
||||
self._tokenizer.train(files, trainer=trainer)
|
||||
|
||||
def train_from_iterator(
|
||||
self,
|
||||
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
|
||||
vocab_size: int = 8000,
|
||||
show_progress: bool = True,
|
||||
special_tokens: Optional[List[Union[str, AddedToken]]] = None,
|
||||
initial_alphabet: Optional[List[str]] = None,
|
||||
unk_token: Optional[str] = None,
|
||||
length: Optional[int] = None,
|
||||
):
|
||||
"""
|
||||
Train the model using the given iterator
|
||||
|
||||
Args:
|
||||
iterator (:obj:`Union[Iterator[str], Iterator[Iterator[str]]]`):
|
||||
Any iterator over strings or list of strings
|
||||
vocab_size (:obj:`int`):
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
show_progress (:obj:`bool`):
|
||||
Whether to show progress bars while training.
|
||||
special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
|
||||
A list of special tokens the model should know of.
|
||||
initial_alphabet (:obj:`List[str]`, `optional`):
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contain more than one character, only the first one
|
||||
is kept.
|
||||
unk_token (:obj:`str`, `optional`):
|
||||
The unknown token to be used by the model.
|
||||
length (:obj:`int`, `optional`):
|
||||
The total number of sequences in the iterator. This is used to
|
||||
provide meaningful progress tracking
|
||||
"""
|
||||
|
||||
if special_tokens is None:
|
||||
special_tokens = []
|
||||
|
||||
if initial_alphabet is None:
|
||||
initial_alphabet = []
|
||||
|
||||
trainer = trainers.UnigramTrainer(
|
||||
vocab_size=vocab_size,
|
||||
special_tokens=special_tokens,
|
||||
show_progress=show_progress,
|
||||
initial_alphabet=initial_alphabet,
|
||||
unk_token=unk_token,
|
||||
)
|
||||
|
||||
self._tokenizer.train_from_iterator(
|
||||
iterator,
|
||||
trainer=trainer,
|
||||
length=length,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def from_spm(filename: str):
|
||||
try:
|
||||
import sys
|
||||
|
||||
sys.path.append(".")
|
||||
|
||||
import sentencepiece_model_pb2 as model
|
||||
except Exception:
|
||||
raise Exception(
|
||||
"You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required."
|
||||
)
|
||||
|
||||
m = model.ModelProto()
|
||||
m.ParseFromString(open(filename, "rb").read())
|
||||
|
||||
precompiled_charsmap = m.normalizer_spec.precompiled_charsmap
|
||||
vocab = [(piece.piece, piece.score) for piece in m.pieces]
|
||||
unk_id = m.trainer_spec.unk_id
|
||||
model_type = m.trainer_spec.model_type
|
||||
byte_fallback = m.trainer_spec.byte_fallback
|
||||
if model_type != 1:
|
||||
raise Exception(
|
||||
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
|
||||
)
|
||||
|
||||
replacement = "▁"
|
||||
add_prefix_space = True
|
||||
|
||||
tokenizer = Tokenizer(Unigram(vocab, unk_id, byte_fallback))
|
||||
|
||||
if precompiled_charsmap:
|
||||
tokenizer.normalizer = normalizers.Sequence(
|
||||
[
|
||||
normalizers.Precompiled(precompiled_charsmap),
|
||||
normalizers.Replace(Regex(" {2,}"), " "),
|
||||
]
|
||||
)
|
||||
else:
|
||||
tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")])
|
||||
prepend_scheme = "always" if add_prefix_space else "never"
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
|
||||
tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
|
||||
|
||||
parameters = {
|
||||
"model": "SentencePieceUnigram",
|
||||
}
|
||||
|
||||
obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters)
|
||||
BaseTokenizer.__init__(obj, tokenizer, parameters)
|
||||
return obj
|
||||
@@ -0,0 +1,8 @@
|
||||
# Generated content DO NOT EDIT
|
||||
from .. import models
|
||||
|
||||
Model = models.Model
|
||||
BPE = models.BPE
|
||||
Unigram = models.Unigram
|
||||
WordLevel = models.WordLevel
|
||||
WordPiece = models.WordPiece
|
||||
@@ -0,0 +1,591 @@
|
||||
# Generated content DO NOT EDIT
|
||||
class Model:
|
||||
"""
|
||||
Base class for all models
|
||||
|
||||
The model represents the actual tokenization algorithm. This is the part that
|
||||
will contain and manage the learned vocabulary.
|
||||
|
||||
This class cannot be constructed directly. Please use one of the concrete models.
|
||||
"""
|
||||
def get_trainer(self):
|
||||
"""
|
||||
Get the associated :class:`~tokenizers.trainers.Trainer`
|
||||
|
||||
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
|
||||
:class:`~tokenizers.models.Model`.
|
||||
|
||||
Returns:
|
||||
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
||||
"""
|
||||
pass
|
||||
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
Get the token associated to an ID
|
||||
|
||||
Args:
|
||||
id (:obj:`int`):
|
||||
An ID to convert to a token
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The token associated to the ID
|
||||
"""
|
||||
pass
|
||||
|
||||
def save(self, folder, prefix):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
Save the current model in the given folder, using the given prefix for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exists in this folder will be overwritten.
|
||||
|
||||
Args:
|
||||
folder (:obj:`str`):
|
||||
The path to the target folder in which to save the various files
|
||||
|
||||
prefix (:obj:`str`, `optional`):
|
||||
An optional prefix, used to prefix each file name
|
||||
|
||||
Returns:
|
||||
:obj:`List[str]`: The list of saved files
|
||||
"""
|
||||
pass
|
||||
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Get the ID associated to a token
|
||||
|
||||
Args:
|
||||
token (:obj:`str`):
|
||||
A token to convert to an ID
|
||||
|
||||
Returns:
|
||||
:obj:`int`: The ID associated to the token
|
||||
"""
|
||||
pass
|
||||
|
||||
def tokenize(self, sequence):
|
||||
"""
|
||||
Tokenize a sequence
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A sequence to tokenize
|
||||
|
||||
Returns:
|
||||
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
||||
"""
|
||||
pass
|
||||
|
||||
class BPE(Model):
|
||||
"""
|
||||
An implementation of the BPE (Byte-Pair Encoding) algorithm
|
||||
|
||||
Args:
|
||||
vocab (:obj:`Dict[str, int]`, `optional`):
|
||||
A dictionary of string keys and their ids :obj:`{"am": 0,...}`
|
||||
|
||||
merges (:obj:`List[Tuple[str, str]]`, `optional`):
|
||||
A list of pairs of tokens (:obj:`Tuple[str, str]`) :obj:`[("a", "b"),...]`
|
||||
|
||||
cache_capacity (:obj:`int`, `optional`):
|
||||
The number of words that the BPE cache can contain. The cache allows
|
||||
to speed-up the process by keeping the result of the merge operations
|
||||
for a number of words.
|
||||
|
||||
dropout (:obj:`float`, `optional`):
|
||||
A float between 0 and 1 that represents the BPE dropout to use.
|
||||
|
||||
unk_token (:obj:`str`, `optional`):
|
||||
The unknown token to be used by the model.
|
||||
|
||||
continuing_subword_prefix (:obj:`str`, `optional`):
|
||||
The prefix to attach to subword units that don't represent a beginning of word.
|
||||
|
||||
end_of_word_suffix (:obj:`str`, `optional`):
|
||||
The suffix to attach to subword units that represent an end of word.
|
||||
|
||||
fuse_unk (:obj:`bool`, `optional`):
|
||||
Whether to fuse any subsequent unknown tokens into a single one
|
||||
|
||||
byte_fallback (:obj:`bool`, `optional`):
|
||||
Whether to use spm byte-fallback trick (defaults to False)
|
||||
|
||||
ignore_merges (:obj:`bool`, `optional`):
|
||||
Whether or not to match tokens with the vocab before using merges.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
vocab=None,
|
||||
merges=None,
|
||||
cache_capacity=None,
|
||||
dropout=None,
|
||||
unk_token=None,
|
||||
continuing_subword_prefix=None,
|
||||
end_of_word_suffix=None,
|
||||
fuse_unk=None,
|
||||
byte_fallback=False,
|
||||
ignore_merges=False,
|
||||
):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def from_file(cls, vocab, merge, **kwargs):
|
||||
"""
|
||||
Instantiate a BPE model from the given files.
|
||||
|
||||
This method is roughly equivalent to doing::
|
||||
|
||||
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
|
||||
bpe = BPE(vocab, merges)
|
||||
|
||||
If you don't need to keep the :obj:`vocab, merges` values lying around,
|
||||
this method is more optimized than manually calling
|
||||
:meth:`~tokenizers.models.BPE.read_file` to initialize a :class:`~tokenizers.models.BPE`
|
||||
|
||||
Args:
|
||||
vocab (:obj:`str`):
|
||||
The path to a :obj:`vocab.json` file
|
||||
|
||||
merges (:obj:`str`):
|
||||
The path to a :obj:`merges.txt` file
|
||||
|
||||
Returns:
|
||||
:class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_trainer(self):
|
||||
"""
|
||||
Get the associated :class:`~tokenizers.trainers.Trainer`
|
||||
|
||||
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
|
||||
:class:`~tokenizers.models.Model`.
|
||||
|
||||
Returns:
|
||||
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
||||
"""
|
||||
pass
|
||||
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
Get the token associated to an ID
|
||||
|
||||
Args:
|
||||
id (:obj:`int`):
|
||||
An ID to convert to a token
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The token associated to the ID
|
||||
"""
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def read_file(self, vocab, merges):
|
||||
"""
|
||||
Read a :obj:`vocab.json` and a :obj:`merges.txt` files
|
||||
|
||||
This method provides a way to read and parse the content of these files,
|
||||
returning the relevant data structures. If you want to instantiate some BPE models
|
||||
from memory, this method gives you the expected input from the standard files.
|
||||
|
||||
Args:
|
||||
vocab (:obj:`str`):
|
||||
The path to a :obj:`vocab.json` file
|
||||
|
||||
merges (:obj:`str`):
|
||||
The path to a :obj:`merges.txt` file
|
||||
|
||||
Returns:
|
||||
A :obj:`Tuple` with the vocab and the merges:
|
||||
The vocabulary and merges loaded into memory
|
||||
"""
|
||||
pass
|
||||
|
||||
def save(self, folder, prefix):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
Save the current model in the given folder, using the given prefix for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exists in this folder will be overwritten.
|
||||
|
||||
Args:
|
||||
folder (:obj:`str`):
|
||||
The path to the target folder in which to save the various files
|
||||
|
||||
prefix (:obj:`str`, `optional`):
|
||||
An optional prefix, used to prefix each file name
|
||||
|
||||
Returns:
|
||||
:obj:`List[str]`: The list of saved files
|
||||
"""
|
||||
pass
|
||||
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Get the ID associated to a token
|
||||
|
||||
Args:
|
||||
token (:obj:`str`):
|
||||
A token to convert to an ID
|
||||
|
||||
Returns:
|
||||
:obj:`int`: The ID associated to the token
|
||||
"""
|
||||
pass
|
||||
|
||||
def tokenize(self, sequence):
|
||||
"""
|
||||
Tokenize a sequence
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A sequence to tokenize
|
||||
|
||||
Returns:
|
||||
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
||||
"""
|
||||
pass
|
||||
|
||||
class Unigram(Model):
|
||||
"""
|
||||
An implementation of the Unigram algorithm
|
||||
|
||||
Args:
|
||||
vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
|
||||
A list of vocabulary items and their relative score [("am", -0.2442),...]
|
||||
"""
|
||||
def __init__(self, vocab, unk_id, byte_fallback):
|
||||
pass
|
||||
|
||||
def get_trainer(self):
|
||||
"""
|
||||
Get the associated :class:`~tokenizers.trainers.Trainer`
|
||||
|
||||
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
|
||||
:class:`~tokenizers.models.Model`.
|
||||
|
||||
Returns:
|
||||
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
||||
"""
|
||||
pass
|
||||
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
Get the token associated to an ID
|
||||
|
||||
Args:
|
||||
id (:obj:`int`):
|
||||
An ID to convert to a token
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The token associated to the ID
|
||||
"""
|
||||
pass
|
||||
|
||||
def save(self, folder, prefix):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
Save the current model in the given folder, using the given prefix for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exists in this folder will be overwritten.
|
||||
|
||||
Args:
|
||||
folder (:obj:`str`):
|
||||
The path to the target folder in which to save the various files
|
||||
|
||||
prefix (:obj:`str`, `optional`):
|
||||
An optional prefix, used to prefix each file name
|
||||
|
||||
Returns:
|
||||
:obj:`List[str]`: The list of saved files
|
||||
"""
|
||||
pass
|
||||
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Get the ID associated to a token
|
||||
|
||||
Args:
|
||||
token (:obj:`str`):
|
||||
A token to convert to an ID
|
||||
|
||||
Returns:
|
||||
:obj:`int`: The ID associated to the token
|
||||
"""
|
||||
pass
|
||||
|
||||
def tokenize(self, sequence):
|
||||
"""
|
||||
Tokenize a sequence
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A sequence to tokenize
|
||||
|
||||
Returns:
|
||||
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
||||
"""
|
||||
pass
|
||||
|
||||
class WordLevel(Model):
|
||||
"""
|
||||
An implementation of the WordLevel algorithm
|
||||
|
||||
Most simple tokenizer model based on mapping tokens to their corresponding id.
|
||||
|
||||
Args:
|
||||
vocab (:obj:`str`, `optional`):
|
||||
A dictionary of string keys and their ids :obj:`{"am": 0,...}`
|
||||
|
||||
unk_token (:obj:`str`, `optional`):
|
||||
The unknown token to be used by the model.
|
||||
"""
|
||||
def __init__(self, vocab, unk_token):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def from_file(vocab, unk_token):
|
||||
"""
|
||||
Instantiate a WordLevel model from the given file
|
||||
|
||||
This method is roughly equivalent to doing::
|
||||
|
||||
vocab = WordLevel.read_file(vocab_filename)
|
||||
wordlevel = WordLevel(vocab)
|
||||
|
||||
If you don't need to keep the :obj:`vocab` values lying around, this method is
|
||||
more optimized than manually calling :meth:`~tokenizers.models.WordLevel.read_file` to
|
||||
initialize a :class:`~tokenizers.models.WordLevel`
|
||||
|
||||
Args:
|
||||
vocab (:obj:`str`):
|
||||
The path to a :obj:`vocab.json` file
|
||||
|
||||
Returns:
|
||||
:class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_trainer(self):
|
||||
"""
|
||||
Get the associated :class:`~tokenizers.trainers.Trainer`
|
||||
|
||||
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
|
||||
:class:`~tokenizers.models.Model`.
|
||||
|
||||
Returns:
|
||||
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
||||
"""
|
||||
pass
|
||||
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
Get the token associated to an ID
|
||||
|
||||
Args:
|
||||
id (:obj:`int`):
|
||||
An ID to convert to a token
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The token associated to the ID
|
||||
"""
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def read_file(vocab):
|
||||
"""
|
||||
Read a :obj:`vocab.json`
|
||||
|
||||
This method provides a way to read and parse the content of a vocabulary file,
|
||||
returning the relevant data structures. If you want to instantiate some WordLevel models
|
||||
from memory, this method gives you the expected input from the standard files.
|
||||
|
||||
Args:
|
||||
vocab (:obj:`str`):
|
||||
The path to a :obj:`vocab.json` file
|
||||
|
||||
Returns:
|
||||
:obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
|
||||
"""
|
||||
pass
|
||||
|
||||
def save(self, folder, prefix):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
Save the current model in the given folder, using the given prefix for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exists in this folder will be overwritten.
|
||||
|
||||
Args:
|
||||
folder (:obj:`str`):
|
||||
The path to the target folder in which to save the various files
|
||||
|
||||
prefix (:obj:`str`, `optional`):
|
||||
An optional prefix, used to prefix each file name
|
||||
|
||||
Returns:
|
||||
:obj:`List[str]`: The list of saved files
|
||||
"""
|
||||
pass
|
||||
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Get the ID associated to a token
|
||||
|
||||
Args:
|
||||
token (:obj:`str`):
|
||||
A token to convert to an ID
|
||||
|
||||
Returns:
|
||||
:obj:`int`: The ID associated to the token
|
||||
"""
|
||||
pass
|
||||
|
||||
def tokenize(self, sequence):
|
||||
"""
|
||||
Tokenize a sequence
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A sequence to tokenize
|
||||
|
||||
Returns:
|
||||
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
||||
"""
|
||||
pass
|
||||
|
||||
class WordPiece(Model):
|
||||
"""
|
||||
An implementation of the WordPiece algorithm
|
||||
|
||||
Args:
|
||||
vocab (:obj:`Dict[str, int]`, `optional`):
|
||||
A dictionary of string keys and their ids :obj:`{"am": 0,...}`
|
||||
|
||||
unk_token (:obj:`str`, `optional`):
|
||||
The unknown token to be used by the model.
|
||||
|
||||
max_input_chars_per_word (:obj:`int`, `optional`):
|
||||
The maximum number of characters to authorize in a single word.
|
||||
"""
|
||||
def __init__(self, vocab, unk_token, max_input_chars_per_word):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def from_file(vocab, **kwargs):
|
||||
"""
|
||||
Instantiate a WordPiece model from the given file
|
||||
|
||||
This method is roughly equivalent to doing::
|
||||
|
||||
vocab = WordPiece.read_file(vocab_filename)
|
||||
wordpiece = WordPiece(vocab)
|
||||
|
||||
If you don't need to keep the :obj:`vocab` values lying around, this method is
|
||||
more optimized than manually calling :meth:`~tokenizers.models.WordPiece.read_file` to
|
||||
initialize a :class:`~tokenizers.models.WordPiece`
|
||||
|
||||
Args:
|
||||
vocab (:obj:`str`):
|
||||
The path to a :obj:`vocab.txt` file
|
||||
|
||||
Returns:
|
||||
:class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_trainer(self):
|
||||
"""
|
||||
Get the associated :class:`~tokenizers.trainers.Trainer`
|
||||
|
||||
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
|
||||
:class:`~tokenizers.models.Model`.
|
||||
|
||||
Returns:
|
||||
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
||||
"""
|
||||
pass
|
||||
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
Get the token associated to an ID
|
||||
|
||||
Args:
|
||||
id (:obj:`int`):
|
||||
An ID to convert to a token
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The token associated to the ID
|
||||
"""
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def read_file(vocab):
|
||||
"""
|
||||
Read a :obj:`vocab.txt` file
|
||||
|
||||
This method provides a way to read and parse the content of a standard `vocab.txt`
|
||||
file as used by the WordPiece Model, returning the relevant data structures. If you
|
||||
want to instantiate some WordPiece models from memory, this method gives you the
|
||||
expected input from the standard files.
|
||||
|
||||
Args:
|
||||
vocab (:obj:`str`):
|
||||
The path to a :obj:`vocab.txt` file
|
||||
|
||||
Returns:
|
||||
:obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
|
||||
"""
|
||||
pass
|
||||
|
||||
def save(self, folder, prefix):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
Save the current model in the given folder, using the given prefix for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exists in this folder will be overwritten.
|
||||
|
||||
Args:
|
||||
folder (:obj:`str`):
|
||||
The path to the target folder in which to save the various files
|
||||
|
||||
prefix (:obj:`str`, `optional`):
|
||||
An optional prefix, used to prefix each file name
|
||||
|
||||
Returns:
|
||||
:obj:`List[str]`: The list of saved files
|
||||
"""
|
||||
pass
|
||||
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Get the ID associated to a token
|
||||
|
||||
Args:
|
||||
token (:obj:`str`):
|
||||
A token to convert to an ID
|
||||
|
||||
Returns:
|
||||
:obj:`int`: The ID associated to the token
|
||||
"""
|
||||
pass
|
||||
|
||||
def tokenize(self, sequence):
|
||||
"""
|
||||
Tokenize a sequence
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A sequence to tokenize
|
||||
|
||||
Returns:
|
||||
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
||||
"""
|
||||
pass
|
||||
Binary file not shown.
@@ -0,0 +1,29 @@
|
||||
from .. import normalizers
|
||||
|
||||
|
||||
Normalizer = normalizers.Normalizer
|
||||
BertNormalizer = normalizers.BertNormalizer
|
||||
NFD = normalizers.NFD
|
||||
NFKD = normalizers.NFKD
|
||||
NFC = normalizers.NFC
|
||||
NFKC = normalizers.NFKC
|
||||
Sequence = normalizers.Sequence
|
||||
Lowercase = normalizers.Lowercase
|
||||
Prepend = normalizers.Prepend
|
||||
Strip = normalizers.Strip
|
||||
StripAccents = normalizers.StripAccents
|
||||
Nmt = normalizers.Nmt
|
||||
Precompiled = normalizers.Precompiled
|
||||
Replace = normalizers.Replace
|
||||
ByteLevel = normalizers.ByteLevel
|
||||
|
||||
NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
|
||||
|
||||
|
||||
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
|
||||
if normalizer not in NORMALIZERS:
|
||||
raise ValueError(
|
||||
"{} is not a known unicode normalizer. Available are {}".format(normalizer, NORMALIZERS.keys())
|
||||
)
|
||||
|
||||
return NORMALIZERS[normalizer]()
|
||||
@@ -0,0 +1,636 @@
|
||||
# Generated content DO NOT EDIT
|
||||
class Normalizer:
|
||||
"""
|
||||
Base class for all normalizers
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
Normalizer will return an instance of this class when instantiated.
|
||||
"""
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
||||
keep track of the alignment information. If you just want to see the result
|
||||
of the normalization on a raw string, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
||||
|
||||
Args:
|
||||
normalized (:class:`~tokenizers.NormalizedString`):
|
||||
The normalized string on which to apply this
|
||||
:class:`~tokenizers.normalizers.Normalizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
||||
information. If you need to get/convert offsets, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to normalize
|
||||
|
||||
Returns:
|
||||
:obj:`str`: A string after normalization
|
||||
"""
|
||||
pass
|
||||
|
||||
class BertNormalizer(Normalizer):
|
||||
"""
|
||||
BertNormalizer
|
||||
|
||||
Takes care of normalizing raw text before giving it to a Bert model.
|
||||
This includes cleaning the text, handling accents, chinese chars and lowercasing
|
||||
|
||||
Args:
|
||||
clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to clean the text, by removing any control characters
|
||||
and replacing all whitespaces by the classic one.
|
||||
|
||||
handle_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to handle chinese chars by putting spaces around them.
|
||||
|
||||
strip_accents (:obj:`bool`, `optional`):
|
||||
Whether to strip all accents. If this option is not specified (ie == None),
|
||||
then it will be determined by the value for `lowercase` (as in the original Bert).
|
||||
|
||||
lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to lowercase.
|
||||
"""
|
||||
def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True):
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
||||
keep track of the alignment information. If you just want to see the result
|
||||
of the normalization on a raw string, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
||||
|
||||
Args:
|
||||
normalized (:class:`~tokenizers.NormalizedString`):
|
||||
The normalized string on which to apply this
|
||||
:class:`~tokenizers.normalizers.Normalizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
||||
information. If you need to get/convert offsets, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to normalize
|
||||
|
||||
Returns:
|
||||
:obj:`str`: A string after normalization
|
||||
"""
|
||||
pass
|
||||
|
||||
class ByteLevel(Normalizer):
|
||||
"""
|
||||
Bytelevel Normalizer
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
||||
keep track of the alignment information. If you just want to see the result
|
||||
of the normalization on a raw string, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
||||
|
||||
Args:
|
||||
normalized (:class:`~tokenizers.NormalizedString`):
|
||||
The normalized string on which to apply this
|
||||
:class:`~tokenizers.normalizers.Normalizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
||||
information. If you need to get/convert offsets, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to normalize
|
||||
|
||||
Returns:
|
||||
:obj:`str`: A string after normalization
|
||||
"""
|
||||
pass
|
||||
|
||||
class Lowercase(Normalizer):
|
||||
"""
|
||||
Lowercase Normalizer
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
||||
keep track of the alignment information. If you just want to see the result
|
||||
of the normalization on a raw string, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
||||
|
||||
Args:
|
||||
normalized (:class:`~tokenizers.NormalizedString`):
|
||||
The normalized string on which to apply this
|
||||
:class:`~tokenizers.normalizers.Normalizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
||||
information. If you need to get/convert offsets, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to normalize
|
||||
|
||||
Returns:
|
||||
:obj:`str`: A string after normalization
|
||||
"""
|
||||
pass
|
||||
|
||||
class NFC(Normalizer):
|
||||
"""
|
||||
NFC Unicode Normalizer
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
||||
keep track of the alignment information. If you just want to see the result
|
||||
of the normalization on a raw string, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
||||
|
||||
Args:
|
||||
normalized (:class:`~tokenizers.NormalizedString`):
|
||||
The normalized string on which to apply this
|
||||
:class:`~tokenizers.normalizers.Normalizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
||||
information. If you need to get/convert offsets, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to normalize
|
||||
|
||||
Returns:
|
||||
:obj:`str`: A string after normalization
|
||||
"""
|
||||
pass
|
||||
|
||||
class NFD(Normalizer):
|
||||
"""
|
||||
NFD Unicode Normalizer
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
||||
keep track of the alignment information. If you just want to see the result
|
||||
of the normalization on a raw string, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
||||
|
||||
Args:
|
||||
normalized (:class:`~tokenizers.NormalizedString`):
|
||||
The normalized string on which to apply this
|
||||
:class:`~tokenizers.normalizers.Normalizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
||||
information. If you need to get/convert offsets, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to normalize
|
||||
|
||||
Returns:
|
||||
:obj:`str`: A string after normalization
|
||||
"""
|
||||
pass
|
||||
|
||||
class NFKC(Normalizer):
|
||||
"""
|
||||
NFKC Unicode Normalizer
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
||||
keep track of the alignment information. If you just want to see the result
|
||||
of the normalization on a raw string, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
||||
|
||||
Args:
|
||||
normalized (:class:`~tokenizers.NormalizedString`):
|
||||
The normalized string on which to apply this
|
||||
:class:`~tokenizers.normalizers.Normalizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
||||
information. If you need to get/convert offsets, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to normalize
|
||||
|
||||
Returns:
|
||||
:obj:`str`: A string after normalization
|
||||
"""
|
||||
pass
|
||||
|
||||
class NFKD(Normalizer):
|
||||
"""
|
||||
NFKD Unicode Normalizer
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
||||
keep track of the alignment information. If you just want to see the result
|
||||
of the normalization on a raw string, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
||||
|
||||
Args:
|
||||
normalized (:class:`~tokenizers.NormalizedString`):
|
||||
The normalized string on which to apply this
|
||||
:class:`~tokenizers.normalizers.Normalizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
||||
information. If you need to get/convert offsets, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to normalize
|
||||
|
||||
Returns:
|
||||
:obj:`str`: A string after normalization
|
||||
"""
|
||||
pass
|
||||
|
||||
class Nmt(Normalizer):
|
||||
"""
|
||||
Nmt normalizer
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
||||
keep track of the alignment information. If you just want to see the result
|
||||
of the normalization on a raw string, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
||||
|
||||
Args:
|
||||
normalized (:class:`~tokenizers.NormalizedString`):
|
||||
The normalized string on which to apply this
|
||||
:class:`~tokenizers.normalizers.Normalizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
||||
information. If you need to get/convert offsets, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to normalize
|
||||
|
||||
Returns:
|
||||
:obj:`str`: A string after normalization
|
||||
"""
|
||||
pass
|
||||
|
||||
class Precompiled(Normalizer):
|
||||
"""
|
||||
Precompiled normalizer
|
||||
Don't use manually it is used for compatibility for SentencePiece.
|
||||
"""
|
||||
def __init__(self, precompiled_charsmap):
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
||||
keep track of the alignment information. If you just want to see the result
|
||||
of the normalization on a raw string, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
||||
|
||||
Args:
|
||||
normalized (:class:`~tokenizers.NormalizedString`):
|
||||
The normalized string on which to apply this
|
||||
:class:`~tokenizers.normalizers.Normalizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
||||
information. If you need to get/convert offsets, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to normalize
|
||||
|
||||
Returns:
|
||||
:obj:`str`: A string after normalization
|
||||
"""
|
||||
pass
|
||||
|
||||
class Prepend(Normalizer):
|
||||
"""
|
||||
Prepend normalizer
|
||||
"""
|
||||
def __init__(self, prepend):
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
||||
keep track of the alignment information. If you just want to see the result
|
||||
of the normalization on a raw string, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
||||
|
||||
Args:
|
||||
normalized (:class:`~tokenizers.NormalizedString`):
|
||||
The normalized string on which to apply this
|
||||
:class:`~tokenizers.normalizers.Normalizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
||||
information. If you need to get/convert offsets, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to normalize
|
||||
|
||||
Returns:
|
||||
:obj:`str`: A string after normalization
|
||||
"""
|
||||
pass
|
||||
|
||||
class Replace(Normalizer):
|
||||
"""
|
||||
Replace normalizer
|
||||
"""
|
||||
def __init__(self, pattern, content):
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
||||
keep track of the alignment information. If you just want to see the result
|
||||
of the normalization on a raw string, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
||||
|
||||
Args:
|
||||
normalized (:class:`~tokenizers.NormalizedString`):
|
||||
The normalized string on which to apply this
|
||||
:class:`~tokenizers.normalizers.Normalizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
||||
information. If you need to get/convert offsets, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to normalize
|
||||
|
||||
Returns:
|
||||
:obj:`str`: A string after normalization
|
||||
"""
|
||||
pass
|
||||
|
||||
class Sequence(Normalizer):
|
||||
"""
|
||||
Allows concatenating multiple other Normalizer as a Sequence.
|
||||
All the normalizers run in sequence in the given order
|
||||
|
||||
Args:
|
||||
normalizers (:obj:`List[Normalizer]`):
|
||||
A list of Normalizer to be run as a sequence
|
||||
"""
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
||||
keep track of the alignment information. If you just want to see the result
|
||||
of the normalization on a raw string, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
||||
|
||||
Args:
|
||||
normalized (:class:`~tokenizers.NormalizedString`):
|
||||
The normalized string on which to apply this
|
||||
:class:`~tokenizers.normalizers.Normalizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
||||
information. If you need to get/convert offsets, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to normalize
|
||||
|
||||
Returns:
|
||||
:obj:`str`: A string after normalization
|
||||
"""
|
||||
pass
|
||||
|
||||
class Strip(Normalizer):
|
||||
"""
|
||||
Strip normalizer
|
||||
"""
|
||||
def __init__(self, left=True, right=True):
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
||||
keep track of the alignment information. If you just want to see the result
|
||||
of the normalization on a raw string, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
||||
|
||||
Args:
|
||||
normalized (:class:`~tokenizers.NormalizedString`):
|
||||
The normalized string on which to apply this
|
||||
:class:`~tokenizers.normalizers.Normalizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
||||
information. If you need to get/convert offsets, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to normalize
|
||||
|
||||
Returns:
|
||||
:obj:`str`: A string after normalization
|
||||
"""
|
||||
pass
|
||||
|
||||
class StripAccents(Normalizer):
|
||||
"""
|
||||
StripAccents normalizer
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
||||
keep track of the alignment information. If you just want to see the result
|
||||
of the normalization on a raw string, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
||||
|
||||
Args:
|
||||
normalized (:class:`~tokenizers.NormalizedString`):
|
||||
The normalized string on which to apply this
|
||||
:class:`~tokenizers.normalizers.Normalizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
||||
information. If you need to get/convert offsets, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to normalize
|
||||
|
||||
Returns:
|
||||
:obj:`str`: A string after normalization
|
||||
"""
|
||||
pass
|
||||
Binary file not shown.
@@ -0,0 +1,15 @@
|
||||
# Generated content DO NOT EDIT
|
||||
from .. import pre_tokenizers
|
||||
|
||||
PreTokenizer = pre_tokenizers.PreTokenizer
|
||||
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
|
||||
ByteLevel = pre_tokenizers.ByteLevel
|
||||
CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
|
||||
Digits = pre_tokenizers.Digits
|
||||
Metaspace = pre_tokenizers.Metaspace
|
||||
Punctuation = pre_tokenizers.Punctuation
|
||||
Sequence = pre_tokenizers.Sequence
|
||||
Split = pre_tokenizers.Split
|
||||
UnicodeScripts = pre_tokenizers.UnicodeScripts
|
||||
Whitespace = pre_tokenizers.Whitespace
|
||||
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
|
||||
@@ -0,0 +1,610 @@
|
||||
# Generated content DO NOT EDIT
|
||||
class PreTokenizer:
|
||||
"""
|
||||
Base class for all pre-tokenizers
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
PreTokenizer will return an instance of this class when instantiated.
|
||||
"""
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
class BertPreTokenizer(PreTokenizer):
|
||||
"""
|
||||
BertPreTokenizer
|
||||
|
||||
This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||
Each occurrence of a punctuation character will be treated separately.
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
class ByteLevel(PreTokenizer):
|
||||
"""
|
||||
ByteLevel PreTokenizer
|
||||
|
||||
This pre-tokenizer takes care of replacing all bytes of the given string
|
||||
with a corresponding representation, as well as splitting into words.
|
||||
|
||||
Args:
|
||||
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
use_regex (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Set this to :obj:`False` to prevent this `pre_tokenizer` from using
|
||||
the GPT2 specific regexp for spliting on whitespace.
|
||||
"""
|
||||
def __init__(self, add_prefix_space=True, use_regex=True):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def alphabet():
|
||||
"""
|
||||
Returns the alphabet used by this PreTokenizer.
|
||||
|
||||
Since the ByteLevel works as its name suggests, at the byte level, it
|
||||
encodes each byte value to a unique visible character. This means that there is a
|
||||
total of 256 different characters composing this alphabet.
|
||||
|
||||
Returns:
|
||||
:obj:`List[str]`: A list of characters that compose the alphabet
|
||||
"""
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
class CharDelimiterSplit(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
|
||||
|
||||
Args:
|
||||
delimiter: str:
|
||||
The delimiter char that will be used to split input
|
||||
"""
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
class Digits(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer simply splits using the digits in separate tokens
|
||||
|
||||
Args:
|
||||
individual_digits (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
If set to True, digits will each be separated as follows::
|
||||
|
||||
"Call 123 please" -> "Call ", "1", "2", "3", " please"
|
||||
|
||||
If set to False, digits will grouped as follows::
|
||||
|
||||
"Call 123 please" -> "Call ", "123", " please"
|
||||
"""
|
||||
def __init__(self, individual_digits=False):
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
class Metaspace(PreTokenizer):
|
||||
"""
|
||||
Metaspace pre-tokenizer
|
||||
|
||||
This pre-tokenizer replaces any whitespace by the provided replacement character.
|
||||
It then tries to split on these spaces.
|
||||
|
||||
Args:
|
||||
replacement (:obj:`str`, `optional`, defaults to :obj:`▁`):
|
||||
The replacement character. Must be exactly one character. By default we
|
||||
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||
|
||||
prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
Choices: "always", "never", "first". First means the space is only added on the first
|
||||
token (relevant when special tokens are used or other pre_tokenizer are used).
|
||||
|
||||
"""
|
||||
def __init__(self, replacement="_", prepend_scheme="always", split=True):
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
class Punctuation(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer simply splits on punctuation as individual characters.
|
||||
|
||||
Args:
|
||||
behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
|
||||
The behavior to use when splitting.
|
||||
Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
|
||||
"contiguous"
|
||||
"""
|
||||
def __init__(self, behavior="isolated"):
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
class Sequence(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer composes other pre_tokenizers and applies them in sequence
|
||||
"""
|
||||
def __init__(self, pretokenizers):
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
class Split(PreTokenizer):
|
||||
"""
|
||||
Split PreTokenizer
|
||||
|
||||
This versatile pre-tokenizer splits using the provided pattern and
|
||||
according to the provided behavior. The pattern can be inverted by
|
||||
making use of the invert flag.
|
||||
|
||||
Args:
|
||||
pattern (:obj:`str` or :class:`~tokenizers.Regex`):
|
||||
A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`.
|
||||
If you want to use a regex pattern, it has to be wrapped around a `tokenizers.Regex`,
|
||||
otherwise we consider is as a string pattern. For example `pattern="|"`
|
||||
means you want to split on `|` (imagine a csv file for example), while
|
||||
`pattern=tokenizers.Regex("1|2")` means you split on either '1' or '2'.
|
||||
behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
|
||||
The behavior to use when splitting.
|
||||
Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
|
||||
"contiguous"
|
||||
|
||||
invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether to invert the pattern.
|
||||
"""
|
||||
def __init__(self, pattern, behavior, invert=False):
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
class UnicodeScripts(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer splits on characters that belong to different language family
|
||||
It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt
|
||||
Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
|
||||
This mimicks SentencePiece Unigram implementation.
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
class Whitespace(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
class WhitespaceSplit(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
Binary file not shown.
@@ -0,0 +1,9 @@
|
||||
# Generated content DO NOT EDIT
|
||||
from .. import processors
|
||||
|
||||
PostProcessor = processors.PostProcessor
|
||||
BertProcessing = processors.BertProcessing
|
||||
ByteLevel = processors.ByteLevel
|
||||
RobertaProcessing = processors.RobertaProcessing
|
||||
Sequence = processors.Sequence
|
||||
TemplateProcessing = processors.TemplateProcessing
|
||||
@@ -0,0 +1,342 @@
|
||||
# Generated content DO NOT EDIT
|
||||
class PostProcessor:
|
||||
"""
|
||||
Base class for all post-processors
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||
a PostProcessor will return an instance of this class when instantiated.
|
||||
"""
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
|
||||
Args:
|
||||
is_pair (:obj:`bool`):
|
||||
Whether the input would be a pair of sequences
|
||||
|
||||
Returns:
|
||||
:obj:`int`: The number of tokens to add
|
||||
"""
|
||||
pass
|
||||
|
||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||
"""
|
||||
Post-process the given encodings, generating the final one
|
||||
|
||||
Args:
|
||||
encoding (:class:`~tokenizers.Encoding`):
|
||||
The encoding for the first sequence
|
||||
|
||||
pair (:class:`~tokenizers.Encoding`, `optional`):
|
||||
The encoding for the pair sequence
|
||||
|
||||
add_special_tokens (:obj:`bool`):
|
||||
Whether to add the special tokens
|
||||
|
||||
Return:
|
||||
:class:`~tokenizers.Encoding`: The final encoding
|
||||
"""
|
||||
pass
|
||||
|
||||
class BertProcessing(PostProcessor):
|
||||
"""
|
||||
This post-processor takes care of adding the special tokens needed by
|
||||
a Bert model:
|
||||
|
||||
- a SEP token
|
||||
- a CLS token
|
||||
|
||||
Args:
|
||||
sep (:obj:`Tuple[str, int]`):
|
||||
A tuple with the string representation of the SEP token, and its id
|
||||
|
||||
cls (:obj:`Tuple[str, int]`):
|
||||
A tuple with the string representation of the CLS token, and its id
|
||||
"""
|
||||
def __init__(self, sep, cls):
|
||||
pass
|
||||
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
|
||||
Args:
|
||||
is_pair (:obj:`bool`):
|
||||
Whether the input would be a pair of sequences
|
||||
|
||||
Returns:
|
||||
:obj:`int`: The number of tokens to add
|
||||
"""
|
||||
pass
|
||||
|
||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||
"""
|
||||
Post-process the given encodings, generating the final one
|
||||
|
||||
Args:
|
||||
encoding (:class:`~tokenizers.Encoding`):
|
||||
The encoding for the first sequence
|
||||
|
||||
pair (:class:`~tokenizers.Encoding`, `optional`):
|
||||
The encoding for the pair sequence
|
||||
|
||||
add_special_tokens (:obj:`bool`):
|
||||
Whether to add the special tokens
|
||||
|
||||
Return:
|
||||
:class:`~tokenizers.Encoding`: The final encoding
|
||||
"""
|
||||
pass
|
||||
|
||||
class ByteLevel(PostProcessor):
|
||||
"""
|
||||
This post-processor takes care of trimming the offsets.
|
||||
|
||||
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
|
||||
want the offsets to include these whitespaces, then this PostProcessor must be used.
|
||||
|
||||
Args:
|
||||
trim_offsets (:obj:`bool`):
|
||||
Whether to trim the whitespaces from the produced offsets.
|
||||
"""
|
||||
def __init__(self, trim_offsets=True):
|
||||
pass
|
||||
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
|
||||
Args:
|
||||
is_pair (:obj:`bool`):
|
||||
Whether the input would be a pair of sequences
|
||||
|
||||
Returns:
|
||||
:obj:`int`: The number of tokens to add
|
||||
"""
|
||||
pass
|
||||
|
||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||
"""
|
||||
Post-process the given encodings, generating the final one
|
||||
|
||||
Args:
|
||||
encoding (:class:`~tokenizers.Encoding`):
|
||||
The encoding for the first sequence
|
||||
|
||||
pair (:class:`~tokenizers.Encoding`, `optional`):
|
||||
The encoding for the pair sequence
|
||||
|
||||
add_special_tokens (:obj:`bool`):
|
||||
Whether to add the special tokens
|
||||
|
||||
Return:
|
||||
:class:`~tokenizers.Encoding`: The final encoding
|
||||
"""
|
||||
pass
|
||||
|
||||
class RobertaProcessing(PostProcessor):
|
||||
"""
|
||||
This post-processor takes care of adding the special tokens needed by
|
||||
a Roberta model:
|
||||
|
||||
- a SEP token
|
||||
- a CLS token
|
||||
|
||||
It also takes care of trimming the offsets.
|
||||
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
|
||||
want the offsets to include these whitespaces, then this PostProcessor should be initialized
|
||||
with :obj:`trim_offsets=True`
|
||||
|
||||
Args:
|
||||
sep (:obj:`Tuple[str, int]`):
|
||||
A tuple with the string representation of the SEP token, and its id
|
||||
|
||||
cls (:obj:`Tuple[str, int]`):
|
||||
A tuple with the string representation of the CLS token, and its id
|
||||
|
||||
trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to trim the whitespaces from the produced offsets.
|
||||
|
||||
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether the add_prefix_space option was enabled during pre-tokenization. This
|
||||
is relevant because it defines the way the offsets are trimmed out.
|
||||
"""
|
||||
def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True):
|
||||
pass
|
||||
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
|
||||
Args:
|
||||
is_pair (:obj:`bool`):
|
||||
Whether the input would be a pair of sequences
|
||||
|
||||
Returns:
|
||||
:obj:`int`: The number of tokens to add
|
||||
"""
|
||||
pass
|
||||
|
||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||
"""
|
||||
Post-process the given encodings, generating the final one
|
||||
|
||||
Args:
|
||||
encoding (:class:`~tokenizers.Encoding`):
|
||||
The encoding for the first sequence
|
||||
|
||||
pair (:class:`~tokenizers.Encoding`, `optional`):
|
||||
The encoding for the pair sequence
|
||||
|
||||
add_special_tokens (:obj:`bool`):
|
||||
Whether to add the special tokens
|
||||
|
||||
Return:
|
||||
:class:`~tokenizers.Encoding`: The final encoding
|
||||
"""
|
||||
pass
|
||||
|
||||
class Sequence(PostProcessor):
|
||||
"""
|
||||
Sequence Processor
|
||||
|
||||
Args:
|
||||
processors (:obj:`List[PostProcessor]`)
|
||||
The processors that need to be chained
|
||||
"""
|
||||
def __init__(self, processors):
|
||||
pass
|
||||
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
|
||||
Args:
|
||||
is_pair (:obj:`bool`):
|
||||
Whether the input would be a pair of sequences
|
||||
|
||||
Returns:
|
||||
:obj:`int`: The number of tokens to add
|
||||
"""
|
||||
pass
|
||||
|
||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||
"""
|
||||
Post-process the given encodings, generating the final one
|
||||
|
||||
Args:
|
||||
encoding (:class:`~tokenizers.Encoding`):
|
||||
The encoding for the first sequence
|
||||
|
||||
pair (:class:`~tokenizers.Encoding`, `optional`):
|
||||
The encoding for the pair sequence
|
||||
|
||||
add_special_tokens (:obj:`bool`):
|
||||
Whether to add the special tokens
|
||||
|
||||
Return:
|
||||
:class:`~tokenizers.Encoding`: The final encoding
|
||||
"""
|
||||
pass
|
||||
|
||||
class TemplateProcessing(PostProcessor):
|
||||
"""
|
||||
Provides a way to specify templates in order to add the special tokens to each
|
||||
input sequence as relevant.
|
||||
|
||||
Let's take :obj:`BERT` tokenizer as an example. It uses two special tokens, used to
|
||||
delimitate each sequence. :obj:`[CLS]` is always used at the beginning of the first
|
||||
sequence, and :obj:`[SEP]` is added at the end of both the first, and the pair
|
||||
sequences. The final result looks like this:
|
||||
|
||||
- Single sequence: :obj:`[CLS] Hello there [SEP]`
|
||||
- Pair sequences: :obj:`[CLS] My name is Anthony [SEP] What is my name? [SEP]`
|
||||
|
||||
With the type ids as following::
|
||||
|
||||
[CLS] ... [SEP] ... [SEP]
|
||||
0 0 0 1 1
|
||||
|
||||
You can achieve such behavior using a TemplateProcessing::
|
||||
|
||||
TemplateProcessing(
|
||||
single="[CLS] $0 [SEP]",
|
||||
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
|
||||
special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
|
||||
)
|
||||
|
||||
In this example, each input sequence is identified using a ``$`` construct. This identifier
|
||||
lets us specify each input sequence, and the type_id to use. When nothing is specified,
|
||||
it uses the default values. Here are the different ways to specify it:
|
||||
|
||||
- Specifying the sequence, with default ``type_id == 0``: ``$A`` or ``$B``
|
||||
- Specifying the `type_id` with default ``sequence == A``: ``$0``, ``$1``, ``$2``, ...
|
||||
- Specifying both: ``$A:0``, ``$B:1``, ...
|
||||
|
||||
The same construct is used for special tokens: ``<identifier>(:<type_id>)?``.
|
||||
|
||||
**Warning**: You must ensure that you are giving the correct tokens/ids as these
|
||||
will be added to the Encoding without any further check. If the given ids correspond
|
||||
to something totally different in a `Tokenizer` using this `PostProcessor`, it
|
||||
might lead to unexpected results.
|
||||
|
||||
Args:
|
||||
single (:obj:`Template`):
|
||||
The template used for single sequences
|
||||
|
||||
pair (:obj:`Template`):
|
||||
The template used when both sequences are specified
|
||||
|
||||
special_tokens (:obj:`Tokens`):
|
||||
The list of special tokens used in each sequences
|
||||
|
||||
Types:
|
||||
|
||||
Template (:obj:`str` or :obj:`List`):
|
||||
- If a :obj:`str` is provided, the whitespace is used as delimiter between tokens
|
||||
- If a :obj:`List[str]` is provided, a list of tokens
|
||||
|
||||
Tokens (:obj:`List[Union[Tuple[int, str], Tuple[str, int], dict]]`):
|
||||
- A :obj:`Tuple` with both a token and its associated ID, in any order
|
||||
- A :obj:`dict` with the following keys:
|
||||
- "id": :obj:`str` => The special token id, as specified in the Template
|
||||
- "ids": :obj:`List[int]` => The associated IDs
|
||||
- "tokens": :obj:`List[str]` => The associated tokens
|
||||
|
||||
The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have
|
||||
the same length.
|
||||
"""
|
||||
def __init__(self, single, pair, special_tokens):
|
||||
pass
|
||||
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
|
||||
Args:
|
||||
is_pair (:obj:`bool`):
|
||||
Whether the input would be a pair of sequences
|
||||
|
||||
Returns:
|
||||
:obj:`int`: The number of tokens to add
|
||||
"""
|
||||
pass
|
||||
|
||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||
"""
|
||||
Post-process the given encodings, generating the final one
|
||||
|
||||
Args:
|
||||
encoding (:class:`~tokenizers.Encoding`):
|
||||
The encoding for the first sequence
|
||||
|
||||
pair (:class:`~tokenizers.Encoding`, `optional`):
|
||||
The encoding for the pair sequence
|
||||
|
||||
add_special_tokens (:obj:`bool`):
|
||||
Whether to add the special tokens
|
||||
|
||||
Return:
|
||||
:class:`~tokenizers.Encoding`: The final encoding
|
||||
"""
|
||||
pass
|
||||
Binary file not shown.
BIN
.venv/lib/python3.10/site-packages/tokenizers/tokenizers.abi3.so
Executable file
BIN
.venv/lib/python3.10/site-packages/tokenizers/tokenizers.abi3.so
Executable file
Binary file not shown.
@@ -0,0 +1 @@
|
||||
from .visualizer import Annotation, EncodingVisualizer
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,170 @@
|
||||
.tokenized-text {
|
||||
width:100%;
|
||||
padding:2rem;
|
||||
max-height: 400px;
|
||||
overflow-y: auto;
|
||||
box-sizing:border-box;
|
||||
line-height:4rem; /* Lots of space between lines */
|
||||
font-family: "Roboto Light", "Ubuntu Light", "Ubuntu", monospace;
|
||||
box-shadow: 2px 2px 2px rgba(0,0,0,0.2);
|
||||
background-color: rgba(0,0,0,0.01);
|
||||
letter-spacing:2px; /* Give some extra separation between chars */
|
||||
}
|
||||
.non-token{
|
||||
/* White space and other things the tokenizer ignores*/
|
||||
white-space: pre;
|
||||
letter-spacing:4px;
|
||||
border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/
|
||||
border-bottom:1px solid #A0A0A0;
|
||||
line-height: 1rem;
|
||||
height: calc(100% - 2px);
|
||||
}
|
||||
|
||||
.token {
|
||||
white-space: pre;
|
||||
position:relative;
|
||||
color:black;
|
||||
letter-spacing:2px;
|
||||
}
|
||||
|
||||
.annotation{
|
||||
white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */
|
||||
border-radius:4px;
|
||||
position:relative;
|
||||
width:fit-content;
|
||||
}
|
||||
.annotation:before {
|
||||
/*The before holds the text and the after holds the background*/
|
||||
z-index:1000; /* Make sure this is above the background */
|
||||
content:attr(data-label); /* The annotations label is on a data attribute */
|
||||
color:white;
|
||||
position:absolute;
|
||||
font-size:1rem;
|
||||
text-align:center;
|
||||
font-weight:bold;
|
||||
|
||||
top:1.75rem;
|
||||
line-height:0;
|
||||
left:0;
|
||||
width:100%;
|
||||
padding:0.5rem 0;
|
||||
/* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/
|
||||
overflow: hidden;
|
||||
white-space: nowrap;
|
||||
text-overflow:ellipsis;
|
||||
}
|
||||
|
||||
.annotation:after {
|
||||
content:attr(data-label); /* The content defines the width of the annotation*/
|
||||
position:absolute;
|
||||
font-size:0.75rem;
|
||||
text-align:center;
|
||||
font-weight:bold;
|
||||
text-overflow:ellipsis;
|
||||
top:1.75rem;
|
||||
line-height:0;
|
||||
overflow: hidden;
|
||||
white-space: nowrap;
|
||||
|
||||
left:0;
|
||||
width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/
|
||||
|
||||
padding:0.5rem 0;
|
||||
/* Nast hack below:
|
||||
We set the annotations color in code because we don't know the colors at css time.
|
||||
But you can't pass a color as a data attribute to get it into the pseudo element (this thing)
|
||||
So to get around that, annotations have the color set on them with a style attribute and then we
|
||||
can get the color with currentColor.
|
||||
Annotations wrap tokens and tokens set the color back to black
|
||||
*/
|
||||
background-color: currentColor;
|
||||
}
|
||||
.annotation:hover::after, .annotation:hover::before{
|
||||
/* When the user hovers over an annotation expand the label to display in full
|
||||
*/
|
||||
min-width: fit-content;
|
||||
}
|
||||
|
||||
.annotation:hover{
|
||||
/* Emphasize the annotation start end with a border on hover*/
|
||||
border-color: currentColor;
|
||||
border: 2px solid;
|
||||
}
|
||||
.special-token:not(:empty){
|
||||
/*
|
||||
A none empty special token is like UNK (as opposed to CLS which has no representation in the text )
|
||||
*/
|
||||
position:relative;
|
||||
}
|
||||
.special-token:empty::before{
|
||||
/* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/
|
||||
content:attr(data-stok);
|
||||
background:#202020;
|
||||
font-size:0.75rem;
|
||||
color:white;
|
||||
margin: 0 0.25rem;
|
||||
padding: 0.25rem;
|
||||
border-radius:4px
|
||||
}
|
||||
|
||||
.special-token:not(:empty):before {
|
||||
/* Special tokens that have text (UNK) are displayed above the actual text*/
|
||||
content:attr(data-stok);
|
||||
position:absolute;
|
||||
bottom:1.75rem;
|
||||
min-width:100%;
|
||||
width:100%;
|
||||
height:1rem;
|
||||
line-height:1rem;
|
||||
font-size:1rem;
|
||||
text-align:center;
|
||||
color:white;
|
||||
font-weight:bold;
|
||||
background:#202020;
|
||||
border-radius:10%;
|
||||
}
|
||||
/*
|
||||
We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations
|
||||
instead we apply even and odd class at generation time and color them that way
|
||||
*/
|
||||
.even-token{
|
||||
background:#DCDCDC ;
|
||||
border: 1px solid #DCDCDC;
|
||||
}
|
||||
.odd-token{
|
||||
background:#A0A0A0;
|
||||
border: 1px solid #A0A0A0;
|
||||
}
|
||||
.even-token.multi-token,.odd-token.multi-token{
|
||||
background: repeating-linear-gradient(
|
||||
45deg,
|
||||
transparent,
|
||||
transparent 1px,
|
||||
#ccc 1px,
|
||||
#ccc 1px
|
||||
),
|
||||
/* on "bottom" */
|
||||
linear-gradient(
|
||||
to bottom,
|
||||
#FFB6C1,
|
||||
#999
|
||||
);
|
||||
}
|
||||
|
||||
.multi-token:hover::after {
|
||||
content:"This char has more than 1 token"; /* The content defines the width of the annotation*/
|
||||
color:white;
|
||||
background-color: black;
|
||||
position:absolute;
|
||||
font-size:0.75rem;
|
||||
text-align:center;
|
||||
font-weight:bold;
|
||||
text-overflow:ellipsis;
|
||||
top:1.75rem;
|
||||
line-height:0;
|
||||
overflow: hidden;
|
||||
white-space: nowrap;
|
||||
left:0;
|
||||
width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/
|
||||
padding:0.5rem 0;
|
||||
}
|
||||
@@ -0,0 +1,403 @@
|
||||
import itertools
|
||||
import os
|
||||
import re
|
||||
from string import Template
|
||||
from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple
|
||||
|
||||
from tokenizers import Encoding, Tokenizer
|
||||
|
||||
|
||||
dirname = os.path.dirname(__file__)
|
||||
css_filename = os.path.join(dirname, "visualizer-styles.css")
|
||||
with open(css_filename) as f:
|
||||
css = f.read()
|
||||
|
||||
|
||||
class Annotation:
|
||||
start: int
|
||||
end: int
|
||||
label: int
|
||||
|
||||
def __init__(self, start: int, end: int, label: str):
|
||||
self.start = start
|
||||
self.end = end
|
||||
self.label = label
|
||||
|
||||
|
||||
AnnotationList = List[Annotation]
|
||||
PartialIntList = List[Optional[int]]
|
||||
|
||||
|
||||
class CharStateKey(NamedTuple):
|
||||
token_ix: Optional[int]
|
||||
anno_ix: Optional[int]
|
||||
|
||||
|
||||
class CharState:
|
||||
char_ix: Optional[int]
|
||||
|
||||
def __init__(self, char_ix):
|
||||
self.char_ix = char_ix
|
||||
|
||||
self.anno_ix: Optional[int] = None
|
||||
self.tokens: List[int] = []
|
||||
|
||||
@property
|
||||
def token_ix(self):
|
||||
return self.tokens[0] if len(self.tokens) > 0 else None
|
||||
|
||||
@property
|
||||
def is_multitoken(self):
|
||||
"""
|
||||
BPE tokenizers can output more than one token for a char
|
||||
"""
|
||||
return len(self.tokens) > 1
|
||||
|
||||
def partition_key(self) -> CharStateKey:
|
||||
return CharStateKey(
|
||||
token_ix=self.token_ix,
|
||||
anno_ix=self.anno_ix,
|
||||
)
|
||||
|
||||
|
||||
class Aligned:
|
||||
pass
|
||||
|
||||
|
||||
class EncodingVisualizer:
|
||||
"""
|
||||
Build an EncodingVisualizer
|
||||
|
||||
Args:
|
||||
|
||||
tokenizer (:class:`~tokenizers.Tokenizer`):
|
||||
A tokenizer instance
|
||||
|
||||
default_to_notebook (:obj:`bool`):
|
||||
Whether to render html output in a notebook by default
|
||||
|
||||
annotation_converter (:obj:`Callable`, `optional`):
|
||||
An optional (lambda) function that takes an annotation in any format and returns
|
||||
an Annotation object
|
||||
"""
|
||||
|
||||
unk_token_regex = re.compile("(.{1}\b)?(unk|oov)(\b.{1})?", flags=re.IGNORECASE)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: Tokenizer,
|
||||
default_to_notebook: bool = True,
|
||||
annotation_converter: Optional[Callable[[Any], Annotation]] = None,
|
||||
):
|
||||
if default_to_notebook:
|
||||
try:
|
||||
from IPython.core.display import HTML, display
|
||||
except ImportError:
|
||||
raise Exception(
|
||||
"""We couldn't import IPython utils for html display.
|
||||
Are you running in a notebook?
|
||||
You can also pass `default_to_notebook=False` to get back raw HTML
|
||||
"""
|
||||
)
|
||||
|
||||
self.tokenizer = tokenizer
|
||||
self.default_to_notebook = default_to_notebook
|
||||
self.annotation_coverter = annotation_converter
|
||||
pass
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: str,
|
||||
annotations: AnnotationList = [],
|
||||
default_to_notebook: Optional[bool] = None,
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Build a visualization of the given text
|
||||
|
||||
Args:
|
||||
text (:obj:`str`):
|
||||
The text to tokenize
|
||||
|
||||
annotations (:obj:`List[Annotation]`, `optional`):
|
||||
An optional list of annotations of the text. The can either be an annotation class
|
||||
or anything else if you instantiated the visualizer with a converter function
|
||||
|
||||
default_to_notebook (:obj:`bool`, `optional`, defaults to `False`):
|
||||
If True, will render the html in a notebook. Otherwise returns an html string.
|
||||
|
||||
Returns:
|
||||
The HTML string if default_to_notebook is False, otherwise (default) returns None and
|
||||
renders the HTML in the notebook
|
||||
|
||||
"""
|
||||
final_default_to_notebook = self.default_to_notebook
|
||||
if default_to_notebook is not None:
|
||||
final_default_to_notebook = default_to_notebook
|
||||
if final_default_to_notebook:
|
||||
try:
|
||||
from IPython.core.display import HTML, display
|
||||
except ImportError:
|
||||
raise Exception(
|
||||
"""We couldn't import IPython utils for html display.
|
||||
Are you running in a notebook?"""
|
||||
)
|
||||
if self.annotation_coverter is not None:
|
||||
annotations = list(map(self.annotation_coverter, annotations))
|
||||
encoding = self.tokenizer.encode(text)
|
||||
html = EncodingVisualizer.__make_html(text, encoding, annotations)
|
||||
if final_default_to_notebook:
|
||||
display(HTML(html))
|
||||
else:
|
||||
return html
|
||||
|
||||
@staticmethod
|
||||
def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]:
|
||||
"""
|
||||
Generates a color palette for all the labels in a given set of annotations
|
||||
|
||||
Args:
|
||||
annotations (:obj:`Annotation`):
|
||||
A list of annotations
|
||||
|
||||
Returns:
|
||||
:obj:`dict`: A dictionary mapping labels to colors in HSL format
|
||||
"""
|
||||
if len(annotations) == 0:
|
||||
return {}
|
||||
labels = set(map(lambda x: x.label, annotations))
|
||||
num_labels = len(labels)
|
||||
h_step = int(255 / num_labels)
|
||||
if h_step < 20:
|
||||
h_step = 20
|
||||
s = 32
|
||||
l = 64 # noqa: E741
|
||||
h = 10
|
||||
colors = {}
|
||||
|
||||
for label in sorted(labels): # sort so we always get the same colors for a given set of labels
|
||||
colors[label] = f"hsl({h},{s}%,{l}%"
|
||||
h += h_step
|
||||
return colors
|
||||
|
||||
@staticmethod
|
||||
def consecutive_chars_to_html(
|
||||
consecutive_chars_list: List[CharState],
|
||||
text: str,
|
||||
encoding: Encoding,
|
||||
):
|
||||
"""
|
||||
Converts a list of "consecutive chars" into a single HTML element.
|
||||
Chars are consecutive if they fall under the same word, token and annotation.
|
||||
The CharState class is a named tuple with a "partition_key" method that makes it easy to
|
||||
compare if two chars are consecutive.
|
||||
|
||||
Args:
|
||||
consecutive_chars_list (:obj:`List[CharState]`):
|
||||
A list of CharStates that have been grouped together
|
||||
|
||||
text (:obj:`str`):
|
||||
The original text being processed
|
||||
|
||||
encoding (:class:`~tokenizers.Encoding`):
|
||||
The encoding returned from the tokenizer
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The HTML span for a set of consecutive chars
|
||||
"""
|
||||
first = consecutive_chars_list[0]
|
||||
if first.char_ix is None:
|
||||
# its a special token
|
||||
stoken = encoding.tokens[first.token_ix]
|
||||
# special tokens are represented as empty spans. We use the data attribute and css
|
||||
# magic to display it
|
||||
return f'<span class="special-token" data-stoken={stoken}></span>'
|
||||
# We're not in a special token so this group has a start and end.
|
||||
last = consecutive_chars_list[-1]
|
||||
start = first.char_ix
|
||||
end = last.char_ix + 1
|
||||
span_text = text[start:end]
|
||||
css_classes = [] # What css classes will we apply on the resulting span
|
||||
data_items = {} # What data attributes will we apply on the result span
|
||||
if first.token_ix is not None:
|
||||
# We can either be in a token or not (e.g. in white space)
|
||||
css_classes.append("token")
|
||||
if first.is_multitoken:
|
||||
css_classes.append("multi-token")
|
||||
if first.token_ix % 2:
|
||||
# We use this to color alternating tokens.
|
||||
# A token might be split by an annotation that ends in the middle of it, so this
|
||||
# lets us visually indicate a consecutive token despite its possible splitting in
|
||||
# the html markup
|
||||
css_classes.append("odd-token")
|
||||
else:
|
||||
# Like above, but a different color so we can see the tokens alternate
|
||||
css_classes.append("even-token")
|
||||
if EncodingVisualizer.unk_token_regex.search(encoding.tokens[first.token_ix]) is not None:
|
||||
# This is a special token that is in the text. probably UNK
|
||||
css_classes.append("special-token")
|
||||
# TODO is this the right name for the data attribute ?
|
||||
data_items["stok"] = encoding.tokens[first.token_ix]
|
||||
else:
|
||||
# In this case we are looking at a group/single char that is not tokenized.
|
||||
# e.g. white space
|
||||
css_classes.append("non-token")
|
||||
css = f'''class="{" ".join(css_classes)}"'''
|
||||
data = ""
|
||||
for key, val in data_items.items():
|
||||
data += f' data-{key}="{val}"'
|
||||
return f"<span {css} {data} >{span_text}</span>"
|
||||
|
||||
@staticmethod
|
||||
def __make_html(text: str, encoding: Encoding, annotations: AnnotationList) -> str:
|
||||
char_states = EncodingVisualizer.__make_char_states(text, encoding, annotations)
|
||||
current_consecutive_chars = [char_states[0]]
|
||||
prev_anno_ix = char_states[0].anno_ix
|
||||
spans = []
|
||||
label_colors_dict = EncodingVisualizer.calculate_label_colors(annotations)
|
||||
cur_anno_ix = char_states[0].anno_ix
|
||||
if cur_anno_ix is not None:
|
||||
# If we started in an annotation make a span for it
|
||||
anno = annotations[cur_anno_ix]
|
||||
label = anno.label
|
||||
color = label_colors_dict[label]
|
||||
spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
|
||||
|
||||
for cs in char_states[1:]:
|
||||
cur_anno_ix = cs.anno_ix
|
||||
if cur_anno_ix != prev_anno_ix:
|
||||
# If we've transitioned in or out of an annotation
|
||||
spans.append(
|
||||
# Create a span from the current consecutive characters
|
||||
EncodingVisualizer.consecutive_chars_to_html(
|
||||
current_consecutive_chars,
|
||||
text=text,
|
||||
encoding=encoding,
|
||||
)
|
||||
)
|
||||
current_consecutive_chars = [cs]
|
||||
|
||||
if prev_anno_ix is not None:
|
||||
# if we transitioned out of an annotation close it's span
|
||||
spans.append("</span>")
|
||||
if cur_anno_ix is not None:
|
||||
# If we entered a new annotation make a span for it
|
||||
anno = annotations[cur_anno_ix]
|
||||
label = anno.label
|
||||
color = label_colors_dict[label]
|
||||
spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
|
||||
prev_anno_ix = cur_anno_ix
|
||||
|
||||
if cs.partition_key() == current_consecutive_chars[0].partition_key():
|
||||
# If the current charchter is in the same "group" as the previous one
|
||||
current_consecutive_chars.append(cs)
|
||||
else:
|
||||
# Otherwise we make a span for the previous group
|
||||
spans.append(
|
||||
EncodingVisualizer.consecutive_chars_to_html(
|
||||
current_consecutive_chars,
|
||||
text=text,
|
||||
encoding=encoding,
|
||||
)
|
||||
)
|
||||
# An reset the consecutive_char_list to form a new group
|
||||
current_consecutive_chars = [cs]
|
||||
# All that's left is to fill out the final span
|
||||
# TODO I think there is an edge case here where an annotation's span might not close
|
||||
spans.append(
|
||||
EncodingVisualizer.consecutive_chars_to_html(
|
||||
current_consecutive_chars,
|
||||
text=text,
|
||||
encoding=encoding,
|
||||
)
|
||||
)
|
||||
res = HTMLBody(spans) # Send the list of spans to the body of our html
|
||||
return res
|
||||
|
||||
@staticmethod
|
||||
def __make_anno_map(text: str, annotations: AnnotationList) -> PartialIntList:
|
||||
"""
|
||||
Args:
|
||||
text (:obj:`str`):
|
||||
The raw text we want to align to
|
||||
|
||||
annotations (:obj:`AnnotationList`):
|
||||
A (possibly empty) list of annotations
|
||||
|
||||
Returns:
|
||||
A list of length len(text) whose entry at index i is None if there is no annotation on
|
||||
character i or k, the index of the annotation that covers index i where k is with
|
||||
respect to the list of annotations
|
||||
"""
|
||||
annotation_map = [None] * len(text)
|
||||
for anno_ix, a in enumerate(annotations):
|
||||
for i in range(a.start, a.end):
|
||||
annotation_map[i] = anno_ix
|
||||
return annotation_map
|
||||
|
||||
@staticmethod
|
||||
def __make_char_states(text: str, encoding: Encoding, annotations: AnnotationList) -> List[CharState]:
|
||||
"""
|
||||
For each character in the original text, we emit a tuple representing it's "state":
|
||||
|
||||
* which token_ix it corresponds to
|
||||
* which word_ix it corresponds to
|
||||
* which annotation_ix it corresponds to
|
||||
|
||||
Args:
|
||||
text (:obj:`str`):
|
||||
The raw text we want to align to
|
||||
|
||||
annotations (:obj:`List[Annotation]`):
|
||||
A (possibly empty) list of annotations
|
||||
|
||||
encoding: (:class:`~tokenizers.Encoding`):
|
||||
The encoding returned from the tokenizer
|
||||
|
||||
Returns:
|
||||
:obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what
|
||||
it's state is
|
||||
"""
|
||||
annotation_map = EncodingVisualizer.__make_anno_map(text, annotations)
|
||||
# Todo make this a dataclass or named tuple
|
||||
char_states: List[CharState] = [CharState(char_ix) for char_ix in range(len(text))]
|
||||
for token_ix, token in enumerate(encoding.tokens):
|
||||
offsets = encoding.token_to_chars(token_ix)
|
||||
if offsets is not None:
|
||||
start, end = offsets
|
||||
for i in range(start, end):
|
||||
char_states[i].tokens.append(token_ix)
|
||||
for char_ix, anno_ix in enumerate(annotation_map):
|
||||
char_states[char_ix].anno_ix = anno_ix
|
||||
|
||||
return char_states
|
||||
|
||||
|
||||
def HTMLBody(children: List[str], css_styles=css) -> str:
|
||||
"""
|
||||
Generates the full html with css from a list of html spans
|
||||
|
||||
Args:
|
||||
children (:obj:`List[str]`):
|
||||
A list of strings, assumed to be html elements
|
||||
|
||||
css_styles (:obj:`str`, `optional`):
|
||||
Optional alternative implementation of the css
|
||||
|
||||
Returns:
|
||||
:obj:`str`: An HTML string with style markup
|
||||
"""
|
||||
children_text = "".join(children)
|
||||
return f"""
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
{css_styles}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="tokenized-text" dir=auto>
|
||||
{children_text}
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
@@ -0,0 +1,8 @@
|
||||
# Generated content DO NOT EDIT
|
||||
from .. import trainers
|
||||
|
||||
Trainer = trainers.Trainer
|
||||
BpeTrainer = trainers.BpeTrainer
|
||||
UnigramTrainer = trainers.UnigramTrainer
|
||||
WordLevelTrainer = trainers.WordLevelTrainer
|
||||
WordPieceTrainer = trainers.WordPieceTrainer
|
||||
@@ -0,0 +1,156 @@
|
||||
# Generated content DO NOT EDIT
|
||||
class Trainer:
|
||||
"""
|
||||
Base class for all trainers
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
Trainer will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
class BpeTrainer(Trainer):
|
||||
"""
|
||||
Trainer capable of training a BPE model
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, `optional`):
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
|
||||
min_frequency (:obj:`int`, `optional`):
|
||||
The minimum frequency a pair should have in order to be merged.
|
||||
|
||||
show_progress (:obj:`bool`, `optional`):
|
||||
Whether to show progress bars while training.
|
||||
|
||||
special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
|
||||
A list of special tokens the model should know of.
|
||||
|
||||
limit_alphabet (:obj:`int`, `optional`):
|
||||
The maximum different characters to keep in the alphabet.
|
||||
|
||||
initial_alphabet (:obj:`List[str]`, `optional`):
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contain more than one character, only the first one
|
||||
is kept.
|
||||
|
||||
continuing_subword_prefix (:obj:`str`, `optional`):
|
||||
A prefix to be used for every subword that is not a beginning-of-word.
|
||||
|
||||
end_of_word_suffix (:obj:`str`, `optional`):
|
||||
A suffix to be used for every subword that is a end-of-word.
|
||||
|
||||
max_token_length (:obj:`int`, `optional`):
|
||||
Prevents creating tokens longer than the specified size.
|
||||
This can help with reducing polluting your vocabulary with
|
||||
highly repetitive tokens like `======` for wikipedia
|
||||
|
||||
"""
|
||||
|
||||
class UnigramTrainer(Trainer):
|
||||
"""
|
||||
Trainer capable of training a Unigram model
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`):
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
|
||||
show_progress (:obj:`bool`):
|
||||
Whether to show progress bars while training.
|
||||
|
||||
special_tokens (:obj:`List[Union[str, AddedToken]]`):
|
||||
A list of special tokens the model should know of.
|
||||
|
||||
initial_alphabet (:obj:`List[str]`):
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contain more than one character, only the first one
|
||||
is kept.
|
||||
|
||||
shrinking_factor (:obj:`float`):
|
||||
The shrinking factor used at each step of the training to prune the
|
||||
vocabulary.
|
||||
|
||||
unk_token (:obj:`str`):
|
||||
The token used for out-of-vocabulary tokens.
|
||||
|
||||
max_piece_length (:obj:`int`):
|
||||
The maximum length of a given token.
|
||||
|
||||
n_sub_iterations (:obj:`int`):
|
||||
The number of iterations of the EM algorithm to perform before
|
||||
pruning the vocabulary.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=8000,
|
||||
show_progress=True,
|
||||
special_tokens=[],
|
||||
shrinking_factor=0.75,
|
||||
unk_token=None,
|
||||
max_piece_length=16,
|
||||
n_sub_iterations=2,
|
||||
):
|
||||
pass
|
||||
|
||||
class WordLevelTrainer(Trainer):
|
||||
"""
|
||||
Trainer capable of training a WorldLevel model
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, `optional`):
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
|
||||
min_frequency (:obj:`int`, `optional`):
|
||||
The minimum frequency a pair should have in order to be merged.
|
||||
|
||||
show_progress (:obj:`bool`, `optional`):
|
||||
Whether to show progress bars while training.
|
||||
|
||||
special_tokens (:obj:`List[Union[str, AddedToken]]`):
|
||||
A list of special tokens the model should know of.
|
||||
"""
|
||||
|
||||
class WordPieceTrainer(Trainer):
|
||||
"""
|
||||
Trainer capable of training a WordPiece model
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, `optional`):
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
|
||||
min_frequency (:obj:`int`, `optional`):
|
||||
The minimum frequency a pair should have in order to be merged.
|
||||
|
||||
show_progress (:obj:`bool`, `optional`):
|
||||
Whether to show progress bars while training.
|
||||
|
||||
special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
|
||||
A list of special tokens the model should know of.
|
||||
|
||||
limit_alphabet (:obj:`int`, `optional`):
|
||||
The maximum different characters to keep in the alphabet.
|
||||
|
||||
initial_alphabet (:obj:`List[str]`, `optional`):
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contain more than one character, only the first one
|
||||
is kept.
|
||||
|
||||
continuing_subword_prefix (:obj:`str`, `optional`):
|
||||
A prefix to be used for every subword that is not a beginning-of-word.
|
||||
|
||||
end_of_word_suffix (:obj:`str`, `optional`):
|
||||
A suffix to be used for every subword that is a end-of-word.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=30000,
|
||||
min_frequency=0,
|
||||
show_progress=True,
|
||||
special_tokens=[],
|
||||
limit_alphabet=None,
|
||||
initial_alphabet=[],
|
||||
continuing_subword_prefix="##",
|
||||
end_of_word_suffix=None,
|
||||
):
|
||||
pass
|
||||
Binary file not shown.
Reference in New Issue
Block a user