Tokenizers¶
The tokenizer has multiple concrete implementations for different tokenization algorithms.
The main Tokenizer
interface defines the contract for creating encoders and decoders, while concrete implementations
handle specific tokenization methods like SentencePiece and tiktoken.
classDiagram class Tokenizer { <<abstract>> +create_encoder(task, lang, mode, device)* +create_raw_encoder(device)* +create_decoder(skip_special_tokens)* +vocab_info: VocabularyInfo* } class BasicSentencePieceTokenizer { -_model: SentencePieceModel -_vocab_info: VocabularyInfo +create_encoder(task, lang, mode, device) +create_raw_encoder(device) +create_decoder(skip_special_tokens) } class RawSentencePieceTokenizer { -_model: SentencePieceModel -_vocab_info: VocabularyInfo +create_encoder(task, lang, mode, device) +create_raw_encoder(device) +create_decoder(skip_special_tokens) } class TiktokenTokenizer { -_model: TiktokenModel -_vocab_info: VocabularyInfo +create_encoder(task, lang, mode, device) +create_raw_encoder(device) +create_decoder(skip_special_tokens) } class CharTokenizer { -_vocab_info: VocabularyInfo +create_encoder(task, lang, mode, device) +create_raw_encoder(device) +create_decoder(skip_special_tokens) } Tokenizer <|-- BasicSentencePieceTokenizer Tokenizer <|-- RawSentencePieceTokenizer Tokenizer <|-- TiktokenTokenizer Tokenizer <|-- CharTokenizer