Source code for fairseq2.data.tokenizers.tokenizer

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from __future__ import annotations

from abc import ABC, abstractmethod
from collections.abc import Sequence

from torch import Tensor

from fairseq2.data.tokenizers.vocab_info import VocabularyInfo
from fairseq2.device import Device



[docs]
class Tokenizer(ABC):
    """Represents a tokenizer to encode and decode text."""


[docs]
    @abstractmethod
    def create_encoder(
        self,
        *,
        task: str | None = None,
        lang: str | None = None,
        mode: str | None = None,
        device: Device | None = None,
        pin_memory: bool = False,
    ) -> TokenEncoder:
        """Constructs a token encoder.

        The valid arguments for the ``task``, ``lang``, and ``mode`` parameters
        are implementation specific. Refer to concrete ``Tokenizer``
        subclasses for more information.

        :param task:
            The task for which to generate token indices. Typically, ``task`` is
            used to distinguish between different tasks such as 'translation' or
            'transcription'.
        :param lang:
            The language of generated token indices. Typically, multilingual
            translation tasks use ``lang`` to distinguish between different
            languages such as 'en-US' or 'de-DE'.
        :param mode:
            The mode in which to generate token indices. Typically, translation
            tasks use ``mode`` to distinguish between different modes such as
            'source' or 'target'.
        :param device:
            The device on which to construct tensors.
        :param pin_memory:
            If ``True``, uses pinned memory while constructing tensors.
        """



[docs]
    @abstractmethod
    def create_raw_encoder(
        self, *, device: Device | None = None, pin_memory: bool = False
    ) -> TokenEncoder:
        """Constructs a raw token encoder with no control symbols.

        :param device: The device on which to construct tensors.
        :param pin_memory: If ``True``, uses pinned memory for tensors.
        """



[docs]
    @abstractmethod
    def create_decoder(self, *, skip_special_tokens: bool = False) -> TokenDecoder:
        """Constructs a token decoder."""


    @property
    @abstractmethod
    def vocab_info(self) -> VocabularyInfo:
        """The vocabulary information associated with the tokenizer."""




[docs]
class TokenEncoder(ABC):
    """Encodes text into tokens or token indices."""

    @abstractmethod
    def __call__(self, text: str) -> Tensor:
        """
        :param text: The text to encode.
        """


[docs]
    @abstractmethod
    def encode_as_tokens(self, text: str) -> list[str]:
        """
        :param text: The text to encode.
        """


    @property
    @abstractmethod
    def prefix_indices(self) -> Tensor | None:
        """
        Gets the indices of the prefix tokens. *Shape:* :math:`(S)`, where
        :math:`S` is the number of indices.
        """

    @property
    @abstractmethod
    def suffix_indices(self) -> Tensor | None:
        """
        Gets the indices of the suffix tokens. *Shape:* :math:`(S)`, where
        :math:`S` is the number of indices.
        """




[docs]
class TokenDecoder(ABC):
    """Decodes text from tokens or token indices."""

    @abstractmethod
    def __call__(self, token_indices: Tensor) -> str: ...


[docs]
    @abstractmethod
    def decode_from_tokens(self, tokens: Sequence[str]) -> str: ...