Skip to content

Vocabulary

minnt.Vocabulary

A class for managing mapping between strings and indices.

The vocabulary is initialized with a list of strings, and additionally can contain two special tokens:

Info

A Vocabulary instance can be pickled and unpickled efficiently as a list of strings; the required string-to-index mapping is reconstructed upon unpickling.

Source code in minnt/vocabulary.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
class Vocabulary:
    """A class for managing mapping between strings and indices.

    The vocabulary is initialized with a list of strings, and additionally can contain
    two special tokens:

    - a padding token [minnt.Vocabulary.PAD_TOKEN][], which, if present, is always at index
      [minnt.Vocabulary.PAD][]=0;
    - an unknown token [minnt.Vocabulary.UNK_TOKEN][], which, if present, is either at index
      [minnt.Vocabulary.UNK][] 0 or 1 (depending on whether the padding token is present);
      the index of this token is returned when looking up a string not present in the vocabulary.

    Info:
      A `Vocabulary` instance can be pickled and unpickled efficiently as a list of strings;
      the required string-to-index mapping is reconstructed upon unpickling.
    """
    PAD: int | None
    """The index of the padding token, either `None` or `0`."""
    PAD_TOKEN: str = "[PAD]"
    """The string representing the padding token."""

    UNK: int | None
    """The index of the unknown token, either `None`, `0`, or `1`."""
    UNK_TOKEN: str = "[UNK]"
    """The string representing the unknown token."""

    def __init__(self, strings: Iterable[str], add_pad: bool = False, add_unk: bool = False) -> None:
        """Initialize the vocabulary with the given list of strings.

        The strings might be prepended with special tokens for padding and unknown tokens, respectively,
        depending on the values of `add_pad` and `add_unk`.

        Note:
          If the given strings already contain special tokens on expected indices, they are recognized
          correctly and no duplicates are added even if `add_pad` and/or `add_unk` are `True`.

        Parameters:
          strings: An iterable of strings to include in the vocabulary.
          add_pad: Whether to add a padding token [minnt.Vocabulary.PAD_TOKEN][] at index 0
            and set [minnt.Vocabulary.PAD][]=0.
          add_unk: Whether to add an unknown token [minnt.Vocabulary.UNK_TOKEN][] at index 0 or 1
            (depending on whether the padding token is added) and set [minnt.Vocabulary.UNK][] accordingly.
        """
        # Get the first two strings (if any) to check for special tokens.
        it, head = iter(strings), []
        try:
            head.append(next(it))
            head.append(next(it))
        except StopIteration:
            it = None

        # Start by adding the special tokens.
        self._strings = []

        if add_pad or (head and head[0] == self.PAD_TOKEN):  # Add PAD token if required or present.
            self.PAD = 0
            self._strings.append(self.PAD_TOKEN)
            (head and head[0] == self.PAD_TOKEN) and head.pop(0)
        else:
            self.PAD = None

        if add_unk or (head and head[0] == self.UNK_TOKEN):  # Add UNK token if required or present.
            self.UNK = len(self._strings)
            (head and head[0] == self.UNK_TOKEN) or self._strings.append(self.UNK_TOKEN)
        else:
            self.UNK = None

        # Now add the remaining strings, both from `head` and from `it`.
        self._strings.extend(head)
        it is not None and self._strings.extend(it)

        self._string_map = {string: index for index, string in enumerate(self._strings)}

    def __len__(self) -> int:
        """The number of strings in the vocabulary.

        Returns:
          The size of the vocabulary.
        """
        return len(self._strings)

    def __iter__(self) -> Iterable[str]:
        """Return an iterator over strings in the vocabulary.

        Returns:
          An iterator over strings in the vocabulary.
        """
        return iter(self._strings)

    def __getstate__(self) -> dict:
        state = self.__dict__.copy()
        del state["_string_map"]
        return state

    def __setstate__(self, state: dict) -> None:
        self.__dict__.update(state)
        self._string_map = {string: index for index, string in enumerate(self._strings)}

    def add(self, string: str) -> int:
        """If not already present, add the given string to the end of the vocabulary.

        Parameters:
          string: The string to add.

        Returns:
          The index of the newly added string (or the index of the existing string if it was already present).
        """
        index = self._string_map.get(string)
        if index is None:
            index = len(self._strings)
            self._strings.append(string)
            self._string_map[string] = index
        return index

    def string(self, index: int) -> str:
        """Convert vocabulary index to string.

        Parameters:
          index: The vocabulary index.

        Returns:
          The string corresponding to the given index.
        """
        return self._strings[index]

    def strings(self, indices: Iterable[int]) -> list[str]:
        """Convert a sequence of vocabulary indices to strings.

        Parameters:
          indices: An iterable of vocabulary indices.

        Returns:
          A list of strings corresponding to the given indices.
        """
        return [self._strings[index] for index in indices]

    def index(self, string: str, add_missing: bool = False) -> int | None:
        """Convert string to vocabulary index.

        Parameters:
          string: The string to convert.
          add_missing: Whether to add the string to the vocabulary if not present.

        Returns:
          The index corresponding to the given string. If the string is not found in the vocabulary, then

            - if `add_missing` is `True`, the string is added to the end of the vocabulary and its index returned;
            - if the [minnt.Vocabulary.UNK_TOKEN][] was added to the vocabulary, its index is returned;
            - otherwise, `None` is returned.
        """
        if add_missing:
            return self.add(string)
        else:
            return self._string_map.get(string, self.UNK)

    def indices(self, strings: Iterable[str], add_missing: bool = False) -> list[int | None]:
        """Convert a sequence of strings to vocabulary indices.

        Parameters:
          strings: An iterable of strings to convert.
            add_missing: Whether to add strings not present in the vocabulary.

        Returns:
          A list of indices corresponding to the given strings. For each string not found in the vocabulary:

            - if `add_missing` is `True`, the string is added to the end of the vocabulary and its index returned;
            - if the [minnt.Vocabulary.UNK_TOKEN][] was added to the vocabulary, its index is returned;
            - otherwise, `None` is returned.
        """
        if add_missing:
            return [self.add(string) for string in strings]
        else:
            return [self._string_map.get(string, self.UNK) for string in strings]

PAD instance-attribute

PAD: int | None

The index of the padding token, either None or 0.

PAD_TOKEN class-attribute instance-attribute

PAD_TOKEN: str = '[PAD]'

The string representing the padding token.

UNK instance-attribute

UNK: int | None

The index of the unknown token, either None, 0, or 1.

UNK_TOKEN class-attribute instance-attribute

UNK_TOKEN: str = '[UNK]'

The string representing the unknown token.

__init__

__init__(
    strings: Iterable[str], add_pad: bool = False, add_unk: bool = False
) -> None

Initialize the vocabulary with the given list of strings.

The strings might be prepended with special tokens for padding and unknown tokens, respectively, depending on the values of add_pad and add_unk.

Note

If the given strings already contain special tokens on expected indices, they are recognized correctly and no duplicates are added even if add_pad and/or add_unk are True.

Parameters:

Source code in minnt/vocabulary.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def __init__(self, strings: Iterable[str], add_pad: bool = False, add_unk: bool = False) -> None:
    """Initialize the vocabulary with the given list of strings.

    The strings might be prepended with special tokens for padding and unknown tokens, respectively,
    depending on the values of `add_pad` and `add_unk`.

    Note:
      If the given strings already contain special tokens on expected indices, they are recognized
      correctly and no duplicates are added even if `add_pad` and/or `add_unk` are `True`.

    Parameters:
      strings: An iterable of strings to include in the vocabulary.
      add_pad: Whether to add a padding token [minnt.Vocabulary.PAD_TOKEN][] at index 0
        and set [minnt.Vocabulary.PAD][]=0.
      add_unk: Whether to add an unknown token [minnt.Vocabulary.UNK_TOKEN][] at index 0 or 1
        (depending on whether the padding token is added) and set [minnt.Vocabulary.UNK][] accordingly.
    """
    # Get the first two strings (if any) to check for special tokens.
    it, head = iter(strings), []
    try:
        head.append(next(it))
        head.append(next(it))
    except StopIteration:
        it = None

    # Start by adding the special tokens.
    self._strings = []

    if add_pad or (head and head[0] == self.PAD_TOKEN):  # Add PAD token if required or present.
        self.PAD = 0
        self._strings.append(self.PAD_TOKEN)
        (head and head[0] == self.PAD_TOKEN) and head.pop(0)
    else:
        self.PAD = None

    if add_unk or (head and head[0] == self.UNK_TOKEN):  # Add UNK token if required or present.
        self.UNK = len(self._strings)
        (head and head[0] == self.UNK_TOKEN) or self._strings.append(self.UNK_TOKEN)
    else:
        self.UNK = None

    # Now add the remaining strings, both from `head` and from `it`.
    self._strings.extend(head)
    it is not None and self._strings.extend(it)

    self._string_map = {string: index for index, string in enumerate(self._strings)}

__len__

__len__() -> int

The number of strings in the vocabulary.

Returns:

  • int

    The size of the vocabulary.

Source code in minnt/vocabulary.py
82
83
84
85
86
87
88
def __len__(self) -> int:
    """The number of strings in the vocabulary.

    Returns:
      The size of the vocabulary.
    """
    return len(self._strings)

__iter__

__iter__() -> Iterable[str]

Return an iterator over strings in the vocabulary.

Returns:

  • Iterable[str]

    An iterator over strings in the vocabulary.

Source code in minnt/vocabulary.py
90
91
92
93
94
95
96
def __iter__(self) -> Iterable[str]:
    """Return an iterator over strings in the vocabulary.

    Returns:
      An iterator over strings in the vocabulary.
    """
    return iter(self._strings)

add

add(string: str) -> int

If not already present, add the given string to the end of the vocabulary.

Parameters:

  • string (str) –

    The string to add.

Returns:

  • int

    The index of the newly added string (or the index of the existing string if it was already present).

Source code in minnt/vocabulary.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def add(self, string: str) -> int:
    """If not already present, add the given string to the end of the vocabulary.

    Parameters:
      string: The string to add.

    Returns:
      The index of the newly added string (or the index of the existing string if it was already present).
    """
    index = self._string_map.get(string)
    if index is None:
        index = len(self._strings)
        self._strings.append(string)
        self._string_map[string] = index
    return index

string

string(index: int) -> str

Convert vocabulary index to string.

Parameters:

  • index (int) –

    The vocabulary index.

Returns:

  • str

    The string corresponding to the given index.

Source code in minnt/vocabulary.py
123
124
125
126
127
128
129
130
131
132
def string(self, index: int) -> str:
    """Convert vocabulary index to string.

    Parameters:
      index: The vocabulary index.

    Returns:
      The string corresponding to the given index.
    """
    return self._strings[index]

strings

strings(indices: Iterable[int]) -> list[str]

Convert a sequence of vocabulary indices to strings.

Parameters:

  • indices (Iterable[int]) –

    An iterable of vocabulary indices.

Returns:

  • list[str]

    A list of strings corresponding to the given indices.

Source code in minnt/vocabulary.py
134
135
136
137
138
139
140
141
142
143
def strings(self, indices: Iterable[int]) -> list[str]:
    """Convert a sequence of vocabulary indices to strings.

    Parameters:
      indices: An iterable of vocabulary indices.

    Returns:
      A list of strings corresponding to the given indices.
    """
    return [self._strings[index] for index in indices]

index

index(string: str, add_missing: bool = False) -> int | None

Convert string to vocabulary index.

Parameters:

  • string (str) –

    The string to convert.

  • add_missing (bool, default: False ) –

    Whether to add the string to the vocabulary if not present.

Returns:

  • int | None

    The index corresponding to the given string. If the string is not found in the vocabulary, then

    • if add_missing is True, the string is added to the end of the vocabulary and its index returned;
    • if the minnt.Vocabulary.UNK_TOKEN was added to the vocabulary, its index is returned;
    • otherwise, None is returned.
Source code in minnt/vocabulary.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def index(self, string: str, add_missing: bool = False) -> int | None:
    """Convert string to vocabulary index.

    Parameters:
      string: The string to convert.
      add_missing: Whether to add the string to the vocabulary if not present.

    Returns:
      The index corresponding to the given string. If the string is not found in the vocabulary, then

        - if `add_missing` is `True`, the string is added to the end of the vocabulary and its index returned;
        - if the [minnt.Vocabulary.UNK_TOKEN][] was added to the vocabulary, its index is returned;
        - otherwise, `None` is returned.
    """
    if add_missing:
        return self.add(string)
    else:
        return self._string_map.get(string, self.UNK)

indices

indices(strings: Iterable[str], add_missing: bool = False) -> list[int | None]

Convert a sequence of strings to vocabulary indices.

Parameters:

  • strings (Iterable[str]) –

    An iterable of strings to convert. add_missing: Whether to add strings not present in the vocabulary.

Returns:

  • list[int | None]

    A list of indices corresponding to the given strings. For each string not found in the vocabulary:

    • if add_missing is True, the string is added to the end of the vocabulary and its index returned;
    • if the minnt.Vocabulary.UNK_TOKEN was added to the vocabulary, its index is returned;
    • otherwise, None is returned.
Source code in minnt/vocabulary.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
def indices(self, strings: Iterable[str], add_missing: bool = False) -> list[int | None]:
    """Convert a sequence of strings to vocabulary indices.

    Parameters:
      strings: An iterable of strings to convert.
        add_missing: Whether to add strings not present in the vocabulary.

    Returns:
      A list of indices corresponding to the given strings. For each string not found in the vocabulary:

        - if `add_missing` is `True`, the string is added to the end of the vocabulary and its index returned;
        - if the [minnt.Vocabulary.UNK_TOKEN][] was added to the vocabulary, its index is returned;
        - otherwise, `None` is returned.
    """
    if add_missing:
        return [self.add(string) for string in strings]
    else:
        return [self._string_map.get(string, self.UNK) for string in strings]