-
Notifications
You must be signed in to change notification settings - Fork 246
/
Copy pathdistil_bert_tokenizer.py
83 lines (70 loc) · 3.04 KB
/
distil_bert_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from keras_hub.src.api_export import keras_hub_export
from keras_hub.src.models.distil_bert.distil_bert_backbone import (
DistilBertBackbone,
)
from keras_hub.src.tokenizers.word_piece_tokenizer import WordPieceTokenizer
@keras_hub_export(
[
"keras_hub.tokenizers.DistilBertTokenizer",
"keras_hub.models.DistilBertTokenizer",
]
)
class DistilBertTokenizer(WordPieceTokenizer):
"""A DistilBERT tokenizer using WordPiece subword segmentation.
This tokenizer class will tokenize raw strings into integer sequences and
is based on `keras_hub.tokenizers.WordPieceTokenizer`. Unlike the
underlying tokenizer, it will check for all special tokens needed by
DistilBERT models and provides a `from_preset()` method to automatically
download a matching vocabulary for a DistilBERT preset.
If input is a batch of strings (rank > 0), the layer will output a
`tf.RaggedTensor` where the last dimension of the output is ragged.
If input is a scalar string (rank == 0), the layer will output a dense
`tf.Tensor` with static shape `[None]`.
Args:
vocabulary: A list of strings or a string filename path. If
passing a list, each element of the list should be a single word
piece token string. If passing a filename, the file should be a
plain text file containing a single word piece token per line.
lowercase: If `True`, the input text will be first lowered before
tokenization.
special_tokens_in_strings: bool. A bool to indicate if the tokenizer
should expect special tokens in input strings that should be
tokenized and mapped correctly to their ids. Defaults to False.
Examples:
```python
# Unbatched input.
tokenizer = keras_hub.models.DistilBertTokenizer.from_preset(
"distil_bert_base_en_uncased",
)
tokenizer("The quick brown fox jumped.")
# Batched input.
tokenizer(["The quick brown fox jumped.", "The fox slept."])
# Detokenization.
tokenizer.detokenize(tokenizer("The quick brown fox jumped."))
# Custom vocabulary.
vocab = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
vocab += ["The", "quick", "brown", "fox", "jumped", "."]
tokenizer = keras_hub.models.DistilBertTokenizer(vocabulary=vocab)
tokenizer("The quick brown fox jumped.")
```
"""
backbone_cls = DistilBertBackbone
def __init__(
self,
vocabulary,
lowercase=False,
**kwargs,
):
self._add_special_token("[CLS]", "cls_token")
self._add_special_token("[SEP]", "sep_token")
self._add_special_token("[PAD]", "pad_token")
self._add_special_token("[MASK]", "mask_token")
# Also add `tokenizer.start_token` and `tokenizer.end_token` for
# compatibility with other tokenizers.
self._add_special_token("[CLS]", "start_token")
self._add_special_token("[SEP]", "end_token")
super().__init__(
vocabulary=vocabulary,
lowercase=lowercase,
**kwargs,
)