Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove the use of SentencePieceTrainer from tests #1283

Merged
merged 15 commits into from
Oct 26, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 8 additions & 21 deletions keras_nlp/models/albert/albert_classifier_test.py
Original file line number Diff line number Diff line change
@@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_backbone import AlbertBackbone
from keras_nlp.models.albert.albert_classifier import AlbertClassifier
@@ -27,26 +26,14 @@
class AlbertClassifierTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
)
self.preprocessor = AlbertPreprocessor(
AlbertTokenizer(proto=bytes_io.getvalue()),
sequence_length=5,
AlbertTokenizer(
# Generated using create_albert_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "albert_test_vocab.spm"
),
sequence_length=5,
)
)
self.backbone = AlbertBackbone(
vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
25 changes: 6 additions & 19 deletions keras_nlp/models/albert/albert_masked_lm_preprocessor_test.py
Original file line number Diff line number Diff line change
@@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_masked_lm_preprocessor import (
AlbertMaskedLMPreprocessor,
@@ -26,24 +25,12 @@

class AlbertMaskedLMPreprocessorTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
self.tokenizer = AlbertTokenizer(
mattdangerw marked this conversation as resolved.
Show resolved Hide resolved
# Generated using create_albert_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "albert_test_vocab.spm"
)
)
self.tokenizer = AlbertTokenizer(proto=bytes_io.getvalue())
self.init_kwargs = {
"tokenizer": self.tokenizer,
# Simplify our testing by masking every available token.
28 changes: 8 additions & 20 deletions keras_nlp/models/albert/albert_masked_lm_test.py
Original file line number Diff line number Diff line change
@@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_backbone import AlbertBackbone
from keras_nlp.models.albert.albert_masked_lm import AlbertMaskedLM
@@ -29,25 +28,14 @@
class AlbertMaskedLMTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
)
self.preprocessor = AlbertMaskedLMPreprocessor(
AlbertTokenizer(proto=bytes_io.getvalue()),
AlbertTokenizer(
# Generated using create_albert_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "albert_test_vocab.spm"
),
sequence_length=5,
),
# Simplify our testing by masking every available token.
mask_selection_rate=1.0,
mask_token_rate=1.0,
25 changes: 6 additions & 19 deletions keras_nlp/models/albert/albert_preprocessor_test.py
Original file line number Diff line number Diff line change
@@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_preprocessor import AlbertPreprocessor
from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
@@ -24,24 +23,12 @@

class AlbertPreprocessorTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
self.tokenizer = AlbertTokenizer(
# Generated using create_albert_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "albert_test_vocab.spm"
)
)
self.tokenizer = AlbertTokenizer(proto=bytes_io.getvalue())
self.init_kwargs = {
"tokenizer": self.tokenizer,
"sequence_length": 8,
43 changes: 13 additions & 30 deletions keras_nlp/models/albert/albert_tokenizer_test.py
Original file line number Diff line number Diff line change
@@ -12,35 +12,22 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
from keras_nlp.tests.test_case import TestCase


class AlbertTokenizerTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
)
self.init_kwargs = {"proto": bytes_io.getvalue()}
self.init_kwargs = {
# Generated using create_albert_test_proto.py
"proto": os.path.join(
self.get_test_data_dir(), "albert_test_vocab.spm"
)
}
self.input_data = ["the quick brown fox.", "the earth is round."]

def test_tokenizer_basics(self):
@@ -52,17 +39,13 @@ def test_tokenizer_basics(self):
)

def test_errors_missing_special_tokens(self):
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(["abc"]),
model_writer=bytes_io,
vocab_size=5,
pad_id=-1,
eos_id=-1,
bos_id=-1,
)
with self.assertRaises(ValueError):
AlbertTokenizer(proto=bytes_io.getvalue())
AlbertTokenizer(
# Generated using create_no_special_token_proto.py
proto=os.path.join(
self.get_test_data_dir(), "no_special_token_vocab.spm"
)
)

@pytest.mark.large
def test_smallest_preset(self):
27 changes: 7 additions & 20 deletions keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py
Original file line number Diff line number Diff line change
@@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone
from keras_nlp.models.deberta_v3.deberta_v3_classifier import (
@@ -31,25 +30,13 @@
class DebertaV3ClassifierTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
bos_id=1,
eos_id=2,
unk_id=3,
pad_piece="[PAD]",
bos_piece="[CLS]",
eos_piece="[SEP]",
unk_piece="[UNK]",
user_defined_symbols="[MASK]",
)
self.preprocessor = DebertaV3Preprocessor(
DebertaV3Tokenizer(proto=bytes_io.getvalue()),
DebertaV3Tokenizer(
# Generated using create_deberta_v3_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
)
),
sequence_length=5,
)
self.backbone = DebertaV3Backbone(
Original file line number Diff line number Diff line change
@@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.deberta_v3.deberta_v3_masked_lm_preprocessor import (
DebertaV3MaskedLMPreprocessor,
@@ -26,24 +25,12 @@

class DebertaV3MaskedLMPreprocessorTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
bos_id=1,
eos_id=2,
unk_id=3,
pad_piece="[PAD]",
bos_piece="[CLS]",
eos_piece="[SEP]",
unk_piece="[UNK]",
user_defined_symbols="[MASK]",
self.tokenizer = DebertaV3Tokenizer(
# Generated using create_deberta_v3_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
)
)
self.tokenizer = DebertaV3Tokenizer(proto=bytes_io.getvalue())
self.init_kwargs = {
"tokenizer": self.tokenizer,
# Simplify our testing by masking every available token.
27 changes: 7 additions & 20 deletions keras_nlp/models/deberta_v3/deberta_v3_masked_lm_test.py
Original file line number Diff line number Diff line change
@@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone
from keras_nlp.models.deberta_v3.deberta_v3_masked_lm import DebertaV3MaskedLM
@@ -29,25 +28,13 @@
class DebertaV3MaskedLMTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
bos_id=1,
eos_id=2,
unk_id=3,
pad_piece="[PAD]",
bos_piece="[CLS]",
eos_piece="[SEP]",
unk_piece="[UNK]",
user_defined_symbols="[MASK]",
)
self.preprocessor = DebertaV3MaskedLMPreprocessor(
DebertaV3Tokenizer(proto=bytes_io.getvalue()),
DebertaV3Tokenizer(
# Generated using create_deberta_v3_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
)
),
# Simplify our testing by masking every available token.
mask_selection_rate=1.0,
mask_token_rate=1.0,
25 changes: 6 additions & 19 deletions keras_nlp/models/deberta_v3/deberta_v3_preprocessor_test.py
Original file line number Diff line number Diff line change
@@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.deberta_v3.deberta_v3_preprocessor import (
DebertaV3Preprocessor,
@@ -26,24 +25,12 @@

class DebertaV3PreprocessorTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
bos_id=1,
eos_id=2,
unk_id=3,
pad_piece="[PAD]",
bos_piece="[CLS]",
eos_piece="[SEP]",
unk_piece="[UNK]",
user_defined_symbols="[MASK]",
self.tokenizer = DebertaV3Tokenizer(
# Generated using create_deberta_v3_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
)
)
self.tokenizer = DebertaV3Tokenizer(proto=bytes_io.getvalue())
self.init_kwargs = {
"tokenizer": self.tokenizer,
"sequence_length": 8,
51 changes: 17 additions & 34 deletions keras_nlp/models/deberta_v3/deberta_v3_tokenizer_test.py
Original file line number Diff line number Diff line change
@@ -12,64 +12,47 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.deberta_v3.deberta_v3_tokenizer import DebertaV3Tokenizer
from keras_nlp.tests.test_case import TestCase


class DebertaV3TokenizerTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=11,
model_type="WORD",
pad_id=0,
bos_id=1,
eos_id=2,
unk_id=3,
pad_piece="[PAD]",
bos_piece="[CLS]",
eos_piece="[SEP]",
unk_piece="[UNK]",
# Generated using create_deberta_v3_test_proto.py
proto = os.path.join(
self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
)
self.tokenizer = DebertaV3Tokenizer(proto=bytes_io.getvalue())
self.init_kwargs = {"proto": bytes_io.getvalue()}
self.tokenizer = DebertaV3Tokenizer(proto=proto)
self.init_kwargs = {"proto": proto}
self.input_data = ["the quick brown fox.", "the earth is round."]

def test_tokenizer_basics(self):
self.run_preprocessing_layer_test(
cls=DebertaV3Tokenizer,
init_kwargs=self.init_kwargs,
input_data=self.input_data,
expected_output=[[4, 9, 5, 3], [4, 6, 8, 3]],
expected_output=[[5, 10, 6, 3], [5, 7, 9, 3]],
)

def test_errors_missing_special_tokens(self):
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(["abc"]),
model_writer=bytes_io,
vocab_size=5,
pad_id=-1,
eos_id=-1,
bos_id=-1,
)
with self.assertRaises(ValueError):
DebertaV3Tokenizer(proto=bytes_io.getvalue())
DebertaV3Tokenizer(
# Generated using create_no_special_token_proto.py
proto=os.path.join(
self.get_test_data_dir(), "no_special_token_vocab.spm"
)
)

def test_mask_token_handling(self):
tokenizer = DebertaV3Tokenizer(**self.init_kwargs)
self.assertEqual(tokenizer.get_vocabulary()[11], "[MASK]")
self.assertEqual(tokenizer.id_to_token(11), "[MASK]")
self.assertEqual(tokenizer.token_to_id("[MASK]"), 11)
input_data = [[4, 9, 5, 7, self.tokenizer.mask_token_id]]
self.assertEqual(tokenizer.get_vocabulary()[4], "[MASK]")
self.assertEqual(tokenizer.id_to_token(4), "[MASK]")
self.assertEqual(tokenizer.token_to_id("[MASK]"), 4)
input_data = [[5, 10, 6, 8, self.tokenizer.mask_token_id]]
output = tokenizer.detokenize(input_data)
self.assertEqual(output, ["the quick brown fox"])

27 changes: 7 additions & 20 deletions keras_nlp/models/f_net/f_net_classifier_test.py
Original file line number Diff line number Diff line change
@@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.f_net.f_net_backbone import FNetBackbone
from keras_nlp.models.f_net.f_net_classifier import FNetClassifier
@@ -27,25 +26,13 @@
class FNetClassifierTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
)
self.preprocessor = FNetPreprocessor(
FNetTokenizer(proto=bytes_io.getvalue()),
FNetTokenizer(
# Generated using create_f_net_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "f_net_test_vocab.spm"
)
),
sequence_length=5,
)
self.backbone = FNetBackbone(
23 changes: 4 additions & 19 deletions keras_nlp/models/f_net/f_net_masked_lm_preprocessor_test.py
Original file line number Diff line number Diff line change
@@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.f_net.f_net_masked_lm_preprocessor import (
FNetMaskedLMPreprocessor,
@@ -26,24 +25,10 @@

class FNetMaskedLMPreprocessorTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
self.tokenizer = FNetTokenizer(
# Generated using create_f_net_test_proto.py
proto=os.path.join(self.get_test_data_dir(), "f_net_test_vocab.spm")
)
self.tokenizer = FNetTokenizer(proto=bytes_io.getvalue())
self.init_kwargs = {
"tokenizer": self.tokenizer,
# Simplify our testing by masking every available token.
27 changes: 7 additions & 20 deletions keras_nlp/models/f_net/f_net_masked_lm_test.py
Original file line number Diff line number Diff line change
@@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.f_net.f_net_backbone import FNetBackbone
from keras_nlp.models.f_net.f_net_masked_lm import FNetMaskedLM
@@ -29,25 +28,13 @@
class FNetMaskedLMTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
)
self.preprocessor = FNetMaskedLMPreprocessor(
FNetTokenizer(proto=bytes_io.getvalue()),
FNetTokenizer(
# Generated using create_f_net_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "f_net_test_vocab.spm"
)
),
# Simplify our testing by masking every available token.
mask_selection_rate=1.0,
mask_token_rate=1.0,
23 changes: 4 additions & 19 deletions keras_nlp/models/f_net/f_net_preprocessor_test.py
Original file line number Diff line number Diff line change
@@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.f_net.f_net_preprocessor import FNetPreprocessor
from keras_nlp.models.f_net.f_net_tokenizer import FNetTokenizer
@@ -24,24 +23,10 @@

class FNetPreprocessorTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
self.tokenizer = FNetTokenizer(
# Generated using create_f_net_test_proto.py
proto=os.path.join(self.get_test_data_dir(), "f_net_test_vocab.spm")
)
self.tokenizer = FNetTokenizer(proto=bytes_io.getvalue())
self.init_kwargs = {
"tokenizer": self.tokenizer,
"sequence_length": 8,
43 changes: 13 additions & 30 deletions keras_nlp/models/f_net/f_net_tokenizer_test.py
Original file line number Diff line number Diff line change
@@ -12,35 +12,22 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.f_net.f_net_tokenizer import FNetTokenizer
from keras_nlp.tests.test_case import TestCase


class FNetTokenizerTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
)
self.init_kwargs = {"proto": bytes_io.getvalue()}
self.init_kwargs = {
# Generated using create_f_net_test_proto.py
"proto": os.path.join(
self.get_test_data_dir(), "f_net_test_vocab.spm"
)
}
self.input_data = ["the quick brown fox.", "the earth is round."]

def test_tokenizer_basics(self):
@@ -52,17 +39,13 @@ def test_tokenizer_basics(self):
)

def test_errors_missing_special_tokens(self):
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(["abc"]),
model_writer=bytes_io,
vocab_size=5,
pad_id=-1,
eos_id=-1,
bos_id=-1,
)
with self.assertRaises(ValueError):
FNetTokenizer(proto=bytes_io.getvalue())
FNetTokenizer(
# Generated using create_no_special_token_proto.py
proto=os.path.join(
self.get_test_data_dir(), "no_special_token_vocab.spm"
)
)

@pytest.mark.large
def test_smallest_preset(self):
43 changes: 11 additions & 32 deletions keras_nlp/models/t5/t5_tokenizer_test.py
Original file line number Diff line number Diff line change
@@ -12,37 +12,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece
import tensorflow as tf

from keras_nlp.models.t5.t5_tokenizer import T5Tokenizer
from keras_nlp.tests.test_case import TestCase


class T5TokenizerTest(TestCase):
def setUp(self):
bytes_io = io.BytesIO()
vocab_data = tf.data.Dataset.from_tensor_slices(
["the quick brown fox", "the earth is round"]
)
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=vocab_data.as_numpy_iterator(),
model_writer=bytes_io,
vocab_size=11,
model_type="WORD",
bos_id=-1,
pad_id=0,
eos_id=1,
unk_id=2,
pad_piece="<pad>",
eos_piece="</s>",
unk_piece="<unk>",
user_defined_symbols="[MASK]",
)
self.init_kwargs = {"proto": bytes_io.getvalue()}
self.init_kwargs = {
# Generated using create_t5_test_proto.py
"proto": os.path.join(self.get_test_data_dir(), "t5_test_vocab.spm")
}
self.input_data = ["the quick brown fox.", "the earth is round."]

def test_tokenizer_basics(self):
@@ -54,17 +37,13 @@ def test_tokenizer_basics(self):
)

def test_errors_missing_special_tokens(self):
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(["abc"]),
model_writer=bytes_io,
vocab_size=5,
pad_id=-1,
eos_id=-1,
bos_id=-1,
)
with self.assertRaises(ValueError):
T5Tokenizer(proto=bytes_io.getvalue())
T5Tokenizer(
# Generated using create_no_special_token_proto.py
proto=os.path.join(
self.get_test_data_dir(), "no_special_token_vocab.spm"
)
)

@pytest.mark.large
def test_smallest_preset(self):
21 changes: 7 additions & 14 deletions keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py
Original file line number Diff line number Diff line change
@@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.xlm_roberta.xlm_roberta_backbone import XLMRobertaBackbone
from keras_nlp.models.xlm_roberta.xlm_roberta_classifier import (
@@ -33,19 +32,13 @@
class XLMRobertaClassifierTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=10,
model_type="WORD",
unk_id=0,
bos_id=1,
eos_id=2,
)
self.preprocessor = XLMRobertaPreprocessor(
XLMRobertaTokenizer(proto=bytes_io.getvalue()),
XLMRobertaTokenizer(
# Generated using create_xlm_roberta_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "xlm_roberta_test_vocab.spm"
)
),
sequence_length=5,
)
self.backbone = XLMRobertaBackbone(
Original file line number Diff line number Diff line change
@@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.xlm_roberta.xlm_roberta_masked_lm_preprocessor import (
XLMRobertaMaskedLMPreprocessor,
@@ -28,19 +27,12 @@

class XLMRobertaMaskedLMPreprocessorTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=11,
model_type="WORD",
unk_id=0,
bos_id=1,
eos_id=2,
user_defined_symbols="[MASK]",
self.tokenizer = XLMRobertaTokenizer(
# Generated using create_xlm_roberta_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "xlm_roberta_test_vocab.spm"
)
)
self.tokenizer = XLMRobertaTokenizer(proto=bytes_io.getvalue())
self.init_kwargs = {
"tokenizer": self.tokenizer,
# Simplify our testing by masking every available token.
@@ -59,11 +51,11 @@ def test_preprocessor_basics(self):
input_data=self.input_data,
expected_output=(
{
"token_ids": [[0, 12, 12, 12, 12, 2, 1, 1, 1, 1, 1, 1]],
"token_ids": [[0, 13, 13, 13, 13, 2, 1, 1, 1, 1, 1, 1]],
"padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]],
"mask_positions": [[1, 2, 3, 4]],
},
[[5, 10, 6, 8]],
[[6, 11, 7, 9]],
[[1.0, 1.0, 1.0, 1.0]],
),
)
@@ -80,7 +72,7 @@ def test_no_masking_zero_rate(self):
no_mask_preprocessor(input_data),
(
{
"token_ids": [[0, 5, 10, 6, 8, 2, 1, 1, 1, 1, 1, 1]],
"token_ids": [[0, 6, 11, 7, 9, 2, 1, 1, 1, 1, 1, 1]],
"padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]],
"mask_positions": [[0, 0, 0, 0]],
},
24 changes: 8 additions & 16 deletions keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_test.py
Original file line number Diff line number Diff line change
@@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.xlm_roberta.xlm_roberta_backbone import XLMRobertaBackbone
from keras_nlp.models.xlm_roberta.xlm_roberta_masked_lm import (
@@ -33,20 +32,13 @@
class XLMRobertaMaskedLMTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=11,
model_type="WORD",
unk_id=0,
bos_id=1,
eos_id=2,
user_defined_symbols="[MASK]",
)
self.preprocessor = XLMRobertaMaskedLMPreprocessor(
XLMRobertaTokenizer(proto=bytes_io.getvalue()),
XLMRobertaTokenizer(
# Generated using create_xlm_roberta_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "xlm_roberta_test_vocab.spm"
)
),
# Simplify our testing by masking every available token.
mask_selection_rate=1.0,
mask_token_rate=1.0,
@@ -76,7 +68,7 @@ def test_masked_lm_basics(self):
cls=XLMRobertaMaskedLM,
init_kwargs=self.init_kwargs,
train_data=self.train_data,
expected_output_shape=(2, 5, 13),
expected_output_shape=(2, 5, 14),
)

@pytest.mark.large
25 changes: 6 additions & 19 deletions keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py
Original file line number Diff line number Diff line change
@@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import (
XLMRobertaPreprocessor,
@@ -28,24 +27,12 @@

class XLMRobertaPreprocessorTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
self.tokenizer = XLMRobertaTokenizer(
# Generated using create_xlm_roberta_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "xlm_roberta_test_vocab.spm"
)
)
self.tokenizer = XLMRobertaTokenizer(proto=bytes_io.getvalue())
self.init_kwargs = {
"tokenizer": self.tokenizer,
"sequence_length": 8,
24 changes: 8 additions & 16 deletions keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py
Original file line number Diff line number Diff line change
@@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.xlm_roberta.xlm_roberta_tokenizer import (
XLMRobertaTokenizer,
@@ -25,27 +24,20 @@

class XLMRobertaTokenizerTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=11,
model_type="WORD",
unk_id=0,
bos_id=1,
eos_id=2,
user_defined_symbols="[MASK]",
)
self.init_kwargs = {"proto": bytes_io.getvalue()}
self.init_kwargs = {
# Generated using create_xlm_roberta_test_proto.py
"proto": os.path.join(
self.get_test_data_dir(), "xlm_roberta_test_vocab.spm"
)
}
self.input_data = ["the quick brown fox.", "the earth is round."]

def test_tokenizer_basics(self):
self.run_preprocessing_layer_test(
cls=XLMRobertaTokenizer,
init_kwargs=self.init_kwargs,
input_data=self.input_data,
expected_output=[[5, 10, 6, 3], [5, 7, 9, 3]],
expected_output=[[6, 11, 7, 2], [6, 8, 10, 2]],
)

@pytest.mark.large
4 changes: 4 additions & 0 deletions keras_nlp/tests/test_case.py
Original file line number Diff line number Diff line change
@@ -14,6 +14,7 @@

import json
import os
import pathlib
import re

import tensorflow as tf
@@ -417,3 +418,6 @@ def compare(actual, expected):
self.assertAllClose(actual, expected, atol=0.01, rtol=0.01)

tree.map_structure(compare, output, expected_partial_output)

def get_test_data_dir(self):
return str(pathlib.Path(__file__).parent / "test_data")
Binary file added keras_nlp/tests/test_data/albert_test_vocab.spm
Binary file not shown.
Binary file not shown.
Binary file added keras_nlp/tests/test_data/f_net_test_vocab.spm
Binary file not shown.
Binary file not shown.
Binary file added keras_nlp/tests/test_data/t5_test_vocab.spm
Binary file not shown.
Binary file not shown.
Binary file not shown.
26 changes: 7 additions & 19 deletions keras_nlp/tokenizers/sentence_piece_tokenizer_test.py
Original file line number Diff line number Diff line change
@@ -12,10 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import sentencepiece
import tensorflow as tf

from keras_nlp.tests.test_case import TestCase
@@ -25,17 +23,9 @@
class SentencePieceTokenizerTest(TestCase):
def setUp(self):
super().setUp()
bytes_io = io.BytesIO()
vocab_data = tf.data.Dataset.from_tensor_slices(
["the quick brown fox."]
self.proto = os.path.join(
self.get_test_data_dir(), "tokenizer_test_vocab.spm"
)
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=vocab_data.as_numpy_iterator(),
model_writer=bytes_io,
vocab_size=7,
model_type="WORD",
)
self.proto = bytes_io.getvalue()

def test_tokenize(self):
input_data = ["the quick brown fox."]
@@ -112,15 +102,13 @@ def test_error_id_out_of_vocabulary(self):
with self.assertRaises(ValueError):
tokenizer.id_to_token(-1)

def test_from_file(self):
filepath = os.path.join(self.get_temp_dir(), "model.txt")
input_data = ["the quick brown fox."]
with tf.io.gfile.GFile(filepath, "wb") as file:
file.write(self.proto)
def test_from_bytes(self):
with tf.io.gfile.GFile(self.proto, "rb") as file:
proto = file.read()
tokenizer = SentencePieceTokenizer(
proto=filepath,
proto=proto,
)
output_data = tokenizer(input_data)
output_data = tokenizer(["the quick brown fox."])
self.assertAllEqual(output_data, [[6, 5, 3, 4]])

def test_tokenize_then_batch(self):
13 changes: 13 additions & 0 deletions tools/sentencepiece_testing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright 2023 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
37 changes: 37 additions & 0 deletions tools/sentencepiece_testing/create_albert_test_proto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright 2023 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from tools.sentencepiece_testing.utils import train_sentencepiece


def main():
train_sentencepiece(
["the quick brown fox", "the earth is round"],
"albert_test_vocab.spm",
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
)


if __name__ == "__main__":
main()
37 changes: 37 additions & 0 deletions tools/sentencepiece_testing/create_deberta_v3_test_proto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright 2023 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from tools.sentencepiece_testing.utils import train_sentencepiece


def main():
train_sentencepiece(
["the quick brown fox", "the earth is round"],
"deberta_v3_test_vocab.spm",
vocab_size=12,
model_type="WORD",
pad_id=0,
bos_id=1,
eos_id=2,
unk_id=3,
pad_piece="[PAD]",
bos_piece="[CLS]",
eos_piece="[SEP]",
unk_piece="[UNK]",
user_defined_symbols="[MASK]",
)


if __name__ == "__main__":
main()
37 changes: 37 additions & 0 deletions tools/sentencepiece_testing/create_f_net_test_proto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright 2023 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from tools.sentencepiece_testing.utils import train_sentencepiece


def main():
train_sentencepiece(
["the quick brown fox", "the earth is round"],
"f_net_test_vocab.spm",
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
)


if __name__ == "__main__":
main()
30 changes: 30 additions & 0 deletions tools/sentencepiece_testing/create_no_special_token_proto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright 2023 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from tools.sentencepiece_testing.utils import train_sentencepiece


def main():
train_sentencepiece(
["abc"],
"no_special_token_vocab.spm",
vocab_size=5,
pad_id=-1,
eos_id=-1,
bos_id=-1,
)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Copyright 2023 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from tools.sentencepiece_testing.utils import train_sentencepiece


def main():
train_sentencepiece(
["the quick brown fox."],
"tokenizer_test_vocab.spm",
vocab_size=7,
model_type="WORD",
)


if __name__ == "__main__":
main()
36 changes: 36 additions & 0 deletions tools/sentencepiece_testing/create_t5_test_proto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright 2023 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from tools.sentencepiece_testing.utils import train_sentencepiece


def main():
train_sentencepiece(
["the quick brown fox", "the earth is round"],
"t5_test_vocab.spm",
vocab_size=11,
model_type="WORD",
bos_id=-1,
pad_id=0,
eos_id=1,
unk_id=2,
pad_piece="<pad>",
eos_piece="</s>",
unk_piece="<unk>",
user_defined_symbols="[MASK]",
)


if __name__ == "__main__":
main()
37 changes: 37 additions & 0 deletions tools/sentencepiece_testing/create_xlm_roberta_test_proto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright 2023 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from tools.sentencepiece_testing.utils import train_sentencepiece


def main():
train_sentencepiece(
["the quick brown fox", "the earth is round"],
"xlm_roberta_test_vocab.spm",
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
)


if __name__ == "__main__":
main()
33 changes: 33 additions & 0 deletions tools/sentencepiece_testing/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright 2023 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import pathlib

import sentencepiece


def train_sentencepiece(data, filename, *args, **kwargs):
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(data), model_writer=bytes_io, *args, **kwargs
)
with open(
pathlib.Path(__file__).parent.parent.parent
/ "keras_nlp"
/ "tests"
/ "test_data"
/ filename,
mode="wb",
) as f:
f.write(bytes_io.getbuffer())