keras-team · mattdangerw · Oct 26, 2023 · Oct 20, 2023 · Oct 20, 2023 · Oct 20, 2023
diff --git a/keras_nlp/models/albert/albert_classifier_test.py b/keras_nlp/models/albert/albert_classifier_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.albert.albert_backbone import AlbertBackbone
 from keras_nlp.models.albert.albert_classifier import AlbertClassifier
@@ -27,26 +26,14 @@
 class AlbertClassifierTest(TestCase):
     def setUp(self):
         # Setup model.
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            unk_id=1,
-            bos_id=2,
-            eos_id=3,
-            pad_piece="<pad>",
-            unk_piece="<unk>",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            user_defined_symbols="[MASK]",
-        )
         self.preprocessor = AlbertPreprocessor(
-            AlbertTokenizer(proto=bytes_io.getvalue()),
-            sequence_length=5,
+            AlbertTokenizer(
+                # Generated using create_albert_test_proto.py
+                proto=os.path.join(
+                    self.get_test_data_dir(), "albert_test_vocab.spm"
+                ),
+                sequence_length=5,
+            )
         )
         self.backbone = AlbertBackbone(
             vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),

diff --git a/keras_nlp/models/albert/albert_masked_lm_preprocessor_test.py b/keras_nlp/models/albert/albert_masked_lm_preprocessor_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.albert.albert_masked_lm_preprocessor import (
     AlbertMaskedLMPreprocessor,
@@ -26,24 +25,12 @@
 
 class AlbertMaskedLMPreprocessorTest(TestCase):
     def setUp(self):
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            unk_id=1,
-            bos_id=2,
-            eos_id=3,
-            pad_piece="<pad>",
-            unk_piece="<unk>",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            user_defined_symbols="[MASK]",
+        self.tokenizer = AlbertTokenizer(
+            # Generated using create_albert_test_proto.py
+            proto=os.path.join(
+                self.get_test_data_dir(), "albert_test_vocab.spm"
+            )
         )
-        self.tokenizer = AlbertTokenizer(proto=bytes_io.getvalue())
         self.init_kwargs = {
             "tokenizer": self.tokenizer,
             # Simplify our testing by masking every available token.

diff --git a/keras_nlp/models/albert/albert_masked_lm_test.py b/keras_nlp/models/albert/albert_masked_lm_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.albert.albert_backbone import AlbertBackbone
 from keras_nlp.models.albert.albert_masked_lm import AlbertMaskedLM
@@ -29,25 +28,14 @@
 class AlbertMaskedLMTest(TestCase):
     def setUp(self):
         # Setup model.
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            unk_id=1,
-            bos_id=2,
-            eos_id=3,
-            pad_piece="<pad>",
-            unk_piece="<unk>",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            user_defined_symbols="[MASK]",
-        )
         self.preprocessor = AlbertMaskedLMPreprocessor(
-            AlbertTokenizer(proto=bytes_io.getvalue()),
+            AlbertTokenizer(
+                # Generated using create_albert_test_proto.py
+                proto=os.path.join(
+                    self.get_test_data_dir(), "albert_test_vocab.spm"
+                ),
+                sequence_length=5,
+            ),
             # Simplify our testing by masking every available token.
             mask_selection_rate=1.0,
             mask_token_rate=1.0,

diff --git a/keras_nlp/models/albert/albert_preprocessor_test.py b/keras_nlp/models/albert/albert_preprocessor_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.albert.albert_preprocessor import AlbertPreprocessor
 from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
@@ -24,24 +23,12 @@
 
 class AlbertPreprocessorTest(TestCase):
     def setUp(self):
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            unk_id=1,
-            bos_id=2,
-            eos_id=3,
-            pad_piece="<pad>",
-            unk_piece="<unk>",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            user_defined_symbols="[MASK]",
+        self.tokenizer = AlbertTokenizer(
+            # Generated using create_albert_test_proto.py
+            proto=os.path.join(
+                self.get_test_data_dir(), "albert_test_vocab.spm"
+            )
         )
-        self.tokenizer = AlbertTokenizer(proto=bytes_io.getvalue())
         self.init_kwargs = {
             "tokenizer": self.tokenizer,
             "sequence_length": 8,

diff --git a/keras_nlp/models/albert/albert_tokenizer_test.py b/keras_nlp/models/albert/albert_tokenizer_test.py
@@ -12,35 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
 from keras_nlp.tests.test_case import TestCase
 
 
 class AlbertTokenizerTest(TestCase):
     def setUp(self):
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            unk_id=1,
-            bos_id=2,
-            eos_id=3,
-            pad_piece="<pad>",
-            unk_piece="<unk>",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            user_defined_symbols="[MASK]",
-        )
-        self.init_kwargs = {"proto": bytes_io.getvalue()}
+        self.init_kwargs = {
+            # Generated using create_albert_test_proto.py
+            "proto": os.path.join(
+                self.get_test_data_dir(), "albert_test_vocab.spm"
+            )
+        }
         self.input_data = ["the quick brown fox.", "the earth is round."]
 
     def test_tokenizer_basics(self):
@@ -52,17 +39,13 @@ def test_tokenizer_basics(self):
         )
 
     def test_errors_missing_special_tokens(self):
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(["abc"]),
-            model_writer=bytes_io,
-            vocab_size=5,
-            pad_id=-1,
-            eos_id=-1,
-            bos_id=-1,
-        )
         with self.assertRaises(ValueError):
-            AlbertTokenizer(proto=bytes_io.getvalue())
+            AlbertTokenizer(
+                # Generated using create_no_special_token_proto.py
+                proto=os.path.join(
+                    self.get_test_data_dir(), "no_special_token_vocab.spm"
+                )
+            )
 
     @pytest.mark.large
     def test_smallest_preset(self):

diff --git a/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py b/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone
 from keras_nlp.models.deberta_v3.deberta_v3_classifier import (
@@ -31,25 +30,13 @@
 class DebertaV3ClassifierTest(TestCase):
     def setUp(self):
         # Setup model.
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            bos_id=1,
-            eos_id=2,
-            unk_id=3,
-            pad_piece="[PAD]",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            unk_piece="[UNK]",
-            user_defined_symbols="[MASK]",
-        )
         self.preprocessor = DebertaV3Preprocessor(
-            DebertaV3Tokenizer(proto=bytes_io.getvalue()),
+            DebertaV3Tokenizer(
+                # Generated using create_deberta_v3_test_proto.py
+                proto=os.path.join(
+                    self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
+                )
+            ),
             sequence_length=5,
         )
         self.backbone = DebertaV3Backbone(

diff --git a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor_test.py b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.deberta_v3.deberta_v3_masked_lm_preprocessor import (
     DebertaV3MaskedLMPreprocessor,
@@ -26,24 +25,12 @@
 
 class DebertaV3MaskedLMPreprocessorTest(TestCase):
     def setUp(self):
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            bos_id=1,
-            eos_id=2,
-            unk_id=3,
-            pad_piece="[PAD]",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            unk_piece="[UNK]",
-            user_defined_symbols="[MASK]",
+        self.tokenizer = DebertaV3Tokenizer(
+            # Generated using create_deberta_v3_test_proto.py
+            proto=os.path.join(
+                self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
+            )
         )
-        self.tokenizer = DebertaV3Tokenizer(proto=bytes_io.getvalue())
         self.init_kwargs = {
             "tokenizer": self.tokenizer,
             # Simplify our testing by masking every available token.

diff --git a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_test.py b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone
 from keras_nlp.models.deberta_v3.deberta_v3_masked_lm import DebertaV3MaskedLM
@@ -29,25 +28,13 @@
 class DebertaV3MaskedLMTest(TestCase):
     def setUp(self):
         # Setup model.
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            bos_id=1,
-            eos_id=2,
-            unk_id=3,
-            pad_piece="[PAD]",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            unk_piece="[UNK]",
-            user_defined_symbols="[MASK]",
-        )
         self.preprocessor = DebertaV3MaskedLMPreprocessor(
-            DebertaV3Tokenizer(proto=bytes_io.getvalue()),
+            DebertaV3Tokenizer(
+                # Generated using create_deberta_v3_test_proto.py
+                proto=os.path.join(
+                    self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
+                )
+            ),
             # Simplify our testing by masking every available token.
             mask_selection_rate=1.0,
             mask_token_rate=1.0,

diff --git a/keras_nlp/models/deberta_v3/deberta_v3_preprocessor_test.py b/keras_nlp/models/deberta_v3/deberta_v3_preprocessor_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.deberta_v3.deberta_v3_preprocessor import (
     DebertaV3Preprocessor,
@@ -26,24 +25,12 @@
 
 class DebertaV3PreprocessorTest(TestCase):
     def setUp(self):
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            bos_id=1,
-            eos_id=2,
-            unk_id=3,
-            pad_piece="[PAD]",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            unk_piece="[UNK]",
-            user_defined_symbols="[MASK]",
+        self.tokenizer = DebertaV3Tokenizer(
+            # Generated using create_deberta_v3_test_proto.py
+            proto=os.path.join(
+                self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
+            )
         )
-        self.tokenizer = DebertaV3Tokenizer(proto=bytes_io.getvalue())
         self.init_kwargs = {
             "tokenizer": self.tokenizer,
             "sequence_length": 8,

diff --git a/keras_nlp/models/deberta_v3/deberta_v3_tokenizer_test.py b/keras_nlp/models/deberta_v3/deberta_v3_tokenizer_test.py
@@ -12,64 +12,47 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.deberta_v3.deberta_v3_tokenizer import DebertaV3Tokenizer
 from keras_nlp.tests.test_case import TestCase
 
 
 class DebertaV3TokenizerTest(TestCase):
     def setUp(self):
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=11,
-            model_type="WORD",
-            pad_id=0,
-            bos_id=1,
-            eos_id=2,
-            unk_id=3,
-            pad_piece="[PAD]",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            unk_piece="[UNK]",
+        # Generated using create_deberta_v3_test_proto.py
+        proto = os.path.join(
+            self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
         )
-        self.tokenizer = DebertaV3Tokenizer(proto=bytes_io.getvalue())
-        self.init_kwargs = {"proto": bytes_io.getvalue()}
+        self.tokenizer = DebertaV3Tokenizer(proto=proto)
+        self.init_kwargs = {"proto": proto}
         self.input_data = ["the quick brown fox.", "the earth is round."]
 
     def test_tokenizer_basics(self):
         self.run_preprocessing_layer_test(
             cls=DebertaV3Tokenizer,
             init_kwargs=self.init_kwargs,
             input_data=self.input_data,
-            expected_output=[[4, 9, 5, 3], [4, 6, 8, 3]],
+            expected_output=[[5, 10, 6, 3], [5, 7, 9, 3]],
         )
 
     def test_errors_missing_special_tokens(self):
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(["abc"]),
-            model_writer=bytes_io,
-            vocab_size=5,
-            pad_id=-1,
-            eos_id=-1,
-            bos_id=-1,
-        )
         with self.assertRaises(ValueError):
-            DebertaV3Tokenizer(proto=bytes_io.getvalue())
+            DebertaV3Tokenizer(
+                # Generated using create_no_special_token_proto.py
+                proto=os.path.join(
+                    self.get_test_data_dir(), "no_special_token_vocab.spm"
+                )
+            )
 
     def test_mask_token_handling(self):
         tokenizer = DebertaV3Tokenizer(**self.init_kwargs)
-        self.assertEqual(tokenizer.get_vocabulary()[11], "[MASK]")
-        self.assertEqual(tokenizer.id_to_token(11), "[MASK]")
-        self.assertEqual(tokenizer.token_to_id("[MASK]"), 11)
-        input_data = [[4, 9, 5, 7, self.tokenizer.mask_token_id]]
+        self.assertEqual(tokenizer.get_vocabulary()[4], "[MASK]")
+        self.assertEqual(tokenizer.id_to_token(4), "[MASK]")
+        self.assertEqual(tokenizer.token_to_id("[MASK]"), 4)
+        input_data = [[5, 10, 6, 8, self.tokenizer.mask_token_id]]
         output = tokenizer.detokenize(input_data)
         self.assertEqual(output, ["the quick brown fox"])
 

diff --git a/keras_nlp/models/f_net/f_net_classifier_test.py b/keras_nlp/models/f_net/f_net_classifier_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.f_net.f_net_backbone import FNetBackbone
 from keras_nlp.models.f_net.f_net_classifier import FNetClassifier
@@ -27,25 +26,13 @@
 class FNetClassifierTest(TestCase):
     def setUp(self):
         # Setup model.
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            unk_id=1,
-            bos_id=2,
-            eos_id=3,
-            pad_piece="<pad>",
-            unk_piece="<unk>",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            user_defined_symbols="[MASK]",
-        )
         self.preprocessor = FNetPreprocessor(
-            FNetTokenizer(proto=bytes_io.getvalue()),
+            FNetTokenizer(
+                # Generated using create_f_net_test_proto.py
+                proto=os.path.join(
+                    self.get_test_data_dir(), "f_net_test_vocab.spm"
+                )
+            ),
             sequence_length=5,
         )
         self.backbone = FNetBackbone(

diff --git a/keras_nlp/models/f_net/f_net_masked_lm_preprocessor_test.py b/keras_nlp/models/f_net/f_net_masked_lm_preprocessor_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.f_net.f_net_masked_lm_preprocessor import (
     FNetMaskedLMPreprocessor,
@@ -26,24 +25,10 @@
 
 class FNetMaskedLMPreprocessorTest(TestCase):
     def setUp(self):
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            unk_id=1,
-            bos_id=2,
-            eos_id=3,
-            pad_piece="<pad>",
-            unk_piece="<unk>",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            user_defined_symbols="[MASK]",
+        self.tokenizer = FNetTokenizer(
+            # Generated using create_f_net_test_proto.py
+            proto=os.path.join(self.get_test_data_dir(), "f_net_test_vocab.spm")
         )
-        self.tokenizer = FNetTokenizer(proto=bytes_io.getvalue())
         self.init_kwargs = {
             "tokenizer": self.tokenizer,
             # Simplify our testing by masking every available token.

diff --git a/keras_nlp/models/f_net/f_net_masked_lm_test.py b/keras_nlp/models/f_net/f_net_masked_lm_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.f_net.f_net_backbone import FNetBackbone
 from keras_nlp.models.f_net.f_net_masked_lm import FNetMaskedLM
@@ -29,25 +28,13 @@
 class FNetMaskedLMTest(TestCase):
     def setUp(self):
         # Setup model.
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            unk_id=1,
-            bos_id=2,
-            eos_id=3,
-            pad_piece="<pad>",
-            unk_piece="<unk>",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            user_defined_symbols="[MASK]",
-        )
         self.preprocessor = FNetMaskedLMPreprocessor(
-            FNetTokenizer(proto=bytes_io.getvalue()),
+            FNetTokenizer(
+                # Generated using create_f_net_test_proto.py
+                proto=os.path.join(
+                    self.get_test_data_dir(), "f_net_test_vocab.spm"
+                )
+            ),
             # Simplify our testing by masking every available token.
             mask_selection_rate=1.0,
             mask_token_rate=1.0,

diff --git a/keras_nlp/models/f_net/f_net_preprocessor_test.py b/keras_nlp/models/f_net/f_net_preprocessor_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.f_net.f_net_preprocessor import FNetPreprocessor
 from keras_nlp.models.f_net.f_net_tokenizer import FNetTokenizer
@@ -24,24 +23,10 @@
 
 class FNetPreprocessorTest(TestCase):
     def setUp(self):
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            unk_id=1,
-            bos_id=2,
-            eos_id=3,
-            pad_piece="<pad>",
-            unk_piece="<unk>",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            user_defined_symbols="[MASK]",
+        self.tokenizer = FNetTokenizer(
+            # Generated using create_f_net_test_proto.py
+            proto=os.path.join(self.get_test_data_dir(), "f_net_test_vocab.spm")
         )
-        self.tokenizer = FNetTokenizer(proto=bytes_io.getvalue())
         self.init_kwargs = {
             "tokenizer": self.tokenizer,
             "sequence_length": 8,

diff --git a/keras_nlp/models/f_net/f_net_tokenizer_test.py b/keras_nlp/models/f_net/f_net_tokenizer_test.py
@@ -12,35 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.f_net.f_net_tokenizer import FNetTokenizer
 from keras_nlp.tests.test_case import TestCase
 
 
 class FNetTokenizerTest(TestCase):
     def setUp(self):
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            unk_id=1,
-            bos_id=2,
-            eos_id=3,
-            pad_piece="<pad>",
-            unk_piece="<unk>",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            user_defined_symbols="[MASK]",
-        )
-        self.init_kwargs = {"proto": bytes_io.getvalue()}
+        self.init_kwargs = {
+            # Generated using create_f_net_test_proto.py
+            "proto": os.path.join(
+                self.get_test_data_dir(), "f_net_test_vocab.spm"
+            )
+        }
         self.input_data = ["the quick brown fox.", "the earth is round."]
 
     def test_tokenizer_basics(self):
@@ -52,17 +39,13 @@ def test_tokenizer_basics(self):
         )
 
     def test_errors_missing_special_tokens(self):
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(["abc"]),
-            model_writer=bytes_io,
-            vocab_size=5,
-            pad_id=-1,
-            eos_id=-1,
-            bos_id=-1,
-        )
         with self.assertRaises(ValueError):
-            FNetTokenizer(proto=bytes_io.getvalue())
+            FNetTokenizer(
+                # Generated using create_no_special_token_proto.py
+                proto=os.path.join(
+                    self.get_test_data_dir(), "no_special_token_vocab.spm"
+                )
+            )
 
     @pytest.mark.large
     def test_smallest_preset(self):

diff --git a/keras_nlp/models/t5/t5_tokenizer_test.py b/keras_nlp/models/t5/t5_tokenizer_test.py
@@ -12,37 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
-import tensorflow as tf
 
 from keras_nlp.models.t5.t5_tokenizer import T5Tokenizer
 from keras_nlp.tests.test_case import TestCase
 
 
 class T5TokenizerTest(TestCase):
     def setUp(self):
-        bytes_io = io.BytesIO()
-        vocab_data = tf.data.Dataset.from_tensor_slices(
-            ["the quick brown fox", "the earth is round"]
-        )
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=vocab_data.as_numpy_iterator(),
-            model_writer=bytes_io,
-            vocab_size=11,
-            model_type="WORD",
-            bos_id=-1,
-            pad_id=0,
-            eos_id=1,
-            unk_id=2,
-            pad_piece="<pad>",
-            eos_piece="</s>",
-            unk_piece="<unk>",
-            user_defined_symbols="[MASK]",
-        )
-        self.init_kwargs = {"proto": bytes_io.getvalue()}
+        self.init_kwargs = {
+            # Generated using create_t5_test_proto.py
+            "proto": os.path.join(self.get_test_data_dir(), "t5_test_vocab.spm")
+        }
         self.input_data = ["the quick brown fox.", "the earth is round."]
 
     def test_tokenizer_basics(self):
@@ -54,17 +37,13 @@ def test_tokenizer_basics(self):
         )
 
     def test_errors_missing_special_tokens(self):
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(["abc"]),
-            model_writer=bytes_io,
-            vocab_size=5,
-            pad_id=-1,
-            eos_id=-1,
-            bos_id=-1,
-        )
         with self.assertRaises(ValueError):
-            T5Tokenizer(proto=bytes_io.getvalue())
+            T5Tokenizer(
+                # Generated using create_no_special_token_proto.py
+                proto=os.path.join(
+                    self.get_test_data_dir(), "no_special_token_vocab.spm"
+                )
+            )
 
     @pytest.mark.large
     def test_smallest_preset(self):

diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.xlm_roberta.xlm_roberta_backbone import XLMRobertaBackbone
 from keras_nlp.models.xlm_roberta.xlm_roberta_classifier import (
@@ -33,19 +32,13 @@
 class XLMRobertaClassifierTest(TestCase):
     def setUp(self):
         # Setup model.
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=10,
-            model_type="WORD",
-            unk_id=0,
-            bos_id=1,
-            eos_id=2,
-        )
         self.preprocessor = XLMRobertaPreprocessor(
-            XLMRobertaTokenizer(proto=bytes_io.getvalue()),
+            XLMRobertaTokenizer(
+                # Generated using create_xlm_roberta_test_proto.py
+                proto=os.path.join(
+                    self.get_test_data_dir(), "xlm_roberta_test_vocab.spm"
+                )
+            ),
             sequence_length=5,
         )
         self.backbone = XLMRobertaBackbone(

diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.xlm_roberta.xlm_roberta_masked_lm_preprocessor import (
     XLMRobertaMaskedLMPreprocessor,
@@ -28,19 +27,12 @@
 
 class XLMRobertaMaskedLMPreprocessorTest(TestCase):
     def setUp(self):
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=11,
-            model_type="WORD",
-            unk_id=0,
-            bos_id=1,
-            eos_id=2,
-            user_defined_symbols="[MASK]",
+        self.tokenizer = XLMRobertaTokenizer(
+            # Generated using create_xlm_roberta_test_proto.py
+            proto=os.path.join(
+                self.get_test_data_dir(), "xlm_roberta_test_vocab.spm"
+            )
         )
-        self.tokenizer = XLMRobertaTokenizer(proto=bytes_io.getvalue())
         self.init_kwargs = {
             "tokenizer": self.tokenizer,
             # Simplify our testing by masking every available token.
@@ -59,11 +51,11 @@ def test_preprocessor_basics(self):
             input_data=self.input_data,
             expected_output=(
                 {
-                    "token_ids": [[0, 12, 12, 12, 12, 2, 1, 1, 1, 1, 1, 1]],
+                    "token_ids": [[0, 13, 13, 13, 13, 2, 1, 1, 1, 1, 1, 1]],
                     "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]],
                     "mask_positions": [[1, 2, 3, 4]],
                 },
-                [[5, 10, 6, 8]],
+                [[6, 11, 7, 9]],
                 [[1.0, 1.0, 1.0, 1.0]],
             ),
         )
@@ -80,7 +72,7 @@ def test_no_masking_zero_rate(self):
             no_mask_preprocessor(input_data),
             (
                 {
-                    "token_ids": [[0, 5, 10, 6, 8, 2, 1, 1, 1, 1, 1, 1]],
+                    "token_ids": [[0, 6, 11, 7, 9, 2, 1, 1, 1, 1, 1, 1]],
                     "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]],
                     "mask_positions": [[0, 0, 0, 0]],
                 },

diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.xlm_roberta.xlm_roberta_backbone import XLMRobertaBackbone
 from keras_nlp.models.xlm_roberta.xlm_roberta_masked_lm import (
@@ -33,20 +32,13 @@
 class XLMRobertaMaskedLMTest(TestCase):
     def setUp(self):
         # Setup model.
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=11,
-            model_type="WORD",
-            unk_id=0,
-            bos_id=1,
-            eos_id=2,
-            user_defined_symbols="[MASK]",
-        )
         self.preprocessor = XLMRobertaMaskedLMPreprocessor(
-            XLMRobertaTokenizer(proto=bytes_io.getvalue()),
+            XLMRobertaTokenizer(
+                # Generated using create_xlm_roberta_test_proto.py
+                proto=os.path.join(
+                    self.get_test_data_dir(), "xlm_roberta_test_vocab.spm"
+                )
+            ),
             # Simplify our testing by masking every available token.
             mask_selection_rate=1.0,
             mask_token_rate=1.0,
@@ -76,7 +68,7 @@ def test_masked_lm_basics(self):
             cls=XLMRobertaMaskedLM,
             init_kwargs=self.init_kwargs,
             train_data=self.train_data,
-            expected_output_shape=(2, 5, 13),
+            expected_output_shape=(2, 5, 14),
         )
 
     @pytest.mark.large

diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import (
     XLMRobertaPreprocessor,
@@ -28,24 +27,12 @@
 
 class XLMRobertaPreprocessorTest(TestCase):
     def setUp(self):
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=12,
-            model_type="WORD",
-            pad_id=0,
-            unk_id=1,
-            bos_id=2,
-            eos_id=3,
-            pad_piece="<pad>",
-            unk_piece="<unk>",
-            bos_piece="[CLS]",
-            eos_piece="[SEP]",
-            user_defined_symbols="[MASK]",
+        self.tokenizer = XLMRobertaTokenizer(
+            # Generated using create_xlm_roberta_test_proto.py
+            proto=os.path.join(
+                self.get_test_data_dir(), "xlm_roberta_test_vocab.spm"
+            )
         )
-        self.tokenizer = XLMRobertaTokenizer(proto=bytes_io.getvalue())
         self.init_kwargs = {
             "tokenizer": self.tokenizer,
             "sequence_length": 8,

diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
+import os
 
 import pytest
-import sentencepiece
 
 from keras_nlp.models.xlm_roberta.xlm_roberta_tokenizer import (
     XLMRobertaTokenizer,
@@ -25,27 +24,20 @@
 
 class XLMRobertaTokenizerTest(TestCase):
     def setUp(self):
-        vocab_data = ["the quick brown fox", "the earth is round"]
-        bytes_io = io.BytesIO()
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=iter(vocab_data),
-            model_writer=bytes_io,
-            vocab_size=11,
-            model_type="WORD",
-            unk_id=0,
-            bos_id=1,
-            eos_id=2,
-            user_defined_symbols="[MASK]",
-        )
-        self.init_kwargs = {"proto": bytes_io.getvalue()}
+        self.init_kwargs = {
+            # Generated using create_xlm_roberta_test_proto.py
+            "proto": os.path.join(
+                self.get_test_data_dir(), "xlm_roberta_test_vocab.spm"
+            )
+        }
         self.input_data = ["the quick brown fox.", "the earth is round."]
 
     def test_tokenizer_basics(self):
         self.run_preprocessing_layer_test(
             cls=XLMRobertaTokenizer,
             init_kwargs=self.init_kwargs,
             input_data=self.input_data,
-            expected_output=[[5, 10, 6, 3], [5, 7, 9, 3]],
+            expected_output=[[6, 11, 7, 2], [6, 8, 10, 2]],
         )
 
     @pytest.mark.large

diff --git a/keras_nlp/tests/test_case.py b/keras_nlp/tests/test_case.py
@@ -14,6 +14,7 @@
 
 import json
 import os
+import pathlib
 import re
 
 import tensorflow as tf
@@ -417,3 +418,6 @@ def compare(actual, expected):
                 self.assertAllClose(actual, expected, atol=0.01, rtol=0.01)
 
             tree.map_structure(compare, output, expected_partial_output)
+
+    def get_test_data_dir(self):
+        return str(pathlib.Path(__file__).parent / "test_data")
diff --git a/keras_nlp/tests/test_data/albert_test_vocab.spm b/keras_nlp/tests/test_data/albert_test_vocab.spm
diff --git a/keras_nlp/tests/test_data/deberta_v3_test_vocab.spm b/keras_nlp/tests/test_data/deberta_v3_test_vocab.spm
diff --git a/keras_nlp/tests/test_data/f_net_test_vocab.spm b/keras_nlp/tests/test_data/f_net_test_vocab.spm
diff --git a/keras_nlp/tests/test_data/no_special_token_vocab.spm b/keras_nlp/tests/test_data/no_special_token_vocab.spm
diff --git a/keras_nlp/tests/test_data/t5_test_vocab.spm b/keras_nlp/tests/test_data/t5_test_vocab.spm
diff --git a/keras_nlp/tests/test_data/tokenizer_test_vocab.spm b/keras_nlp/tests/test_data/tokenizer_test_vocab.spm
diff --git a/keras_nlp/tests/test_data/xlm_roberta_test_vocab.spm b/keras_nlp/tests/test_data/xlm_roberta_test_vocab.spm
diff --git a/keras_nlp/tokenizers/sentence_piece_tokenizer_test.py b/keras_nlp/tokenizers/sentence_piece_tokenizer_test.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
 import os
 
-import sentencepiece
 import tensorflow as tf
 
 from keras_nlp.tests.test_case import TestCase
@@ -25,17 +23,9 @@
 class SentencePieceTokenizerTest(TestCase):
     def setUp(self):
         super().setUp()
-        bytes_io = io.BytesIO()
-        vocab_data = tf.data.Dataset.from_tensor_slices(
-            ["the quick brown fox."]
+        self.proto = os.path.join(
+            self.get_test_data_dir(), "tokenizer_test_vocab.spm"
         )
-        sentencepiece.SentencePieceTrainer.train(
-            sentence_iterator=vocab_data.as_numpy_iterator(),
-            model_writer=bytes_io,
-            vocab_size=7,
-            model_type="WORD",
-        )
-        self.proto = bytes_io.getvalue()
 
     def test_tokenize(self):
         input_data = ["the quick brown fox."]
@@ -112,15 +102,13 @@ def test_error_id_out_of_vocabulary(self):
         with self.assertRaises(ValueError):
             tokenizer.id_to_token(-1)
 
-    def test_from_file(self):
-        filepath = os.path.join(self.get_temp_dir(), "model.txt")
-        input_data = ["the quick brown fox."]
-        with tf.io.gfile.GFile(filepath, "wb") as file:
-            file.write(self.proto)
+    def test_from_bytes(self):
+        with tf.io.gfile.GFile(self.proto, "rb") as file:
+            proto = file.read()
         tokenizer = SentencePieceTokenizer(
-            proto=filepath,
+            proto=proto,
         )
-        output_data = tokenizer(input_data)
+        output_data = tokenizer(["the quick brown fox."])
         self.assertAllEqual(output_data, [[6, 5, 3, 4]])
 
     def test_tokenize_then_batch(self):

diff --git a/tools/sentencepiece_testing/__init__.py b/tools/sentencepiece_testing/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tools/sentencepiece_testing/create_albert_test_proto.py b/tools/sentencepiece_testing/create_albert_test_proto.py
@@ -0,0 +1,37 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tools.sentencepiece_testing.utils import train_sentencepiece
+
+
+def main():
+    train_sentencepiece(
+        ["the quick brown fox", "the earth is round"],
+        "albert_test_vocab.spm",
+        vocab_size=12,
+        model_type="WORD",
+        pad_id=0,
+        unk_id=1,
+        bos_id=2,
+        eos_id=3,
+        pad_piece="<pad>",
+        unk_piece="<unk>",
+        bos_piece="[CLS]",
+        eos_piece="[SEP]",
+        user_defined_symbols="[MASK]",
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/sentencepiece_testing/create_deberta_v3_test_proto.py b/tools/sentencepiece_testing/create_deberta_v3_test_proto.py
@@ -0,0 +1,37 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tools.sentencepiece_testing.utils import train_sentencepiece
+
+
+def main():
+    train_sentencepiece(
+        ["the quick brown fox", "the earth is round"],
+        "deberta_v3_test_vocab.spm",
+        vocab_size=12,
+        model_type="WORD",
+        pad_id=0,
+        bos_id=1,
+        eos_id=2,
+        unk_id=3,
+        pad_piece="[PAD]",
+        bos_piece="[CLS]",
+        eos_piece="[SEP]",
+        unk_piece="[UNK]",
+        user_defined_symbols="[MASK]",
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/sentencepiece_testing/create_f_net_test_proto.py b/tools/sentencepiece_testing/create_f_net_test_proto.py
@@ -0,0 +1,37 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tools.sentencepiece_testing.utils import train_sentencepiece
+
+
+def main():
+    train_sentencepiece(
+        ["the quick brown fox", "the earth is round"],
+        "f_net_test_vocab.spm",
+        vocab_size=12,
+        model_type="WORD",
+        pad_id=0,
+        unk_id=1,
+        bos_id=2,
+        eos_id=3,
+        pad_piece="<pad>",
+        unk_piece="<unk>",
+        bos_piece="[CLS]",
+        eos_piece="[SEP]",
+        user_defined_symbols="[MASK]",
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/sentencepiece_testing/create_no_special_token_proto.py b/tools/sentencepiece_testing/create_no_special_token_proto.py
@@ -0,0 +1,30 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tools.sentencepiece_testing.utils import train_sentencepiece
+
+
+def main():
+    train_sentencepiece(
+        ["abc"],
+        "no_special_token_vocab.spm",
+        vocab_size=5,
+        pad_id=-1,
+        eos_id=-1,
+        bos_id=-1,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/sentencepiece_testing/create_sentence_piece_tokenizer_proto.py b/tools/sentencepiece_testing/create_sentence_piece_tokenizer_proto.py
@@ -0,0 +1,28 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tools.sentencepiece_testing.utils import train_sentencepiece
+
+
+def main():
+    train_sentencepiece(
+        ["the quick brown fox."],
+        "tokenizer_test_vocab.spm",
+        vocab_size=7,
+        model_type="WORD",
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/sentencepiece_testing/create_t5_test_proto.py b/tools/sentencepiece_testing/create_t5_test_proto.py
@@ -0,0 +1,36 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tools.sentencepiece_testing.utils import train_sentencepiece
+
+
+def main():
+    train_sentencepiece(
+        ["the quick brown fox", "the earth is round"],
+        "t5_test_vocab.spm",
+        vocab_size=11,
+        model_type="WORD",
+        bos_id=-1,
+        pad_id=0,
+        eos_id=1,
+        unk_id=2,
+        pad_piece="<pad>",
+        eos_piece="</s>",
+        unk_piece="<unk>",
+        user_defined_symbols="[MASK]",
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/sentencepiece_testing/create_xlm_roberta_test_proto.py b/tools/sentencepiece_testing/create_xlm_roberta_test_proto.py
@@ -0,0 +1,37 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tools.sentencepiece_testing.utils import train_sentencepiece
+
+
+def main():
+    train_sentencepiece(
+        ["the quick brown fox", "the earth is round"],
+        "xlm_roberta_test_vocab.spm",
+        vocab_size=12,
+        model_type="WORD",
+        pad_id=0,
+        unk_id=1,
+        bos_id=2,
+        eos_id=3,
+        pad_piece="<pad>",
+        unk_piece="<unk>",
+        bos_piece="[CLS]",
+        eos_piece="[SEP]",
+        user_defined_symbols="[MASK]",
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/sentencepiece_testing/utils.py b/tools/sentencepiece_testing/utils.py
@@ -0,0 +1,33 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import pathlib
+
+import sentencepiece
+
+
+def train_sentencepiece(data, filename, *args, **kwargs):
+    bytes_io = io.BytesIO()
+    sentencepiece.SentencePieceTrainer.train(
+        sentence_iterator=iter(data), model_writer=bytes_io, *args, **kwargs
+    )
+    with open(
+        pathlib.Path(__file__).parent.parent.parent
+        / "keras_nlp"
+        / "tests"
+        / "test_data"
+        / filename,
+        mode="wb",
+    ) as f:
+        f.write(bytes_io.getbuffer())