Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ALBERT Presets #655

Merged
merged 8 commits into from
Jan 18, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions keras_nlp/models/albert/albert_backbone.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,17 @@

"""ALBERT backbone model."""

import copy

import tensorflow as tf
from tensorflow import keras

from keras_nlp.layers.position_embedding import PositionEmbedding
from keras_nlp.layers.transformer_encoder import TransformerEncoder
from keras_nlp.models.albert.albert_presets import backbone_presets
from keras_nlp.models.backbone import Backbone
from keras_nlp.utils.python_utils import classproperty
from keras_nlp.utils.python_utils import format_docstring


def albert_kernel_initializer(stddev=0.02):
Expand Down Expand Up @@ -264,3 +269,19 @@ def get_config(self):
"name": self.name,
"trainable": self.trainable,
}

@classproperty
def presets(cls):
return copy.deepcopy(backbone_presets)

@classmethod
def from_preset(cls, preset, load_weights=True, **kwargs):
return super().from_preset(preset, load_weights, **kwargs)


AlbertBackbone.from_preset.__func__.__doc__ = Backbone.from_preset.__doc__
format_docstring(
model_name=AlbertBackbone.__name__,
example_preset_name="albert_base_en_uncased",
preset_names='", "'.join(AlbertBackbone.presets),
)(AlbertBackbone.from_preset.__func__)
22 changes: 22 additions & 0 deletions keras_nlp/models/albert/albert_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,20 @@
# limitations under the License.
"""ALBERT preprocessor layer."""

import copy

from tensorflow import keras

from keras_nlp.layers.multi_segment_packer import MultiSegmentPacker
from keras_nlp.models.albert.albert_presets import backbone_presets
from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
from keras_nlp.models.preprocessor import Preprocessor
from keras_nlp.utils.keras_utils import (
convert_inputs_to_list_of_tensor_segments,
)
from keras_nlp.utils.keras_utils import pack_x_y_sample_weight
from keras_nlp.utils.python_utils import classproperty
from keras_nlp.utils.python_utils import format_docstring


@keras.utils.register_keras_serializable(package="keras_nlp")
Expand Down Expand Up @@ -179,3 +183,21 @@ def call(self, x, y=None, sample_weight=None):
@classproperty
def tokenizer_cls(cls):
return AlbertTokenizer

@classproperty
def presets(cls):
return copy.deepcopy(backbone_presets)

@classmethod
def from_preset(cls, preset, **kwargs):
return super().from_preset(preset, **kwargs)


AlbertPreprocessor.from_preset.__func__.__doc__ = (
Preprocessor.from_preset.__doc__
)
format_docstring(
preprocessor_name=AlbertPreprocessor.__name__,
example_preset_name="albert_base_en_uncased",
preset_names='", "'.join(AlbertPreprocessor.presets),
)(AlbertPreprocessor.from_preset.__func__)
114 changes: 114 additions & 0 deletions keras_nlp/models/albert/albert_presets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# Copyright 2022 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""ALBERT model preset configurations."""


backbone_presets = {
"albert_base_en_uncased": {
"config": {
"vocabulary_size": 30000,
"num_layers": 12,
"num_heads": 12,
"num_groups": 1,
"num_inner_repetitions": 1,
"embedding_dim": 128,
"hidden_dim": 768,
"intermediate_dim": 3072,
"dropout": 0.0,
"max_sequence_length": 512,
"num_segments": 2,
},
"preprocessor_config": {},
"description": (
"Base size of ALBERT where all input is lowercased. "
"Trained on English Wikipedia + BooksCorpus."
),
"weights_url": "https://drive.google.com/uc?export=download&id=1RzTTa8nMcBc84nARvJmHal5SndpKbDUa",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still need to review, but this is awesome! Super great to model how to do this for contributors.

Copy link
Collaborator Author

@abheesht17 abheesht17 Jan 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mattdangerw, there is a cap on the size of the file you can download from GDrive. For example, extra_large and extra_extra_large tests fail because their size > 200MB. I have observed the same for FNet. Instead of downloading the actual file, an HTML file is downloaded.

"weights_hash": "b83ccf3418dd84adc569324183176813",
"spm_proto_url": "https://drive.google.com/uc?export=download&id=1-0C5mWLPRxNaY3yuzdqut-Wi7VaKo2gX",
"spm_proto_hash": "73e62ff8e90f951f24c8b907913039a5",
},
"albert_large_en_uncased": {
"config": {
"vocabulary_size": 30000,
"num_layers": 24,
"num_heads": 16,
"num_groups": 1,
"num_inner_repetitions": 1,
"embedding_dim": 128,
"hidden_dim": 1024,
"intermediate_dim": 4096,
"dropout": 0,
"max_sequence_length": 512,
"num_segments": 2,
},
"preprocessor_config": {},
jbischof marked this conversation as resolved.
Show resolved Hide resolved
"description": (
"Large size of ALBERT where all input is lowercased. "
"Trained on English Wikipedia + BooksCorpus."
),
"weights_url": "https://drive.google.com/uc?export=download&id=1PfMgFPNIb4K9hiLx1Ik5Qcgol_3pOsLE",
"weights_hash": "c7754804efb245f06dd6e7ced32e082c",
"spm_proto_url": "https://drive.google.com/uc?export=download&id=1-0C5mWLPRxNaY3yuzdqut-Wi7VaKo2gX",
"spm_proto_hash": "73e62ff8e90f951f24c8b907913039a5",
},
"albert_extra_large_en_uncased": {
"config": {
"vocabulary_size": 30000,
"num_layers": 24,
"num_heads": 16,
"num_groups": 1,
"num_inner_repetitions": 1,
"embedding_dim": 128,
"hidden_dim": 2048,
"intermediate_dim": 8192,
"dropout": 0,
"max_sequence_length": 512,
"num_segments": 2,
},
"preprocessor_config": {},
"description": (
"Extra Large size of ALBERT where all input is lowercased. "
"Trained on English Wikipedia + BooksCorpus."
),
"weights_url": "https://drive.google.com/uc?export=download&id=1cFQuP3QdHZ7JIA62fQxG8f9kKnv4CR-D",
"weights_hash": "beff185687874df846fda98386fcab29",
"spm_proto_url": "https://drive.google.com/uc?export=download&id=1-0C5mWLPRxNaY3yuzdqut-Wi7VaKo2gX",
"spm_proto_hash": "73e62ff8e90f951f24c8b907913039a5",
},
"albert_extra_extra_large_en_uncased": {
jbischof marked this conversation as resolved.
Show resolved Hide resolved
"config": {
"vocabulary_size": 30000,
"num_layers": 12,
"num_heads": 64,
"num_groups": 1,
"num_inner_repetitions": 1,
"embedding_dim": 128,
"hidden_dim": 4096,
"intermediate_dim": 16384,
"dropout": 0,
"max_sequence_length": 512,
"num_segments": 2,
},
"preprocessor_config": {},
"description": (
"Extra Extra Large size of ALBERT where all input is lowercased. "
"Trained on English Wikipedia + BooksCorpus."
),
"weights_url": "https://drive.google.com/uc?export=download&id=1NcmanywBpby6n1uolJw2G4QPp_V0fHlx",
"weights_hash": "5d44017f3a68b2a7e6f755c1070ad3ed",
"spm_proto_url": "https://drive.google.com/uc?export=download&id=1-0C5mWLPRxNaY3yuzdqut-Wi7VaKo2gX",
"spm_proto_hash": "73e62ff8e90f951f24c8b907913039a5",
},
}
123 changes: 123 additions & 0 deletions keras_nlp/models/albert/albert_presets_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# Copyright 2022 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for loading pretrained model presets."""

import pytest
import tensorflow as tf
from absl.testing import parameterized

from keras_nlp.models.albert.albert_backbone import AlbertBackbone
from keras_nlp.models.albert.albert_preprocessor import AlbertPreprocessor
from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer


@pytest.mark.large
class AlbertPresetSmokeTest(tf.test.TestCase, parameterized.TestCase):
"""
A smoke test for ALBERT presets we run continuously.

This only tests the smallest weights we have available. Run with:
`pytest keras_nlp/models/albert/albert_presets_test.py --run_large`
"""

def test_tokenizer_output(self):
tokenizer = AlbertTokenizer.from_preset(
"albert_base_en_uncased",
)
outputs = tokenizer("The quick brown fox.")
expected_outputs = [13, 1, 438, 2231, 886, 2385, 9]
self.assertAllEqual(outputs, expected_outputs)

def test_preprocessor_output(self):
preprocessor = AlbertPreprocessor.from_preset(
"albert_base_en_uncased",
sequence_length=4,
)
outputs = preprocessor("The quick brown fox.")["token_ids"]
expected_outputs = [2, 13, 1, 3]
self.assertAllEqual(outputs, expected_outputs)

@parameterized.named_parameters(
("preset_weights", True), ("random_weights", False)
)
def test_backbone_output(self, load_weights):
input_data = {
"token_ids": tf.constant([[0, 581, 63773, 2]]),
"padding_mask": tf.constant([[1, 1, 1, 1]]),
}
model = AlbertBackbone.from_preset(
"albert_base_en_uncased", load_weights=load_weights
)
outputs = model(input_data)
if load_weights:
outputs = outputs[0, 0, :5]
expected = [0.418, -0.116, -0.122, -1.847, -0.035]
self.assertAllClose(outputs, expected, atol=0.01, rtol=0.01)

@parameterized.named_parameters(
("albert_tokenizer", AlbertTokenizer),
("albert_preprocessor", AlbertPreprocessor),
("albert", AlbertBackbone),
)
def test_preset_docstring(self, cls):
"""Check we did our docstring formatting correctly."""
for name in cls.presets:
self.assertRegex(cls.from_preset.__doc__, name)

@parameterized.named_parameters(
("albert_tokenizer", AlbertTokenizer),
("albert_preprocessor", AlbertPreprocessor),
("albert", AlbertBackbone),
)
def test_unknown_preset_error(self, cls):
# Not a preset name
with self.assertRaises(ValueError):
cls.from_preset("albert_base_en_uncased_clowntown")


@pytest.mark.extra_large
class AlbertPresetFullTest(tf.test.TestCase, parameterized.TestCase):
"""
Test the full enumeration of our preset.

This tests every ALBERT preset and is only run manually.
Run with:
`pytest keras_nlp/models/albert/albert_presets_test.py --run_extra_large`
"""

@parameterized.named_parameters(
("preset_weights", True), ("random_weights", False)
)
def test_load_albert(self, load_weights):
for preset in AlbertBackbone.presets:
model = AlbertBackbone.from_preset(
preset, load_weights=load_weights
)
input_data = {
"token_ids": tf.random.uniform(
shape=(1, 512), dtype=tf.int64, maxval=model.vocabulary_size
),
"padding_mask": tf.constant([1] * 512, shape=(1, 512)),
}
model(input_data)

def test_load_tokenizers(self):
for preset in AlbertTokenizer.presets:
tokenizer = AlbertTokenizer.from_preset(preset)
tokenizer("The quick brown fox.")

def test_load_preprocessors(self):
for preset in AlbertPreprocessor.presets:
preprocessor = AlbertPreprocessor.from_preset(preset)
preprocessor("The quick brown fox.")
49 changes: 47 additions & 2 deletions keras_nlp/models/albert/albert_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,15 @@

"""ALBERT tokenizer."""

import copy
import os

from tensorflow import keras

from keras_nlp.models.albert.albert_presets import backbone_presets
from keras_nlp.tokenizers.sentence_piece_tokenizer import SentencePieceTokenizer
from keras_nlp.utils.python_utils import classproperty
from keras_nlp.utils.python_utils import format_docstring


@keras.utils.register_keras_serializable(package="keras_nlp")
Expand Down Expand Up @@ -84,12 +88,53 @@ def __init__(self, proto, **kwargs):

@classproperty
def presets(cls):
return {}
return copy.deepcopy(backbone_presets)

@classmethod
@format_docstring(names=", ".join(backbone_presets))
def from_preset(
cls,
preset,
**kwargs,
):
raise NotImplementedError
"""Instantiate an ALBERT tokenizer from preset vocabulary.

Args:
preset: string. Must be one of {{names}}.

Examples:
```python
# Load a preset tokenizer.
tokenizer = keras_nlp.models.AlbertTokenizer.from_preset(
"albert_base_en_uncased",
)

# Tokenize some input.
tokenizer("The quick brown fox tripped.")

# Detokenize some input.
tokenizer.detokenize([5, 6, 7, 8, 9])
```
"""
if preset not in cls.presets:
raise ValueError(
"`preset` must be one of "
f"""{", ".join(cls.presets)}. Received: {preset}."""
)
metadata = cls.presets[preset]

spm_proto = keras.utils.get_file(
"vocab.spm",
metadata["spm_proto_url"],
cache_subdir=os.path.join("models", preset),
file_hash=metadata["spm_proto_hash"],
)

config = metadata["preprocessor_config"]
config.update(
{
"proto": spm_proto,
},
)

return cls.from_config({**config, **kwargs})