Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow Schema class from core to be used to create TabularSequenceFeatures #638

Merged
merged 2 commits into from
Mar 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,13 @@ def tabular_core_schema(tabular_schema):
return TensorflowMetadata.from_json(tabular_schema.to_json()).to_merlin_schema()


def parametrize_tabular_schemas():
schema = tabular_testing_data.schema.remove_by_name(["session_id", "session_start", "day_idx"])
def parametrize_schemas(name):
if name == "tabular":
schema = tabular_testing_data.schema.remove_by_name(
["session_id", "session_start", "day_idx"]
)
elif name == "yoochoose":
schema = tabular_sequence_testing_data.schema

return pytest.mark.parametrize(
"schema",
Expand Down
66 changes: 37 additions & 29 deletions tests/torch/features/test_sequential.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,16 @@
#

import pytest
from merlin.schema import Schema as CoreSchema
from merlin.schema import Tags

import transformers4rec.torch as tr
from tests.conftest import parametrize_schemas


def test_sequential_and_non_seq_embedding_features(yoochoose_schema, torch_yoochoose_like):
schema = yoochoose_schema.select_by_tag(Tags.CATEGORICAL)
@parametrize_schemas("yoochoose")
def test_sequential_and_non_seq_embedding_features(schema, torch_yoochoose_like):
schema = schema.select_by_tag(Tags.CATEGORICAL)
emb_module = tr.SequenceEmbeddingFeatures.from_schema(schema)

outputs = emb_module(torch_yoochoose_like)
Expand All @@ -38,24 +41,24 @@ def test_sequential_and_non_seq_embedding_features(yoochoose_schema, torch_yooch
assert list(outputs[fname].shape) == [100, 64]


def test_sequential_tabular_features(yoochoose_schema, torch_yoochoose_like):
schema = yoochoose_schema
@parametrize_schemas("yoochoose")
def test_sequential_tabular_features(schema, torch_yoochoose_like):
tab_module = tr.TabularSequenceFeatures.from_schema(schema)

outputs = tab_module(torch_yoochoose_like)

tag_select = lambda tags: any( # noqa
t in [Tags.CONTINUOUS.value, Tags.CATEGORICAL.value] for t in tags
)
cols = schema.select_by_tag(tag_select).column_names
cols = [
c.name
for c in list(
schema.select_by_tag(Tags.CONTINUOUS) + schema.select_by_tag(Tags.CATEGORICAL)
)
]

assert set(outputs.keys()) == set(cols)


def test_sequential_tabular_features_with_feature_modules_kwargs(
yoochoose_schema, torch_yoochoose_like
):
schema = yoochoose_schema
@parametrize_schemas("yoochoose")
def test_sequential_tabular_features_with_feature_modules_kwargs(schema, torch_yoochoose_like):
EMB_DIM = 200
tab_module = tr.TabularSequenceFeatures.from_schema(
schema,
Expand All @@ -73,8 +76,8 @@ def test_sequential_tabular_features_with_feature_modules_kwargs(
assert all(v.shape[-1] == EMB_DIM for k, v in outputs.items() if k in categ_features)


def test_sequential_tabular_features_with_projection(yoochoose_schema, torch_yoochoose_like):
schema = yoochoose_schema
@parametrize_schemas("yoochoose")
def test_sequential_tabular_features_with_projection(schema, torch_yoochoose_like):
tab_module = tr.TabularSequenceFeatures.from_schema(
schema, max_sequence_length=20, continuous_projection=64
)
Expand All @@ -87,9 +90,10 @@ def test_sequential_tabular_features_with_projection(yoochoose_schema, torch_yoo
assert list(outputs["continuous_projection"].shape)[1:] == [20, 64]


def test_sequential_tabular_features_with_masking(yoochoose_schema, torch_yoochoose_like):
@parametrize_schemas("yoochoose")
def test_sequential_tabular_features_with_masking(schema, torch_yoochoose_like):
input_module = tr.TabularSequenceFeatures.from_schema(
yoochoose_schema,
schema,
max_sequence_length=20,
continuous_projection=64,
d_output=100,
Expand All @@ -103,13 +107,14 @@ def test_sequential_tabular_features_with_masking(yoochoose_schema, torch_yoocho
assert outputs.shape[1] == 20


def test_sequential_tabular_features_ignore_masking(yoochoose_schema, torch_yoochoose_like):
@parametrize_schemas("yoochoose")
def test_sequential_tabular_features_ignore_masking(schema, torch_yoochoose_like):
import numpy as np

from transformers4rec.torch.masking import CausalLanguageModeling, MaskedLanguageModeling

input_module = tr.TabularSequenceFeatures.from_schema(
yoochoose_schema,
schema,
max_sequence_length=20,
continuous_projection=64,
d_output=100,
Expand Down Expand Up @@ -140,17 +145,16 @@ def test_sequential_tabular_features_ignore_masking(yoochoose_schema, torch_yooc
assert output_inference_masking.shape[1] == output_eval_masking.shape[1] + 1


def test_tabular_features_yoochoose_direct(yoochoose_schema, torch_yoochoose_like):
continuous_module = tr.ContinuousFeatures.from_schema(yoochoose_schema, tags=["continuous"])
categorical_module = tr.SequenceEmbeddingFeatures.from_schema(
yoochoose_schema, tags=["categorical"]
)
@parametrize_schemas("yoochoose")
def test_tabular_features_yoochoose_direct(schema, torch_yoochoose_like):
continuous_module = tr.ContinuousFeatures.from_schema(schema, tags=Tags.CONTINUOUS)
categorical_module = tr.SequenceEmbeddingFeatures.from_schema(schema, tags=Tags.CATEGORICAL)

tab_seq_features = tr.TabularSequenceFeatures(
continuous_module=continuous_module,
categorical_module=categorical_module,
aggregation="concat",
schema=yoochoose_schema,
schema=schema,
)
outputs = tab_seq_features(torch_yoochoose_like)

Expand All @@ -173,12 +177,16 @@ def test_tabular_features_yoochoose_direct(yoochoose_schema, torch_yoochoose_lik
assert len(outputs.shape) == 3


def test_sequential_tabular_features_with_masking_no_itemid(yoochoose_schema):
@parametrize_schemas("yoochoose")
def test_sequential_tabular_features_with_masking_no_itemid(schema):
with pytest.raises(ValueError) as excinfo:
yoochoose_schema = yoochoose_schema.remove_by_name("item_id/list")
if isinstance(schema, CoreSchema):
schema = schema.excluding_by_name("item_id/list")
else:
schema = schema.remove_by_name("item_id/list")

tr.TabularSequenceFeatures.from_schema(
yoochoose_schema,
schema,
max_sequence_length=20,
continuous_projection=64,
d_output=100,
Expand All @@ -202,8 +210,8 @@ def test_sequential_tabular_features_with_projection_and_d_output(yoochoose_sche
assert "You cannot specify both d_output and projection at the same time" in str(excinfo.value)


def test_sequential_and_non_sequential_tabular_features(yoochoose_schema, torch_yoochoose_like):
schema = yoochoose_schema
@parametrize_schemas("yoochoose")
def test_sequential_and_non_sequential_tabular_features(schema, torch_yoochoose_like):
tab_module = tr.TabularSequenceFeatures.from_schema(schema, aggregation="concat")

outputs = tab_module(torch_yoochoose_like)
Expand Down
10 changes: 5 additions & 5 deletions tests/torch/features/test_tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
from merlin.schema import Tags

import transformers4rec.torch as tr
from tests.conftest import parametrize_tabular_schemas
from tests.conftest import parametrize_schemas


@parametrize_tabular_schemas()
@parametrize_schemas("tabular")
def test_tabular_features(schema, torch_tabular_data):
tab_module = tr.TabularFeatures.from_schema(schema)

Expand All @@ -33,7 +33,7 @@ def test_tabular_features(schema, torch_tabular_data):
)


@parametrize_tabular_schemas()
@parametrize_schemas("tabular")
def test_tabular_features_embeddings_options(schema, torch_tabular_data):
EMB_DIM = 100
tab_module = tr.TabularFeatures.from_schema(schema, embedding_dim_default=EMB_DIM)
Expand All @@ -44,7 +44,7 @@ def test_tabular_features_embeddings_options(schema, torch_tabular_data):
assert all(v.shape[-1] == EMB_DIM for k, v in outputs.items() if k in categ_features)


@parametrize_tabular_schemas()
@parametrize_schemas("tabular")
def test_tabular_features_with_projection(schema, torch_tabular_data):
tab_module = tr.TabularFeatures.from_schema(schema, continuous_projection=64)

Expand All @@ -57,7 +57,7 @@ def test_tabular_features_with_projection(schema, torch_tabular_data):
assert list(outputs["continuous_projection"].shape)[1] == 64


@parametrize_tabular_schemas()
@parametrize_schemas("tabular")
def test_tabular_features_soft_encoding(schema, torch_tabular_data):
emb_cardinality = 10
emb_dim = 8
Expand Down
12 changes: 9 additions & 3 deletions transformers4rec/torch/features/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

import torch
from merlin.models.utils.doc_utils import docstring_parameter
from merlin.schema import TagsType
from merlin.schema import Tags, TagsType

from merlin_standard_lib import Schema, categorical_cardinalities
from merlin_standard_lib.utils.embedding_utils import get_embedding_sizes_from_schema
Expand Down Expand Up @@ -159,8 +159,14 @@ def from_schema( # type: ignore
if tags:
schema = schema.select_by_tag(tags)

if not item_id and schema.select_by_tag(["item_id"]).column_names:
item_id = schema.select_by_tag(["item_id"]).column_names[0]
_item_id = schema.select_by_tag(Tags.ITEM_ID)
if not item_id and len(_item_id) > 0:
marcromeyn marked this conversation as resolved.
Show resolved Hide resolved
if len(_item_id) > 1:
raise ValueError(
"Multiple columns with tag ITEM_ID found. "
"Please specify the item_id column name."
)
item_id = list(_item_id)[0].name

embedding_dims = embedding_dims or {}

Expand Down