Catch FF requiring squared context length

facebookresearch · Jun 3, 2022 · 55119ab · 55119ab
1 parent b19d544
commit 55119ab
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 3 deletions.
diff --git a/tests/test_block_factory.py b/tests/test_block_factory.py
@@ -237,7 +237,10 @@ def test_xformer_decoder_block(
         )
 
     # Test different sequence lengths when encoding and decoding
-    if not decoder_block.requires_same_k_q_dimensions:
+    if (
+        not decoder_block.requires_same_k_q_dimensions
+        and not decoder_block.requires_squared_context_length
+    ):
         if not causal or not decoder_block.causal_attention:
             _ = decoder_block(inputs[:, :-16], encoded)
         else:

diff --git a/tests/test_model_factory.py b/tests/test_model_factory.py
@@ -199,16 +199,19 @@ def check_against_default(p):
 
 
 @pytest.mark.parametrize("weight_init", [w.value for w in xFormerWeightInit])
+@pytest.mark.parametrize("feedforward", ["MLP", "Conv2DFeedforward"])
 @pytest.mark.parametrize("deepnorm", [False, True])
 @pytest.mark.parametrize("device", DEVICES)
-def test_weight_init(weight_init, deepnorm, device):
+def test_weight_init(weight_init, feedforward, deepnorm, device):
     torch.cuda.manual_seed(42)
     torch.manual_seed(42)
 
     config = test_configs_dict
 
     if deepnorm:
         config["encoder"]["layer_norm_style"] = "deepnorm"
+        config["encoder"]["feedforward_config"]["name"] = feedforward
+
         config["decoder"]["layer_norm_style"] = "deepnorm"
 
     # Make sure that all the init methods catch all the weights

diff --git a/xformers/factory/block_factory.py b/xformers/factory/block_factory.py
@@ -275,9 +275,11 @@ def __init__(self, config: xFormerDecoderConfig, **kwargs):
         cross_mha = build_multi_head_attention(config.multi_head_config_cross)
         feedforward = build_feedforward(config.feedforward_config)
 
-        # Expose attention specific capabilities
+        # Expose attention or feedforward specific capabilities
         self.supports_attention_mask = mha.attention.supports_attention_mask
         self.requires_same_k_q_dimensions = mha.attention.requires_same_k_q_dimensions
+        self.requires_squared_context_length = feedforward.requires_squared_context
+
         self.causal_attention = (
             mha.attention.causal if hasattr(mha.attention, "causal") else False
         )