huggingface · a-r-r-o-w · Jul 11, 2024 · Jun 5, 2024 · Jun 5, 2024 · Jun 5, 2024
diff --git a/.gitignore b/.gitignore
@@ -175,4 +175,4 @@ tags
 .ruff_cache
 
 # wandb
-wandb
+wandb
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -249,6 +249,8 @@
       title: DiTTransformer2DModel
     - local: api/models/hunyuan_transformer2d
       title: HunyuanDiT2DModel
+    - local: api/models/latte_transformer3d
+      title: LatteTransformer3DModel
     - local: api/models/lumina_nextdit2d
       title: LuminaNextDiT2DModel
     - local: api/models/transformer_temporal

diff --git a/docs/source/en/api/models/latte_transformer3d.md b/docs/source/en/api/models/latte_transformer3d.md
@@ -0,0 +1,19 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+## LatteTransformer3DModel
+
+A Diffusion Transformer model for 3D data from [Latte](https://github.com/Vchitect/Latte).
+
+## LatteTransformer3DModel
+
+[[autodoc]] LatteTransformer3DModel
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -88,6 +88,7 @@
             "HunyuanDiT2DMultiControlNetModel",
             "I2VGenXLUNet",
             "Kandinsky3UNet",
+            "LatteTransformer3DModel",
             "LuminaNextDiT2DModel",
             "ModelMixin",
             "MotionAdapter",
@@ -269,6 +270,7 @@
             "KandinskyV22PriorPipeline",
             "LatentConsistencyModelImg2ImgPipeline",
             "LatentConsistencyModelPipeline",
+            "LattePipeline",
             "LDMTextToImagePipeline",
             "LEditsPPPipelineStableDiffusion",
             "LEditsPPPipelineStableDiffusionXL",
@@ -513,6 +515,7 @@
             HunyuanDiT2DMultiControlNetModel,
             I2VGenXLUNet,
             Kandinsky3UNet,
+            LatteTransformer3DModel,
             LuminaNextDiT2DModel,
             ModelMixin,
             MotionAdapter,
@@ -672,6 +675,7 @@
             KandinskyV22PriorPipeline,
             LatentConsistencyModelImg2ImgPipeline,
             LatentConsistencyModelPipeline,
+            LattePipeline,
             LDMTextToImagePipeline,
             LEditsPPPipelineStableDiffusion,
             LEditsPPPipelineStableDiffusionXL,

diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
@@ -41,6 +41,7 @@
     _import_structure["transformers.dit_transformer_2d"] = ["DiTTransformer2DModel"]
     _import_structure["transformers.dual_transformer_2d"] = ["DualTransformer2DModel"]
     _import_structure["transformers.hunyuan_transformer_2d"] = ["HunyuanDiT2DModel"]
+    _import_structure["transformers.latte_transformer_3d"] = ["LatteTransformer3DModel"]
     _import_structure["transformers.lumina_nextdit2d"] = ["LuminaNextDiT2DModel"]
     _import_structure["transformers.pixart_transformer_2d"] = ["PixArtTransformer2DModel"]
     _import_structure["transformers.prior_transformer"] = ["PriorTransformer"]
@@ -86,6 +87,7 @@
             DiTTransformer2DModel,
             DualTransformer2DModel,
             HunyuanDiT2DModel,
+            LatteTransformer3DModel,
             LuminaNextDiT2DModel,
             PixArtTransformer2DModel,
             PriorTransformer,

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
@@ -359,7 +359,10 @@ def __init__(
                 out_bias=attention_out_bias,
             )  # is self-attn if encoder_hidden_states is none
         else:
-            self.norm2 = None
+            if norm_type == "ada_norm_single":  # For Latte
+                self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+            else:
+                self.norm2 = None
             self.attn2 = None
 
         # 3. Feed-forward
@@ -439,7 +442,6 @@ def forward(
             ).chunk(6, dim=1)
             norm_hidden_states = self.norm1(hidden_states)
             norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
-            norm_hidden_states = norm_hidden_states.squeeze(1)
         else:
             raise ValueError("Incorrect norm used")
 
@@ -456,6 +458,7 @@ def forward(
             attention_mask=attention_mask,
             **cross_attention_kwargs,
         )
+
         if self.norm_type == "ada_norm_zero":
             attn_output = gate_msa.unsqueeze(1) * attn_output
         elif self.norm_type == "ada_norm_single":

diff --git a/src/diffusers/models/transformers/__init__.py b/src/diffusers/models/transformers/__init__.py
@@ -5,6 +5,7 @@
     from .dit_transformer_2d import DiTTransformer2DModel
     from .dual_transformer_2d import DualTransformer2DModel
     from .hunyuan_transformer_2d import HunyuanDiT2DModel
+    from .latte_transformer_3d import LatteTransformer3DModel
     from .lumina_nextdit2d import LuminaNextDiT2DModel
     from .pixart_transformer_2d import PixArtTransformer2DModel
     from .prior_transformer import PriorTransformer
-Original file line number
+Diff line change
@@ Expand Up / @@ -175,4 +175,4 @@ tags @@
     .ruff_cache
     # wandb
-    wandb
+    wandb