From f6c975ae2d7f37cbc51f8e88f62d79cd4ac9ab54 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 15 Jul 2024 06:46:33 +0530 Subject: [PATCH] default to 1024 1024. --- .../models/transformers/auraflow_transformer_2d.py | 2 +- src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/diffusers/models/transformers/auraflow_transformer_2d.py b/src/diffusers/models/transformers/auraflow_transformer_2d.py index 53ec470720b3..342373b4c11d 100644 --- a/src/diffusers/models/transformers/auraflow_transformer_2d.py +++ b/src/diffusers/models/transformers/auraflow_transformer_2d.py @@ -75,7 +75,7 @@ def forward(self, latent): ) latent = latent.permute(0, 2, 4, 1, 3, 5).flatten(-3).flatten(1, 2) latent = self.proj(latent) - return latent + self.pos_embed[:, : latent.size(1)] + return latent + self.pos_embed # Taken from the original Aura flow inference code. diff --git a/src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py b/src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py index 47c765d5cbb5..8b5a85b5ab78 100644 --- a/src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py +++ b/src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py @@ -391,8 +391,8 @@ def __call__( sigmas: List[float] = None, guidance_scale: float = 3.5, num_images_per_prompt: Optional[int] = 1, - height: Optional[int] = 512, - width: Optional[int] = 512, + height: Optional[int] = 1024, + width: Optional[int] = 1024, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, latents: Optional[torch.Tensor] = None, prompt_embeds: Optional[torch.Tensor] = None, @@ -415,9 +415,9 @@ def __call__( `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): - The height in pixels of the generated image. This is set to 512 by default. + The height in pixels of the generated image. This is set to 1024 by default for best results. width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): - The width in pixels of the generated image. This is set to 512 by default. + The width in pixels of the generated image. This is set to 1024 by default for best results. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference.