diff --git a/docs/source/en/using-diffusers/custom_pipeline_overview.md b/docs/source/en/using-diffusers/custom_pipeline_overview.md index 341a98a5c897..17ba779b8136 100644 --- a/docs/source/en/using-diffusers/custom_pipeline_overview.md +++ b/docs/source/en/using-diffusers/custom_pipeline_overview.md @@ -289,9 +289,9 @@ scheduler = DPMSolverMultistepScheduler.from_pretrained(pipe_id, subfolder="sche 3. Load an image processor: ```python -from transformers import CLIPFeatureExtractor +from transformers import CLIPImageProcessor -feature_extractor = CLIPFeatureExtractor.from_pretrained(pipe_id, subfolder="feature_extractor") +feature_extractor = CLIPImageProcessor.from_pretrained(pipe_id, subfolder="feature_extractor") ``` diff --git a/docs/source/en/using-diffusers/inference_with_tcd_lora.md b/docs/source/en/using-diffusers/inference_with_tcd_lora.md index df49fc8475ad..d6fa61be557a 100644 --- a/docs/source/en/using-diffusers/inference_with_tcd_lora.md +++ b/docs/source/en/using-diffusers/inference_with_tcd_lora.md @@ -212,14 +212,14 @@ TCD-LoRA is very versatile, and it can be combined with other adapter types like import torch import numpy as np from PIL import Image -from transformers import DPTFeatureExtractor, DPTForDepthEstimation +from transformers import DPTImageProcessor, DPTForDepthEstimation from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline from diffusers.utils import load_image, make_image_grid from scheduling_tcd import TCDScheduler device = "cuda" depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device) -feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas") +feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas") def get_depth_map(image): image = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device) diff --git a/docs/source/ko/using-diffusers/loading.md b/docs/source/ko/using-diffusers/loading.md index 39cd228af401..2106b91a68cf 100644 --- a/docs/source/ko/using-diffusers/loading.md +++ b/docs/source/ko/using-diffusers/loading.md @@ -307,7 +307,7 @@ print(pipeline) 위의 코드 출력 결과를 확인해보면, `pipeline`은 [`StableDiffusionPipeline`]의 인스턴스이며, 다음과 같이 총 7개의 컴포넌트로 구성된다는 것을 알 수 있습니다. -- `"feature_extractor"`: [`~transformers.CLIPFeatureExtractor`]의 인스턴스 +- `"feature_extractor"`: [`~transformers.CLIPImageProcessor`]의 인스턴스 - `"safety_checker"`: 유해한 컨텐츠를 스크리닝하기 위한 [컴포넌트](https://github.com/huggingface/diffusers/blob/e55687e1e15407f60f32242027b7bb8170e58266/src/diffusers/pipelines/stable_diffusion/safety_checker.py#L32) - `"scheduler"`: [`PNDMScheduler`]의 인스턴스 - `"text_encoder"`: [`~transformers.CLIPTextModel`]의 인스턴스 diff --git a/docs/source/ko/using-diffusers/textual_inversion_inference.md b/docs/source/ko/using-diffusers/textual_inversion_inference.md index 1b52fee923b3..39fab939a704 100644 --- a/docs/source/ko/using-diffusers/textual_inversion_inference.md +++ b/docs/source/ko/using-diffusers/textual_inversion_inference.md @@ -24,7 +24,7 @@ import PIL from PIL import Image from diffusers import StableDiffusionPipeline -from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer def image_grid(imgs, rows, cols): diff --git a/examples/community/README.md b/examples/community/README.md index 57af5a2e34b4..090bb980b221 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -1435,9 +1435,9 @@ import requests import torch from diffusers import DiffusionPipeline from PIL import Image -from transformers import CLIPFeatureExtractor, CLIPModel +from transformers import CLIPImageProcessor, CLIPModel -feature_extractor = CLIPFeatureExtractor.from_pretrained( +feature_extractor = CLIPImageProcessor.from_pretrained( "laion/CLIP-ViT-B-32-laion2B-s34B-b79K" ) clip_model = CLIPModel.from_pretrained( @@ -2122,7 +2122,7 @@ import torch import open_clip from open_clip import SimpleTokenizer from diffusers import DiffusionPipeline -from transformers import CLIPFeatureExtractor, CLIPModel +from transformers import CLIPImageProcessor, CLIPModel def download_image(url): @@ -2130,7 +2130,7 @@ def download_image(url): return PIL.Image.open(BytesIO(response.content)).convert("RGB") # Loading additional models -feature_extractor = CLIPFeatureExtractor.from_pretrained( +feature_extractor = CLIPImageProcessor.from_pretrained( "laion/CLIP-ViT-B-32-laion2B-s34B-b79K" ) clip_model = CLIPModel.from_pretrained( diff --git a/examples/community/clip_guided_images_mixing_stable_diffusion.py b/examples/community/clip_guided_images_mixing_stable_diffusion.py index 75b7df84dc77..f9a4b12ad20f 100644 --- a/examples/community/clip_guided_images_mixing_stable_diffusion.py +++ b/examples/community/clip_guided_images_mixing_stable_diffusion.py @@ -7,7 +7,7 @@ import torch from torch.nn import functional as F from torchvision import transforms -from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPModel, CLIPTextModel, CLIPTokenizer from diffusers import ( AutoencoderKL, @@ -86,7 +86,7 @@ def __init__( tokenizer: CLIPTokenizer, unet: UNet2DConditionModel, scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler, DPMSolverMultistepScheduler], - feature_extractor: CLIPFeatureExtractor, + feature_extractor: CLIPImageProcessor, coca_model=None, coca_tokenizer=None, coca_transform=None, diff --git a/examples/community/clip_guided_stable_diffusion_img2img.py b/examples/community/clip_guided_stable_diffusion_img2img.py index 0f3de94e2cdb..91c74b9ffa74 100644 --- a/examples/community/clip_guided_stable_diffusion_img2img.py +++ b/examples/community/clip_guided_stable_diffusion_img2img.py @@ -7,7 +7,7 @@ from torch import nn from torch.nn import functional as F from torchvision import transforms -from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPModel, CLIPTextModel, CLIPTokenizer from diffusers import ( AutoencoderKL, @@ -32,9 +32,9 @@ import torch from diffusers import DiffusionPipeline from PIL import Image - from transformers import CLIPFeatureExtractor, CLIPModel + from transformers import CLIPImageProcessor, CLIPModel - feature_extractor = CLIPFeatureExtractor.from_pretrained( + feature_extractor = CLIPImageProcessor.from_pretrained( "laion/CLIP-ViT-B-32-laion2B-s34B-b79K" ) clip_model = CLIPModel.from_pretrained( @@ -139,7 +139,7 @@ def __init__( tokenizer: CLIPTokenizer, unet: UNet2DConditionModel, scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler, DPMSolverMultistepScheduler], - feature_extractor: CLIPFeatureExtractor, + feature_extractor: CLIPImageProcessor, ): super().__init__() self.register_modules( diff --git a/examples/community/mixture_canvas.py b/examples/community/mixture_canvas.py index 7196ee9587f2..2bb054a123d0 100644 --- a/examples/community/mixture_canvas.py +++ b/examples/community/mixture_canvas.py @@ -9,7 +9,7 @@ from numpy import exp, pi, sqrt from torchvision.transforms.functional import resize from tqdm.auto import tqdm -from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin @@ -275,7 +275,7 @@ def __init__( unet: UNet2DConditionModel, scheduler: Union[DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler], safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, + feature_extractor: CLIPImageProcessor, ): super().__init__() self.register_modules( diff --git a/examples/community/mixture_tiling.py b/examples/community/mixture_tiling.py index 7e3d592d8514..867bce0d9eb8 100644 --- a/examples/community/mixture_tiling.py +++ b/examples/community/mixture_tiling.py @@ -15,7 +15,7 @@ try: from ligo.segments import segment - from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer + from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer except ImportError: raise ImportError("Please install transformers and ligo-segments to use the mixture pipeline") @@ -144,7 +144,7 @@ def __init__( unet: UNet2DConditionModel, scheduler: Union[DDIMScheduler, PNDMScheduler], safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, + feature_extractor: CLIPImageProcessor, ): super().__init__() self.register_modules( diff --git a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py index 5fc6d8af03c4..ae495979f366 100644 --- a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py +++ b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py @@ -189,7 +189,7 @@ class StableDiffusionXLControlNetAdapterPipeline( safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. - feature_extractor ([`CLIPFeatureExtractor`]): + feature_extractor ([`CLIPImageProcessor`]): Model that extracts features from generated images to be used as inputs for the `safety_checker`. """ diff --git a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py index 07954f013295..94ca71cf7b1b 100644 --- a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py +++ b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py @@ -332,7 +332,7 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline( safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. - feature_extractor ([`CLIPFeatureExtractor`]): + feature_extractor ([`CLIPImageProcessor`]): Model that extracts features from generated images to be used as inputs for the `safety_checker`. requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`): Whether the `unet` requires a aesthetic_score condition to be passed during inference. Also see the config diff --git a/examples/community/pipeline_zero1to3.py b/examples/community/pipeline_zero1to3.py index af1c82dfad95..95bb37ce02b7 100644 --- a/examples/community/pipeline_zero1to3.py +++ b/examples/community/pipeline_zero1to3.py @@ -9,7 +9,7 @@ import PIL.Image import torch from packaging import version -from transformers import CLIPFeatureExtractor, CLIPVisionModelWithProjection +from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection # from ...configuration_utils import FrozenDict # from ...models import AutoencoderKL, UNet2DConditionModel @@ -87,7 +87,7 @@ class Zero1to3StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. - feature_extractor ([`CLIPFeatureExtractor`]): + feature_extractor ([`CLIPImageProcessor`]): Model that extracts features from generated images to be used as inputs for the `safety_checker`. cc_projection ([`CCProjection`]): Projection layer to project the concated CLIP features and pose embeddings to the original CLIP feature size. @@ -102,7 +102,7 @@ def __init__( unet: UNet2DConditionModel, scheduler: KarrasDiffusionSchedulers, safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, + feature_extractor: CLIPImageProcessor, cc_projection: CCProjection, requires_safety_checker: bool = True, ): diff --git a/examples/community/regional_prompting_stable_diffusion.py b/examples/community/regional_prompting_stable_diffusion.py index cad71338faed..8a022987ba9d 100644 --- a/examples/community/regional_prompting_stable_diffusion.py +++ b/examples/community/regional_prompting_stable_diffusion.py @@ -3,7 +3,7 @@ import torch import torchvision.transforms.functional as FF -from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from diffusers import StableDiffusionPipeline from diffusers.models import AutoencoderKL, UNet2DConditionModel @@ -69,7 +69,7 @@ def __init__( unet: UNet2DConditionModel, scheduler: KarrasDiffusionSchedulers, safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, + feature_extractor: CLIPImageProcessor, requires_safety_checker: bool = True, ): super().__init__( diff --git a/examples/community/stable_diffusion_ipex.py b/examples/community/stable_diffusion_ipex.py index 388992a740ec..123892f6229a 100644 --- a/examples/community/stable_diffusion_ipex.py +++ b/examples/community/stable_diffusion_ipex.py @@ -18,7 +18,7 @@ import intel_extension_for_pytorch as ipex import torch from packaging import version -from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from diffusers.configuration_utils import FrozenDict from diffusers.loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin @@ -86,7 +86,7 @@ class StableDiffusionIPEXPipeline( safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. - feature_extractor ([`CLIPFeatureExtractor`]): + feature_extractor ([`CLIPImageProcessor`]): Model that extracts features from generated images to be used as inputs for the `safety_checker`. """ @@ -100,7 +100,7 @@ def __init__( unet: UNet2DConditionModel, scheduler: KarrasDiffusionSchedulers, safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, + feature_extractor: CLIPImageProcessor, requires_safety_checker: bool = True, ): super().__init__() diff --git a/examples/community/stable_diffusion_tensorrt_img2img.py b/examples/community/stable_diffusion_tensorrt_img2img.py index 180879776ea9..91540d1f4159 100755 --- a/examples/community/stable_diffusion_tensorrt_img2img.py +++ b/examples/community/stable_diffusion_tensorrt_img2img.py @@ -42,7 +42,7 @@ network_from_onnx_path, save_engine, ) -from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from diffusers import DiffusionPipeline from diffusers.configuration_utils import FrozenDict, deprecate @@ -679,7 +679,7 @@ class TensorRTStableDiffusionImg2ImgPipeline(DiffusionPipeline): safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. - feature_extractor ([`CLIPFeatureExtractor`]): + feature_extractor ([`CLIPImageProcessor`]): Model that extracts features from generated images to be used as inputs for the `safety_checker`. """ @@ -693,7 +693,7 @@ def __init__( unet: UNet2DConditionModel, scheduler: DDIMScheduler, safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, + feature_extractor: CLIPImageProcessor, image_encoder: CLIPVisionModelWithProjection = None, requires_safety_checker: bool = True, stages=["clip", "unet", "vae", "vae_encoder"], diff --git a/examples/community/stable_diffusion_tensorrt_inpaint.py b/examples/community/stable_diffusion_tensorrt_inpaint.py index c790e5aeb1d4..b6f6711a53e7 100755 --- a/examples/community/stable_diffusion_tensorrt_inpaint.py +++ b/examples/community/stable_diffusion_tensorrt_inpaint.py @@ -42,7 +42,7 @@ network_from_onnx_path, save_engine, ) -from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from diffusers import DiffusionPipeline from diffusers.configuration_utils import FrozenDict, deprecate @@ -683,7 +683,7 @@ class TensorRTStableDiffusionInpaintPipeline(DiffusionPipeline): safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. - feature_extractor ([`CLIPFeatureExtractor`]): + feature_extractor ([`CLIPImageProcessor`]): Model that extracts features from generated images to be used as inputs for the `safety_checker`. """ @@ -697,7 +697,7 @@ def __init__( unet: UNet2DConditionModel, scheduler: DDIMScheduler, safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, + feature_extractor: CLIPImageProcessor, image_encoder: CLIPVisionModelWithProjection = None, requires_safety_checker: bool = True, stages=["clip", "unet", "vae", "vae_encoder"], diff --git a/examples/community/stable_diffusion_tensorrt_txt2img.py b/examples/community/stable_diffusion_tensorrt_txt2img.py index 201119c6ba1d..f8761053ed1a 100755 --- a/examples/community/stable_diffusion_tensorrt_txt2img.py +++ b/examples/community/stable_diffusion_tensorrt_txt2img.py @@ -42,7 +42,7 @@ network_from_onnx_path, save_engine, ) -from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from diffusers import DiffusionPipeline from diffusers.configuration_utils import FrozenDict, deprecate @@ -595,7 +595,7 @@ class TensorRTStableDiffusionPipeline(DiffusionPipeline): safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. - feature_extractor ([`CLIPFeatureExtractor`]): + feature_extractor ([`CLIPImageProcessor`]): Model that extracts features from generated images to be used as inputs for the `safety_checker`. """ @@ -609,7 +609,7 @@ def __init__( unet: UNet2DConditionModel, scheduler: DDIMScheduler, safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, + feature_extractor: CLIPImageProcessor, image_encoder: CLIPVisionModelWithProjection = None, requires_safety_checker: bool = True, stages=["clip", "unet", "vae"], diff --git a/examples/research_projects/controlnet/train_controlnet_webdataset.py b/examples/research_projects/controlnet/train_controlnet_webdataset.py index 615eb834ac24..88a5d93d8edf 100644 --- a/examples/research_projects/controlnet/train_controlnet_webdataset.py +++ b/examples/research_projects/controlnet/train_controlnet_webdataset.py @@ -43,7 +43,7 @@ from torch.utils.data import default_collate from torchvision import transforms from tqdm.auto import tqdm -from transformers import AutoTokenizer, DPTFeatureExtractor, DPTForDepthEstimation, PretrainedConfig +from transformers import AutoTokenizer, DPTForDepthEstimation, DPTImageProcessor, PretrainedConfig from webdataset.tariterators import ( base_plus_ext, tar_file_expander, @@ -205,7 +205,7 @@ def __init__( pin_memory: bool = False, persistent_workers: bool = False, control_type: str = "canny", - feature_extractor: Optional[DPTFeatureExtractor] = None, + feature_extractor: Optional[DPTImageProcessor] = None, ): if not isinstance(train_shards_path_or_url, str): train_shards_path_or_url = [list(braceexpand(urls)) for urls in train_shards_path_or_url] @@ -1011,7 +1011,7 @@ def main(args): controlnet = pre_controlnet if args.control_type == "depth": - feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas") + feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas") depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas") depth_model.requires_grad_(False) else: diff --git a/examples/research_projects/gligen/demo.ipynb b/examples/research_projects/gligen/demo.ipynb index c4c6292a9cde..571f1a0323a2 100644 --- a/examples/research_projects/gligen/demo.ipynb +++ b/examples/research_projects/gligen/demo.ipynb @@ -45,7 +45,7 @@ " UniPCMultistepScheduler,\n", " EulerDiscreteScheduler,\n", ")\n", - "from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer\n", + "from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer\n", "# pretrained_model_name_or_path = 'masterful/gligen-1-4-generation-text-box'\n", "\n", "pretrained_model_name_or_path = '/root/data/zhizhonghuang/checkpoints/models--masterful--gligen-1-4-generation-text-box/snapshots/d2820dc1e9ba6ca082051ce79cfd3eb468ae2c83'\n", diff --git a/examples/research_projects/rdm/pipeline_rdm.py b/examples/research_projects/rdm/pipeline_rdm.py index 201acb95aabd..f8093a3f217d 100644 --- a/examples/research_projects/rdm/pipeline_rdm.py +++ b/examples/research_projects/rdm/pipeline_rdm.py @@ -4,7 +4,7 @@ import torch from PIL import Image from retriever import Retriever, normalize_images, preprocess_images -from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPModel, CLIPTokenizer from diffusers import ( AutoencoderKL, @@ -47,7 +47,7 @@ class RDMPipeline(DiffusionPipeline, StableDiffusionMixin): scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. - feature_extractor ([`CLIPFeatureExtractor`]): + feature_extractor ([`CLIPImageProcessor`]): Model that extracts features from generated images to be used as inputs for the `safety_checker`. """ @@ -65,7 +65,7 @@ def __init__( EulerAncestralDiscreteScheduler, DPMSolverMultistepScheduler, ], - feature_extractor: CLIPFeatureExtractor, + feature_extractor: CLIPImageProcessor, retriever: Optional[Retriever] = None, ): super().__init__() diff --git a/examples/research_projects/rdm/retriever.py b/examples/research_projects/rdm/retriever.py index 6be9785a21f3..4ae4989bd8bb 100644 --- a/examples/research_projects/rdm/retriever.py +++ b/examples/research_projects/rdm/retriever.py @@ -6,7 +6,7 @@ import torch from datasets import Dataset, load_dataset from PIL import Image -from transformers import CLIPFeatureExtractor, CLIPModel, PretrainedConfig +from transformers import CLIPImageProcessor, CLIPModel, PretrainedConfig from diffusers import logging @@ -20,7 +20,7 @@ def normalize_images(images: List[Image.Image]): return images -def preprocess_images(images: List[np.array], feature_extractor: CLIPFeatureExtractor) -> torch.Tensor: +def preprocess_images(images: List[np.array], feature_extractor: CLIPImageProcessor) -> torch.Tensor: """ Preprocesses a list of images into a batch of tensors. @@ -95,14 +95,12 @@ def init_index(self): def build_index( self, model=None, - feature_extractor: CLIPFeatureExtractor = None, + feature_extractor: CLIPImageProcessor = None, torch_dtype=torch.float32, ): if not self.index_initialized: model = model or CLIPModel.from_pretrained(self.config.clip_name_or_path).to(dtype=torch_dtype) - feature_extractor = feature_extractor or CLIPFeatureExtractor.from_pretrained( - self.config.clip_name_or_path - ) + feature_extractor = feature_extractor or CLIPImageProcessor.from_pretrained(self.config.clip_name_or_path) self.dataset = get_dataset_with_emb_from_clip_model( self.dataset, model, @@ -136,7 +134,7 @@ def __init__( index: Index = None, dataset: Dataset = None, model=None, - feature_extractor: CLIPFeatureExtractor = None, + feature_extractor: CLIPImageProcessor = None, ): self.config = config self.index = index or self._build_index(config, dataset, model=model, feature_extractor=feature_extractor) @@ -148,7 +146,7 @@ def from_pretrained( index: Index = None, dataset: Dataset = None, model=None, - feature_extractor: CLIPFeatureExtractor = None, + feature_extractor: CLIPImageProcessor = None, **kwargs, ): config = kwargs.pop("config", None) or IndexConfig.from_pretrained(retriever_name_or_path, **kwargs) @@ -156,7 +154,7 @@ def from_pretrained( @staticmethod def _build_index( - config: IndexConfig, dataset: Dataset = None, model=None, feature_extractor: CLIPFeatureExtractor = None + config: IndexConfig, dataset: Dataset = None, model=None, feature_extractor: CLIPImageProcessor = None ): dataset = dataset or load_dataset(config.dataset_name) dataset = dataset[config.dataset_set] diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py index bfc54426e727..976e977d55ec 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py @@ -76,13 +76,13 @@ >>> import numpy as np >>> from PIL import Image - >>> from transformers import DPTFeatureExtractor, DPTForDepthEstimation + >>> from transformers import DPTImageProcessor, DPTForDepthEstimation >>> from diffusers import ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, AutoencoderKL >>> from diffusers.utils import load_image >>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda") - >>> feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas") + >>> feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas") >>> controlnet = ControlNetModel.from_pretrained( ... "diffusers/controlnet-depth-sdxl-1.0-small", ... variant="fp16", diff --git a/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py index 5b6fc2b393c0..8a2cc08dbb2b 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py @@ -23,7 +23,7 @@ from flax.jax_utils import unreplicate from flax.training.common_utils import shard from PIL import Image -from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel +from transformers import CLIPImageProcessor, CLIPTokenizer, FlaxCLIPTextModel from ...models import FlaxAutoencoderKL, FlaxControlNetModel, FlaxUNet2DConditionModel from ...schedulers import ( @@ -149,7 +149,7 @@ def __init__( FlaxDDIMScheduler, FlaxPNDMScheduler, FlaxLMSDiscreteScheduler, FlaxDPMSolverMultistepScheduler ], safety_checker: FlaxStableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, + feature_extractor: CLIPImageProcessor, dtype: jnp.dtype = jnp.float32, ): super().__init__() diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py index 701e7a3a81b2..9e91986896bd 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py @@ -16,7 +16,7 @@ from typing import Any, Callable, Dict, List, Optional, Union import torch -from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ....image_processor import VaeImageProcessor from ....loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin @@ -66,8 +66,8 @@ class StableDiffusionModelEditingPipeline( Classification module that estimates whether generated images could be considered offensive or harmful. Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details about a model's potential harms. - feature_extractor ([`~transformers.CLIPFeatureExtractor`]): - A `CLIPFeatureExtractor` to extract features from generated images; used as inputs to the `safety_checker`. + feature_extractor ([`~transformers.CLIPImageProcessor`]): + A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`. with_to_k ([`bool`]): Whether to edit the key projection matrices along with the value projection matrices. with_augs ([`list`]): @@ -86,7 +86,7 @@ def __init__( unet: UNet2DConditionModel, scheduler: SchedulerMixin, safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, + feature_extractor: CLIPImageProcessor, requires_safety_checker: bool = True, with_to_k: bool = True, with_augs: list = AUGS_CONST, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index ccfb2300bda5..7801b0d01dff 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -20,7 +20,7 @@ import PIL.Image import torch from packaging import version -from transformers import CLIPTextModel, CLIPTokenizer, DPTFeatureExtractor, DPTForDepthEstimation +from transformers import CLIPTextModel, CLIPTokenizer, DPTForDepthEstimation, DPTImageProcessor from ...configuration_utils import FrozenDict from ...image_processor import PipelineImageInput, VaeImageProcessor @@ -111,7 +111,7 @@ def __init__( unet: UNet2DConditionModel, scheduler: KarrasDiffusionSchedulers, depth_estimator: DPTForDepthEstimation, - feature_extractor: DPTFeatureExtractor, + feature_extractor: DPTImageProcessor, ): super().__init__() diff --git a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py index 62584beec6a9..52ccd5612776 100644 --- a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +++ b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py @@ -18,7 +18,7 @@ import PIL.Image import torch -from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin @@ -138,7 +138,7 @@ def __init__( unet: UNet2DConditionModel, scheduler: KarrasDiffusionSchedulers, safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, + feature_extractor: CLIPImageProcessor, requires_safety_checker: bool = True, ): super().__init__() diff --git a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py index 67b9b927f210..c6748ad418fe 100644 --- a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +++ b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py @@ -19,7 +19,7 @@ import PIL.Image import torch from transformers import ( - CLIPFeatureExtractor, + CLIPImageProcessor, CLIPProcessor, CLIPTextModel, CLIPTokenizer, @@ -193,7 +193,7 @@ def __init__( unet: UNet2DConditionModel, scheduler: KarrasDiffusionSchedulers, safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, + feature_extractor: CLIPImageProcessor, requires_safety_checker: bool = True, ): super().__init__() diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py index 55a8694c16e9..3cb7c26bb6a2 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py @@ -19,7 +19,7 @@ import numpy as np import PIL.Image import torch -from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin @@ -209,7 +209,7 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin): safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. - feature_extractor ([`CLIPFeatureExtractor`]): + feature_extractor ([`CLIPImageProcessor`]): Model that extracts features from generated images to be used as inputs for the `safety_checker`. """ @@ -225,7 +225,7 @@ def __init__( adapter: Union[T2IAdapter, MultiAdapter, List[T2IAdapter]], scheduler: KarrasDiffusionSchedulers, safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, + feature_extractor: CLIPImageProcessor, requires_safety_checker: bool = True, ): super().__init__() diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py index 287298b87a73..0ea197e42e62 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py @@ -237,7 +237,7 @@ class StableDiffusionXLAdapterPipeline( safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. - feature_extractor ([`CLIPFeatureExtractor`]): + feature_extractor ([`CLIPImageProcessor`]): Model that extracts features from generated images to be used as inputs for the `safety_checker`. """ diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py index 8fc76db311a6..838f996117aa 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py @@ -26,8 +26,8 @@ CLIPTextModel, CLIPTokenizer, DPTConfig, - DPTFeatureExtractor, DPTForDepthEstimation, + DPTImageProcessor, ) from diffusers import ( @@ -145,9 +145,7 @@ def get_dummy_components(self): backbone_featmap_shape=[1, 384, 24, 24], ) depth_estimator = DPTForDepthEstimation(depth_estimator_config).eval() - feature_extractor = DPTFeatureExtractor.from_pretrained( - "hf-internal-testing/tiny-random-DPTForDepthEstimation" - ) + feature_extractor = DPTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-DPTForDepthEstimation") components = { "unet": unet,