openvinotoolkit · samet-akcay · Feb 29, 2024 · Feb 2, 2024 · Feb 14, 2024 · Feb 15, 2024
@@ -11,15 +11,16 @@
 from pathlib import Path
 
 import albumentations as A  # noqa: N812
-import cv2
-import numpy as np
 import pandas as pd
 import torch
 from pandas import DataFrame
+from PIL import Image
 from torch.utils.data import Dataset
+from torchvision.transforms.functional import to_tensor
+from torchvision.tv_tensors import Mask
 
 from anomalib import TaskType
-from anomalib.data.utils import masks_to_boxes, read_image
+from anomalib.data.utils import masks_to_boxes
 
 _EXPECTED_COLUMNS_CLASSIFICATION = ["image_path", "split"]
 _EXPECTED_COLUMNS_SEGMENTATION = [*_EXPECTED_COLUMNS_CLASSIFICATION, "mask_path"]
@@ -117,24 +118,21 @@ def __getitem__(self, index: int) -> dict[str, str | torch.Tensor]:
         mask_path = self._samples.iloc[index].mask_path
         label_index = self._samples.iloc[index].label_index
 
-        image = read_image(image_path)
+        image = to_tensor(Image.open(image_path))
         item = {"image_path": image_path, "label": label_index}
 
         if self.task == TaskType.CLASSIFICATION:
-            transformed = self.transform(image=image)
-            item["image"] = transformed["image"]
+            item["image"] = self.transform(image)
         elif self.task in (TaskType.DETECTION, TaskType.SEGMENTATION):
             # Only Anomalous (1) images have masks in anomaly datasets
             # Therefore, create empty mask for Normal (0) images.
-
-            mask = np.zeros(shape=image.shape[:2]) if label_index == 0 else cv2.imread(mask_path, flags=0) / 255.0
-            mask = mask.astype(np.single)
-
-            transformed = self.transform(image=image, mask=mask)
-
-            item["image"] = transformed["image"]
+            mask = (
+                Mask(torch.zeros(image.shape[-2:]))
+                if label_index == 0
+                else Mask(to_tensor(Image.open(mask_path)).squeeze())
+            )
+            item["image"], item["mask"] = self.transform(image, mask)
             item["mask_path"] = mask_path
-            item["mask"] = transformed["mask"]
 
             if self.task == TaskType.DETECTION:
                 # create boxes from masks for detection task

@@ -4,13 +4,14 @@
 from abc import ABC
 
 import albumentations as A  # noqa: N812
-import cv2
-import numpy as np
 import torch
+from PIL import Image
+from torchvision.transforms.functional import to_tensor
+from torchvision.tv_tensors import Mask
 
 from anomalib import TaskType
 from anomalib.data.base.dataset import AnomalibDataset
-from anomalib.data.utils import masks_to_boxes, read_depth_image, read_image
+from anomalib.data.utils import masks_to_boxes, read_depth_image
 
 
 class AnomalibDepthDataset(AnomalibDataset, ABC):
@@ -40,25 +41,23 @@ def __getitem__(self, index: int) -> dict[str, str | torch.Tensor]:
         label_index = self._samples.iloc[index].label_index
         depth_path = self._samples.iloc[index].depth_path
 
-        image = read_image(image_path)
-        depth_image = read_depth_image(depth_path)
+        image = to_tensor(Image.open(image_path))
+        depth_image = to_tensor(read_depth_image(depth_path))
         item = {"image_path": image_path, "depth_path": depth_path, "label": label_index}
 
         if self.task == TaskType.CLASSIFICATION:
-            transformed = self.transform(image=image, depth_image=depth_image)
-            item["image"] = transformed["image"]
-            item["depth_image"] = transformed["depth_image"]
+            item["image"], item["depth_image"] = self.transform(image, depth_image)
         elif self.task in (TaskType.DETECTION, TaskType.SEGMENTATION):
             # Only Anomalous (1) images have masks in anomaly datasets
             # Therefore, create empty mask for Normal (0) images.
-            mask = np.zeros(shape=image.shape[:2]) if label_index == 0 else cv2.imread(mask_path, flags=0) / 255.0
+            mask = (
+                Mask(torch.zeros(image.shape[-2:]))
+                if label_index == 0
+                else Mask(to_tensor(Image.open(mask_path)).squeeze())
+            )
 
-            transformed = self.transform(image=image, depth_image=depth_image, mask=mask)
-
-            item["image"] = transformed["image"]
-            item["depth_image"] = transformed["depth_image"]
+            item["image"], item["depth_image"], item["mask"] = self.transform(image, depth_image, mask)
             item["mask_path"] = mask_path
-            item["mask"] = transformed["mask"]
 
             if self.task == TaskType.DETECTION:
                 # create boxes from masks for detection task

@@ -8,6 +8,7 @@
 import albumentations as A  # noqa: N812
 import torch
 from pandas import DataFrame
+from torchvision.tv_tensors import Mask
 
 from anomalib import TaskType
 from anomalib.data.base.datamodule import AnomalibDataModule
@@ -148,22 +149,14 @@ def __getitem__(self, index: int) -> dict[str, str | torch.Tensor]:
         item["original_image"] = item["image"].to(torch.uint8)
 
         # apply transforms
-        if "mask" in item and item["mask"] is not None:
-            processed_frames = [
-                self.transform(image=frame.numpy(), mask=mask)
-                for frame, mask in zip(item["image"], item["mask"], strict=True)
-            ]
-            item["image"] = torch.stack([item["image"] for item in processed_frames]).squeeze(0)
-            mask = torch.as_tensor(item["mask"])
-            item["mask"] = torch.stack([item["mask"] for item in processed_frames]).squeeze(0)
-            item["label"] = torch.Tensor([1 in frame for frame in mask]).int().squeeze(0)
+        if item.get("mask") is not None:
+            item["image"], item["mask"] = self.transform(item["image"], Mask(item["mask"]))
+            item["label"] = torch.Tensor([1 in frame for frame in item["mask"]]).int().squeeze(0)
             if self.task == TaskType.DETECTION:
                 item["boxes"], _ = masks_to_boxes(item["mask"])
                 item["boxes"] = item["boxes"][0] if len(item["boxes"]) == 1 else item["boxes"]
         else:
-            item["image"] = torch.stack(
-                [self.transform(image=frame.numpy())["image"] for frame in item["image"]],
-            ).squeeze(0)
+            item["image"] = self.transform(item["image"])
 
         # include only target frame in gt
         if self.clip_length_in_frames > 1 and self.target_frame != VideoTargetFrame.ALL:

@@ -8,9 +8,11 @@
 from typing import Any
 
 import albumentations as A  # noqa: N812
+from PIL import Image
 from torch.utils.data.dataset import Dataset
+from torchvision.transforms.functional import to_tensor
 
-from anomalib.data.utils import get_image_filenames, get_transforms, read_image
+from anomalib.data.utils import get_image_filenames, get_transforms
 
 
 class PredictDataset(Dataset):
@@ -46,8 +48,8 @@ def __len__(self) -> int:
     def __getitem__(self, index: int) -> dict[str, Any]:
         """Get the image based on the `index`."""
         image_filename = self.image_filenames[index]
-        image = read_image(path=image_filename)
-        pre_processed = self.transform(image=image)
+        image = to_tensor(Image.open(image_filename))
+        pre_processed = {"image": self.transform(image)}
         pre_processed["image_path"] = str(image_filename)
 
         return pre_processed
@@ -111,7 +111,7 @@ def generate_perturbation(
 
         # Load anomaly source image
         if anomaly_source_path:
-            anomaly_source_img = cv2.imread(anomaly_source_path)
+            anomaly_source_img = cv2.imread(str(anomaly_source_path))
             anomaly_source_img = cv2.resize(anomaly_source_img, dsize=(width, height))
         else:  # if no anomaly source is specified, we use the perlin noise as anomalous source
             anomaly_source_img = np.expand_dims(perlin_noise, 2).repeat(3, 2)

@@ -3,19 +3,23 @@
 # Copyright (C) 2022-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-
 import logging
 from enum import Enum
 
-import albumentations as A  # noqa: N812
-from albumentations.pytorch import ToTensorV2
-from omegaconf import DictConfig
+import torch
+from torchvision.transforms import v2
 
 from anomalib.data.utils.image import get_image_height_and_width
 
 logger = logging.getLogger(__name__)
 
 
+NORMALIZATION_STATS = {
+    "imagenet": {"mean": (0.485, 0.456, 0.406), "std": (0.229, 0.224, 0.225)},
+    "clip": {"mean": (0.48145466, 0.4578275, 0.40821073), "std": (0.26862954, 0.26130258, 0.27577711)},
+}
+
+
 class InputNormalizationMethod(str, Enum):
     """Normalization method for the input images."""
 
@@ -25,18 +29,18 @@ class InputNormalizationMethod(str, Enum):
 
 
 def get_transforms(
-    config: str | A.Compose | None = None,
+    config: str | v2.Compose | None = None,
     image_size: int | tuple[int, int] | None = None,
     center_crop: int | tuple[int, int] | None = None,
     normalization: InputNormalizationMethod = InputNormalizationMethod.IMAGENET,
-    to_tensor: bool = True,
-) -> A.Compose:
+    _to_tensor: bool = True,
+) -> v2.Compose:
     """Get transforms from config or image size.
 
     Args:
-        config (str | A.Compose | None, optional):
-            Albumentations transforms.
-            Either config or albumentations ``Compose`` object. Defaults to None.
+        config (str | v2.Compose | None, optional):
+            Torchvision transforms.
+            Either config or torchvision ``Compose`` object. Defaults to None.
         image_size (int | tuple | None, optional):
             Image size to transform.
             Defaults to None.
@@ -46,126 +50,45 @@ def get_transforms(
         normalization (InputNormalizationMethod, optional):
             Normalization method for the input images.
             Defaults to InputNormalizationMethod.IMAGENET.
-        to_tensor (bool, optional):
+        _to_tensor (bool, optional):
             Boolean to convert the final transforms into Torch tensor.
             Defaults to True.
 
-    Raises:
-        ValueError: When both ``config`` and ``image_size`` is ``None``.
-        ValueError: When ``config`` is not a ``str`` or `A.Compose`` object.
-
     Returns:
-        A.Compose: Albumentation ``Compose`` object containing the image transforms.
-
-    Examples:
-        >>> import skimage
-        >>> image = skimage.data.astronaut()
-
-        >>> transforms = get_transforms(image_size=256, to_tensor=False)
-        >>> output = transforms(image=image)
-        >>> output["image"].shape
-        (256, 256, 3)
-
-        >>> transforms = get_transforms(image_size=256, to_tensor=True)
-        >>> output = transforms(image=image)
-        >>> output["image"].shape
-        torch.Size([3, 256, 256])
-
-
-        Transforms could be read from albumentations Compose object.
-
-        >>> import albumentations as A  # noqa: N812
-        >>> from albumentations.pytorch import ToTensorV2
-        >>> config = A.Compose([A.Resize(512, 512), ToTensorV2()])
-        >>> transforms = get_transforms(config=config, to_tensor=False)
-        >>> output = transforms(image=image)
-        >>> output["image"].shape
-        (512, 512, 3)
-        >>> type(output["image"])
-        numpy.ndarray
-
-        Transforms could be deserialized from a yaml file.
-
-        >>> transforms = A.Compose([A.Resize(1024, 1024), ToTensorV2()])
-        >>> A.save(transforms, "/tmp/transforms.yaml", data_format="yaml")
-        >>> transforms = get_transforms(config="/tmp/transforms.yaml")
-        >>> output = transforms(image=image)
-        >>> output["image"].shape
-        torch.Size([3, 1024, 1024])
+        T.Compose: Torchvision Compose object containing the image transforms.
     """
-    transforms: A.Compose
-
     if config is not None:
-        if isinstance(config, DictConfig):
-            logger.info("Loading transforms from config File")
-            transforms_list = []
-
-            if "Resize" not in config and image_size is not None:
-                resize_height, resize_width = get_image_height_and_width(image_size)
-                transforms_list.append(A.Resize(height=resize_height, width=resize_width, always_apply=True))
-                logger.info("Resize %s added!", (resize_height, resize_width))
-
-            for key, value in config.items():
-                if hasattr(A, key):
-                    transform = getattr(A, key)(**value)
-                    logger.info("Transform %s added!", transform)
-                    transforms_list.append(transform)
-                else:
-                    msg = f"Transformation {key} is not part of albumentations"
-                    raise ValueError(msg)
-
-            transforms_list.append(ToTensorV2())
-            transforms = A.Compose(transforms_list, additional_targets={"image": "image", "depth_image": "image"})
-
-        # load transforms from config file
-        elif isinstance(config, str):
-            logger.info("Reading transforms from Albumentations config file: %s.", config)
-            transforms = A.load(filepath=config, data_format="yaml")
-        elif isinstance(config, A.Compose):
-            logger.info("Transforms loaded from Albumentations Compose object")
-            transforms = config
-        else:
-            msg = "config could be either ``str`` or ``A.Compose``"
-            raise TypeError(msg)
+        # Load torchvision transforms from config
+        pass  # Implement your logic for loading torchvision transforms from a config
     else:
         logger.info("No config file has been provided. Using default transforms.")
         transforms_list = []
 
-        # add resize transform
+        # Add resize transform
         if image_size is None:
-            msg = (
-                "Both config and image_size cannot be `None`. "
-                "Provide either config file to de-serialize transforms or image_size to get the default transformations"
-            )
+            msg = "Both config and image_size cannot be `None`."
             raise ValueError(msg)
         resize_height, resize_width = get_image_height_and_width(image_size)
-        transforms_list.append(A.Resize(height=resize_height, width=resize_width, always_apply=True))
+        transforms_list.append(v2.Resize(size=(resize_height, resize_width), interpolation=3, antialias=True))
 
-        # add center crop transform
+        # Add center crop transform
         if center_crop is not None:
             crop_height, crop_width = get_image_height_and_width(center_crop)
             if crop_height > resize_height or crop_width > resize_width:
                 msg = f"Crop size may not be larger than image size. Found {image_size} and {center_crop}"
                 raise ValueError(msg)
-            transforms_list.append(A.CenterCrop(height=crop_height, width=crop_width, always_apply=True))
-
-        # add normalize transform
-        if normalization == InputNormalizationMethod.IMAGENET:
-            transforms_list.append(A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)))
-        elif normalization == InputNormalizationMethod.CLIP:
-            transforms_list.append(
-                A.Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711)),
-            )
-        elif normalization == InputNormalizationMethod.NONE:
-            transforms_list.append(A.ToFloat(max_value=255))
-        else:
+            transforms_list.append(v2.CenterCrop(size=(crop_height, crop_width)))
+
+        # Add convert-to-float transform
+        transforms_list.append(v2.ToDtype(dtype=torch.float32, scale=True))
+
+        # Add normalize transform
+        if normalization in [InputNormalizationMethod.IMAGENET, InputNormalizationMethod.CLIP]:
+            transforms_list.append(v2.Normalize(**NORMALIZATION_STATS[normalization]))
+        elif normalization != InputNormalizationMethod.NONE:
             msg = f"Unknown normalization method: {normalization}"
             raise ValueError(msg)
 
-        # add tensor conversion
-        if to_tensor:
-            transforms_list.append(ToTensorV2())
-
-        transforms = A.Compose(transforms_list, additional_targets={"image": "image", "depth_image": "image"})
+        transforms = v2.Compose(transforms_list)
 
     return transforms
@@ -60,7 +60,7 @@ def get_item(self, idx: int) -> dict[str, Any]:
         clip_pts = self.clips[video_idx][clip_idx]
 
         return {
-            "image": clip,
+            "image": clip.permute(0, 3, 1, 2) / 255,
             "mask": self.get_mask(idx),
             "video_path": video_path,
             "frames": clip_pts,