Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add kolektor dataset #983

Merged
merged 26 commits into from
May 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
9296c68
Add dataset create function
Ravindu987 Mar 26, 2023
2ab6135
Add KolektorDataset class
Ravindu987 Mar 26, 2023
b86f0bb
Add Kolektor class
Ravindu987 Mar 26, 2023
25793c8
Add comments for dataset creation
Ravindu987 Mar 26, 2023
5843cf1
Merge branch 'main' into add-kolektor-branch
samet-akcay Apr 3, 2023
7ef792e
Merge branch 'main' into add-kolektor-branch
Ravindu987 Apr 6, 2023
8edf332
Add licensing
Ravindu987 Apr 6, 2023
8716439
Fix error message
Ravindu987 Apr 6, 2023
e42caaa
Add train split ratio as a parameter
Ravindu987 Apr 6, 2023
fb335c6
Add dataset link
Ravindu987 Apr 6, 2023
4de0f3b
Merge branch 'main' into add-kolektor-branch
Ravindu987 Apr 16, 2023
e2d7d77
Add docstrings
Ravindu987 Apr 16, 2023
268c01d
Fix warn in imageio
Ravindu987 Apr 16, 2023
eb10894
Add Prepare dataset method with custom download function
Ravindu987 Apr 16, 2023
6dac3e7
Code clean
Ravindu987 Apr 16, 2023
2cd8ec6
Merge branch 'openvinotoolkit:main' into add-kolektor-branch
Ravindu987 Apr 25, 2023
cdc251b
Update function to check mask
Ravindu987 Apr 25, 2023
0b22c16
Merge branch 'main' into add-kolektor-branch
Ravindu987 Apr 28, 2023
c3b55fe
Merge branch 'openvinotoolkit:main' into add-kolektor-branch
Ravindu987 Apr 28, 2023
5c9ed13
Update download util with filename in DownloadInfo
Ravindu987 Apr 28, 2023
0cde49e
Use updated download_and_extract in Kolektor
Ravindu987 Apr 28, 2023
d04bc99
Use opencv for reading images
Ravindu987 Apr 28, 2023
915ee82
Updated the changelog
Ravindu987 Apr 29, 2023
1a3d668
Merge branch 'main' into add-kolektor-branch
samet-akcay May 5, 2023
ae07eeb
Merge branch 'main' into add-kolektor-branch
samet-akcay May 11, 2023
97d4f5d
Merge branch 'main' into add-kolektor-branch
samet-akcay May 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
285 changes: 285 additions & 0 deletions src/anomalib/data/kolektor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
"""Kolektor Surface-Defect Dataset (CC BY-NC-SA 4.0).

Description:
This script contains PyTorch Dataset, Dataloader and PyTorch
Lightning DataModule for the Kolektor Surface-Defect dataset.
The dataset can be found at https://www.vicos.si/resources/kolektorsdd/

License:
Kolektor Surface-Defect dataset is released under the Creative Commons
Attribution-NonCommercial-ShareAlike 4.0 International License
(CC BY-NC-SA 4.0)(https://creativecommons.org/licenses/by-nc-sa/4.0/).

Reference:
- Tabernik, Domen, Samo Šela, Jure Skvarč, and Danijel Skočaj.
"Segmentation-based deep-learning approach for surface-defect detection."
Journal of Intelligent Manufacturing 31, no. 3 (2020): 759-776.
"""


from __future__ import annotations
Ravindu987 marked this conversation as resolved.
Show resolved Hide resolved

import logging
from pathlib import Path

import albumentations as A
import numpy as np
from cv2 import imread
from pandas import DataFrame
from sklearn.model_selection import train_test_split

from anomalib.data.base import AnomalibDataModule, AnomalibDataset
from anomalib.data.task_type import TaskType
from anomalib.data.utils import (
DownloadInfo,
InputNormalizationMethod,
Split,
TestSplitMode,
ValSplitMode,
download_and_extract,
get_transforms,
)

logger = logging.getLogger(__name__)

DOWNLOAD_INFO = DownloadInfo(
name="kolektor",
url="https://go.vicos.si/kolektorsdd",
hash="2b094030343c1cd59df02203ac6c57a0",
filename="KolektorSDD.zip",
)


# Check if a mask shows defects
def is_mask_anomalous(path):
img_arr = imread(path)
samet-akcay marked this conversation as resolved.
Show resolved Hide resolved
if np.all(img_arr == 0):
return 0
return 1


def make_kolektor_dataset(
root: str | Path,
train_split_ratio: float = 0.8,
split: str | Split | None = None,
) -> DataFrame:
"""Create Kolektor samples by parsing the Koelktor data file structure.

The files are expected to follow the structure:
image files:
path/to/dataset/item/image_filename.jpg
path/to/dataset/kos01/Part0.jpg
mask files:
path/to/dataset/item/mask_filename.bmp
path/to/dataset/kos01/Part0_label.bmp

This function creates a dataframe to store the parsed information based on the following format:
|---|--------------------|--------|-------|---------|---------------------|--------------------|-------------|
| | path | item | split | label | image_path | mask_path | label_index |
|---|--------------------|--------|-------|---------|---------------------|--------------------|-------------|
| 0 | KolektorSDD | kos01 | test | Bad | /path/to/image_file | /path/to/mask_file | 1 |
|---|--------------------|--------|-------|---------|---------------------|--------------------|-------------|

Args:
root (Path): Path to dataset
train_split_ratio (float, optional): Ratio to split good images into train/test
Defaults to 0.8 for train.
split (str | Split | None, optional): Dataset split (Either train or test). Defaults to None.

Examples:
The following example shows how to get training samples from Kolektor Dataset:

>>> root = Path('./KolektorSDD/')

>>> samples = make_kolektor_dataset(root, train_split_ratio=0.8)
>>> print(samples.head())
path item split label image_path mask_path label_index
0 KolektorSDD kos01 train Good KolektorSDD/kos01/Part0.jpg KolektorSDD/kos01/Part0_label.bmp 0
1 KolektorSDD kos01 train Good KolektorSDD/kos01/Part1.jpg KolektorSDD/kos01/Part1_label.bmp 0
2 KolektorSDD kos01 train Good KolektorSDD/kos01/Part2.jpg KolektorSDD/kos01/Part2_label.bmp 0
3 KolektorSDD kos01 test Good KolektorSDD/kos01/Part3.jpg KolektorSDD/kos01/Part3_label.bmp 0
4 KolektorSDD kos01 train Good KolektorSDD/kos01/Part4.jpg KolektorSDD/kos01/Part4_label.bmp 0

Returns:
DataFrame: an output dataframe containing the samples of the dataset.
"""

root = Path(root)

# Get list of images and masks
samples_list = [(str(root),) + f.parts[-2:] for f in root.glob(r"**/*") if f.suffix == ".jpg"]
masks_list = [(str(root),) + f.parts[-2:] for f in root.glob(r"**/*") if f.suffix == ".bmp"]

if not samples_list:
raise RuntimeError(f"Found 0 images in {root}")

# Create dataframes
samples = DataFrame(samples_list, columns=["path", "item", "image_path"])
masks = DataFrame(masks_list, columns=["path", "item", "image_path"])

# Modify image_path column by converting to absolute path
samples["image_path"] = samples.path + "/" + samples.item + "/" + samples.image_path
masks["image_path"] = masks.path + "/" + masks.item + "/" + masks.image_path

# Sort samples by image path
samples = samples.sort_values(by="image_path", ignore_index=True)
masks = masks.sort_values(by="image_path", ignore_index=True)

# Add mask paths for sample images
samples["mask_path"] = masks.image_path.values

# Use is_good func to configure the label_index
samples["label_index"] = samples["mask_path"].apply(is_mask_anomalous)
samples.label_index = samples.label_index.astype(int)

# Use label indexes to label data
samples.loc[(samples.label_index == 0), "label"] = "Good"
samples.loc[(samples.label_index == 1), "label"] = "Bad"

# Add all 'Bad' samples to test set
samples.loc[(samples.label == "Bad"), "split"] = "test"

# Divide 'good' images to train/test on 0.8/0.2 ratio
train_samples, test_samples = train_test_split(
Ravindu987 marked this conversation as resolved.
Show resolved Hide resolved
samples[samples.label == "Good"], train_size=train_split_ratio, random_state=42
)
samples.loc[train_samples.index, "split"] = "train"
samples.loc[test_samples.index, "split"] = "test"

# Reorder columns
samples = samples[["path", "item", "split", "label", "image_path", "mask_path", "label_index"]]

# assert that the right mask files are associated with the right test images
assert (
samples.loc[samples.label_index == 1]
.apply(lambda x: Path(x.image_path).stem in Path(x.mask_path).stem, axis=1)
.all()
), "Mismatch between anomalous images and ground truth masks. Make sure the mask files \
follow the same naming convention as the anomalous images in the dataset (e.g. image: 'Part0.jpg', \
mask: 'Part0_label.bmp')."

# Get the dataframe for the required split
if split:
samples = samples[samples.split == split].reset_index(drop=True)

return samples


class KolektorDataset(AnomalibDataset):
"""Kolektor dataset class.

Args:
task (TaskType): Task type, ``classification``, ``detection`` or ``segmentation``
transform (A.Compose): Albumentations Compose object describing the transforms that are applied to the inputs.
root (Path | str): Path to the root of the dataset
split (str | Split | None): Split of the dataset, usually Split.TRAIN or Split.TEST
"""

def __init__(
self,
task: TaskType,
transform: A.Compose,
root: Path | str,
split: str | Split | None = None,
) -> None:
super().__init__(task=task, transform=transform)

self.root = root
self.split = split

def _setup(self) -> None:
self.samples = make_kolektor_dataset(self.root, train_split_ratio=0.8, split=self.split)


class Kolektor(AnomalibDataModule):
"""Kolektor Datamodule.

Args:
root (Path | str): Path to the root of the dataset
image_size (int | tuple[int, int] | None, optional): Size of the input image.
Defaults to None.
center_crop (int | tuple[int, int] | None, optional): When provided, the images will be center-cropped
to the provided dimensions.
normalize (bool): When True, the images will be normalized to the ImageNet statistics.
train_batch_size (int, optional): Training batch size. Defaults to 32.
eval_batch_size (int, optional): Test batch size. Defaults to 32.
num_workers (int, optional): Number of workers. Defaults to 8.
task TaskType): Task type, 'classification', 'detection' or 'segmentation'
transform_config_train (str | A.Compose | None, optional): Config for pre-processing
during training.
Defaults to None.
transform_config_val (str | A.Compose | None, optional): Config for pre-processing
during validation.
Defaults to None.
test_split_mode (TestSplitMode): Setting that determines how the testing subset is obtained.
test_split_ratio (float): Fraction of images from the train set that will be reserved for testing.
Defaults to 0.2
val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained.
val_split_ratio (float): Fraction of train or test images that will be reserved for validation.
Defaults to 0.5
seed (int | None, optional): Seed which may be set to a fixed value for reproducibility.
"""

def __init__(
self,
root: Path | str,
image_size: int | tuple[int, int] | None = None,
center_crop: int | tuple[int, int] | None = None,
normalization: str | InputNormalizationMethod = InputNormalizationMethod.IMAGENET,
train_batch_size: int = 32,
eval_batch_size: int = 32,
num_workers: int = 8,
task: TaskType = TaskType.SEGMENTATION,
transform_config_train: str | A.Compose | None = None,
transform_config_eval: str | A.Compose | None = None,
test_split_mode: TestSplitMode = TestSplitMode.FROM_DIR,
test_split_ratio: float = 0.2,
val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
val_split_ratio: float = 0.5,
seed: int | None = None,
) -> None:
super().__init__(
train_batch_size=train_batch_size,
eval_batch_size=eval_batch_size,
num_workers=num_workers,
test_split_mode=test_split_mode,
test_split_ratio=test_split_ratio,
val_split_mode=val_split_mode,
val_split_ratio=val_split_ratio,
seed=seed,
)

self.root = Path(root)

transform_train = get_transforms(
config=transform_config_train,
image_size=image_size,
center_crop=center_crop,
normalization=InputNormalizationMethod(normalization),
)
transform_eval = get_transforms(
config=transform_config_eval,
image_size=image_size,
center_crop=center_crop,
normalization=InputNormalizationMethod(normalization),
)

self.train_data = KolektorDataset(
task=task,
transform=transform_train,
split=Split.TRAIN,
root=root,
)
self.test_data = KolektorDataset(
task=task,
transform=transform_eval,
split=Split.TEST,
root=root,
)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is missing prepare_data method. Have a look at

def prepare_data(self) -> None:
to see an example implementation


def prepare_data(self) -> None:
"""Download the dataset if not available."""
if (self.root).is_dir():
logger.info("Found the dataset.")
else:
download_and_extract(self.root, DOWNLOAD_INFO)
7 changes: 6 additions & 1 deletion src/anomalib/data/utils/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class DownloadInfo:
name: str
url: str
hash: str
filename: str | None = None


class DownloadProgressBar(tqdm):
Expand Down Expand Up @@ -227,7 +228,11 @@ def download_and_extract(root: Path, info: DownloadInfo) -> None:
root.mkdir(parents=True, exist_ok=True)

# save the compressed file in the specified root directory, using the same file name as on the server
downloaded_file_path = root / info.url.split("/")[-1]
if info.filename:
downloaded_file_path = root / info.filename
else:
downloaded_file_path = root / info.url.split("/")[-1]

if downloaded_file_path.exists():
logger.info("Existing dataset archive found. Skipping download stage.")
else:
Expand Down