forked from wanghsinwei/isic-2019
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
56 lines (46 loc) · 2.66 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import pandas as pd
import numpy as np
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
def load_isic_training_data(image_folder, ground_truth_file):
df_ground_truth = pd.read_csv(ground_truth_file)
# Category names
known_category_names = list(df_ground_truth.columns.values[1:9])
unknown_category_name = df_ground_truth.columns.values[9]
# Add path and category columns
df_ground_truth['path'] = df_ground_truth.apply(lambda row : os.path.join(image_folder, row['image']+'.jpg'), axis=1)
df_ground_truth['category'] = np.argmax(np.array(df_ground_truth.iloc[:,1:10]), axis=1)
return df_ground_truth, known_category_names, unknown_category_name
def load_isic_training_and_out_dist_data(isic_image_folder, ground_truth_file, out_dist_image_folder):
"""ISIC training data and Out-of-distribution data are combined"""
df_ground_truth = pd.read_csv(ground_truth_file)
# Category names
known_category_names = list(df_ground_truth.columns.values[1:9])
unknown_category_name = df_ground_truth.columns.values[9]
# Add path and category columns
df_ground_truth['path'] = df_ground_truth.apply(lambda row : os.path.join(isic_image_folder, row['image']+'.jpg'), axis=1)
df_out_dist = get_dataframe_from_img_folder(out_dist_image_folder, has_path_col=True)
for name in known_category_names:
df_out_dist[name] = 0.0
df_out_dist[unknown_category_name] = 1.0
# Change the order of columns
df_out_dist = df_out_dist[df_ground_truth.columns.values]
df_combined = pd.concat([df_ground_truth, df_out_dist])
df_combined['category'] = np.argmax(np.array(df_combined.iloc[:,1:10]), axis=1)
category_names = known_category_names + [unknown_category_name]
return df_combined, category_names
def train_validation_split(df):
df_train, df_val = train_test_split(df, stratify=df['category'], test_size=0.2, random_state=1)
return df_train, df_val
def compute_class_weight_dict(df):
"""Compute class weights for weighting the loss function on imbalanced data."""
class_weights = class_weight.compute_class_weight('balanced', np.unique(df['category']), df['category'])
class_weight_dict = dict(enumerate(class_weights))
return class_weight_dict, class_weights
def get_dataframe_from_img_folder(img_folder, has_path_col=True):
if has_path_col:
return pd.DataFrame([[Path(x).stem, x] for x in sorted(Path(img_folder).glob('**/*.jpg'))], columns=['image', 'path'], dtype=np.str)
else:
return pd.DataFrame([Path(x).stem for x in sorted(Path(img_folder).glob('**/*.jpg'))], columns=['image'], dtype=np.str)