-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.py
101 lines (83 loc) · 3.39 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import argparse
import numpy as np
from src import config, evaluation, helpers, data_preprocessing, model
def set_execution_arguments() -> argparse.Namespace:
"""
Sets all the possible execution arguments and retrieves their values
This method allows the user to customize the training parameters as
well as the seed of execution to ensure that results are deterministic.
Returns:
args: A namespace containing every execution argument and their
values
"""
parser = argparse.ArgumentParser(
description="Training model with customizable parameters."
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="Seed to ensure that results are deterministic.",
)
parser.add_argument(
"--gamma", type=float, default=0.2, help="Learning rate for model training."
)
parser.add_argument(
"--max_iters",
type=int,
default=1000,
help="Maximum number of iterations for training.",
)
parser.add_argument(
"--lambda_", type=float, default=0.01, help="Regularization parameter."
)
parser.add_argument("--undersampling_ratio", type=float, default=0.25, help=".")
return parser.parse_args()
if __name__ == "__main__":
args: argparse.Namespace = set_execution_arguments()
# Set random seed to ensure that results are deterministic
np.random.seed(args.seed)
try:
# Try to load already processed datasets
print("Attempting to read already processed datasets...")
x_train = np.load(f"{config.DATA_FOLDER}/processed_x_train.npz")["arr_0"]
x_test = np.load(f"{config.DATA_FOLDER}/processed_x_test.npz")["arr_0"]
y_train = np.load(f"{config.DATA_FOLDER}/processed_y_train.npz")["arr_0"]
train_ids = np.arange(x_train.shape[0])
test_ids = np.arange(x_test.shape[0]) + x_train.shape[0]
print("Success!")
except FileNotFoundError:
print("Failed - Reading original datasets...")
# If they do not exist, load the original datasets
# Load and preprocess the data, ready for modelling
x_train, x_test, y_train, train_ids, test_ids = (
data_preprocessing.preprocess_data(data_dir=config.DATA_FOLDER)
)
# Since the healthy class is overrepresented, undersampling is a good idea
# to improve predictions over the unhealthy class
x_train_undersampled, y_train_undersampled = model.undersample(
x_train, y_train, undersampling_ratio=args.undersampling_ratio
)
# Train our model with a validation set to prevent overfitting
x_tr, x_val, y_tr, y_val = model.split_data(
x_train_undersampled, y_train_undersampled, val_size=0.2
)
w, loss = model.train(
x_tr,
y_tr,
x_val=x_val,
y_val=y_val,
gamma=args.gamma,
max_iters=args.max_iters,
lambda_=args.lambda_,
threshold=1e-6,
)
# Perform predictions over the train and test datasets
y_pred_train = model.predict_labels(w, x_train)
y_pred = model.predict_labels(w, x_test)
# Obtain classification metrics in order to assess the quality of our model
evaluation.evaluate_predictions(y_train, y_pred_train)
# Convert the labels back to their original value range (-1, 1)
y_pred = y_pred * 2 - 1
# Save the predictions
helpers.create_csv_submission(test_ids, y_pred, "submission.csv")