-
Notifications
You must be signed in to change notification settings - Fork 0
/
tune_hyper_params_optuna.py
93 lines (79 loc) · 3.36 KB
/
tune_hyper_params_optuna.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import pandas as pd
import numpy as np
import json
import logging
from sklearn.model_selection import StratifiedKFold, cross_validate
from xgboost import XGBClassifier
from tabulate import tabulate
import optuna
import copy
import config
from etl import load_train_test_data
X_train, X_test, y_train, y_test = load_train_test_data(selected_feats_only=True)
def objective_xgb(trial):
""" Objective function to tune an `XGBRegressor` model. """
params_trial = {
'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.30, log=True),
'max_depth': trial.suggest_int('max_depth', 2, 9),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.20, 1.0),
'subsample': trial.suggest_float('subsample', 0.5, 1.0),
'gamma': trial.suggest_float('gamma', 0.01, 5, log=True),
'min_child_weight': trial.suggest_float('min_child_weight', 1, 20, log=True),
'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 20, log=True),
}
params_model = copy.deepcopy(config.PARAMS_MODEL_DEFAULT)
params_model.update(params_trial)
model = XGBClassifier(**params_model)
skf = StratifiedKFold(
n_splits=config.TUNE_N_SPLITS,
shuffle=True,
random_state=config.CV_RANDOM_STATE,
)
cv = cross_validate(
estimator=model,
X=X_train,
y=y_train,
scoring=config.CV_METRIC,
cv=skf.split(X_train, y_train),
n_jobs=-1,
verbose=10,
return_estimator=True,
return_train_score=True,
)
return np.mean(cv['test_score']) #, np.mean(cv['train_score'])
def tune_hyper_params_w_optuna():
logger = logging.getLogger('tune_hyper_params_optuna.py')
logger.info('START - Tune hyper-parameters with optuna')
study = optuna.create_study(direction='maximize')
study.optimize(objective_xgb, n_trials=100)
metric_names = ['mean_test_score']
df_cv_results = pd.DataFrame([dict(zip(metric_names, trial.values), **trial.params) for trial in study.trials])
df_cv_results.sort_values(by=['mean_test_score'], ascending=False, inplace=True)
logger.info(' - Best parameters found: \n' + json.dumps(study.best_params, indent=2))
logger.info(' - CV results: \n' + tabulate(df_cv_results.head(25), headers=df_cv_results.columns, showindex=False))
logger.info(
f'Save artifacts: \n'
f' - CV results to: {config.FILE_TUNE_ALL_PARAMS_COMBS} \n'
f' - Best params to: {config.FILE_TUNE_PARAMS_BEST}'
)
config.make_dir_for_artifacts()
df_cv_results.to_csv(config.FILE_TUNE_ALL_PARAMS_COMBS, index=False, float_format='%.4f')
with open(config.FILE_TUNE_PARAMS_BEST, 'w') as f:
json.dump(study.best_params, f, indent=2)
logger.info('END - Tune hyper-parameters')
def plot_parallel_optuna_res():
import plotly.offline as py
import plotly.express as px
df_res_tune = pd.read_csv(config.FILE_TUNE_ALL_PARAMS_COMBS)
fig = px.parallel_coordinates(
df_res_tune,
color='mean_test_score',
dimensions=['mean_test_score'] + list(config.TUNE_PARAMS_GRID.keys()),
color_continuous_scale=['red', 'yellow', 'green'],
color_continuous_midpoint=np.median(df_res_tune['mean_test_score']),
)
# py.iplot(fig)
py.plot(fig)
if __name__ == "__main__":
tune_hyper_params_w_optuna()