-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCovid19_newcase_analysis_train.py
143 lines (111 loc) · 4.19 KB
/
Covid19_newcase_analysis_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# -*- coding: utf-8 -*-
"""
Created on Fri May 20 09:39:32 2022
@author: snaff
"""
import os
import datetime
import numpy as np
import pandas as pd
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
from Covid19_newcase_analysis_module import ExploratoryDataAnalysis
from Covid19_newcase_analysis_module import ModelCreation
from Covid19_newcase_analysis_module import ModelEvaluation
TRAIN_DATASET = os.path.join(os.getcwd(), 'Data', 'cases_malaysia_train.csv')
TEST_DATASET = os.path.join(os.getcwd(), 'Data', 'cases_malaysia_test.csv')
MODEL_SAVE_PATH = os.path.join(os.getcwd(), 'Saved_path', 'model.h5')
LOG_PATH = os.path.join(os.getcwd(), 'Log')
# Constant
window_size = 30 # how many day of prediction
#%% EDA
# Step 1) Load data
train_df = pd.read_csv(TRAIN_DATASET)
test_df = pd.read_csv(TEST_DATASET)
# Step 2) Analyse the loaded data
train_df.describe()
train_df.info()
train_df.duplicated().sum()
train_df.isnull().mean()
# There are missing data on all the cluster columns, about 50% data are missing
# Date & cases_new column in object datatype
# No duplicated data found
test_df.describe()
test_df.info()
test_df.duplicated().sum()
# there is 1 missing data in 1 column
# Only date in object datatype
# No duplicated data found
# Discussion:
# Since we are going to analyse 'new_cases' column only,
# we can ignore all cluster columns.
# Step 3) Data cleaning
# Try changing type of cases_new column datatype to int type got error.
# Double check on the column, its contain symbol '?'
# and there are missing value but not detected as NaN
train_df['cases_new'] = train_df['cases_new'].str.replace(r'[^0-9a-zA-Z:,]+', '')
train_df['cases_new'] = train_df['cases_new'].replace(r'^\s*$', np.NaN, regex=True)
train_df['cases_new'] = train_df['cases_new'].astype('float')
train_df.info()
# Similar to test_df
test_df['cases_new'] = test_df['cases_new'].replace(r'[^0-9a-zA-Z:,]+', '')
test_df['cases_new'] = test_df['cases_new'].replace(r'^\s*$', np.NaN, regex=True)
# Now column cases_new has 12 missing value, cluster columns with 50% NaN
# We will do KNN imputer on the missing data
eda = ExploratoryDataAnalysis()
train_df_clean = eda.knn_imputer(train_df,n_neighbors=24)
test_df_clean = eda.knn_imputer(test_df,n_neighbors=24)
# Step 5) Feature selection
# Step 6) Data preprocessing
# In this analysis, we only choose 'cases_new' with index 0
scaled_train_df = eda.mm_scaler(train_df_clean, index_column=0)
scaled_test_df = eda.mm_scaler(test_df_clean, index_column=0)
# Testing dataset with windows 30 day
window_size = 30
# train data
#eda.train_process_window(train_df, scaled_train_df)
X_train=[]
Y_train=[]
for i in range(window_size, len(train_df)):
X_train.append(scaled_train_df[i-window_size:i,0])
Y_train.append(scaled_train_df[i,0])
X_train = np.array(X_train)
Y_train = np.array(Y_train)
# test data
temp = np.concatenate((scaled_train_df, scaled_test_df))
length_window = window_size+len(scaled_test_df)
temp = temp[-length_window:]
#eda.train_process_window(temp, window_size=30)
X_test=[]
Y_test=[]
for i in range(window_size, len(temp)):
X_test.append(temp[i-window_size:i,0])
Y_test.append(temp[i,0])
X_test = np.array(X_test)
Y_test = np.array(Y_test)
# expend dimension
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)
#%% Model creation
mc = ModelCreation()
model = mc.lstm_layer(X_train, nodes=64, dropout=0.2, output=1)
model.compile(optimizer='adam', loss='mse', metrics='mse')
# callback
log_dir = os.path.join(LOG_PATH, datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
tensorboard_callback = TensorBoard(log_dir=log_dir)
early_stopping_callback = EarlyStopping(monitor='loss', patience=15)
hist = model.fit(X_train, Y_train, epochs=200, batch_size=200,
validation_data=(X_test,Y_test),
callbacks=[tensorboard_callback, early_stopping_callback])
print(hist.history.keys())
#%% model deployment
predicted = []
for test in X_test:
predicted.append(model.predict(np.expand_dims(test,axis=0)))
predicted = np.array(predicted)
y_true = Y_test
y_pred = predicted.reshape(len(predicted),1)
#%% Model Analysis
me = ModelEvaluation()
me.model_report(y_true, y_pred)
#%% Save model
model.save(MODEL_SAVE_PATH)