-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprediction.py
94 lines (78 loc) · 3.2 KB
/
prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import numpy as np
import pandas as pd
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, FunctionTransformer, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import SGDRegressor, Ridge, Lasso, ElasticNet
from sklearn.cluster import KMeans
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.cross_validation import train_test_split as tts
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
### We're given the task of predicting total charges resulting from inpatient procedures
### Data includes 12 features across 163k samples
### Load in the data
x = pd.read_csv('./data/inpatientCharges.csv')
x.head()
### We'll be predicting the average covered charges
predicted_value = ' Average Covered Charges '
# ### Transform all of the currency columns into floats
currency_columns = [
' Average Covered Charges ',
' Average Total Payments ',
'Average Medicare Payments'
]
for column in currency_columns:
x[[column]] = x[[column]].replace('[\$,]','',regex=True).astype(float)
y = x.loc[:, predicted_value]
x = x.drop(predicted_value, axis=1)
#
# encoder = OneHotEncoder()
# result = encoder.fit_transform(x[['Provider Id']])
#
# print(result.__class__.__name__)
# print(result.feature_indices)
### Set up a DataFrameMapper to perform all the necessary transformations per column
mapper = DataFrameMapper([
('Provider Name', FeatureHasher(input_type='string')),
('Provider City', FeatureHasher(input_type='string')),
('Provider State', FeatureHasher(n_features=50, input_type='string')),
('DRG Definition', FeatureHasher(input_type='string')),
('Hospital Referral Region Description', FeatureHasher(input_type='string')),
('Average Medicare Payments', StandardScaler()),
(' Average Total Payments ', StandardScaler()),
(' Total Discharges ', StandardScaler()),
('Provider Id', None),
('Provider Zip Code', None)
])
print('Tranforming...')
x = mapper.fit_transform(x)
print('Finished transforming')
print(x)
print("Splitting train/test sets...")
x_train, x_test, y_train, y_test = tts(x, y, test_size=0.2)
print("Finished splitting train/test sets")
# regressor = SGDRegressor() --> mae on the order of 1e16
regressor = Ridge(alpha=0.5) # --> mae ~21.5%
# regressor = SVR(kernel='linear')
# regressor = ElasticNet() # --> mae ~22.4%
# regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls')
# selector = SelectKBest()
# print('Selecting features...')
# x_train = selector.fit_transform(x_train, y_train)
# print(x_train)
# print('Finished selecting features')
print('Performing estimation...')
estimator = regressor.fit(x_train, y_train)
prediction = estimator.predict(x_test)
print('Finished performing estimation')
average = np.mean(y_test, axis=0)
print(average)
error = mean_absolute_error(y_test, prediction)
print(error)
percentage = (error / average) * 100
print(percentage)