-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrain.scala
100 lines (78 loc) · 3.88 KB
/
train.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
/* Code to train machine learning models on combined data of three datasets - airbnb, yelp and Toronto crime*/
import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.model.DecisionTreeModel
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
import org.apache.spark.mllib.util.MLUtils
// val finalData = sc.wholeTextFiles("project/airbnbPredictionData").flatMap(_._2.split("\\n")).map(_.split(",")).filter(_.length==6).filter(x=> !(x(4).contains('$') || x(5).contains('$')) )
// airbnbid,latitude,longitude,bathrooms,bedrooms,beds,review_scores_rating,review_scores_location,num_businesses_in_neighborhood,num_crimes_in_neighborhood,price,avg_business_rating
val finalData = sc.textFile("project_data/airbnbPredictionData").
flatMap(_.split("\\n")).
map(_.split(",")).
filter(_.length == 12).
filter(_.forall(!_.isEmpty())).
filter(row => !row(10).contains('$'))
val mllibData = finalData.map(row => LabeledPoint(row(10).toDouble, Vectors.dense(row(3).toDouble/12.5, row(4).toDouble/13, row(5).toDouble/16, row(6).toDouble/100, row(7).toDouble/10, row(8).toDouble/3045, row(9).toDouble/28133, row(11).toDouble/4.5)))
val splits = mllibData.randomSplit(Array(0.8, 0.2), seed = 11L)
val training = splits(0).cache()
val test = splits(1)
/*
val numIterations = 100000
val stepSize = 0.00000006
val model = LinearRegressionWithSGD.train(training, numIterations, stepSize)
*/
//bathroom categories: 18 max val 12.5
//bedroom categories: 14 max val 13
//beds categories: 23, max_val - 16
//review_scores_rating: max val 100
//review_scores_location: max val 10
//num_businesses_in_neighborhood max val - 3045
//num_crimes_in_nneighborhood 28133
//average_business_rating 4.5
//price 998
val categoricalFeaturesInfo = Map[Int, Int]()
val impurity = "variance"
val maxDepth = 5
val maxBins = 32
val model = DecisionTree.trainRegressor(training, categoricalFeaturesInfo, impurity, maxDepth, maxBins)
/*
// Train a RandomForest model.
// Empty categoricalFeaturesInfo indicates all features are continuous.
val numClasses = 2
val categoricalFeaturesInfo = Map[Int, Int]()
val numTrees = 30 // Use more in practice.
val featureSubsetStrategy = "auto" // Let the algorithm choose.
val impurity = "variance"
val maxDepth = 4
val maxBins = 32
val model = RandomForest.trainRegressor(training, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins)
*/
/*
// Train a GradientBoostedTrees model.
// The defaultParams for Regression use SquaredError by default.
val boostingStrategy = BoostingStrategy.defaultParams("Regression")
boostingStrategy.numIterations = 10 // Note: Use more iterations in practice.
boostingStrategy.treeStrategy.maxDepth = 5
// Empty categoricalFeaturesInfo indicates all features are continuous.
boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]()
val model = GradientBoostedTrees.train(training, boostingStrategy)
*/
// mean squared training error
val valuesAndPreds = training.map { point =>
val prediction = model.predict(point.features)
(point.label, prediction)
}
val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean()
val valuesAndPredsTest = test.map { point =>
val prediction = model.predict(point.features)
(point.label, prediction)
}
val MSE_test = valuesAndPredsTest.map{ case(v, p) => math.pow((v - p), 2) }.mean()
model.save(sc, "project/AirbnbMLModel")