-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIntro to Data Science with R
647 lines (500 loc) · 28.8 KB
/
Intro to Data Science with R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
# ----------------------------------------------------------- PART 1 ------------------------------------------------------------------
# Source: https://www.youtube.com/watch?v=32o0DnuRjfg
# Load raw data
train <- read.csv("train Titanic.csv", header = TRUE, sep = ";")
test <- read.csv("test Titanic.csv", header = TRUE, sep = ";")
head(train)
# Add a "Survived" variable to test set to allow for combining data sets
test.survived <- data.frame(Survived = rep("None", nrow(test)), test[,])
# Combine data sets
data.combined <- rbind(train, test.survived)
# A bit about R data types (e.g., factors)
str(data.combined)
data.combined$Survived <- as.factor(data.combined$Survived)
data.combined$Pclass <- as.factor(data.combined$Pclass)
# Factor = Categorical variable = var./data type that can only takes a limited set of discrete values (similar to drop downs)
# pclass is a factor and not an int because it represents the class of the passenger (proxy)
# Take a look at gross survival rate
table(data.combined$Survived)
# Distribution accross classes
table(data.combined$Pclass)
# More people lower class than upper class
# Would expect more people in second class than in first but not the case (could be interesting from analysis perspective)
# for creating machine learning algo there is some rules/patterns to understand based on the data
# For human beings visualisations are easier to grasp so let's look at some graphs:
# Load up ggplot2 package to use for visualizations
library(ggplot2)
# Hypothesis - Rich folks survived at a higher rate (upper class cabins closer to the life boats)
train$Pclass <- as.factor(train$Pclass) # only know if you survived in the train set
ggplot(train, aes(x = Pclass, fill = factor(Survived))) + geom_histogram(width = 0.5) + # aes = aesthetic; fill = fill color (optional)
xlab("Pclass") +
ylab("Total Count") +
labs(fill = "Survived") # label for the fill
# Hypothesis seems to be confirmed
# Examine the first few names in the training data set (general sense of the data)
head(as.character(train$Name)) # on the fly, don't consider the names as factor and just give me the first few
# How many unique names are there across both train & test?
length(unique(as.character(data.combined$Name)))
# Obtain 1037 while factor dimension is 1309 ie. two possibly duplicated names
# First, get the duplicate names and store them as vector
dup.names <- as.character(data.combined[which(duplicated(as.character(data.combined$Name))), "Name"])
# Take duplicates and convert them to character
# Which = "where" clause in SQL
# Next, take a look at the records in the combined data set
data.combined[which(data.combined$Name %in% dup.names),]
# Appears like it's all good as it is only 2 people having the same name (i.e no anomaly)
# Personal Note: looks like some Embarked information is missing
# data.combined[which(as.character(data.combined$Embarked) == "" ),]
# What is up with the "Miss." and "Mr." thing?
library(stringr)
# Any correlation with other variables (e.g., sibsp)?
# Do the titles have any predictive power?
misses <- data.combined[which(str_detect(data.combined$Name, "Miss.")),]
misses[1:5,] # equivalent to head(misses,5)
# Personal note: how many were 1st class vs. other classes?
# table(misses$Pclass)
# Question: different from remaining population or expected distribution?
# t.test(as.numeric(data.combined$Pclass), as.numeric(misses$Pclass))
# p-value > 0.05 i.e can't reject H0 i.e. the mean are the same (come from the same population)
# Personal note: let's extract the titles from the Name column
# TempName <- str_extract(as.character(data.combined$Name), '\\b[^,]+$')
# str(TempName)
# http://stackoverflow.com/questions/14790253/character-extraction-from-string
# data.combined$Title <- sapply(strsplit(TempName, "\\."), `[[`, 1)
# table(data.combined$Title)
# Correlations?:
# out of the 5 records, 4 survived (80%)
# out of the 5 records, 4 were third class (interesting knowing that third class were seen surviving less, earlier)
# out of the 5 records, 4 had no sibling/spouse (the remaining was 4)
# out of the 5 records, 4 had no parents (the 4-year-old girl was travelling with a sibling and a parent)
# Seems to infer that "Miss" is a non-maried women, generally speaking (maybe some predictive power?)
# Also tend to be younger in age which makes sense knowing that at this time most women were married past a certain age
# Hypothesis - Name titles correlate with age
mrses <- data.combined[which(str_detect(data.combined$Name, "Mrs.")),]
head(mrses, 5)
# Same general pattern: Husbands name first and then their maid name (entre parathèses)
# In general older females; most travel with spouse; high survival %
# Check out males to see if pattern continues
males <- data.combined[which(train$Sex == "male"),]
males[1:5,]
# one "Master" age of 2 i.e. possible that this title is related to being a boy of young age
# Expand upon the relationship between "Survived" and "Pclass" by adding the new "Title" variable to the data set
# and then explore a potential 3-dimensional relationship
# Create a utility function to help with title extration
extractTitle <- function(Name) {
Name <- as.character(Name)
if(length(grep("Miss.", Name)) > 0) { # ?grep if recognize "Miss." within the name then return "Miss."
return("Miss.")
} else if (length(grep("Master.", Name)) > 0) {
return("Master.")
} else if (length(grep("Mrs.", Name)) > 0) {
return("Mrs.")
} else if (length(grep("Mr.", Name)) > 0) {
return("Mr.")
} else {
return("Other")
}
}
Titles <- NULL
for (i in 1:nrow(data.combined)) {
Titles <- c(Titles, extractTitle(data.combined[i, "Name"]))
}
data.combined$Title <- as.factor(Titles)
# Since we only have a survived lables for the train set, only use the first 891
ggplot(data.combined[1:891,], aes(x = Title, fill = Survived)) + # want aesthitic where the x-axis is the title & color code base on survival
geom_bar(binwidth = 0.5) +
facet_wrap(~ Pclass) + # pivot data base on passenger class
ggtitle("Pclass") +
xlab("Title") +
ylab("Total Count") +
labs(fill = "Survived")
# If you are a Mister. in third class, life is not so good for you
# If you are a Mrs. in thir class you are about 50/50
# "Women and children first"
# Title gives indication on both sex and age (acts as proxy) - Correlation between those variables
# ----------------------------------------------------------- PART 2 ------------------------------------------------------------------
# source: https://www.youtube.com/watch?v=u6sahb7Hmog
# Algo are great, but nothing beats knowing your data i.e. do the data analysis upfront
# Data analysis will save you a lot of time (training a model takes a lot of time)
# Eventually, your data will speak to you:
# What's likely predictive and what's not
# Brainstorming new scenarios (i.e., feature engineering)
# What's the distribution of females to males across train & test?
table(data.combined$Sex)
# 2 to 1 ratio of males to female and males were far more likely to perish than female
# Ususally models tends to optimaze themselves from the data they are presented
# Thus not out of the realm of possibilities to build a model that basically says:
# "if you are male, you perish and if you are female, you don't"
# Doesn't work for accuracy
# Visualize the 3-way relashionship of sex, pclass and survival, compare to analysis of title
ggplot(data.combined[1:891,], aes(x = Sex, fill = Survived)) + # want aesthitic where the x-axis is the sex & color code base on survival
geom_bar(binwidth = 0.5) +
facet_wrap(~ Pclass) + # pivot data base on passenger class
ggtitle("Pclass") +
xlab("Sex") +
ylab("Total Count") +
labs(fill = "Survived")
# Strangely survival rate for second class is not largely better than third class which is interesting
# Nothing here invalidates previous plot. In fact, confirms that are on right track with titles
# Ok, age and sex seem pretty important as derived from analysis of title,
# let's take a look at the dist. of age over the entire data set.
summary(data.combined$Age)
# 263/1309 NAs i.e. missing values
# Multiple ways of handling missing values:
# 1. Collect them (scientific world) but cost prohibitive or hardly feasible
# 2. Infer a value (ex: replace a NA by the medium or mean for category considered)
# 3. Imputation i.e. create a predictive model on data to predict what the value should be (ex: linear regression model)
# use rest of values as input
# 4. Find a proxy that mimics the value so that you don't need to worry about it
# Correlation between title and age
summary(data.combined[1:891,"Age"]) # first 891 rows only
# Not only do we have missing values but half of them are in the training data (bad) 177/263 = 67%
# Reitarate the need to find some var. to act as a proxy and not infering or imputing values
# (lot of missing values i.e. issue with accuracy of the model)
# Just to be thorough, take a look at survival rates broken out by sex, pclass and age
ggplot(data.combined[1:891,], aes(x = Age, fill = Survived)) + # want aesthitic where the x-axis is the age & color code base on survival
facet_wrap(~ Sex + Pclass) + # pivot data base on passenger class and sex
geom_histogram(binwidth = 10) +
ggtitle("Pclass") +
xlab("Age") +
ylab("Total Count")
# Females and 1st class = very good survival rate
# Female and 2nd class = if young (<25 y.o.) then good chance to survive
# Female and 3rd class = approx. 50/50
# Male and 3rd class = the older you get, the worst your survival rate
# Male and 2nd class = younger you are, the more you survive (correspond to master)
# Male and 1st class = odds decline quickly for males even in first class
# More potential evidence toward "women and children first"
# Validate that "Master" is a good proxy for male children
boys <- data.combined[which(data.combined$Title == "Master."),] # give whichever is true; equivalent to "where" in SQL
summary(boys$Age)
# Reasonable proxy: max = 14.5 y.o.l min = 0.33 & only 8 missing values (/263 NAs)
# Reflexively, mister is a good proxy for adult males so don't need to worry about the age variable for males in this dataset
# We know that "Miss." is more complicated, let's examin further
misses <- data.combined[which(data.combined$Title == "Miss."),]
summary(misses$Age)
head(misses,5)
# 50(/263) missing values
# Min = 0.17; Max = 63
# Mean < Median = right skewed i.e. most Misses are actualy adult (i.e. not necessarely female children)
ggplot(misses[misses$Survived != "None",], aes(x = Age, fill = Survived)) +
facet_wrap(~Pclass) +
geom_histogram(binwidth = 5) +
ggtitle("Age for Miss. by Pclass") +
xlab("Age") +
ylab("Total Count")
# In general if title miss and in 1st class, very good chance of survival
# 2nd & 3rd class more mixted but for 3rd class, the older the miss, the least chance of survival
# Want to know if adult or child since, especially in 3rd class, there seems to have some predictive power
# Ok, appears female children may have different survival rate,
# could be a candidate for feature engineering later
misses.alone <- misses[which(misses$SibSp == 0 & misses$Parch == 0),]
summary(misses.alone)
length(which(misses.alone$Age <= 14.5)) # 14.5 because max(Age) for boys
# If misses travel alone, usually they are adult
# Will use rules of thumb: if you are the title miss and you travel alone, good chance you are adult (only 4/150 are child)
# Move on to the sibsp variable, summarize the variable
summary(data.combined$SibSp)
# No NAs
# Mean only 0.4989 (higly skewed toward 0)
# Median = 0 i.e. 50% travel alone
# Can we treat as a factor?
length(unique(data.combined$SibSp))
# Accross the dataset, there is only 7 unique values so can turn it into a factor (reasonable)
data.combined$SibSp <- as.factor(data.combined$SibSp)
# We believe title is predictive, visualize surival rates by SibSp, Pclas and Title
ggplot(data.combined[1:891,], aes(x = SibSp, fill = Survived)) + # want aesthitic where the x-axis is the age & color code base on survival
geom_histogram(binwidth = 1) +
facet_wrap(~Pclass + Title) +
ggtitle("Pclass, Title") +
xlab("SibSp") +
ylab("Total Count") +
ylim(0,300) +
labs(fill = "Survived")
# Pretty definitive pattern:
# Master & Miss 1st class = good to go
# No big pattern for Mr. in 1st class
# Mrs survived in 1st class
# Mr. 2nd & 3rd class: not good (but more chance of survival if less siblings)
# Master in 3rd: better chance of survival if traveling with smaller contingent of siblings
# Miss in 3rd class: if you are a child, you have more chance to survive if you have less siblings
# Treat the Parch variable as a factor and visualize
data.combined$Parch <- as.factor(data.combined$Parch)
ggplot(data.combined[1:891,], aes(x = Parch, fill = Survived)) + # want aesthitic where the x-axis is the age & color code base on survival
geom_histogram(binwidth = 1) +
facet_wrap(~Pclass + Title) +
ggtitle("Pclass, Title") +
xlab("Parch") +
ylab("Total Count") +
ylim(0,300) +
labs(fill = "Survived")
# Adult male 1st: higher chance of survival if were travelling without a parent or a child
# Mrs.3rd class: the more children, the least chance of survival
# Let's try some feature engineering. What about creating a family size feature?
temp.SibSp <- c(train$SibSp, test$SibSp)
temp.Parch <- c(train$Parch, test$Parch)
data.combined$Family.size <- as.factor(temp.SibSp + temp.Parch + 1)
# Visualize it to see if it is predictive
ggplot(data.combined[1:891,], aes(x = Family.size, fill = Survived)) + # want aesthitic where the x-axis is the age & color code base on survival
geom_histogram(binwidth = 1) +
facet_wrap(~Pclass + Title) +
ggtitle("Pclass, Title") +
xlab("Family.size") +
ylab("Total Count") +
ylim(0,300) +
labs(fill = "Survived")
# Stair stepping: survival lower as family bigger
# Maybe some predictive power
# ----------------------------------------------------------- PART 3 ------------------------------------------------------------------
# source: https://www.youtube.com/watch?v=aMV_6LmCs4Q
# Take a look at the ticket variable
str(data.combined$Ticket) # structure of the data
# Based on the huge number of levels ticket really isn't a factor variable. It is a string.
# Convert i and display first 20
data.combined$Ticket <- as.character(data.combined$Ticket)
data.combined$Ticket[1:20]
# There's no immediatly apparent structure in the data, let's see if we can find some.
# We'll start with taking a look at just the first char for each
Ticket.first.char <- ifelse(data.combined$Ticket == "", " ", substr(data.combined$Ticket,1,1))
unique(Ticket.first.char)
# Ok we can make a factor for analysis purposes and visualize
data.combined$Ticket.first.char <- as.factor(Ticket.first.char)
# First, a high-level plot of the data
ggplot(data.combined[1:891,], aes(x = Ticket.first.char, fill = Survived)) +
geom_bar() +
ggtitle("Survivability by Ticket.first.char") +
xlab("Ticket.first.char") +
ylab("Total Count") +
ylim(0,300) +
labs(fill = "Survived")
# On the surface, this is some signal that maybe there is some structure in that data that helps me decide if someone survived or not
# Not always intuitive. Oftentime can find something that doesn't match intuition but is suprisingly effective
# Tickets seems like it might be predictive, drill down a bit (spectical)
ggplot(data.combined[1:891,], aes(x = Ticket.first.char, fill = Survived)) +
geom_bar() +
facet_wrap(~ Pclass) +
ggtitle("PClass") +
xlab("Ticket.first.char") +
ylab("Total Count") +
# ylim(0,150) +
labs(fill = "Survived")
# Tells a more interesting story: most folks in first class had tickets of type 1 & disproportionnaly seem to survive
# Most second class had a ticket of type 2; Maybe there is a pattern there?
# But pattern broken in third class + don't learn anything new. Already knew that people in third class died more often
# Most of the tickets will be turquoise in first class anyaway
# Lastly, see if we get a pattern when using comination of pclass & title
ggplot(data.combined[1:891,], aes(x = Ticket.first.char, fill = Survived)) +
geom_bar() +
facet_wrap(~ Pclass + Title) + # most predictive variables that we have seen so far is combi of Pclass & Title
ggtitle("PClass & Title") +
xlab("Ticket.first.char") +
ylab("Total Count") +
ylim(0,200) +
labs(fill = "Survived")
# Ticket not a predictive variable; Not much structure. Wouldn't use it in model (at least not at the beginning)
# Ockham razor: all things being equal, tends to prefer simple models to complex ones
# True for nature of algo:
# ex: logit vs deep learning network for classification
# If both have the same accuracy then would tend to prefer logit because will generalize better
# Simpler models tends to perform better on unseen data in the futur than more complicated models (in general)
# Same principal with data: if can get same level of performance with smaller dataset
# Next up - The fare Titanic passenger paid
summary(data.combined$Fare)
str(data.combined$Fare)
# 25% of the tickets were 7.896 Pound or less (first quartile)
# Mean is double the median so distribution is expected to be skewed toward the high end (evidence: Max value > 500)
length(unique(data.combined$Fare))
# Sometimes if have a numeric variable, if there is a small number of unique values in dataset you could potentially transform that
# into a factor (may be useful for survey analysis or if trying to bin the x-axis)
# In this case, way too many values
# Can't make fare a factor, treat as a numeric & visualize with histogram
ggplot(data.combined[1:891,], aes(x = Fare)) +
geom_histogram(binwidth =5) +
ggtitle("Combined Fare Distribution") +
xlab("Fare") +
ylab("Total Count") +
ylim(0,200)
# One outlier way above (around 500 pouds)
# Large portion of folks on Titanic correspond to first class
# First class tickets would cost much more than third class
# let's check to see if fare has predictive power
ggplot(data.combined[1:891,], aes(x = Fare, fill = Survived)) +
geom_histogram(binwidth =5) +
facet_wrap(~ Pclass + Title) + # most predictive variables that we have seen so far is combi of Pclass & Title
ggtitle("PClass, Title") +
xlab("Fare") +
ylab("Total Count") +
ylim(0,200) +
labs(fill = "Survived")
# Risk of overfitting (perfect for training but don't generalize)
# Analysis of the cabin variable
str(data.combined$Cabin)
# Expected to look a lot like Class: Nomenclature that indicates what deck i.e. what class you are in
# Cabin isn't really a factor, make a string and then display first 100
data.combined$Cabin <- as.character(data.combined$Cabin)
data.combined$Cabin[1:100]
# the de facto random forrest algo in R can't handle factors > 30 levels
# Letter probably correspond to the deck (potentially predictive: the higher the deck, the richer and the closer to the life boats)
# Could the empty space correspond to people in third class?
# Most indicative information should be the letter. The cabin number would be prone to overfitting.
# Replace empty cabins with a "U"
data.combined[which(data.combined$Cabin == ""), "Cabin"] <- "U"
# Take a look at just the first char as a factor
Cabin.first.char <- as.factor(substr(data.combined$Cabin, 1,1))
str(Cabin.first.char)
levels(Cabin.first.char)
# Add to combined data set and plot
data.combined$Cabin.first.char <- Cabin.first.char
# High level plot
ggplot(data.combined[1:891,], aes(x = Cabin.first.char, fill = Survived)) +
geom_bar() +
ggtitle("Survivability by Cabin.first.char") +
xlab("Cabin.first.char") +
ylab("Total Count") +
ylim(0,750) +
labs(fill = "Survived")
# Nothing jumps out. Already knows that the letters correspond to the class (i.e. nothing new).
# Ex: less people in A (first class?)
# Could have some predictive power, drill it
ggplot(data.combined[1:891,], aes(x = Cabin.first.char, fill = Survived)) +
geom_bar() +
facet_wrap(~Pclass) +
ggtitle("Survivability by Cabin.first.char") +
xlab("Cabin.first.char") +
ylab("Total Count") +
ylim(0,500) +
labs(fill = "Survived")
# Few people, relatively speakin, in first class
# Big shift: majority of people with no tickets are in second and third class
# strong correspondance between the cabin, the deck and the class: cabin is not going to be very useful
# Personal Note: head(data.combined[which(data.combined$Cabin == "U"),],10)
# Does this feature improve upon Pclass + Title?
ggplot(data.combined[1:891,], aes(x = Cabin.first.char, fill = Survived)) +
geom_bar() +
facet_wrap(~Pclass + Title) +
ggtitle("Pclass, Title") +
xlab("Cabin.first.char") +
ylab("Total Count") +
ylim(0,500) +
labs(fill = "Survived")
# No real trend/signal
# What about folks with multiple cabins? (Note: more room = richier?)
data.combined$Cabin.multiple <- as.factor(ifelse(str_detect(data.combined$Cabin, " "), "Y", "N"))
# A space separates the multiple cabin numbers (and previously replaced NAs with "U")
ggplot(data.combined[1:891,], aes(x = Cabin.multiple, fill = Survived)) +
geom_bar() +
facet_wrap(~Pclass + Title) +
ggtitle("Pclass, Title") +
xlab("Cabin.multiple") +
ylab("Total Count") +
ylim(0,500) +
labs(fill = "Survived")
# Nothing new
# Does survivability depend on where you got onboard the Titanic?
str(data.combined$Embarked)
levels(data.combined$Embarked)
# Intuitively: people of a certain socio-economic situation could have disporpotionatly embarked in a specific location
# But cpuld be tenuous
# Plot data for analysis
ggplot(data.combined[1:891,], aes(x = Embarked, fill = Survived)) +
geom_bar() +
facet_wrap(~Pclass + Title) +
ggtitle("Pclass, Title") +
xlab("Embarked") +
ylab("Total Count") +
ylim(0,300) +
labs(fill = "Survived")
# Not seeing anything that jumps out at us
# Confirm intuition that location doesn't have a particular impact
# Most powerful ccl that can be made is that if you embarked at Queensland, you were disporportionately in 3rd class
# ----------------------------------------------------------- PART 4 ------------------------------------------------------------------
# Source: https://www.youtube.com/watch?v=UHJH7w9q4Lc&list=PLTJTBoU5HOCRrTs3cJK-PbHM39cwCU0PF&index=4
# Many machine learning algo can rate the "importance" of training variables/features either explicitely or implicitely ("feature selection")
# Scenario 1: lots of features -> exploratory modelling assist in identifying "high value" candidate features
# Scenario 2: data analysis done -> expl. mod. can provide additional insight/confirmation
# Scenario 3: feature engineering done -> can gain insight into efficacity of these features
# Expl. mod. should be simple (no extensive hyperparameter tuning), fast (train as fast as possible) and effective (powerful enough)
# the more regression we can execute in a time, the more value we can generate
# Random forests:
# King of expl. models
# Meet all the intuitive criteria above:
# Feature selection: byproduct of the algo (i.e. tree trainig will avoid features without predictive power)
# Simple: can handle numeric, categorical, and correlated var. without preprocessing
# Fast: excellent compromise between speed of training and other benefits
# Effective: one of the most powerfulm general-purpose algo available.
# Great resources for learning Random Forests on YouTube
# Random forrest tutorial from Kaggle: https://www.youtube.com/watch?v=kwt6XEh7U3g&list=PL6XpIcmAkEKLqAapoA1_sXZ7wQTBLdQQ2 (40:00)
library(randomForest)
# Train a Random Forest wih the default parameters using Pclass & Title
rf.train.1 <- data.combined[1:891, c("Pclass", "Title")] # let's explore with Pclass and Title
rf.label <- as.factor(train$Survived)
set.seed(1234) # Random Forest are random so if want reproducibility setting the seed is important else will receive slightly different
# results each time (by design)
rf.1 <- randomForest(x = rf.train.1, y = rf.label, importance = TRUE, ntree = 1000) # function that trains an instance of rf for us
# Importance = as train all the decision trees, keep track of the importance of the features and variables
# ntree = how many individual trees are in the forest (default = 500 which is reasonnable)
rf.1
# Nb of var tried.. = hyperparameter?
# OOB = "out of bag". Rf takes a random set of columns and rows out of spreadsheet and does that iterativaly (each time draw a tree)
# Possible that select the same colum and the same row more than once (since replacement) i.e. some will never get selected
# Possible to use those unselected rows and columns to test how accurate the rf (that use the remaining data) is.
# It is what the model does by provind the % of error rate (very useful feature for data exploration but not enough to test the model)
# we get nearly (100-20.76)% accuracy with only too variable which is very good
# 2 var. together are very predictive of wether you survived or died (confirm intuition)
# http://stats.stackexchange.com/questions/30691/how-to-interpret-oob-and-confusion-matrix-for-random-forest
# Confusion matrix: broken down by our labels what were the results of this rf? (row = predictive var.)
# Model predicted correctly that 538 records perished when in fact they did perish
# also predicted that 11 people perished when in fact they survived (error rate of 0.02003643*100 = 2% in perish prediction - v.good)
table(train$Survived)
# However, the data is skewed, most people people perished so that's not a surprising result. This model is pretty good at pessimist results
# How good are we at predicting if somebody survived?
# Predicted that 174 survived when they in fact perished and that 168 survived when did survive (error rate of 51% not very good!)
varImpPlot(Rf.1)
# Not super interesting because only two variables but if multiple variables what you can see is how important the variable is
# The further (right), the better.
# As we can see here, and as expected from data analysis, title is actually more important than pclass to determiner wether you
# survived in this model (more predictive).
# As we did our analysis we were able to throw out a bunch of variables as not being particularly predictive (cabin, embarked, age...)
# But what we identified as potentially predictive is the sibsp and parch variables
# Train Random Forest using pclass, title & sibsp
rf.train.2 <- data.combined[1:891, c("Pclass", "Title", "SibSp")]
set.seed(1234)
rf.2 <- randomForest(x = rf.train.2, y = rf.label, importance = TRUE, ntree = 1000)
rf.2
varImpPlot(rf.2)
# When add a third variable get OOB of 19.75% (1% increase in accuracy).
# Seems minimal but is actually huge in machine learning application
# Confirms intuition that sibsp is predictive
# Dramaticly increase accuracy for those of survived (error:33% vs. 50%)
# But had a negative impact on accuracy for non-survival (but in far less proportion than the increase of accuracy for other result)
# So it's ok (but bold) in this context (Kaggle) where accuracy of the model is .
# Train Random Forest using pclass, title & parch
rf.train.3 <- data.combined[1:891, c("Pclass", "Title", "Parch")]
set.seed(1234)
rf.3 <- randomForest(x = rf.train.3, y = rf.label, importance = TRUE, ntree = 1000)
rf.3
varImpPlot(rf.3)
# Both Parch and Sibsp have predictive power but one is better than the other (sibsp > parch) as illustrated by OOB error rates.
# Why not use them both?
# Train Random Forest using pclass, title, sibsp & parch
rf.train.4 <- data.combined[1:891, c("Pclass", "Title", "SibSp", "Parch")]
set.seed(1234) #setting seed allows to make sure that any difference comes from the variable we put in
rf.4 <- randomForest(x = rf.train.4, y = rf.label, importance = TRUE, ntree = 1000)
rf.4
varImpPlot(rf.4)
# Combination of Sibsp & Parch improve the model again (18.52% i.e. minus 2 point of %)
# Telling us that the number of siblings/spouses/Childrens and parents probably matters
# "women & children first" holds out
# "Richest survived most" holds out
# More children in second and third class most probably meant less chance of making it to the top
# Let's use the family variable we created
# Train Random Forest using pclass, title & family.size (combine Parch & SibSp)
rf.train.5 <- data.combined[1:891, c("Pclass", "Title", "Family.size")]
set.seed(1234)
rf.5 <- randomForest(x = rf.train.5, y = rf.label, importance = TRUE, ntree = 1000)
rf.5
varImpPlot(rf.5)
# Algo can't understand the concept of family size by itself simply by looking at Parch & Sibsp (will look into them separatly)
# Interesting result, total error rate is now 18.41% which is a small improvement but still relevant
# Validate intuition that family size matters and algo not efficient at combining Parch & Sibsp. It works better. Not by a lot but still
100 - 18.41 # Estimate of accuracy around 81.59% with just 3 features (cool)