-
Notifications
You must be signed in to change notification settings - Fork 2
/
log_regression_hanna_csv_Rproject.Rmd
480 lines (313 loc) · 15.2 KB
/
log_regression_hanna_csv_Rproject.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
---
title: "logistic_regression"
author: "Hanna Buechi"
date: "2/28/2019"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
###Load packages (incl. for sentiment analysis)
```{r}
library(tidyverse)
#library(googledrive)
#library(purrr)
library(readr)
#library(stringr)
#library(gsubfn)
#library(devtools)
library(dplyr)
#library(tidytext)
#library(broom)
#library(data.table)
#library(ggplot2)
#library(dotwhisker)
#library(jtools)
#library(sjPlot)
#library(jtools)
#library(ggeffects)
library(pscl)
library(car)
library(InformationValue)
library(boot)
ventyx_df_google_Sen <- read_csv("ventyx_df_google_Sen_022219.csv")
## Make sure Impact is coded correctly
# I don't have the updated CSV that Margaux put on the Bren G: drive
ventyx_df_google_Sen <- ventyx_df_google_Sen %>%
mutate(H_L_W = ifelse(lr_tif > "0", "Low", "High"))
```
###Logistic regression
####Organize factors and levels for categorical variables
```{r organize_for_regressions}
ventyx_df_google_Sen$H_L_W <- as.factor(ventyx_df_google_Sen$H_L_W)
ventyx_df_google_Sen$H_L_W <- relevel(ventyx_df_google_Sen$H_L_W, ref = "High")
ventyx_df_google_Sen$PopDensity <- as.numeric(ventyx_df_google_Sen$PopDensity)
ventyx_df_google_Sen$Household_ <- as.numeric(ventyx_df_google_Sen$Household_)
ventyx_df_google_Sen$View_Score <- as.numeric(ventyx_df_google_Sen$View_Score)
ventyx_df_google_Sen$Sign_Google <- as.factor(ventyx_df_google_Sen$Sign_Google)
ventyx_df_google_Sen$OnlyCancel <- as.factor(ventyx_df_google_Sen$OnlyCancel)
ventyx_df_google_Sen$State <- as.factor(ventyx_df_google_Sen$State)
## Set reference state to Texas because I don't have the state abbrev. column
ventyx_df_google_Sen$State <- relevel(ventyx_df_google_Sen$State, ref = "Texas")
### adding normalization to df
#
# ventyx_df_NU_google_Sen_norm <- ventyx_df_NU_google_Sen %>%
# select(-starts_with("X")) %>%
# mutate(Capacity_2 = normalize(Capacity),
# View_Score_2 = normalize(View_Score),
# Google_Sentiment_2 = normalize(Google_Sentiment),
# NU_Sentiment_2 = normalize(NU_Sentiment),
# PopDensity_mi_2 = normalize(PopDensity_mi),
# Household_MedianIncome_2 = normalize(Household_MedianIncome),
# members_env_2 = normalize(members_env))
```
###Finalize data for regression
####Make publicity NAs = 0
```{r finalize_ventyx_for_regression}
# Make NA scores 0
ventyx_df_google_Sen <- ventyx_df_google_Sen %>%
mutate(Google_Sentiment = ifelse(is.na(Google_Sentiment),0,Google_Sentiment)) %>%
mutate(Canceled = ifelse(OnlyCancel == "Yes",1,0))
ventyx_df_google_Sen <- ventyx_df_google_Sen %>%
filter(Capacity >= 0) %>%
filter(TimelineDa > 0) %>%
filter(!lr_tif < 0) %>%
filter(!View_Score < 0)
### For descriptive stats
#write.csv(ventyx_df_NU_google_Sen, "~/Desktop/ventyx_df_NU_google_Sen_01.18.19.csv")
```
```{r prep}
## For some kind of visualization?
renamed_variables <- c(View_Score = "View Score",
Google_Sentiment = "Google News Publicity score",
NU_Sentiment = "Nexis Uni Publicity score",
PopDensity_mi = "Population Density per m2",
Household_MedianIncome = "Household Median Income",
Capacity = "Capacity",
H_L_WLow = "High vs. low impact area"
# H_L_W*NU_Sentiment = "site vs. sentiment interaction"
)
## Subset data
ventyx_Operating <- ventyx_df_google_Sen %>%
filter(OnlyCancel == "No" & Artificial == "No" & OperatingD == "No") # why do we take OperatingD out?
ventyx_Operating_Canceled <- ventyx_df_google_Sen %>%
filter(Artificial == "No" & OperatingD == "No")
ventyx_Cancelled <- ventyx_df_google_Sen %>%
filter(OnlyCancel == "Yes" & OperatingD == "No" & Artificial == "No")
Ventyx_Operating_NoZeros <- ventyx_df_google_Sen %>%
filter(OnlyCancel == "No" & Artificial == "No" & OperatingD == "No") %>%
filter(Google_Sentiment != 0)
Ventyx_Operating_Canceled_NoZeros <- ventyx_df_google_Sen %>%
filter(Artificial == "No" & OperatingD == "No") %>%
filter(Google_Sentiment != 0) %>%
filter(State != "Arkansas")
#ventyx_Operating_Canceled$OnlyCancel <- relevel(ventyx_Operating_Canceled$OnlyCancel, ref = "No")
# write_csv(ventyx_df_google_Sen, "G:/Data/VENTYX_DATASET/ventyx_df_google_Sen_022219_2.csv")
```
###LOGISTIC REGRESSION
```{r log_regression}
#For the purposes of this regression, we first want to subset our data into a section that we used to train the model, and a section for testing the predicted probabilities later on.
train <- ventyx_Operating_Canceled[1:800,]
test <- ventyx_Operating_Canceled[801:871,]
#To test the reversal of values if we switch "High" impact areas to "Low" and vice verse
test_reversed <- test %>%
mutate(H_L_W = ifelse(H_L_W == "High", "Low","High"))
#Next, we run the logistic regression
logistic_reg_all <- glm(Canceled ~ TimelineDa + View_Score + Google_Sentiment + PopDensity + Household_ + Capacity + H_L_W + State + members_env + Google_Sentiment*H_L_W, family = binomial, data = train)
#View model results
summary(logistic_reg_all) #coefficients represent log odds
exp(coef(logistic_reg_all)) #exponentiated coefficients
#Run anova to compare null deviance vs. residual deviance. The greater the difference the better.
anova(logistic_reg_all, test="Chisq")
#Diagnostic plots
glm.diag.plots(logistic_reg_all)
#To view the logistic model equivalent of R-squared
pR2(logistic_reg_all)
#Check for multicollinearity
vif(logistic_reg_all) #low multicollinearity
#Now we predict the probability of the test projects being canceled
pred_probs <- predict(logistic_reg_all, newdata=test, type='response', se.fit = TRUE)
pred_probs_df <- data.frame(test, pred_probs$fit, pred_probs$se.fit)
pred_probs_df <- pred_probs_df %>%
rename(Probability = pred_probs.fit) %>%
rename(Publicity = Google_Sentiment)
#Now predict probabilities for reversed values
pred_probs_reversed <- predict(logistic_reg_all, newdata=test_reversed, type='response', se.fit = TRUE)
pred_probs_df_reversed <- data.frame(test, pred_probs_reversed$fit, pred_probs_reversed$se.fit)
pred_probs_df_reversed <- pred_probs_df_reversed %>%
rename(Probability = pred_probs_reversed.fit) %>%
rename(Publicity = Google_Sentiment)
#Probability means
mean(pred_probs_df$Probability)
mean(pred_probs_df_reversed$pred_probs_reversed.fit)
# pred_low <- pred_probs_df %>%
# filter(H_L_W == "Low")
# pred_high <- pred_probs_df %>%
# filter(H_L_W == "High")
# mean(pred_low$Probability)
# mean(pred_high$Probability)
#To evaluate the misclassification of predicted cancelations and actual cancelations -- essentially, how accurate are the predictions?
optCutOff <- optimalCutoff(test$Canceled, pred_probs$fit)
misClassError(test$Canceled, pred_probs$fit, threshold = optCutOff) #~14% misclassification error, so overall pretty accurate
#Plot ROC, which traces the percentage of true positives accurately predicted by the model as the prediction probability cutoff is lowered from 1 to 0.
plotROC(test$Canceled, pred_probs$fit) #the graph has a steep curve, which indicated a good quality model
#Measure concordance, which measures how well high probability scores match up with cancelation values, and vice versa.
Concordance(test$Canceled, pred_probs$fit) #a concordance of ~0.9 is a high quality model
```
###Calculate probability of cancellation of other projects
```{r}
ventyx_Truncated <- ventyx_df_google_Sen %>%
filter(Artificial == "Yes")
# same workflow as Alex's predictions above
# Timelines ARE included
pred_probs_trunc <- predict(logistic_reg_all, newdata=ventyx_Truncated, type='response', se.fit = TRUE)
pred_probs_trunc_df <- data.frame(ventyx_Truncated, pred_probs_trunc$fit, pred_probs_trunc$se.fit)
pred_probs_trunc_df <- pred_probs_trunc_df %>%
rename(Probability = pred_probs_trunc.fit) %>%
rename(Publicity = Google_Sentiment)
```
### Mean Wind Project: Probability of Cancellation
####Part 1. Define the average wind project
For numeric variables, I used the mean. For categorical variables, I used the most common (ie. Texas and high-risk).
```{r}
# summary(ventyx_Operating_Canceled)
mean(ventyx_Operating_Canceled$TimelineDa) # 1149.821
sd(ventyx_Operating_Canceled$TimelineDa) # 827.1744
mean(ventyx_Operating_Canceled$View_Score) # 65.11366
sd(ventyx_Operating_Canceled$View_Score) # 50.58458
mean(ventyx_Operating_Canceled$Google_Sentiment) # 0.2926
sd(ventyx_Operating_Canceled$Google_Sentiment) # 0.5677
mean(ventyx_Operating_Canceled$PopDensity) # 57.90
sd(ventyx_Operating_Canceled$PopDensity) # 189.07
mean(ventyx_Operating_Canceled$Household_, na.rm = TRUE) # 50516.88, Pine Ridge Wind Farm does not have a household income
sd(ventyx_Operating_Canceled$Household_, na.rm = TRUE) # 9514.333
mean(ventyx_Operating_Canceled$Capacity) # 117.08
sd(ventyx_Operating_Canceled$Capacity) # 155.12
mean(ventyx_Operating_Canceled$members_env) # 1346.125
sd(ventyx_Operating_Canceled$members_env) # 2853.518
# repeat for canceled and operating projects
# Mean project:
# Canceled ~ TimelineDa + View_Score + Google_Sentiment + PopDensity + Household_ + Capacity + H_L_W + State + members_env + Google_Sentiment*H_L_W
mean_project <- c(1149.821, 65.11366, 0.2926, 57.90, 50516.88, 117.08, "High", "Texas", 1346.125)
mean_project <- as.data.frame(mean_project)
average_project <- mean_project # change name of df so that row and df aren't the same name
average_project <- t(average_project)
# rename columns
colnames(average_project) <- c("TimelineDa", "View_Score", "Google_Sentiment", "PopDensity", "Household_", "Capacity", "H_L_W", "State", "members_env")
# prep df for regression: it ends up as a matrix after wrangling above, variables need to be correct classes
average_project <- as.data.frame(average_project)
average_project$TimelineDa <- as.numeric(average_project$TimelineDa)
average_project$View_Score <- as.numeric(average_project$View_Score)
average_project$Google_Sentiment <- as.numeric(average_project$Google_Sentiment)
average_project$PopDensity <- as.numeric(average_project$PopDensity)
average_project$Household_ <- as.numeric(average_project$Household_)
average_project$Capacity <- as.numeric(average_project$Capacity)
average_project$H_L_W <- as.factor(average_project$H_L_W)
average_project$State <- as.factor(average_project$State)
average_project$members_env <- as.numeric(average_project$members_env)
# canceled_mean
# operating_mean
```
Repeat for median project.
```{r}
median(ventyx_Operating_Canceled$TimelineDa) # 989 (mean = 1149.821)
median(ventyx_Operating_Canceled$View_Score) # 54 (mean = 65.11366)
median(ventyx_Operating_Canceled$Google_Sentiment) # 0 includings NAs = 0, (mean = 0.2926)
median(Ventyx_Operating_Canceled_NoZeros$Google_Sentiment) # 0.899816 no 0's
median(ventyx_Operating_Canceled$PopDensity) # 16.7 (mean = 57.90)
median(ventyx_Operating_Canceled$Household_, na.rm = TRUE) # 49693.5 (mean = 50516.88), Pine Ridge Wind Farm does not have a household income
median(ventyx_Operating_Canceled$Capacity) # 95 (mean = 117.08)
median(ventyx_Operating_Canceled$members_env) # 222 (mean = 1346.125)
med_project <- c(989, 54, 0, 16.7, 49693.5, 95, "High", "Texas", 222)
med_project <- as.data.frame(med_project)
median_project <- med_project # change name of df so that row and df aren't the same name
median_project <- t(median_project)
# rename columns
colnames(median_project) <- c("TimelineDa", "View_Score", "Google_Sentiment", "PopDensity", "Household_", "Capacity", "H_L_W", "State", "members_env")
# prep df for regression: it ends up as a matrix after wrangling above, variables need to be correct classes
median_project <- as.data.frame(median_project)
median_project$TimelineDa <- as.numeric(median_project$TimelineDa)
median_project$View_Score <- as.numeric(median_project$View_Score)
median_project$Google_Sentiment <- as.numeric(median_project$Google_Sentiment)
median_project$PopDensity <- as.numeric(median_project$PopDensity)
median_project$Household_ <- as.numeric(median_project$Household_)
median_project$Capacity <- as.numeric(median_project$Capacity)
median_project$H_L_W <- as.factor(median_project$H_L_W)
median_project$State <- as.factor(median_project$State)
median_project$members_env <- as.numeric(median_project$members_env)
```
Part 2. Use logistic model to predict probability of cancellation
```{r}
mean_pred_probs <- predict(logistic_reg_all, newdata=average_project, type='response', se.fit = TRUE)
mean_pred_probs_df <- data.frame(average_project, mean_pred_probs$fit, mean_pred_probs$se.fit)
mean_pred_probs_df <- mean_pred_probs_df %>%
rename(Probability = mean_pred_probs.fit) %>%
rename(Publicity = Google_Sentiment)
# The average wind project, based on our data, has a 2% chance of being canceled
med_pred_probs <- predict(logistic_reg_all, newdata=median_project, type='response', se.fit = TRUE)
med_pred_probs_df <- data.frame(median_project, med_pred_probs$fit, med_pred_probs$se.fit)
med_pred_probs_df <- med_pred_probs_df %>%
rename(Probability = med_pred_probs.fit) %>%
rename(Publicity = Google_Sentiment)
# same result
```
The average wind project, based on our data, has a 2% change of being canceled. This seems low?
### Distribution visualizations
```{r publicity_score}
# operating and canceled
ggplot(ventyx_Operating_Canceled, aes(x = Google_Sentiment)) + # with 0s
geom_histogram() +
xlim(-1.5, 2) +
ylim(0, 610) +
geom_vline(aes(xintercept = mean(Google_Sentiment)),col='red') +
geom_vline(aes(xintercept = median(Google_Sentiment)),col='green')
ggplot(filter(ventyx_Operating_Canceled, Google_Sentiment != 0), aes(x = Google_Sentiment)) + # without 0s
geom_histogram() +
xlim(-1.5, 2)
# just canceled
ggplot(ventyx_Cancelled, aes(x = Google_Sentiment)) + # with 0s
geom_histogram() +
xlim(-1.5, 2)
ggplot(filter(ventyx_Cancelled, Google_Sentiment != 0), aes(x = Google_Sentiment)) + # without 0s
geom_histogram() +
xlim(-1.5, 2)
# just operating
ggplot(ventyx_Operating, aes(x = Google_Sentiment)) + # with 0s
geom_histogram() +
xlim(-1.5, 2)
ggplot(filter(ventyx_Operating, Google_Sentiment != 0), aes(x = Google_Sentiment)) + # without 0s
geom_histogram() +
geom_vline(aes(xintercept = mean(Google_Sentiment)),col='red') +
xlim(-1.5, 2)
```
```{r timeline}
# operating and canceled
ggplot(ventyx_Operating_Canceled, aes(x = TimelineDa)) +
geom_histogram()
# just canceled
ggplot(ventyx_Cancelled, aes(x = TimelineDa)) +
geom_histogram() +
xlim(0, 4500) +
ylim(0, 100)
# just operating
ggplot(ventyx_Operating, aes(x = TimelineDa)) +
geom_histogram() +
xlim(0, 4500) +
ylim(0, 100)
```
####Distribution of publicity scores
I'm thinking about how to tell the publicity story to developers. How do our results impact their process? How can we help them choose locations?
Where are the projects with good publicity and bad publicity?
- Spatial distribution of publicity by county and state (operating projects and cancelled projects)
```{r}
# group by state --> summary statistics
op_by_state <- ventyx_Operating_Canceled %>%
group_by(State) %>%
tally()
# summarise(mean_pub = mean(Google_Sentiment),
# SE = sd(Google_Sentiment))
can_by_state <- ventyx_Operating_Canceled %>%
group_by(State) %>%
summarise(mean_pub = mean(Google_Sentiment),
SE = sd(Google_Sentiment))
```