forked from ihmeuw/pcv-impact
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathits.r
178 lines (140 loc) · 7.93 KB
/
its.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# ---------------------------------------------------------------------------------------------------------------------------------------------
# David Phillips
#
# 3/1/2016
# Function that carries out interrupted time series analysis
# Inputs:
# * data - data table object in 'prepped' format (see below)
# * outcome - character. name of the outcome variable
# * cutpoint - date object containing the time point or points (up to 2) of intervention
# * slope - logical. TRUE indicates that an interaction term (or terms) should be used to estimate a different slope before/after intervetion
# * newEffectDate - date. date for which effect size should be calculated. defaults to cutoff if null, over-rides the regression coef if not null
# Outputs (in a list):
# * data - the input data object with six new columns: [outcome]_pred, [outcome]_pred_upper, [outcome]_pred_lower, [outcome]_cf, [outcome]_cf_upper, [outcome]_cf_lower,
# * outcome - character. name of the outcome variable
# * cutpoint - date object containing the time point or points (up to 2) of intervention
# * effect size - a data frame containing the intercept shift associated with intervention, including uncertainty
# * gof - goodness of fit based on BIC
# ---------------------------------------------------------------------------------------------------------------------------------------------
# To do
# - return all NA if the model fails
# Define function
its = function(data=NULL, outcome=NULL, cutpoint=NULL, slope=NULL, newEffectDate=NULL) {
# ------------------------------------------------------------------------------
# Handle inputs
# test
for(arg in c('data', 'outcome', 'cutpoint', 'slope')) {
if (is.null(get(arg))) stop(paste('Must provide', arg))
}
# essential variables
formulaVars = c(outcome, 'moyr')
if (!all(formulaVars %in% names(data))) stop('Essential variables not in data')
# cutpoint(s)
C = length(cutpoint)
if (C>2) stop('ITS not set up to handle more than 2 cutpoints')
if (C==1) start = end = cutpoint
if (C==2) {
start = cutpoint[1]
end = cutpoint[2]
duration = (end-start)[[1]]
}
# ------------------------------------------------------------------------------
# -------------------------------------------------------------------------------------------
# Set up/run regression
# generate ITS variables
data[, preIntervention:=moyr<=start]
data[, postIntervention:=moyr>=end]
data[, daysPostIntervention:=moyr-end]
data[daysPostIntervention<0, daysPostIntervention:=0]
if (C==2) data[, duringIntervention:=moyr>start & moyr<end]
if (C==2) data[, daysDuringIntervention:=moyr-start]
if (C==2) data[daysDuringIntervention<0, daysDuringIntervention:=0]
if (C==2) data[postIntervention==TRUE, daysDuringIntervention:=duration]
# make sure it's sorted
data = data[order(moyr)]
# store formula
if (!slope & C==1) f = paste(paste(formulaVars, collapse=' ~ '), '+ postIntervention')
if (slope & C==1) f = paste(paste(formulaVars, collapse=' ~ '), '+ daysPostIntervention')
if (!slope & C==2) f = paste(paste(formulaVars, collapse=' ~ '), '+ duringIntervention + postIntervention')
if (slope & C==2) f = paste(paste(formulaVars, collapse=' ~ '), '+ daysDuringIntervention + daysPostIntervention')
f = as.formula(f)
# run regression
fit = suppressWarnings(glm.nb(f, data))
# -------------------------------------------------------------------------------------------
# ---------------------------------------------------------------------
# Forecast exposure for XRCP
forecastFit = glm.nb(xrcp_exposure ~ moyr*month(moyr)*year(moyr), data)
data[, forecast:=exp(predict(forecastFit, newdata=data))]
data[is.na(xrcp_exposure), xrcp_exposure:=forecast]
data$forecast = NULL
# ---------------------------------------------------------------------
# -----------------------------------------------------------------------------------------------
# Predict
# store predictions
preds = predict(fit, type='link', se.fit=TRUE, newdata=data)
# exponentiate/include uncertainty and add to data
data[, (paste0(outcome,'_pred')):=exp(preds$fit)]
data[, (paste0(outcome,'_pred_upper')):=exp(preds$fit+1.95996*preds$se.fit)]
data[, (paste0(outcome,'_pred_lower')):=exp(preds$fit-1.95996*preds$se.fit)]
data[, (paste0(outcome,'_pred_se')):=preds$se.fit]
# linearly interpolate intervention period if two cutpoints are specified but slope isn't
if(!slope & C==2) {
# expected value
lmFit = lm(as.formula(paste0(outcome, '_pred ~ moyr')), data[moyr==start | moyr==end])
interpolation = predict(lmFit, newdata=data[duringIntervention==TRUE])
data[duringIntervention==TRUE, (paste0(outcome,'_pred')):=interpolation]
# upper
lmFit = lm(as.formula(paste0(outcome, '_pred_upper ~ moyr')), data[moyr==start | moyr==end])
interpolation = predict(lmFit, newdata=data[duringIntervention==TRUE])
data[duringIntervention==TRUE, (paste0(outcome,'_pred_upper')):=interpolation]
# lower
lmFit = lm(as.formula(paste0(outcome, '_pred_lower ~ moyr')), data[moyr==start | moyr==end])
interpolation = predict(lmFit, newdata=data[duringIntervention==TRUE])
data[duringIntervention==TRUE, (paste0(outcome,'_pred_lower')):=interpolation]
}
# -----------------------------------------------------------------------------------------------
# ---------------------------------------------------------------------------------
# Predict counterfactual
# modify coefficients so effect of intervention is zero
cfFit = copy(fit)
cfFit$coefficients[3:length(cfFit$coefficients)] = 0
# store counterfactual predictions
cfPreds = predict(cfFit, type='link', se.fit=TRUE, newdata=data)
# exponentiate/include uncertainty and add to data
data[, (paste0(outcome,'_pred_cf')):=exp(cfPreds$fit)]
data[, (paste0(outcome,'_pred_cf_upper')):=exp(cfPreds$fit+1.95996*cfPreds$se.fit)]
data[, (paste0(outcome,'_pred_cf_lower')):=exp(cfPreds$fit-1.95996*cfPreds$se.fit)]
# ---------------------------------------------------------------------------------
# ----------------------------------------------------------------------------------------------------------------
# Store effect size and goodness of fit
# effect size
if (!slope) effect_size = data.table('effect'=cbind(coef(fit), confint(fit), sqrt(diag(vcov(fit))))['postInterventionTRUE',])
# effect size if slope or new date specified (fix me)
# note: this is based on the prediction/prediction interval, not the regression coefficients
if (slope | !is.null(newEffectDate)) {
refDate = start
if (!is.null(newEffectDate)) refDate = newEffectDate # use counterfactual as reference date
effect = log(data[[paste0(outcome,'_pred')]][data$moyr==refDate]) - log(data[[paste0(outcome,'_pred_cf')]][data$moyr==refDate])
effect_lower = log(data[[paste0(outcome,'_pred_upper')]][data$moyr==refDate]) - log(data[[paste0(outcome,'_pred_cf')]][data$moyr==refDate])
effect_upper = log(data[[paste0(outcome,'_pred_lower')]][data$moyr==refDate]) - log(data[[paste0(outcome,'_pred_cf')]][data$moyr==refDate])
effect_se = (effect_upper-effect)/1.95996
effect_size = data.table('effect'=c(effect, effect_upper, effect_lower, effect_se))
}
# GoF
gof = data.table(bic=BIC(fit), rmse=sqrt(mean((data[[paste0(outcome,'_pred')]]-data[[outcome]])^2, na.rm=TRUE)))
# ----------------------------------------------------------------------------------------------------------------
# -------------------------------------------
# Clean up
# remove ITS variables
data$preIntervention = NULL
data$postIntervention = NULL
data$daysPostIntervention = NULL
if (C==2) data$duringIntervention = NULL
if (C==2) data$daysDuringIntervention = NULL
# -------------------------------------------
# -----------------------------------------------------------------------------
# Return output
return(list('data'=data, 'outcome'=outcome, 'cutpoint'=cutpoint,
'effect_size'=effect_size, 'gof'=gof, newEffectDate=newEffectDate))
# -----------------------------------------------------------------------------
}