-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathchi-square-funs.R
177 lines (135 loc) · 6.54 KB
/
chi-square-funs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
data2RTQ = function(data) {
#Convert empirical data from a single condition to quantiles for diffusion
#model analysis.
#Assumes data are organized as an N x 2 matrix, where N indexes the number
#of trials in this condition. Column 1 of the data denotes choice (usually
#correct or error). Column 2 of the data denotes RT (currently in seconds).
#We can use the "choiceRT" output from simDiff as our data.
Qs = c(.1, .3, .5, .7, .9) #RT quantiles of interest for diffusion modeling
#STEP 1: Setting up the data
#The first thing we need to do with the data is to separate correct and
#error responses, as the RT distributions for these will be different.
#Loop that goes through the data and separates correct and error responses.
dataC = numeric(0)
dataE = numeric(0)
for (i in 1:dim(data)[1]) {
#for Trial #1 - #N
if (data[i, 1] == 1) {
#Correct response
dataC = c(dataC,data[i, 2]) #only take the second column entry, since we no longer need accuracy information
} else {
#Error response
dataE = c(dataE,data[i, 2])
}
}
#STEP 2: Identifying empirical RT quantiles for each type of response
#We now need to figure out where the [0.1, 0.3, 0.5, 0.7, and 0.9] RT
#quantiles are along the time axis for both Correct and Error responses.
#These quantiles will form the edges of RT "bins" in the data, which are
#used for parameter estimation using the chi-square method.
#Matlab has an in-built function for extracting quantiles, which I use
#here, but you can also do this manually by sorting RTs from fastest to
#slowest (for, say, corrects), and then figuring out where the 10th, 30th,
#50th, 70th, and 90th percentiles of the data are along the time axis.
binedge_c = quantile(dataC, Qs)
binedge_e = quantile(dataE, Qs)
#STEP 3: Compute the proportion of observations within each RT bin
#Estimating parameters using the chi-square method means that we need a way
#of aggregating the data so that we can compare predictions against
#observations.
#Due to how quantiles are defined, we know the proportion of observations
#that are within each bin. For example, the proportion of observations
#that are between the 0.1 and 0.3 quantiles is equal to 0.2 (i.e., 0.3 -
#0.1 = 0.2). It follows that the data for any experiment, when binned
#according to the RT quantiles we are interested in, will be:
#[0.1, 0.2, 0.2, 0.2, 0.2, 0.1]
#However, we need to scale these values by the observed correct/error rates
#so that the proportion of observations sum to 1.
pC = length(dataC) / dim(data)[1] #proportion of corrects
pE = length(dataE) / dim(data)[1] #proportion of errors
bindata_c = pC * c(.1, .2, .2, .2, .2, .1)
bindata_e = pE * c(.1, .2, .2, .2, .2, .1)
#What we now need to do is compare diffusion model predictions to our
#empirical data. To do so, we will need to compare the predicted proportion
#of responses that fall within the empirically defined RT quantiles (i.e.,
#the bin edges defined by "binedge_c" and "binedge_e"). If the model
#perfectly matches the data, the expected proportions will equal the
#observed proportions (i.e., the data vectors defined by "bindata_c" and
#"bindata_e").
return(list(binedge_c, binedge_e, bindata_c, bindata_e))
}
preds2RTQ = function(preds, binedge_c, binedge_e) {
#Convert model predictions for a single condition into RT Quantiles
#Assumes preds are organized as an N x 2 matrix, where N indexes the number
#of simulated trials in this condition. Conventions are otherwise the same
#as for "data2RTQ"
#We can use the "choiceRT" output from simDiff as our model predictions.
Qc = binedge_c
#Empirical RT quantiles for correct responses.
Qe = binedge_e
#Empirical RT quantiles for error responses.
#We now need to convert our model predictions (here, based on simulation),
#to the same RT-binned format as the data. The procedure is similar to how
#we dealt with data, but with a few changes.
#Loop that goes through the predictions and separates correct and error
#trials.
predC = numeric(0)
predE = numeric(0)
for (i in 1:dim(preds)[1]) {
#for Trial #1 - #N
if (preds[i, 1] == 1) {
#Correct response
predC = c(predC,preds[i, 2]) #only take the second column entry, since we no longer need accuracy information
} else {
#Error response
predE = c(predE,preds[i, 2])
}
}
#Convert frequencies to proportions (for later).
predPC = length(predC) / dim(preds)[1] #Predicted P(correct)
predPE = length(predE) / dim(preds)[1] #Predicted P(error)
#We now identify the proportion of *predicted* trials that fall within the
#*empirically-defined* bin edges for both correct and error trials.
#First, we need to sort the predicted RTs to make life easier.
predC = sort(predC)
predE = sort(predE)
#Now we see how many simulated trials fell between each of the bin edges.
#This is a matter of comparing the numbers of trials that fall between
#successive bin edges.
bindata_pc = c(
sum(predC <= Qc[1]),
sum(predC <= Qc[2]) - sum(predC <= Qc[1]),
sum(predC <= Qc[3]) - sum(predC <= Qc[2]),
sum(predC <= Qc[4]) - sum(predC <= Qc[3]),
sum(predC <= Qc[5]) - sum(predC <= Qc[4]),
sum(predC > Qc[5]) / dim(preds)[1]
)
bindata_pe = c(
sum(predE <= Qe[1]),
sum(predE <= Qe[2]) - sum(predE <= Qe[1]),
sum(predE <= Qe[3]) - sum(predE <= Qe[2]),
sum(predE <= Qe[4]) - sum(predE <= Qe[3]),
sum(predE <= Qe[5]) - sum(predE <= Qe[4]),
sum(predE > Qe[5]) / dim(preds)[1]
)
return(list(bindata_pc, bindata_pe))
}
chisqFit = function(data, preds, N) {
#Computes a chi-square statistic comparing diffusion model predictions
#against data. N denotes the number of trials per condition.
#The data are assumed to be an N x 12 matrix, where N denotes the number of
#experimental conditions (we're just doing 1 condition for our demo), and
#the RTQ-binned proportions of observations for correct responses followed
#by error responses.
#Model predictions ("preds") are assumed to be formatted in the same way.
#The simulated output from the Diffusion Model loop can be converted to
#quantiles using the "data2RTQ" function (just as we did for the data).
#I have included a double-summation to allow for the possibility of
#multiple conditions.
#Set minimum value of predicted counts to a small constant
#to avoid dividing by 0
preds = pmax(preds,1e-10)
#calculate chi-square
x2 = N * sum( sum( ( (data - preds ) ^ 2) / preds ) )
return(x2)
}