-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnpmvtolregion(error-when-m-greater-than-0).py
286 lines (259 loc) · 12 KB
/
npmvtolregion(error-when-m-greater-than-0).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
import pandas as pd
import numpy as np
import scipy.stats as st
from scipy.spatial.distance import mahalanobis
def length(x):
if type(x) == float or type(x) == int or type(x) == np.int32 or type(x) == np.float64 or type(x) == np.float32 or type(x) == np.int64:
return 1
return len(x)
def npmvtolregion(x, depthfn, alpha = None, P = None, Beta = None, adjust = "no",
type1 = ["central"], slower = None, scenter = None, supper = None,
L = -np.inf, U = np.inf):
'''
Nonparametric Multivariate Hyperrectangular Tolerance Regions
Description
Provides depth-based multivariate central or semi-space nonparametric
tolerance regions. These can be calculated for any continuous multivariate
data set. Either (P, 1-alpha) tolerance regions or beta-expectation
tolerance regions can be specified.
Usage
npmvtol.region(x, alpha = None, P = None, Beta = None, depthfn,
adjust = ["no", "floor", "ceiling"],
type1 = ["central", "semispace"],
slower = None, center = None, upper = None),
L = -np.inf, U = np.inf)
Parameters
----------
x: dataframe
An nxp matrix of data assumed to be drawn from a p-dimensional
multivariate distribution. n pertains to the sample size.
alpha: float
The level chosen such that 1-alpha is the confidence level. Note that
if a (P, 1-alpha) tolerance region is required, then both alpha and P
must be specified, but Beta must be set to None.
P: float
The proportion of the population to be covered by this tolerance
interval. Note that if a (P, 1-alpha) tolerance region is required,
then both alpha and P must be specified, but Beta must be set to None.
Beta: float
The confidence level for a beta-expectation tolerance region. Note
that if a beta-expectation tolerance region is required, then Beta
must be specified, but both alpha and P must be set to None.
depthfn: function
The data depth function used to perform the ordering of the multivariate data. Thus function must be coded in such a way that the first argument is multivariate data for which to calculate the depth values and the second argument is the original multivariate sample, x. For the purpose of this tolerance region calculation, these two arguments should both be the original multivariate sample.
adjust: string, optional
Whether an adjustment should be made during an intermediate
calculation for determining the number of points that need to be
included in the multivariate region. If adjust = "no", the default,
then no adjustment is made during the intermediate calculation. If
adjust = "floor", then the intermediate calculation is rounded down to
the next nearest integer. If adjust = "ceiling", then the intermediate
calculation is rounded up to the next nearest integer.
type1: string, optional
The type of multivariate hyperrectangular region to calculate. If
type = "central", then two-sided intervals are reported for each
dimension of the data x. If type = "semispace", then a combination of
one-sided intervals and two-sided intervals are reported for the
dimensions of x. Which interval is calculated for each dimension in
this latter setting is dictated by the semi.order argument.
supper, scenter, slower: 1,2, or 3, unique
If type = "semispace", then this argument must be specified. Each
element gives the indices of the dimensions of x for which the type of
interval should be calculated. Indices specified for the element of
lower will return one-sided lower limits for those dimensions, indices
specified for the element of center will return two-sided intervals
for those dimensions, and indices specified for the element of upper
will return one-sided upper limits for those dimensions.
L: float, optional
If type = "semispace", these are the lower limits for any dimensions
for which one requests one-sided upper limits.
U: float, optional
If type = "semispace", these are the upper limits for any dimensions
for which one requests one-sided lower limits.
Returns
-------
npmvtolregion returns a px2 matrix where the columns give the lower and
upper limits, respectively, of the multivariate hyperrectangular tolerance
region.
References
Young, D. S. and Mathew, T. (2020+), Nonparametric Hyperrectangular
Tolerance and Prediction Regions for Setting Multivariate Reference
Regions in Laboratory Medicine, Submitted.
Examples
x = np.array([[1,4,10,10.45],[9,4,11,11.1],[4,7,9,9.8]]).T
x = pd.DataFrame(x)
npmvtolregion(x, depthfn=1,alpha = 0.05, P = 0.99, type1 ='semispace',
adjust = 'no',slower = 2, supper = 1, scenter = 3)
'''
if not ((alpha is None and P is None and Beta is not None) or (alpha is not None and P is not None and Beta is None)):
print("Either alpha and P, (exclusive) or Beta must be specified!")
if adjust not in ['no','floor','ceiling']:
return "adjust must be one of 'no','floor', or 'ceiling'."
n = length(x.iloc[:,0])
p = length(x.iloc[0])
typevec = [type1,]*p
semiorder = np.array([slower,scenter,supper])
slower -=1
scenter -=1
supper -=1
if type1 == 'semispace':
if length(list(set(semiorder))) != length(semiorder):
return "All indices must be uniquely specified in the list semiorder!"
if 1 not in semiorder:
return "slower, supper, and scenter must any permutation of 1,2,3"
if 2 not in semiorder:
return "slower, supper, and scenter must any permutation of 1,2,3"
if 3 not in semiorder:
return "slower, supper, and scenter must any permutation of 1,2,3"
typevec[slower] = 'lower'
typevec[scenter] = 'central'
typevec[supper] = 'upper'
if p<2:
return "This procedure only works when x has two or more dimensions"
if type1 == 'central':
side = int(2*p)
else:
side = sum(np.array(list(map(length,semiorder)))*[1,2,1])
if Beta is None:
rn= n*P + st.norm.ppf(1-alpha)*np.sqrt(n*(P*(1-P)))
rn = [np.floor(rn),np.ceil(rn)]
if adjust == 'no':
if n+1-rn[0] == 0:
n1rn0 = n+1-rn[0]+1e-14
else:
n1rn0 = n+1-rn[0]
if n+1-rn[1] == 0:
n1rn1 = n+1-rn[1]+1e-14
else:
n1rn1 = n+1-rn[1]
Pb = np.array([st.beta.sf(P, rn[0], n1rn0),st.beta.sf(P, rn[1], n1rn1)])
m = n - (rn[np.max(np.where(np.abs((1-alpha)-Pb)))])-side
elif adjust == 'floor':
m = n - rn[0] - side
else:
m = n - rn[1] - side
else:
rn = (n+1)*Beta
rn = np.array([np.floor(rn),np.ceil(rn)])
if adjust == 'no':
m = n - rn[int(np.min(np.abs(Beta-rn/(n+1))))]
elif adjust == 'floor':
m = n - rn[0]
else:
m = n - rn[1]
#when m > 0, there's unknown issues.
if m > 0:
dx = np.array(depthfn(x,x))
dx = pd.DataFrame(dx)
maxdepth = dx[dx == max(dx.iloc[:,0])]
maxdepth = maxdepth.dropna().index
y = pd.DataFrame(x.iloc[maxdepth]).T
y.columns = range(length(maxdepth))
y = np.array(y)
yy = []
for i in range(length(y[0])):
for j in range(length(y)):
yy.append(y[j][i])
y = pd.DataFrame(y)
yy = np.array(yy).reshape(length(y.iloc[:,0]),length(y.iloc[0])).T
yy = pd.DataFrame(yy)
centx = []
for i in range(length(yy.iloc[0])):
centx.append(np.mean(yy.iloc[:,i]))
euclidean = pd.DataFrame(np.sqrt((scale(x, c = centx, sc=False)**2).T.sum().values))
allx = []
for i in range(p):
temp = pd.concat([x,dx,pd.DataFrame(range(1,n+1)),euclidean],axis = 1)
emp = ['']*p
tcolumns = [emp,'dx','range','euclidean']
tcolumns = flatten(tcolumns)
temp.columns = tcolumns
if typevec[i] == 'lower':
idxs = (-x.iloc[:,i]).argsort()
else:
idxs = (x.iloc[:,i]).argsort()
temp.index = idxs
temp = temp.sort_index()
allx.append(temp)
newallxred = []
allxred = []
#fix this###########################################################
for i in range(int(m-1)):
for j in range(p):
idxx = []
if (typevec[j] == 'central'):
idxx.append(allx[j].iloc[0*(typevec[j]=='central')])
idxx.append(allx[j].iloc[n-(i+1)])
allxred.append((pd.concat(idxx,axis = 1).T))
else:
idxx.append(allx[j].iloc[n-(i+1)])
allxred.append(pd.concat(idxx,axis = 1).T)
####################################################################
allxred = pd.concat(allxred)
for j in range(int(length(allxred)/(m-1))):
newallxred.append(pd.concat([allxred.iloc[j*int(m-1):int(m-1)*(j+1)]]))
#print(newallxred)
for i in range(length(newallxred)):
tmpind = np.where(newallxred[i].iloc[:,p] == min(newallxred[i].iloc[:,p]))[0]
if length(tmpind) > 1:
newallxred[i] = newallxred[i].iloc[tmpind]
ind2remove = newallxred[i].iloc[np.max(np.where(newallxred[i].iloc[:,p+2])),p+1]
else:
ind2remove = newallxred[i].iloc[np.min(np.where(newallxred[i].iloc[:,p])),p+1]
for j in range(p):
which = np.where(allx[j].iloc[:,p+1] == ind2remove)[0]
try:
allx[j] = allx[j].drop(which)
except:
break
#print(allx)
LIMITS = allx[0].iloc[:,:p].T
#print(LIMITS)
else:
LIMITSmin = []
LIMITSmax = []
for i in range(length(x.iloc[0])):
LIMITSmin.append(min(x.iloc[:,i]))
LIMITSmax.append(max(x.iloc[:,i]))
LIMITSmin = pd.DataFrame(LIMITSmin)
LIMITSmax = pd.DataFrame(LIMITSmax)
LIMITS = pd.concat([LIMITSmin,LIMITSmax],axis=1)
LIMITS.columns = ['Lower','Upper']
if type1 == 'semispace':
LIMITS.iloc[supper,0] = L
LIMITS.iloc[slower,1] = U
Lim_idx = []
for i in range(length(LIMITS.iloc[:,0])):
Lim_idx.append(f'X{i+1}')
LIMITS.index = [Lim_idx]
return LIMITS
def flatten(A):
rt = []
for i in A:
if isinstance(i,list): rt.extend(flatten(i))
else: rt.append(i)
return rt
def scale(y, c=True, sc=True):
x = y.copy()
if c is True:
x -= x.mean()
if type(c) == list:
x = x.apply(lambda x: x-c, axis = 1)
if sc and c:
x /= x.std()
elif sc:
x /= np.sqrt(x.pow(2).sum().div(x.count() - 1))
return x
def mahalanobisR(X,meanCol,IC):
m = []
for i in range(X.shape[0]):
m.append(mahalanobis(X.iloc[i,:],meanCol,IC) ** 2)
return m
def mdepth(pts,x):
return mahalanobisR(pts, meanCol=np.array([0,0,0]), IC=np.diag([1,1,1]))
# x = np.array([[1,4,10,10.45],[9,4,11,11.1],[4,7,9,9.8]]).T
# #x = np.array([[1,4,10.45],[9,4,11.1],[4,7,9.8]]).T
# x = pd.DataFrame(x)
# #print(npmvtolregion(x, depthfn=mdepth, Beta = -.1, type1 ='semispace', adjust = 'no',slower = 2, supper = 1, scenter = 3))
# print(npmvtolregion(x, depthfn=1,alpha = 0.05, P = 0.99, type1 ='semispace', adjust = 'no',slower = 2, supper = 1, scenter = 3))
print("There is an error in this function when m > 0.")