Skip to content

Commit

Permalink
add optional --residfor argument to accept user specified variable(s)…
Browse files Browse the repository at this point in the history
… from demographics matrix to residualize input for
  • Loading branch information
alyssadai committed Jan 4, 2022
1 parent 146728c commit 7da36ae
Showing 1 changed file with 44 additions and 9 deletions.
53 changes: 44 additions & 9 deletions vertex/define_splits.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from scipy.io import savemat, loadmat
import sklearn
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import linear_model
import argparse

parser=argparse.ArgumentParser(
Expand All @@ -35,12 +36,30 @@
group.add_argument(
"--norm", help='z scoring direction', default='all', choices=['all','vertex','subject'])

group.add_argument(
"--residfor", help='measures to residualize for, must be in demographic spreadsheet', metavar='list', nargs='*')

args=parser.parse_args()

def save_mat(x,key,fname):
print("Saving ", np.shape(x), key, "to", fname)
scipy.io.savemat(fname, {'X': x})

# mx_raw = vertex x subject matrix, x = subject x variables of interest matrix ??
def residualize_mx(mx_raw, x):
mx_raw = mx_raw.transpose() # result = n_subjects x n_vertices
n_subjects, n_vertex = np.shape(mx_raw)
mx_resid = np.zeros_like(mx_raw)
regr = linear_model.LinearRegression()
for vertex in range(0,n_vertex):
y = mx_raw[:,vertex].reshape(n_subjects,1)
regr.fit(x,y)
predicted=regr.predict(x)
resid=y-predicted
mx_resid[:,vertex] = resid.flatten() # collapse into 1D
mx_resid = mx_resid.transpose()
return mx_resid

#lookup dictionary for normalization, used to set axis in z scoring
norm_lookup = {
'all' : None,
Expand All @@ -59,6 +78,13 @@ def save_mat(x,key,fname):
demo_vars.append(x)
demo = df_sorted[demo_vars].values

# create matrix with variables to residualize for
resid_vars = []
#resid_vars.append(args.id_col)
for x in args.residfor:
resid_vars.append(x)
# resid_vars = df_sorted[resid_vars].values - not really needed

#define train data as subj ids (x)
#define categorical vars as vars to stratify by (y, ie labels)
X = demo[:,0:1]
Expand All @@ -78,7 +104,7 @@ def save_mat(x,key,fname):
for train_index, test_index in sss.split(X, y):
Asplits_indices[str(iter)] = train_index;
Bsplits_indices[str(iter)] = test_index;

ID_list = []
s = train_index[0]
ID_list.append(df_sorted[args.id_col].iloc[s])
Expand Down Expand Up @@ -118,16 +144,26 @@ def save_mat(x,key,fname):
data_all = data_dict[metric]
#get data_a and data_b, containing ct data for A indicies and B indices
data_a = data_all[:,Asplits_indices[str(split)]]; data_b = data_all[:,Bsplits_indices[str(split)]]
#z score each
a_mx_wb = scipy.stats.zscore(data_a,axis=norm_lookup[args.norm])
b_mx_wb = scipy.stats.zscore(data_b,axis=norm_lookup[args.norm])

#repeat for each metric
resid_vars_a = resid_vars[Asplits_indices[str(split)],:]; resid_vars_b = resid_vars[Bsplits_indices[str(split)],:]
#z score each
if args.residfor is not None:
a_mx_wb = scipy.stats.zscore(residualize_mx(data_a, resid_vars_a),axis=norm_lookup[args.norm])
b_mx_wb = scipy.stats.zscore(residualize_mx(data_b, resid_vars_b),axis=norm_lookup[args.norm])
else:
a_mx_wb = scipy.stats.zscore(data_a,axis=norm_lookup[args.norm])
b_mx_wb = scipy.stats.zscore(data_b,axis=norm_lookup[args.norm])

#repeat for each metric
for metric in input_list[1:]:
data_all = data_dict[metric]
data_a = data_all[:,Asplits_indices[str(split)]]; data_b = data_all[:,Bsplits_indices[str(split)]]
data_a_z = scipy.stats.zscore(data_a,axis=norm_lookup[args.norm]) #zscore
data_b_z = scipy.stats.zscore(data_b,axis=norm_lookup[args.norm])
resid_vars_a = resid_vars[Asplits_indices[str(split)],:]; resid_vars_b = resid_vars[Bsplits_indices[str(split)],:]
if args.residfor is not None:
data_a_z = scipy.stats.zscore(residualize_mx(data_a, resid_vars_a),axis=norm_lookup[args.norm])
data_b_z = scipy.stats.zscore(residualize_mx(data_b, resid_vars_b),axis=norm_lookup[args.norm])
else:
data_a_z = scipy.stats.zscore(data_a,axis=norm_lookup[args.norm]) #zscore
data_b_z = scipy.stats.zscore(data_b,axis=norm_lookup[args.norm])
a_mx_wb = np.concatenate((a_mx_wb,data_a_z),axis=1) #append z scored data for this metric to the rest
b_mx_wb = np.concatenate((b_mx_wb,data_b_z),axis=1)

Expand All @@ -138,4 +174,3 @@ def save_mat(x,key,fname):
#write out
save_mat(a_mx_shift_wb, 'split a_' + str(split), out_dir + "a_" + str(split) + ".mat")
save_mat(b_mx_shift_wb, 'split b_' + str(split), out_dir + "b_" + str(split) + ".mat")

0 comments on commit 7da36ae

Please sign in to comment.