-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathevaluate_the_impact_of_hvg.py
66 lines (56 loc) · 2.51 KB
/
evaluate_the_impact_of_hvg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from myUtil import *
import scanpy as sc
import warnings
import anndata as ad
from scipy import sparse
from scipy import stats
import signal
import multiprocessing
import pickle
from tqdm import tqdm
warnings.filterwarnings('ignore')
def cal_result(dirName):
os.chdir(dirName)
if not os.path.isfile('mixscape_filter.h5ad'): return
adata1 = sc.read_h5ad('mixscape_filter.h5ad')
adata1.uns['log1p']["base"] = None
sc.pp.highly_variable_genes(adata1, n_top_genes=10000)
all_hvgs = adata1[:, adata1.var['highly_variable']].var_names
perts = adata1.obs['gene'].unique()
perts = [i for i in perts if i !='CTRL']
with open('missingDEG_ratio1.tsv', 'w') as fout:
fout.write('Pertb\tMissingNums\ttotalNums\tMissingRatio\n')
for pert in tqdm(perts):
mylist = []
adata3 = adata1[adata1.obs['gene'].isin(['CTRL', pert])]
adata3.uns['log1p']["base"] = None
sc.pp.highly_variable_genes(adata3, subset=True, n_top_genes=4000)
sc.tl.rank_genes_groups(adata3, 'gene', groups=[pert], reference='CTRL')
for pvals_adj, geneName, logfoldchanges in zip(adata3.uns['rank_genes_groups']['pvals_adj'], adata3.uns['rank_genes_groups']['names'], adata3.uns['rank_genes_groups']['logfoldchanges']):
if pvals_adj[0] <= 0.05 and abs(logfoldchanges[0]) >=1:
mylist.append(geneName[0])
tmp = [i for i in mylist if i not in all_hvgs]
if len(mylist) !=0:
fout.write('{}\t{}\t{}\t{:.3f}\n'.format(pert, len(tmp), len(mylist) ,len(tmp)/ len(mylist)))
else:
fout.write('{}\t{}\t{}\t{:.3f}\n'.format(pert, len(tmp), 0, 0))
### analyze data
def analysis1():
mylist = []
dat1 = pd.read_excel('Sheet10_modif.xlsx')
dat1 = dat1[((dat1['QC'] == 'Pass') & (dat1['Modality'] !='ATAC'))]
dat1.sort_values('Filter_PerturbationNums', ascending=True, inplace=True)
for dirName in tqdm(dat1['Datapath']):
os.chdir(dirName)
if os.path.isfile('missingDEG_ratio.tsv'):
dat = pd.read_csv('missingDEG_ratio.tsv', sep='\t')
dat['PertNums']= dat.shape[0]
mylist.append(dat)
datAll = pd.concat(mylist)
if __name__== '__main__':
dat = pd.read_excel('Sheet10_modif.xlsx')
dat = dat[((dat['QC'] == 'Pass') & (dat['Modality'] !='ATAC'))]
dat.sort_values('Filter_PerturbationNums', ascending=True, inplace=True)
for dirName in tqdm(dat['Datapath']):
print (dirName)
cal_result(dirName)