-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreProcessing_methodology.r
80 lines (56 loc) · 2.18 KB
/
preProcessing_methodology.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# Required packages
require(phyloseq)
require(FCBF)
setwd('~/git/MIE_Metagenomics')
source('utils.r')
## --- Create Phyloseq object --- ##
# Defining input and outdir directories
inputDir = 'data/raw/'
outDir = 'data/processed/'
# Defining biom, sample and tree files (100nt) --> trimmed sequencing
biom_file = paste0(inputDir, 'otu_table_json.biom')
tree_file = paste0(inputDir, '97_otus.tree')
sampleDir = paste0(inputDir, 'clinical_SelectFeatures_unique_patients_fecal.rds')
# Reading the three files into phyloseq compatible objects
print('Importing OTU table ... ')
otu = import_biom(biom_file)
print('Reading tree file ...')
tree = read_tree(tree_file)
print('Reading sample data ...')
sample = readRDS(sampleDir)
sample_cohort = sample_data(sample)
phyloseq = merge_phyloseq(otu, tree, sample_cohort)
saveRDS(phyloseq, file = paste0(outDir, 'phyloseq.rds'))
## --- Glomerating, outlier analysis, labeling and balancing --- ##
# Load phyloseq object
filePath = paste0(outDir, 'phyloseq.rds')
taxLevel = 'Rank6' # genera
pseq = readRDS(filePath)
print(paste0('Glomerating OTUs by ', taxLevel, ' ...'))
pseqGlom = tax_glom(pseq, taxLevel)
pats = isoForest(pseqGlom)
pseq2 = prune_samples(pats, pseqGlom)
targets = c('DIET_TYPE')
for (i in seq_along(targets)) {
res = labeling(pseq2, targets[i])
saveRDS(res, file = paste0(outDir, 'phyloseq_', targets[i], '.rds'))
}
## --- Split --- ##
set.seed(1993)
pseq = readRDS(paste0(outDir), 'phyloseq_COUNTRY.rds')
smp_size = floor(0.85 * nsamples(pseq))
train_ind = sample(seq_len(nsamples(pseq)), size = smp_size)
trainPats = sample_names(pseq)[train_ind]
testPats = sample_names(pseq)[-train_ind]
train = subset_samples(pseq, sample_names(pseq) %in% trainPats)
test = subset_samples(pseq, sample_names(pseq) %in% testPats)
saveRDS(test, file = paste0(outDir, 'test_COUNTRY.rds'))
print('Done split!'))
## --- Feature Selectino (with FCBF) --- ##
otu = as.data.frame(t(otu_table(train)))
otu = apply(otu, 2, function(x) log2(x + 1))
otu = cbind.data.frame(otu, target = get_variable(data, 'COUNTRY')
names(otu) = make.names(names(otu))
fcbf = fast.cor.FS(otu, thresh = 0.0025)
saveRDS(fcbf, file = paste0(outDir, 'fcbf_COUNTRY'))
print('Done FCBF!')