-
Notifications
You must be signed in to change notification settings - Fork 8
/
cluster_files
executable file
·158 lines (132 loc) · 5.19 KB
/
cluster_files
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python2
import argparse
import sys
import os
import shutil
import glob
import re
import logging
args = None
def parse_args():
logging.basicConfig(level=logging.INFO,
format="[%(levelname)s] %(message)s")
parser = argparse.ArgumentParser(
description='clustering files by regular expression [V3.0]',
epilog="https://github.com/shenwei356/easy_qsub")
parser.add_argument('indir', type=str, help='source directory')
parser.add_argument('-o',
'--outdir',
type=str,
help='out directory [<indir>.cluster]')
parser.add_argument(
'-p',
'--pattern',
type=str,
help='pattern (regular expression) of files in indir. ' +
'if not given, it will be the longest common substring of the files.' +
'GROUP (parenthese) should be in the regular expression. ' +
'Captured group will be the cluster name. e.g. "(.+?)_\d\.fq\.gz"')
parser.add_argument('-k',
'--keep',
action='store_true',
help='keep original dir structure')
parser.add_argument('-m',
'--mv',
action='store_true',
help='moving files instead of creating symbolic links')
parser.add_argument(
"-f",
"--force",
action="store_true",
help='force file overwriting, i.e. deleting existed out directory')
args = parser.parse_args()
args.indir = os.path.normpath(args.indir)
if not args.outdir:
args.outdir = os.path.normpath(args.indir) + '.cluster'
args.outdir = os.path.normpath(args.outdir)
return args
def longest_common_substring(s1, s2):
m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
longest, x_longest = 0, 0
for x in range(1, 1 + len(s1)):
for y in range(1, 1 + len(s2)):
if s1[x - 1] == s2[y - 1]:
m[x][y] = m[x - 1][y - 1] + 1
if m[x][y] > longest:
longest = m[x][y]
x_longest = x
else:
m[x][y] = 0
return s1[x_longest - longest:x_longest]
if __name__ == '__main__':
args = parse_args()
targets = list()
if args.pattern:
try:
pattern = re.compile(args.pattern)
except:
logging.error("illegal regular expression: {}".format(
args.pattern))
sys.exit(1)
if not ('(' in args.pattern and ')' in args.pattern):
logging.error(
'GROUP (parenthese) should be in the regular expression. ' +
'Captured group will be the cluster name. e.g. "(.+?)_\d\.fq\.gz"')
sys.exit(1)
def walk_func(_, dir, files):
basenames = [os.path.basename(file) for file in files
if not file.startswith('.')] # ignore .file
if not args.pattern:
lcs = basenames[0]
for file in basenames[1:]:
lcs = longest_common_substring(lcs, file)
lcs = lcs.lstrip('.')
if lcs == '':
return
clusters = set([lcs])
files = [os.path.join(dir, file) for file in files if lcs in file]
else:
clusters = set(pattern.findall(file)[0] for file in basenames
if pattern.search(file))
if len(clusters) == 0:
return
files = [os.path.join(dir, file) for file in files
if pattern.search(file)]
targets.append([dir, clusters, files])
os.path.walk(args.indir, walk_func, ())
if len(targets) == 0:
logging.error('no files match pattern: {}'.format(args.pattern))
sys.exit(1)
if os.path.exists(args.outdir):
if args.force:
shutil.rmtree(args.outdir)
else:
logging.info("update existed directory: {}".format(args.outdir))
for dir, clusters, files in targets:
for cluster in clusters:
if args.keep:
splits = os.path.split(dir)
if not splits[0] == '':
newdir = os.path.join(splits[1:])[0]
else: # no subdir in args.indir
newdir = ''
else:
newdir = ''
outdir = os.path.join(args.outdir, newdir, cluster)
if os.path.exists(outdir):
if args.force:
shutil.rmtree(outdir)
else:
logging.info("ignore existed directory: {}".format(outdir))
continue
logging.info("create new directory: {}".format(outdir))
os.makedirs(outdir)
for file in files:
if pattern.findall(os.path.basename(file))[0] != cluster:
continue
if args.mv: # moving file
shutil.move(file, outdir)
else: # creating symbolic links
link_name = os.path.join(outdir, os.path.basename(file))
source = os.path.relpath(os.path.abspath(file), outdir)
os.symlink(source, link_name)