forked from plantismash/plantismash
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_antismash.py
executable file
·1932 lines (1713 loc) · 86.5 KB
/
run_antismash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
#
# Copyright (C) 2010-2012 Marnix H. Medema
# University of Groningen
# Department of Microbial Physiology / Groningen Bioinformatics Centre
#
# Copyright (C) 2011,2012 Kai Blin
# University of Tuebingen
# Interfaculty Institute of Microbiology and Infection Medicine
# Div. of Microbiology/Biotechnology
#
# License: GNU Affero General Public License v3 or later
# A copy of GNU AGPL v3 should have been included in this software package in LICENSE.txt.
"""Run the antiSMASH pipeline"""
import sys
import os
import random
if sys.platform == ('win32') or sys.platform == ('darwin'):
os.environ['EXEC'] = os.getcwd() + os.sep + "exec"
os.environ['PYTHON'] = os.getcwd() + os.sep + "python"
sys.path.append(os.sep.join([os.getcwd(), "python", "Lib", "site-packages"]))
os.environ['PATH'] = os.pathsep + os.environ['PYTHON'] + os.pathsep + os.environ['PATH']
os.environ['PATH'] = os.pathsep + os.environ['EXEC'] + os.pathsep + os.environ['PATH']
import logging
import argparse
from os import path
import multiprocessing
import straight.plugin
from helperlibs.bio import seqio
from antismash.config import load_config, set_config
from antismash import utils
from antismash.output_modules import xls
from antismash.generic_modules import check_prereqs as generic_check_prereqs
from antismash.generic_genome_modules import check_prereqs as ggm_check_prereqs
from antismash.generic_modules import (
hmm_detection,
genefinding,
fullhmmer,
clusterfinder,
smcogs,
clusterblast,
subclusterblast,
knownclusterblast,
active_site_finder,
coexpress,
ecpredictor,
gff_parser,
subgroup
)
try:
from antismash.db.biosql import get_record
from antismash.db.extradata import getExtradata
USE_BIOSQL = True
except ImportError:
USE_BIOSQL = False
from numpy import array_split, array
import urllib2
from urllib2 import URLError
import httplib
import time
from collections import defaultdict
from Bio import SeqIO
from Bio.Alphabet import generic_dna
from Bio.Alphabet import generic_protein
from Bio.Seq import Seq
from Bio.Alphabet import NucleotideAlphabet
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
from datetime import datetime
def ValidateDetectionTypes(detection_models):
class Validator(argparse.Action):
def __call__(self, parser, args, values, option_string = None):
if values == "all":
values = ",".join(detection_models).replace(";",",").split(",")
else:
values = values.replace(";",",").split(",")
try:
for value in values:
if value not in detection_models:
raise ValueError('invalid detection models {s!r}'.format(s = value))
except ValueError as e:
print "\nInput error:", e, "\n"
print "Please choose from the following list:\n", "\n".join(detection_models)
sys.exit(1)
setattr(args, self.dest, values)
return Validator
def ValidateClusterTypes(clustertypes):
#TODO : list according to selected detection models
class Validator(argparse.Action):
def __call__(self, parser, args, values, option_string = None):
values = values.replace(";",",").split(",")
try:
for value in values:
if value not in clustertypes:
raise ValueError('invalid clustertype {s!r}'.format(s = value))
except ValueError as e:
print "\nInput error:", e, "\n"
print "Please choose from the following list:\n", "\n".join(clustertypes), "\n\nExample: --enable t1pks,nrps,other"
sys.exit(1)
setattr(args, self.dest, values)
return Validator
def main():
"Actually run the pipeline"
multiprocessing.freeze_support()
# We print the logging debug message as soon as the logging object is initialized after parsing the cmdline
start_time = datetime.now()
# First load the output plugins so we can present appropriate options
output_plugins = load_output_plugins()
detection_models = hmm_detection.get_supported_detection_models()
clustertypes = hmm_detection.get_supported_cluster_types()
parser = utils.getArgParser()
#positional arguments
parser.add_argument('sequences',
metavar='sequence',
nargs="*",
help="GenBank/EMBL/FASTA file(s) containing DNA.")
#optional non-grouped arguments
parser.add_argument('-h', '--help',
dest='help',
action='store_true',
default=False,
help="Show this help text.")
parser.add_argument('--help-showall',
dest='help_showall',
action='store_true',
default=False,
help="Show full lists of arguments on this help text.")
parser.add_argument('-c', '--cpus',
dest='cpus',
type=int,
default=multiprocessing.cpu_count(),
help="How many CPUs to use in parallel. (default: %(default)s)")
## grouped arguments
group = parser.add_argument_group('Basic analysis options', '', basic=True)
group.add_argument('--input-type',
dest='input_type',
default='nucl',
choices=['nucl', 'prot'],
help="Determine input type: amino acid sequence(s) or nucleotide sequence(s). (default: %(default)s)")
group.add_argument('--taxon',
dest='taxon',
default='bacteria',
choices=['bacteria', 'fungi', 'plants'],
help="Determine the taxon from which the sequence(s) came from. (default: bacteria)")
group.add_argument('--hmmsearch-chunk',
dest='hmmsearch_chunk',
default=10000,
type=int,
help="Number of sequences per hmm_search run (set lower for CPU with less RAM). (default: %(default)s)")
group = parser.add_argument_group('Additional analysis',
'Use -h along with the corresponding options to show '
'additional analysis specific parameters. (e.g.: '
'"{prog} -h --clusterblast --smcogs")'.format(prog=parser.prog),
basic=True)
group.add_argument('--cdhit',
dest='enable_cdhit',
action='store_true',
default=False,
help="Use CD-HIT to filter tandem arrays on hmm_detection.")
group.add_argument('--clusterblast',
dest='clusterblast',
action='store_true',
default=False,
help="Compare identified clusters against a database of antiSMASH-predicted clusters.")
group.add_argument('--update_clusterblast',
dest='update_clusterblast',
action='store_true',
default=False,
help="update the database of antiSMASH-predicted clusters (plantgeneclusters.txt, plantgeneclusterprots.fasta, plantgeneclusterprots.dmnd) using this time found clusters.")
group.add_argument('--clusterblastdir',
dest='clusterblastdir',
type=str,
default="",
help="the clusterblastdir contain the database of antiSMASH-predicted clusters (plantgeneclusters.txt, plantgeneclusterprots.fasta).")
group.add_argument('--subclusterblast',
dest='subclusterblast',
action='store_true',
default=False,
help="Compare identified clusters against known subclusters responsible for synthesising precursors.")
group.add_argument('--knownclusterblast',
dest='knownclusterblast',
action='store_true',
default=False,
help="Compare identified clusters against known gene clusters from the MIBiG database.")
group.add_argument('--coexpress',
dest='coexpress',
action='store_true',
default=False,
help="Compare identified clusters against a prepared Expression data.")
group.add_argument('--disable_subgroup',
dest='disable_subgroup',
action='store_true',
default=False,
help="disable identifying the subgroup.")
group.add_argument('--disable_treesvg',
dest='disable_treesvg',
action='store_true',
default=False,
help="disable making svg pictures of subgroupping tree to save time.")
group.add_argument('--subgroup_inputpath',
dest="subgroup_inputpath",
type=str,
default="",
help="give path of folder with the structure same to subgroup folder in antismash/generic_modules.")
group.add_argument('--disable_specific_modules',
dest='disable_specific_modules',
action='store_true',
default=False,
help="disable specific modules.")
group.add_argument('--smcogs',
dest='smcogs',
action='store_true',
default=False,
help="Look for sec. met. clusters of orthologous groups.")
group.add_argument('--inclusive',
dest='inclusive',
action='store_true',
default=False,
help="Use inclusive ClusterFinder algorithm for additional cluster detection.")
group.add_argument('--borderpredict',
dest='borderpredict',
action='store_true',
default=False,
help="Use ClusterFinder algorithm to predict gene cluster borders.")
group.add_argument('--full-hmmer',
dest='full_hmmer',
action='store_true',
default=False,
help="Run a whole-genome HMMer analysis.")
group.add_argument('--asf',
dest='run_asf',
action='store_true',
default=False,
help='Run active site finder module.')
group.add_argument('--ecpred',
dest='ecpred',
default='none',
choices=['none', 'kegg', 'eficaz'],
help="Run EC prediction. kegg = query KEGG REST API (Internet connection required!) "
"eficaz = use EFICAZ2.5a EC predictor. (default: %(default)s)")
group.add_argument('--modeling',
dest="modeling",
default='none',
choices=['none', 'sco', 'eco'],
help="Run homology based metabolic modeling pipeline against template model "
"eco or sco. (default: %(default)s)")
group = parser.add_argument_group('Advanced options')
group.add_argument('--limit',
dest="limit",
type=int,
default=1000,
help="Only process the first <limit> records (default: %(default)s). -1 to disable")
group.add_argument('--from',
dest='start',
type=int,
default=-1,
help="Start analysis at nucleotide specified.")
group.add_argument('--to',
dest='end',
type=int,
default=-1,
help="End analysis at nucleotide specified")
group.add_argument('--pfamdir',
dest='pfamdir',
default=argparse.SUPPRESS,
help="Directory the Pfam-A.hmm file is located in.")
group.add_argument('--models',
metavar="TYPES",
dest='enabled_detection_models',
action=ValidateDetectionTypes(detection_models),
default=["default"],
help="Select user-defined hmm detection models to be used.")
group.add_argument('--enable',
metavar="TYPES",
dest='enabled_cluster_types',
action=ValidateClusterTypes(clustertypes),
default=clustertypes,
help="Select sec. met. cluster types to search for.")
group.add_argument('--fix-id-line',
dest='fix_id_line',
action='store_true',
default=False,
help="Try to fix invalid sequence file ID lines.")
group.add_argument('--min-domain-number',
dest='min_domain_number',
type=int,
default=0,
help="Minimum unique domains required for minimum_rules hmm detection.")
group.add_argument('--dynamic-cutoff',
dest='enable_dynamic_cutoff',
action='store_true',
default=False,
help="Use dynamic cutoff multiplier based on local intergenic distances.")
group.add_argument('--cutoff-multiplier',
dest='cutoff_multiplier',
type=float,
default=1.00,
help="Multiplier for tuning hmm_detection cutoff globally.")
group.add_argument('--gene-num-cutoff',
dest='gene_num_cutoff',
type=int,
default=0,
help="Accept core genes within kb distance or having separated "
"by equal to or less than this number of intervening genes.")
group.add_argument('--gene-num-cutoff-only',
dest='gene_num_cutoff_only',
action='store_true',
default=False,
help="Only use number of intervening genes for the cutoff.")
group.add_argument('--ignore-short-aa',
dest='ignore_short_aa',
action='store_true',
default=True,
help="In hmm detection, ignore short AA without significant pfam hits (affect dynamic cutoffs).")
group = parser.add_argument_group('Gene finding options (ignored when ORFs are annotated)')
group.add_argument('--genefinding',
dest='genefinding',
default='none',
choices=['glimmer', 'prodigal', 'prodigal-m', 'none'],
help="Specify algorithm used for gene finding: Glimmer, "
"Prodigal, or Prodigal Metagenomic/Anonymous mode. (default: %(default)s).")
group.add_argument('--eukaryotic',
dest='eukaryotic',
action='store_true',
default=False,
help="DNA is of eukaryotic origin.")
group.add_argument('--all-orfs',
dest='all_orfs',
action='store_true',
default=False,
help="Use all ORFs > 60 nt instead of running gene finding.")
group.add_argument('--gff3',
dest='gff3',
default=False,
help="Specify GFF3 file to extract features from.")
group.add_argument('--use_phase',
dest='use_phase',
default=False,
action='store_true',
help="This flag forces CDS coordinates in the GFF3 to be modified by phase before translation. Otherwise this field of the GFF3 file will be ignored. (Phytozome GFF3s require applying phase before translation, whereas NCBI, augustus and glimmerhmm GFF3s do not.)")
group.add_argument('--glimmerhmm-train_folder',
dest='glimmerhmm_train_folder',
default='crypto',
help="Training models for the glimmerHMM (see antismash/generic_modules/genefinding/).")
group = parser.add_argument_group('CD-HIT specific options', '',
param=["--cdhit"])
group.add_argument('--cdh-memory',
dest='cdh_memory',
type=str,
default="2000",
help="memory of using CD-HIT, default 2000M.")
group.add_argument('--cdh-cutoff',
dest='cdh_cutoff',
default=0.5,
help="Cutoff for CD-HIT filtering, lower means more strict filtering. (0.2 - 1.0)")
group.add_argument('--cdh-display-cutoff',
dest='cdh_display_cutoff',
default=0.2,
help="Display percentage identities for cluster of core genes passing this cutoff value. (0.2-1.0)")
group = parser.add_argument_group('ClusterBlast specific options', '',
param=["--clusterblast", "--subclusterblast", "--knownclusterblast"])
group.add_argument('--nclusters',
dest='nclusters',
type=int,
default=10,
help="Number of clusters from ClusterBlast to display.")
group.add_argument('--seed',
dest='seed',
type=int,
default=0,
help="Random number seed for ClusterBlast coloring.")
group.add_argument('--homologyscalelimit',
dest='homologyscalelimit',
type=float,
default=0.0,
help="If positive float number greater than 1, limit horizontal shrinkage "
"of the graphical display of the query BGC in ClusterBlast results to "
"this ratio. Warning: some homologous genes may no longer be visible!")
group.add_argument('--dbgclusterblast',
dest='dbgclusterblast',
default="",
help='Retrieve the resuts from previous antiSMASH run stored in the specified directory.')
group = parser.add_argument_group('CoExpress specific options', '',
param=["--coexpress"])
group.add_argument('--coexpress-soft_file',
dest='coexpress_soft_file',
default="",
help="GEO soft formatted file (.soft) from either GEO Series (GSE) or GEO Dataset (GDS)."
"Use ',' to separate multiple input files.")
group.add_argument('--coexpress-csv_file',
dest='coexpress_csv_file',
default="",
help="CSV formatted gene expression file."
"Use ',' to separate multiple input files.")
group.add_argument('--coexpress-min_MAD',
dest='coexpress_min_MAD',
default="",
help="By default, genes with a Mean Absolute Deviation (MAD) of 0 get filtered out of any correlation calculation. "
"A minimum MAD value can be defined to filter out genes with values below the specified one. "
"If option is set to 0, the feature is turned off entirely as MAD is never below 0.")
group = parser.add_argument_group('ClusterFinder specific options', '',
param=["--inclusive", "--borderpredict"])
group.add_argument('--cf_cdsnr',
dest='cdsnr',
type=int,
default=5,
help="Minimum size of a ClusterFinder cluster, in number of CDS.")
group.add_argument('--cf_threshold',
dest='cf_prob_thres',
type=float,
default=0.6,
help="ClusterFinder mean probability threshold.")
group.add_argument('--cf_npfams',
dest='cf_npfams',
type=int,
default=5,
help="Minimum number of biosynthetic Pfam domains in a ClusterFinder cluster.")
group = parser.add_argument_group('Output options', '', basic=True)
group.add_argument('--outputfolder',
dest='outputfoldername',
default=argparse.SUPPRESS,
help="Directory to write results to.")
group.add_argument('--db',
dest='use_db',
action='store_true',
default=False,
help="write results to BioSQL database.")
group.add_argument('--dboverwrite',
dest='db_overwrite',
action="store_true",
default=False,
help="Re-run antiSMASH analysis and overwrite results with same accession number in BioSQL database.")
for plugin in output_plugins:
group.add_argument('--disable-%s' % plugin.name,
dest=plugin.name,
action='store_false',
default=argparse.SUPPRESS,
help="Disable %s" % plugin.short_description)
group = parser.add_argument_group("Debugging & Logging options", '', basic=True)
group.add_argument('-v', '--verbose',
dest='verbose',
action='store_true',
default=False,
help="Print verbose status information to stderr.")
group.add_argument('-d', '--debug',
dest='debug',
action='store_true',
default=False,
help="Print debugging information to stderr.")
group.add_argument('--logfile',
dest='logfile',
default=argparse.SUPPRESS,
help="Also write logging output to a file.")
group.add_argument('--statusfile',
dest='statusfile',
default=argparse.SUPPRESS,
help="Write the current status to a file.")
group.add_argument('--list-plugins',
dest='list_plugins',
action='store_true',
default=False,
help="List all available sec. met. detection modules.")
group.add_argument('--check-prereqs',
dest='check_prereqs_only',
action='store_true',
default=False,
help="Just check if all prerequisites are met.")
group.add_argument('-V', '--version',
dest='version',
action='store_true',
default=False,
help="Display the version number and exit.")
## endof grouped arguments
#if --help, show help texts and exit
if (list(set(["-h", "--help", "--help-showall"]) & set(sys.argv))):
parser.print_help(None, "--help-showall" in sys.argv)
sys.exit(0)
#Parse arguments, removing hyphens from the beginning of file names to avoid conflicts with argparse
infile_extensions = ('.fasta', '.fas', '.fa', '.gb', '.gbk', '.emb', '.embl')
sys.argv = [arg.replace("-","< > HYPHEN < >") if (arg.endswith(infile_extensions) and arg[0] == "-") else arg for arg in sys.argv]
options = parser.parse_args()
options.sequences = [filename.replace("< > HYPHEN < >","-") for filename in options.sequences]
#Remove ClusterFinder cluster types when ClusterFinder is not turned on
if not options.inclusive:
options.enabled_cluster_types = [cl_type for cl_type in options.enabled_cluster_types if not cl_type.startswith("cf_")]
# Logging is useful for all the following code, so make sure that is set up
# right after parsing the arguments.
setup_logging(options)
#preset options according to taxon
apply_taxon_preset(options)
#Remove cluster types not included in selected detection models
temp_ct = []
for cl_type in options.enabled_cluster_types:
if len(cl_type.split("/")) > 1:
if cl_type.split("/")[0] in options.enabled_detection_models:
temp_ct.append(cl_type)
elif "default" in options.enabled_detection_models :
temp_ct.append(cl_type)
options.enabled_cluster_types = temp_ct
#if -V, show version texts and exit
if options.version:
print "antiSMASH %s" % utils.get_version()
sys.exit(0)
logging.info("starting antiSMASH {version}, called with {cmdline}".format(version=utils.get_version(), cmdline=" ".join(sys.argv)))
logging.debug("antismash analysis started at %s", str(start_time))
options.run_info = {}
options.run_info["ver"] = utils.get_version()
options.run_info["param"] = " ".join(sys.argv[1:-1])
options.run_info["start"] = str(start_time)
if options.nclusters > 50:
logging.info("Number of clusters (%d) is too large. Reducing to 50.", options.nclusters)
options.nclusters = 50
logging.info("Number of clusters = %d", options.nclusters)
if options.seed != 0:
random.seed(options.seed)
if options.enable_cdhit:
cdh_cutoff = float(options.cdh_cutoff)
if not 0.2 <= cdh_cutoff <= 1.0:
logging.error('cd-hit cutoff is on wrong value (use 0.2 - 1.0)')
sys.exit(1)
cdh_display_cutoff = float(options.cdh_display_cutoff)
if not 0.2 <= cdh_display_cutoff <= 1.0:
logging.error('cd-hit display cutoff is on wrong value (use 0.2 - 1.0)')
sys.exit(1)
if options.input_type == 'prot' and (
options.clusterblast or
options.knownclusterblast or
options.subclusterblast or
options.inclusive or
options.full_hmmer):
logging.error("Protein input option is not compatible with --clusterblast, --subclusterblast, " \
"--knownclusterblast, --inclusive, and --full-hmmer")
sys.exit(2)
if options.input_type == 'prot' and (options.start != -1 or options.end != -1):
logging.error("Protein input option is not compatible with --start and --end.")
sys.exit(2)
if options.coexpress:
# check if soft file exist
if (options.coexpress_soft_file == "" and options.coexpress_csv_file == ""):
logging.error("Required .soft or .csv not found (set with --coexpress-soft_file PATH or --coexpress-csv_file PATH).")
sys.exit(2)
options.coexpress_soft_files = []
options.coexpress_csv_files = []
if options.coexpress_soft_file:
options.coexpress_soft_file = path.abspath(options.coexpress_soft_file)
soft_files = options.coexpress_soft_file.split(",")
for fname in soft_files:
if fname == "" or not path.isfile(fname):
logging.error("Required .soft not found (%s)." % fname)
sys.exit(2)
else:
options.coexpress_soft_files.append(path.abspath(fname))
if options.coexpress_csv_file:
options.coexpress_csv_file = path.abspath(options.coexpress_csv_file)
csv_files = options.coexpress_csv_file.split(",")
for fname in csv_files:
if fname == "" or not path.isfile(fname):
logging.error("Required .csv not found (%s)." % fname)
sys.exit(2)
else:
options.coexpress_csv_files.append(path.abspath(fname))
if options.coexpress_min_MAD != '':
try:
float(options.coexpress_min_MAD)
except ValueError:
logging.error("coexpress_min_MAD value must be a float.")
sys.exit(2)
if not USE_BIOSQL:
logging.info("BioSQL not loaded. Turning off DB access")
options.use_db = False
#Load configuration data from config file
load_config(options)
set_config(options)
# set up standard DB namespace
options.dbnamespace = options.BioSQLconfig.dbgenomenamespace
#Load and filter plugins
utils.log_status("Loading detection plugins")
if not options.disable_specific_modules:
plugins = load_specific_modules()
else:
plugins = []
if options.list_plugins:
list_available_plugins(plugins, output_plugins)
sys.exit(0)
filter_plugins(plugins, options, clustertypes)
filter_outputs(output_plugins, options)
#Check prerequisites
if check_prereqs(plugins, options) > 0:
logging.error("Not all prerequisites met")
sys.exit(1)
if options.check_prereqs_only:
logging.info("All prerequisites are met")
sys.exit(0)
if not options.sequences:
parser.error("Please specify at least one sequence file")
if options.gff3 and not os.path.exists(options.gff3):
logging.error('No file found at %r', options.gff3)
sys.exit(1)
if 'outputfoldername' not in options:
options.outputfoldername = path.splitext(path.basename(options.sequences[0]))[0]
if not os.path.exists(options.outputfoldername):
os.mkdir(options.outputfoldername)
options.full_outputfolder_path = path.abspath(options.outputfoldername)
#Remove old clusterblast files
clusterblast_files = ["subclusterblastoutput.txt", "clusterblastoutput.txt", "knownclusterblastoutput.txt"]
for clusterblast_file in clusterblast_files:
clusterblastoutput_path = os.path.join(options.full_outputfolder_path, clusterblast_file)
if os.path.exists(clusterblastoutput_path):
os.remove(clusterblastoutput_path)
if options.debug and os.path.exists(options.dbgclusterblast):
logging.debug("Using %s instead of computing Clusterblasts and variantes!", options.dbgclusterblast)
if ("train_%s" % options.glimmerhmm_train_folder) not in genefinding.get_supported_training_models():
logging.error("Specified glimmerHMM training models not found (%s)!", options.glimmerhmm_train_folder)
sys.exit(1)
# Structure for options.extrarecord
#
# options.extrarecord[seq_record_id]=Namespace()
# options.extrarecord[seq_record_id].extradata[extradataID] can contain the different (deserialized) objects,
# e.g. ...extradata['SubClusterBlastData'] will contain the storage object for subclsuterblast results
options.extrarecord = {}
#Parse input sequence
try:
utils.log_status("Parsing the input sequence(s)")
seq_records = parse_input_sequences(options)
if options.input_type == 'nucl':
seq_records = [record for record in seq_records if len(record.seq) > 1000]
if len(seq_records) == 0:
logging.error("Input does not contain contigs larger than minimum size of 1000 bp.")
sys.exit(1)
except IOError as e:
logging.error(str(e))
sys.exit(1)
if len(seq_records) < 1:
logging.error("Sequence file is incorrectly formatted or contains no sequences of sufficient quality.")
sys.exit(1)
# parse coexpress input file
if options.coexpress:
logging.info("Parsing CoExpress input file...")
options.geo_dataset = coexpress.parse_geofiles(options.coexpress_soft_files)
options.geo_dataset.extend(coexpress.parse_csvfiles(options.coexpress_csv_files))
options.gene_expressions = []
geo_ids = []
for geo in options.geo_dataset:
options.gene_expressions.append({})
for seq_record in seq_records:
options.gene_expressions[-1][seq_record.id] = {}
geo_id = geo["info"]["id"]
i = 0
while geo_id in geo_ids:
i += 1
geo_id = "%s_%d" % (geo["info"]["id"], i)
if i > 0:
logging.warning("Duplicated GEO rec_id found, renaming to %s..." % geo_id)
geo["info"]["id"] = geo_id
geo_ids.append(geo_id)
# calculate samples ranges
logging.info("Finding CoExpress locus tag columns...")
for geo in options.geo_dataset:
coexpress.find_col_id(geo, seq_records)
# calculate samples ranges
logging.info("Calculating CoExpress sample ranges...")
for geo in options.geo_dataset:
coexpress.calc_sample_ranges(geo)
features_to_match = {}
logging.info("Doing whole genome expression matching...")
for seq_record in seq_records:
features_to_match[seq_record.id] = utils.get_cds_features(seq_record)
for i in xrange(0, len(options.geo_dataset)):
logging.debug("Matching expression for dataset (%d/%d).." % (i + 1, len(options.geo_dataset)))
for seq_record in seq_records:
logging.debug("Seq_record %s.." % (seq_record.id))
options.gene_expressions[i][seq_record.id] = coexpress.match_exp_to_genes(features_to_match[seq_record.id], options.geo_dataset[i])
# filter out gene_expressions in case of physical overlaps,
# choose 1) the one with pfam hits (assuming has been filtered in hmm_detection filtering)
# and/or 2) the one with highest avg expression values
overlaps = utils.get_overlaps_table(seq_record)[0]
prefix = "%s:" % seq_record.id.replace(":", "_")
gene_expressions = {}
all_gene_expressions = options.gene_expressions[i][seq_record.id]
for overlap in overlaps:
chosen_gene = -1
best_exp = 0
for j in xrange(0, len(overlap)):
feature = overlap[j]
gene_id = utils.get_gene_id(feature)
if gene_id in all_gene_expressions:
if (prefix + gene_id) in options.hmm_results:
chosen_gene = j
break # this assumes one overlap cluster only have 1 gene with hits
exps = [all_gene_expressions[gene_id]["exp"][sample] for sample in all_gene_expressions[gene_id]["exp"]]
avg_exp = sum(exps) / len(exps)
if chosen_gene < 0 or best_exp < avg_exp:
chosen_gene = j
if chosen_gene >= 0:
gene_id = utils.get_gene_id(overlap[chosen_gene])
gene_expressions[gene_id] = all_gene_expressions[gene_id]
options.gene_expressions[i][seq_record.id] = gene_expressions
# The current implementation of the modeling pipeline works on seq_record level; to support multi-contig sequences, we have to modify it
# to work with seq_records instead and be called outside the "for seq_record in seq_records" loop
# if (not options.modeling == "none") and (len(seq_records) > 1):
# logging.error('Metabolic modeling is currently only available for finished genome sequences, i.e. sequences subitted as 1 contig')
# options.modeling = "none"
options.record_idx = 1
options.orig_record_idx =1
temp_seq_records = []
for seq_record in seq_records:
old_seq_id = seq_record.id
utils.log_status("Analyzing record %d" % options.record_idx)
logging.info("Analyzing record %d", options.record_idx)
utils.sort_features(seq_record)
strip_record(seq_record)
utils.fix_record_name_id(seq_record, options)
logging.debug("record name/id: %s", seq_record.id)
if (seq_record.id != old_seq_id):
# update seq_record id in dictionaries to match new seq_record id
# this assume utils.fix_record_name_id retained the unique ids naming
new_hmm_results = {}
for composite_id in options.hmm_results:
new_id = composite_id
if (composite_id.split(":")[0] == old_seq_id):
new_id = ":".join([seq_record.id.replace(":", "_"), composite_id.split(":")[-1]])
new_hmm_results[new_id] = options.hmm_results[composite_id]
options.hmm_results = new_hmm_results
if options.coexpress:
for i in xrange(0, len(options.gene_expressions)):
if old_seq_id in options.gene_expressions[i]:
options.gene_expressions[i][seq_record.id] = options.gene_expressions[i][old_seq_id]
del options.gene_expressions[i][old_seq_id]
if options.use_db and not options.db_overwrite:
if utils.check_if_dbrecord_exists(seq_record.name, options):
logging.warn("A record with accession number %s exits in antiSMASH-DB; " \
"using this record instead of re-calculating everything\n" \
"If a recalculation is desired, please add --dboverwrite parameter to command line", seq_record.name)
try:
utils.log_status("retrieving record")
#TODO: This new association is not!!! Transferred to seq_records!!!
seq_record = get_record(seq_record.name, options)
except Exception, e:
logging.exception("Uncaptured error %s when reading entries from antiSMASH-DB. This should not have happened :-(", e)
sys.exit(1)
# set from_database flag to prevent updating the record in the output module
temp_seq_records.append(seq_record)
options.from_database = True
# and change options for database export
options.clusterblast = None
options.subclusterblast = None
options.knownclusterblast = None
options.modeling = "none"
options.smcogs="TRUE"
options.enabled_cluster_types = ValidateClusterTypes(clustertypes)
options.extrarecord[seq_record.id] = argparse.Namespace()
# Get extradata from DB
extradataHash = getExtradata(options, seq_record.id)
options.extrarecord[seq_record.id].extradata = extradataHash
if 'ClusterBlastData' in options.extrarecord[seq_record.id].extradata:
options.clusterblast = True
if 'SubClusterBlastData' in options.extrarecord[seq_record.id].extradata:
options.subclusterblast = True
if 'KnownClusterBlastData' in options.extrarecord[seq_record.id].extradata:
options.knownclusterblast = None
if 'MetabolicModelDataObj' in options.extrarecord[seq_record.id].extradata:
options.modeling = "db"
options.metabolicmodeldir = options.outputfoldername + os.sep + "metabolicModel"
# There must be a nicer solution...
else:
run_analyses(seq_record, options, plugins)
utils.sort_features(seq_record)
options.from_database = False
temp_seq_records.append(seq_record)
else:
run_analyses(seq_record, options, plugins)
utils.sort_features(seq_record)
options.from_database = False
temp_seq_records.append(seq_record)
options.record_idx += 1
options.orig_record_idx += 1
logging.debug("The record %s is originating from db %s", seq_record.name, options.from_database)
# update coexpress data to calculate all possible high corelation betwen genes in clusters
if options.coexpress:
for i in xrange(0, len(options.geo_dataset)):
coexpress.update_dist_between_clusters(seq_records, options.gene_expressions[i], options.geo_dataset[i])
#check CoExpressinter-cluster relation
if options.coexpress:
options.coexpress_inter_clusters = {}
for geo_id in utils.get_all_geo_rec_ids(seq_records):
options.coexpress_inter_clusters[geo_id] = coexpress.get_inter_cluster_relation(seq_records,geo_id)
#overwrite seq_records with newly assembled temp_seq_records
seq_records = temp_seq_records
del temp_seq_records
# execute whole genome-scope modules
run_generic_genome_modules(seq_records, options)
options.run_info["end"] = str(datetime.now())
#before writing to output, remove all hmm_detection's subdir prefixes from clustertype
for seq_record in seq_records:
for cluster in utils.get_cluster_features(seq_record):
prod_names = []
for prod in cluster.qualifiers['product']:
prod_name = []
for name in prod.split('-'):
prod_name.append(name.split('/')[-1])
prod_names.append("-".join(prod_name))
cluster.qualifiers['product'] = prod_names
for cds in utils.get_cds_features(seq_record):
if 'sec_met' in cds.qualifiers:
temp_qual = []
for row in cds.qualifiers['sec_met']:
if row.startswith('Type: '):
clustertypes = [(ct.split('/')[-1]) for ct in row.split('Type: ')[-1].split('-')]
temp_qual.append('Type: ' + "-".join(clustertypes))
elif row.startswith('Domains detected: '):
cluster_results = []
for cluster_result in row.split('Domains detected: ')[-1].split(';'):
cluster_results.append(cluster_result.split(' (E-value')[0].split('/')[-1] + ' (E-value' + cluster_result.split(' (E-value')[-1])
temp_qual.append('Domains detected: ' + ";".join(cluster_results))
else:
temp_qual.append(row)
cds.qualifiers['sec_met'] = temp_qual
#on plants, remove plant clustertype from hybrid types, and replace single
#plant clustertype with "putative"
if options.taxon == "plants":
for seq_record in seq_records:
for cluster in utils.get_cluster_features(seq_record):
prod_names = []
for prod in cluster.qualifiers['product']:
prod_name = list(set(prod.split('-')))
if (len(prod_name) > 1) and ("plant" in prod_name):
prod_name.remove("plant")
elif prod_name == ["plant"]:
prod_name = ["putative"]
prod_names.append("-".join(prod_name))
cluster.qualifiers['product'] = prod_names
for cds in utils.get_cds_features(seq_record):
if 'sec_met' in cds.qualifiers:
temp_qual = []
for row in cds.qualifiers['sec_met']:
if row.startswith('Type: '):
clustertypes = list(set(row.split('Type: ')[-1].split('-')))
if (len(clustertypes) > 1) and ("plant" in clustertypes):
clustertypes.remove("plant")
elif clustertypes == ["plant"]:
clustertypes = ["putative"]
temp_qual.append('Type: ' + "-".join(clustertypes))
else:
temp_qual.append(row)
cds.qualifiers['sec_met'] = temp_qual
# run subgroup identification
# output family sequences fasta files and hmmscan results txt in the output folder, and update seq_records
if not options.disable_subgroup:
logging.info("identificating subgroup")
subgroup.subgroup_identification(seq_records,path.abspath(options.outputfoldername), options)
else:
logging.info("subgroup identification is disabled")
if options.update_clusterblast:
logging.info("Updating ClusterBlast database use this time results")
# make sure the clusterblastdir and files exists
if options.clusterblastdir == "":
options.clusterblastdir = clusterblast.where_is_clusterblast()
if not os.path.exists(options.clusterblastdir):
os.makedirs(options.clusterblastdir)
clusteblast_txt = path.join(options.clusterblastdir, "plantgeneclusters.txt")
clusteblast_fasta = path.join(options.clusterblastdir, "plantgeneclusterprots.fasta")
if not path.exists(clusteblast_fasta) or not path.exists(clusteblast_txt):
with open(clusteblast_fasta, "w") as clusteblast_file:
pass
with open(clusteblast_txt, "w") as clusteblast_file:
pass
# write the plantgeneclusterprots.fasta
clusterblast.make_geneclusterprots(seq_records, options)
outputname =path.join(path.abspath(options.outputfoldername), "plantgeneclusterprots.fasta")
with open(clusteblast_fasta, "a") as clusteblast_file:
with open(outputname, "r") as fastafile:
clusteblast_file.write(fastafile.read())
# write the plantgeneclusters.txt
xls.write(seq_records, options)
with open(clusteblast_txt, "a") as clusteblast_file:
with open(path.join(options.full_outputfolder_path, "plantgeneclusters.txt"), "r") as txtfile:
clusteblast_file.write(txtfile.read())
with open(path.join(options.full_outputfolder_path, "plantgeneclusters.txt"), "r") as txtfile:
non_empty_line_count = 0
for line in txtfile:
# Strip leading and trailing whitespace characters
if line.strip():
non_empty_line_count += 1
logging.info("Numbers of clusters\t{}\t{}".format(options.sequences, non_empty_line_count) )
#Write results
options.plugins = plugins
utils.log_status("Writing the output files")
logging.debug("Writing output for %s sequence records", len(seq_records))
write_results(output_plugins, seq_records, options)
zip_results(seq_records, options)
end_time = datetime.now()
running_time = end_time-start_time
logging.debug("antiSMASH calculation finished at %s; runtime: %s", str(end_time), str(running_time))
utils.log_status("antiSMASH status: SUCCESS")
logging.debug("antiSMASH status: SUCCESS")
def strip_record(seq_record):
features = utils.get_cds_features(seq_record)
for feature in features:
if 'sec_met' in feature.qualifiers:
del feature.qualifiers['sec_met']
# remove aStorage note qualifiers
if 'note' in feature.qualifiers:
note_list=[]
for notetext in feature.qualifiers['note']:
if "aSstorage" not in notetext: