-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathAnnotateClusters.py
165 lines (116 loc) · 6.54 KB
/
AnnotateClusters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/local/bin/python
#Created on 3/26/13
__author__ = 'Juan A. Ugalde'
def annotate_cluster(protein_list, feature_type, annotation):
from collections import defaultdict
top_hit = None
total_conflicts = None
unresolved_conflicts = None
summary_annotation = defaultdict(int)
for protein_id in protein_list:
#First check that the protein is in the annotation
if not protein_id in annotation:
continue
else:
#Check that it has COG annotation
if annotation[protein_id].has_key(feature_type):
protein_info = annotation[protein_id][feature_type]
summary_annotation[protein_info] += 1
if len(summary_annotation) == 0:
pass
elif len(summary_annotation) == 1:
top_hit = summary_annotation.keys()[0]
else:
total_conflicts = summary_annotation
#Get the total number of values
total_annotations = sum(summary_annotation.itervalues())
for hit in summary_annotation:
if summary_annotation[hit] / float(total_annotations) >= float(0.5):
top_hit = hit
#If no decision was made
if top_hit is None:
unresolved_conflicts = summary_annotation
return top_hit, total_conflicts, unresolved_conflicts
if __name__ == '__main__':
import os
import argparse
from collections import defaultdict
from Common import ClusterTools, AnnotationTools
from Annotation import COG
program_description = "This script annotates a cluster generated by the SummarizeOrthoMCLResults script"
parser = argparse.ArgumentParser(description=program_description)
#Arguments
parser.add_argument("-l", "--genome_list_index", type=str,
help="File with the genome list. Format GenomeID, FullName, ShortName", required=True)
parser.add_argument("-a", "--annotation_folder", type=str,
help="Folder with the annotation files from JGI", required=True)
parser.add_argument("-c", "--cluster_file", type=str,
help="Cluster file", required=True)
parser.add_argument("-o", "--output_directory", type=str,
help="Output folder", required=True)
args = parser.parse_args()
#Create the output directory
if not os.path.exists(args.output_directory):
os.makedirs(args.output_directory)
#####Read the genome list
genome_id_dictionary, genome_count = ClusterTools.read_genome_list(args.genome_list_index)
###Read the annotation information
protein_annotation, function_definitions = AnnotationTools.parse_annotation_folder(genome_id_dictionary.keys(), args.annotation_folder)
##Read the cluster information
total_clusters = ClusterTools.get_cluster_information(args.cluster_file)
##Print log file
logfile = open(args.output_directory + "/logfile.txt", 'w')
##Total number of clusters
logfile.write("Total number of analyzed clusters: %d" % len(total_clusters) + "\n")
features_to_annotate = ["COG", "KO", "PFAM", "Product"]
#Get the COG definitions
cog_one_letter, desc_cog_letter, desc_cog_number = COG.cog_definitions()
#Create the output files
for feature in features_to_annotate:
output_top_hit = open(args.output_directory + "/" + feature + "_clusters.txt", 'w')
output_conflicts = open(args.output_directory + "/" + feature + "_conflicts.txt", 'w')
for cluster in total_clusters:
protein_id_list = [id_tag.split("|")[1] for id_tag in total_clusters[cluster].split(",")]
function_description = None
#Get the annotations
top_hit, total_conflicts, unresolved_conflicts = annotate_cluster(protein_id_list, feature, protein_annotation)
#Print the output file
if total_conflicts is None and unresolved_conflicts is None and top_hit is not None:
if feature == "Product":
output_top_hit.write(cluster + "\t" + top_hit + "\n")
else:
if top_hit in function_definitions:
function_description = function_definitions[top_hit]
if feature == "COG":
letter = cog_one_letter[top_hit]
definition = desc_cog_letter[letter[0]]
output_top_hit.write(cluster + "\t" + top_hit + "\t" + str(function_description)
+ "\t" + letter[0] + "\t" + definition + "\n")
else:
output_top_hit.write(cluster + "\t" + top_hit + "\t" + str(function_description) + "\n")
elif total_conflicts is not None and top_hit is not None and unresolved_conflicts is None:
if feature == "Product":
output_top_hit.write(cluster + "\t" + top_hit + "\t" +
" ; ".join(["%s=%s" % (k, v) for k, v in total_conflicts.items()]) + "\n")
else:
if top_hit in function_definitions:
function_description = function_definitions[top_hit]
if feature == "COG":
letter = cog_one_letter[top_hit]
definition = desc_cog_letter[letter[0]]
output_top_hit.write(cluster + "\t" + top_hit + "\t" + function_description +
"\t" + letter + "\t" + definition + "\t"
+ " ; ".join(["%s=%s" % (k, v) for k, v in total_conflicts.items()]) + "\n")
else:
output_top_hit.write(cluster + "\t" + top_hit + "\t" + str(function_description) + "\t" +
" ; ".join(["%s=%s" % (k, v) for k, v in total_conflicts.items()]) + "\n")
elif top_hit is None and total_conflicts is not None and unresolved_conflicts is not None:
if feature == "Product":
output_conflicts.write(cluster + "\t" +
" ; ".join(["%s=%s" % (k, v) for k, v in unresolved_conflicts.items()]) + "\n")
else:
output_conflicts.write(cluster + "\t" +
" ; ".join(["%s, %s=%s" % (k, function_definitions[k], v) if k in function_definitions
else "%s, None=%s" % (k, v) for k, v in unresolved_conflicts.items()]) + "\n")
output_top_hit.close()
output_conflicts.close()