-
Notifications
You must be signed in to change notification settings - Fork 8
/
showTree.py
executable file
·608 lines (564 loc) · 25.3 KB
/
showTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
#!/usr/bin/python
'''
Visualization of gene trees produced by Carsten Kemenas geneSearch and
calcTree tools. Next to the gene trees, a MSA including protein domains
is displayed.
Works with python2.7 and python3.3+ :)
by Lukas PM Kremer, 2016/17
'''
from __future__ import print_function
import argparse
import os
from ete3 import Tree, TreeStyle, SeqMotifFace, NodeStyle
from Bio import SeqIO
COLORS = [
'Khaki', 'DarkSeaGreen', 'LightSteelBlue', 'orange', 'yellow',
'cyan', 'pink', 'magenta', 'green', 'blue', 'Moccasin', 'brown',
'gray', 'black', 'white',
] * 3
class DomTreeMaker(object):
def __init__(self, msa_path, tree_path=None, config_path=None, root=None,
out=False, scale_factor=1.0, highlight=None, hide_nodes=False,
domain_annotation=None, gff_path=None, lineseq=False):
self.msa_path = msa_path
self.gff_path = gff_path
self.domain_annotation = domain_annotation
self.config_path = config_path
self.tree_path = tree_path
self.out = out
self.scale_factor = scale_factor
self.termnodes_to_highlight = self.parse_highlight_option(highlight)
self.outgroup_node_for_rooting = root
self.hide_nodes = hide_nodes
if lineseq:
print("line")
self.seq_style = 'line'
self.gap_style = 'blank'
else:
print("compactseq")
self.seq_style = 'compactseq'
self.gap_style = 'line'
return
def parse_highlight_option(self, hl_list):
'''
parse the --highlight option to determine
which nodes / clades should be marked in
which colour.
'''
termnodes_to_highlight = []
if hl_list:
for hl_str in hl_list:
comma_sep = ',' in hl_str
plus_sep = '+' in hl_str
if plus_sep:
sep = '+'
elif comma_sep:
sep = ','
elif comma_sep and plus_sep:
raise Exception(
'Cannot use "+" and "," in the same highlight option.'
)
else:
sep = 'doesnt_matter'
colon_split = hl_str.split(':')
bg_color = colon_split[0]
prot_ids = set(colon_split[1].split(sep))
assert len(colon_split) == 2 and '' not in prot_ids, (
'\n Invalid argument format for parameter '
'-hl/--highlight\n '
'The correct format is for instance:\n '
'--highlight yellow:prot01,prot02,prot03 red:prot04\nor\n'
'--highlight yellow:prot01+prot02+prot03 red:prot04')
termnodes_to_highlight.append(
(bg_color, prot_ids, plus_sep)
)
return termnodes_to_highlight
def get_domain_annot_paths(self):
'''
get the paths to the domain annotation(s) from the geneSearch
config file, and/or directly from the --domain_annotation param.
'''
self.dom_files = set()
if self.config_path:
with open(self.config_path, 'r') as cfg:
for line in cfg:
if line.startswith('#') or \
line.startswith('ortho') or \
line.startswith('augustus') or \
line.startswith('!targets')or \
not line.strip():
continue
dom_file_path = line.strip().split()[2]
self.dom_files.add(dom_file_path)
print('Found these domain annotations:')
print('\t' + '\n\t'.join(self.dom_files) + '\n')
if self.domain_annotation:
print('You manually specified the domain annotation(s):')
for dom_annot in self.domain_annotation:
print('\t{}'.format(dom_annot))
self.dom_files.add(dom_annot)
print()
if not self.domain_annotation and not self.config_path:
print('\nYou did not specify a geneSearch config or '
'domain annotation file, so domains will not be '
'displayed.\n')
return
def get_gene_list(self):
'''
extracts the protein IDs from the fasta alignment file
or from the tree (whatever is available). Otherwise,
the domain annotation itself will be used to get the IDs.
'''
if not hasattr(self, '_gene_list'):
self._gene_list = set()
if self.msa_path:
with open(self.msa_path, 'r') as msa_file:
for line in msa_file:
if line.startswith('>'):
ID = line.split()[0][1:]
self._gene_list.add(ID)
elif self.tree_path:
t = Tree(self.tree_path)
self._gene_list = set()
for leaf in t:
self._gene_list.add(leaf.name)
elif self.domain_annotation:
if self.domain_dict:
self._gene_list = self.domain_dict.keys()
else:
self._gene_list = None
else:
raise Exception('This should never happen')
return self._gene_list
def parse_msa(self):
'''
store the MSA multi-fasta as a dict with header as keys and
sequence as values.
'''
self.msa_fasta_dict = {}
with open(self.msa_path, 'rU') as fa:
for record in SeqIO.parse(fa, 'fasta'):
header = str(record.id)
seq = str(record.seq)
seq_id = header.split()[0]
self.msa_fasta_dict[seq_id] = seq
return
def collect_domain_info(self):
'''
parse all domain files that were gathered from the config file
and collect relevant domain info.
'''
self.domain_dict = {}
self.domains = set()
for dom_annot_path in self.dom_files:
self.parse_domain_annotation(dom_annot_path)
self.domains = list(self.domains)
return
def parse_domain_annotation(self, dom_annot_path):
'''
Parses a pfam_scan domain annotation file.
Collects domain names and start/end borders for all genes that
are contained in the MSA file.
'''
this_annot = set()
gene_set = self.get_gene_list() # will be None if neither MSA nor tree were specified
with open(dom_annot_path, 'r') as df:
for line in df:
if line.startswith('#') or not line.strip():
continue
fields = line.split()
gene_id = fields[0].replace(':', '_')
if gene_id in self.domain_dict and gene_id not in this_annot:
print('\033[31m Warning! The ID {} is listed in '
'multiple domain annotations!\033[0m'.format(gene_id))
this_annot.add(gene_id)
if not gene_set or gene_id in gene_set: # if no MSA or tree were specified, all pfam_scan entries will be plotted
ungapped_start, ungapped_stop = \
int(fields[1]), int(fields[2])
ungapped_start -= 1 # adjusting borders to Pythons 0-indexing
ungapped_stop -= 1 # (Pfam domains are 1-indexed)
domname = str(fields[6])
self.domains.add(domname)
if self.msa_fasta_dict:
gapped_seq = self.msa_fasta_dict[gene_id]
try:
start, stop = self.correct_borders_for_gaps(
gapped_seq, ungapped_start, ungapped_stop
)
except KeyError:
raise DomainIndexError(
dom=domname, annot=dom_annot_path,
plen=len(gapped_seq.replace('-', '')),
b1=ungapped_start, b2=ungapped_stop,
prot=gene_id
)
if gene_id not in self.domain_dict:
self.domain_dict[gene_id] = []
self.domain_dict[gene_id].append(
(domname, start, stop))
else:
if gene_id not in self.domain_dict:
self.domain_dict[gene_id] = []
self.domain_dict[gene_id].append(
(domname, 1, 1))
return
def parse_gff(self, gff_path):
''' parse a GFF file to determine intron positions '''
if not hasattr(self, 'cds_length_dict'):
self.cds_length_dict = {}
gene_list = self.get_gene_list()
found = False
n_weird_lines = 0 # how many lines could not be parsed/are not in GFF format
with open(gff_path, 'r') as gff_file:
for line in gff_file:
if line.startswith('#') or not line.strip():
continue
values = line.strip().split('\t')
if not is_valid_gff_line(values):
n_weird_lines += 1
continue
feature = values[2]
if feature.upper() != 'CDS':
continue
gff_comment = values[8]
for query_gene in gene_list:
if query_gene in gff_comment:
if query_gene not in self.cds_length_dict:
self.cds_length_dict[query_gene] = []
cds_len = (int(values[4]) - int(values[3])) / 3.0
if cds_len > 3: # some GFFs have CDSs of length 0, why?!
self.cds_length_dict[query_gene].append(cds_len)
found = True
if n_weird_lines:
print('\nWarning! The GFF {} contained {} lines that could not be '
'interpreted! Check the file.\n'.format(gff_path, n_weird_lines))
assert found, '\n\nDidn\'t find any relevant proteins in {}.\n\n'\
'Looked for these:\n{}'.format(gff_path, '\t'.join(gene_list))
return
def add_background_color_to_nodes(self, t, whole_clade=True):
for bg_color, prot_ids, whole_clade in self.termnodes_to_highlight:
for prot_id in prot_ids:
assert prot_id in t, (
'The node "{}" that you want to '
'highlight was not found in your tree!'.format(prot_id)
)
nstyle = NodeStyle()
nstyle['bgcolor'] = bg_color
if not whole_clade or len(prot_ids) == 1:
# don't colour the whole clade containing the proteins,
# only color the proteins individually:
for prot_id in prot_ids:
(t & prot_id).set_style(nstyle)
else:
# colour the whole clade that contains the proteins:
node = t.get_common_ancestor(*prot_ids)
node.set_style(nstyle)
return
def root_tree(self, t):
if '+' in self.outgroup_node_for_rooting:
node_ids = self.outgroup_node_for_rooting.split('+')
ancestor = t.get_common_ancestor(*node_ids)
t.set_outgroup(ancestor)
print('Rooted tree at the clade that contains {}'
'.\n'.format(' and '.join(node_ids)))
else:
t.set_outgroup(t & self.outgroup_node_for_rooting)
print('Rooted tree at node "{}"'
'.\n'.format(self.outgroup_node_for_rooting))
return
def show_dom_MSA_tree(self):
'''
Plot the gene tree with a MSA+domains aligned next to it,
or alternatively dump the tree to a file (if self.out was defined)
'''
if not self.tree_path:
# if no tree was specified, use a "star-shaped" newick tree
# as a substitute (all species have the same relatedness)
newick_star = '({});'.format(','.join(sorted(self.get_gene_list())))
t = Tree(newick_star)
ts = TreeStyle()
ts.show_branch_length = False
ts.show_branch_support = False
ts.branch_vertical_margin = 10
ts.scale = 0 # 0 pixels per branch length unit
# make the tree disappear:
nstyle = NodeStyle()
nstyle['size'] = 0
nstyle['hz_line_color'] = 'white'
nstyle['vt_line_color'] = 'white'
for n in t.traverse():
n.set_style(nstyle)
else:
t = Tree(self.tree_path)
ts = TreeStyle()
ts.show_branch_length = True
ts.show_branch_support = True
ts.scale = 120 # 120 pixels per branch length unit
ts.show_leaf_name = True
ts.show_scale = False
if self.outgroup_node_for_rooting:
assert self.tree_path, ('Rooting doesn\'t make sense if '
'you don\'t specify a tree file!')
self.root_tree(t)
if self.msa_path:
self.add_background_color_to_nodes(t)
self.add_msa_with_domains_to_tree(t)
else:
self.add_background_color_to_nodes(t)
self.add_domains_to_tree(t)
if self.hide_nodes:
for word2hide in self.hide_nodes:
nodes_hidden = []
print('\nRemoving these terminal nodes because they contain the '
'word "{}":'.format(word2hide))
for leaf in t.traverse():
if word2hide in leaf.name:
nodes_hidden.append(leaf.name)
leaf.delete()
print(', '.join(nodes_hidden))
if self.out:
file_ext = os.path.splitext(self.out.upper())[1]
if file_ext not in ('.SVG', '.PDF', '.PNG'):
raise Exception('Output image file name must end with '
'".SVG", ".PDF" or ".PNG".')
t.render(self.out, tree_style=ts)
print('Wrote output image to', self.out)
else:
t.show(tree_style=ts)
return
def add_domains_to_tree(self, t):
'''
displaying domains without a sequence / MSA
'''
for leaf in t:
gene_id = leaf.name
domains = self.domain_dict.get(gene_id, [])
# if no domains are annotated, 'domains' is an empty list and
# no motifs are added to the sequence (for loop won't iterate)
motifs = []
dom_n = 0
for domain in domains:
domname, start, stop = domain
start = dom_n * 55
stop = dom_n * 55 + 50
dom_n += 1
color_n = self.domains.index(domname)
try:
dom_color = COLORS[color_n]
except IndexError:
dom_color = 'gray'
motifs.append([
start, stop, '()', None, 10, None,
'rgradient:{}'.format(dom_color),
'arial|6|black|{}'.format(domname)
])
if len(motifs) > 0:
domface = SeqMotifFace(None, motifs=motifs, gap_format='line')
(t & gene_id).add_face(domface, column=0, position='aligned')
return
def add_msa_with_domains_to_tree(self, t):
'''
iterating over all sequences in the tree and adding
the sequence, domain and intron position visualizations.
'''
for leaf in t:
gene_id = leaf.name
gapped_seq = self.msa_fasta_dict[gene_id]
domains = self.domain_dict.get(gene_id, [])
# if no domains are annotated, 'domains' is an empty list and
# no motifs are added to the sequence (for loop won't iterate)
motifs = []
for domain in domains:
domname, start, stop = domain
color_n = self.domains.index(domname)
try:
dom_color = COLORS[color_n]
except IndexError:
dom_color = 'gray'
motifs.append([
start, stop, '()', None, 10, None,
'rgradient:{}'.format(dom_color),
'arial|6|black|{}'.format(domname)
]) # domain markers
if hasattr(self, 'cds_length_dict'):
if gene_id not in self.cds_length_dict:
print('Warning: No GFF entry found for {}'.format(gene_id))
else:
cds_lengths = self.cds_length_dict[gene_id]
current_pos = 1
for cds_len in cds_lengths[:-1]:
current_pos += cds_len
gapped_seq = self.msa_fasta_dict[gene_id]
try:
gapped_pos, __ = self.correct_borders_for_gaps(
gapped_seq, int(round(current_pos)), 0
)
except KeyError:
raise Exception('The protein sequence of {} is shorter '
'in the MSA than in the GFF!'.format(gene_id))
motifs.append([
(gapped_pos - 1), (gapped_pos + 1),
'[]', None, 10, None, 'black', None
]) # black line that marks the intron positions
seqface = SeqMotifFace(gapped_seq, gapcolor='gray', motifs=motifs,
seq_format=self.seq_style, gap_format=self.gap_style,
scale_factor=self.scale_factor)
(t & gene_id).add_face(seqface, column=0, position='aligned')
return
def map_gapped_to_ungapped_position(self, seq):
position_map = {}
corrected_pos = 0
for original_pos, char in enumerate(seq):
position_map[corrected_pos] = original_pos
if char == '-':
continue
corrected_pos += 1
position_map[corrected_pos] = original_pos
assert corrected_pos == len(seq.replace('-', ''))
return position_map
def correct_borders_for_gaps(self, seq, start, stop):
'''
seq: sequence with gaps, e.g. A----VTIAGISVML---HMVSPMSQ---
start/stop: start/stop position of a domain, but according to
an ungapped sequence e.g. AVTIAGISVMLHMVSPMSQ
returns: corrected start/stop (according to the gapped sequence)
'''
gapped_to_ungapped_pos = self.map_gapped_to_ungapped_position(seq)
corrected_start = gapped_to_ungapped_pos[start]
corrected_stop = gapped_to_ungapped_pos[stop]
dom_seq = seq.replace('-', '')[start:stop+1]
dom_seq_from_new_borders = \
seq[corrected_start:corrected_stop+1]
if dom_seq_from_new_borders.endswith('-'):
corrected_stop = gapped_to_ungapped_pos[stop]
dom_seq_from_new_borders = \
seq[corrected_start:corrected_stop+1]
assert dom_seq == dom_seq_from_new_borders.replace('-', ''), 'Could '\
'not determine the domain borders for the gapped MSA'
return corrected_start, corrected_stop
class DomainIndexError(Exception):
def __init__(self, **kwargs):
self.__dict__.update(kwargs)
e_msg = '''\n
The domain annotation
{annot}
lists domain borders that are out of range
of the annotated protein {prot} :
protein length: {plen}
domain: {dom}
domain borders: {b1} - {b2}
'''.format(**kwargs)
Exception.__init__(self, e_msg)
def is_valid_gff_line(splitline):
''' check if a file line (split) is in valid GFF format '''
if not len(splitline) == 9:
return False
try:
feat_len = int(splitline[4]) - int(splitline[3])
assert feat_len >= 0
except (ValueError, AssertionError):
return False
return True
def main(msa_path, config_path, tree_path, gff_paths=None,
out=False, scale_factor=1.0, highlight=None,
root=None, hide_nodes=False, domain_annotation=None,
lineseq=False):
d = DomTreeMaker(
msa_path=msa_path,
config_path=config_path,
tree_path=tree_path,
gff_path=gff_paths,
out=out,
scale_factor=scale_factor,
highlight=highlight,
root=root,
hide_nodes=hide_nodes,
domain_annotation=domain_annotation,
lineseq=lineseq,
)
d.get_domain_annot_paths()
if msa_path is not None:
d.parse_msa()
else: # why was this required again? ...
d.msa_fasta_dict = None
d.collect_domain_info()
if gff_paths is not None:
for gff_path in gff_paths:
d.parse_gff(gff_path)
d.show_dom_MSA_tree()
return
if __name__ == '__main__':
parser = argparse.ArgumentParser()
main_args = parser.add_argument_group('main arguments')
add_args = parser.add_argument_group('additional optional arguments')
main_args.add_argument('-m', '--msa', default=None,
help='Path to an untrimmed multiple sequence '
'alignment in FASTA format (the one produced '
'by calcTree usually ends with "_aln.fa")')
main_args.add_argument('-t', '--tree', default=None,
help='Path to a gene tree, i.e. the best-'
'scoring gene tree with bootstrap values '
'(not as branch labels) produced by RAxML, '
'usually the file name starts with '
'"RAxML_bipartitions."')
main_args.add_argument('-d', '--domain_annotations', help='Path(s) '
'to one or more Pfam_scan domain annotation(s) '
'of the proteins (standard method to display '
'protein domains)', nargs='+', default=[])
main_args.add_argument('-c', '--config', default=None,
help='Path to the geneSearch/calcTree '
'configuration file (alternative method to '
'display protein domains)')
add_args.add_argument('-g', '--gffs', default=None,
help='Path to one or more GFF files '
'containing CDS features of the proteins. '
'Used to mark intron positions. Only works '
'if the 9th (=last) column of the GFF contains '
'the protein IDs', nargs='+')
add_args.add_argument('-o', '--output_path', default=False,
help='Path to the output image file. Must end with '
'.PDF, .PNG or .SVG. Tree will be shown in a window '
'if omitted')
add_args.add_argument('-s', '--scale_factor', default=1.0, type=float,
help='Horizontal scaling factor of the MSA '
'(default: 1.0). '
'Decrease this value if your image is too wide!')
add_args.add_argument('-hl', '--highlight', nargs='*',
help='Highlight specific clades or terminal nodes '
'with a background color. '
'Comma-separated IDs will colour each terminal '
'node separately. '
'Plus-separated IDs will colour the whole clade '
'that contains those IDs. '
'Example: --highlight red:prot01,prot02,prot03 '
'green:prot04 lightyellow:prot05+prot06')
add_args.add_argument('-r', '--root', help='Set an outgroup node at which '
'the tree will be rooted. If multiple IDs are '
'specified (plus-separated), the whole clade that '
'contains these IDs will be used for rooting')
add_args.add_argument('-hn', '--hide_nodes', help='Hide terminal nodes '
'that contain the specified substring; e.g. use '
'"FBpp" to hide all Drosophila proteins', nargs='+')
add_args.add_argument('--lineseq', action='store_true',
help='Draw a simple line instead of the amino acid / '
'nucleotide sequence')
args = parser.parse_args()
assert any([args.msa, args.tree, args.domain_annotations]), '''
\nFor visualization, you need to specify at least one of these files:
- a multiple sequence alignment (--msa)
- a gene/protein tree (--tree)
- one or more pfam_scan protein domain annotations (--domain_annotations OR --config)
Read the help (--help) for further information.'''
main(
msa_path=args.msa,
config_path=args.config,
tree_path=args.tree,
domain_annotation=args.domain_annotations,
gff_paths=args.gffs,
out=args.output_path,
scale_factor=args.scale_factor,
highlight=args.highlight,
root=args.root,
hide_nodes=args.hide_nodes,
lineseq=args.lineseq,
)