-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgenes2contigs.py
executable file
·95 lines (81 loc) · 3.53 KB
/
genes2contigs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#! /usr/bin/env python
"""
given list of gene ids (e.g. KEGG IDs), write table with each contig that has each gene id
usage:
python genes2contigs.py list-of-gene-ids.txt prokka.gff newfilename
optionally, can also find corresponding ESOM data point IDs if provided a .names file
in this case, usage:
python genes2contigs.py list-of-gene-ids.txt prokka.gff newfilename esom.names
Copyright:
genes2contigs write table with each contig that has each gene id
Copyright (C) 2016 William Brazelton
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
# define files
import sys
gene_file = sys.argv[1]
gff_file = sys.argv[2]
newfilename = sys.argv[3]
status = 'no'
try:
esom_file = sys.argv[4]
status = 'yes'
except: pass
# if esom.names file is provided, make dictionary with contig name as key and data point(s) as values
if status == 'yes':
e = {}
with open(esom_file) as esom:
for line in esom:
cols = line.split('\t')
if len(cols) > 1:
contig = cols[1]
contig = contig.split('_')
contig = contig[0] + '_' + contig[1]
if contig in e: e[contig].append(cols[0]) # add this data point to an existing list of data points as the value (for contigs that are broken up into multiple data points)
else: e[contig] = list(tuple([cols[0]])) # create a new key with this contig and its first data point as the value
# create list of gene ids
l = []
with open(gene_file) as genes:
for gene in genes:
geneL = gene.split('\t')
if len(geneL) > 1: gene = geneL[0]
gene = gene.strip()
l.append(gene.strip('\n'))
# create dictionary from prokka gff file with each KEGG ID as a key and a list of contigs that have that KEGG ID as the value
d = {}
with open(gff_file) as gff:
for line in gff:
if '##FASTA'in line: break
if line[0] == '#': pass
elif 'gene_id=' in line:
contig_id = line.split('\t')
contig_id = contig_id[0].replace('-','_')
gene_id = line.split('gene_id=')
gene_id = gene_id[1].split(';')
gene_id = gene_id[0]
if gene_id in d: d[gene_id].append(contig_id) # add this contig_id to an existing list of contig_ids as the value
else: d[gene_id] = list(tuple([contig_id])) # create a new key with this kegg_id and its host contig as the value
# iterate through list of gene ids, writing each one to file with corresponding contig
with open(newfilename, 'w') as newfile:
for id in l:
if id in d:
newfile.write(id + '\t')
for contig in d[id]:
newfile.write(contig + ',') # for each contig that has this ID, write it to the file, separated by commas. last one has an orphan comma. sorry, deal with it
if status == 'yes':
newfile.write('\t')
for contig in d[id]: # iterate through contigs again to find esom data points if esom.names file provided
if contig in e:
for data_point in e[contig]: newfile.write(data_point + ',')
else: newfile.write('not in ESOM,')
newfile.write('\n')
else: newfile.write(id + '\t' + '\n')