-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathblast_e5_write_subjs_aa.py
executable file
·102 lines (86 loc) · 2.78 KB
/
blast_e5_write_subjs_aa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#! /usr/bin/env python
"""
outputs FASTA sequence corresponding to subject start and subject end in m8 file
if subject has match over E value cutoff
some subjects will be written multiple times, depending on how many hits they have
Copyright:
blast_e5_write_subjs_aa.py Output FASTA seqs as aa from BLAST alignments
Copyright (C) 2016 William Brazelton
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import sys
fastafilename = sys.argv[1]
blastfilename = sys.argv[2]
outfilename = blastfilename + '.besthits.fasta'
outfile = open(outfilename, 'a')
blast_file = open(blastfilename)
query_list = []
subject_list = []
count = 0
for line in blast_file:
columns = line.split('\t')
query = columns[0]
subject = columns[1]
Evalue = columns[10]
try:
Evalue = Evalue.split('e-')
Evalue = Evalue[1]
if int(Evalue) > 4: # this is where you can change the evalue
count = count + 1
query_list.append(query)
subject_list.append(subject)
except:
if Evalue == '0':
query_list.append(query)
subject_list.append(subject)
print count,
print ' total hits above E value cutoff'
query_set = set(query_list)
subject_set = set(subject_list)
print len(query_set),
print ' unique queries with hits above E value cutoff'
print len(subject_set),
print ' unique subjects with hits above E value cutoff'
l=[] # keep track of subjects already found
blast_file = open(blastfilename)
for line in blast_file:
columns = line.split('\t')
subject = columns[1]
if subject in l: pass # don't look for subjects already found
else:
start = int(columns[8])
end = int(columns[9])
Evalue = columns[10]
if Evalue == '0': Evalue = 99
elif 'e-' in Evalue:
Evalue = Evalue.split('e-')
Evalue = Evalue[1]
else: Evalue = 1
fasta_file = open(fastafilename)
status = 'stop'
if int(Evalue) > 4:
for line in fasta_file:
if line[0] == '>':
status = 'stop'
header = line.split(' ')
header = header[0].replace('>','')
header = header.replace('\n','')
if subject == header:
print subject
outfile.write('>')
outfile.write(header)
outfile.write('\n')
l.append(header)
status = 'go'
elif status == 'go':
outfile.write(line)
outfile.close()