-
Notifications
You must be signed in to change notification settings - Fork 0
/
clusters2fasta.py
executable file
·44 lines (36 loc) · 1.16 KB
/
clusters2fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/env python
import sys, os, Bio
def usage():
print "Usage: clusters2fasta.py [clusterfile] [fastafile]"
if len(sys.argv) != 3:
usage()
exit()
# opens each file
fastafile = open(sys.argv[2], "rU")
clusterfile = open(sys.argv[1], "rU")
# makes a list of each cluster and a list of all clusterlists
clusters=clusterfile.read().split("\n")
clusterlist=[]
for cluster in clusters:
clusterlist.append(cluster.split("\t"))
#reads in the fastafile to bio
from Bio import SeqIO
records = list(SeqIO.parse(fastafile, "fasta"))
fastafile.close()
# for each cluster, opens a new file with the name of the index, then for each gene in that cluster
# it finds the fasta record with the same name and appends that record to a list, then writes all those
# seqs to the file and goes onto the next cluster
index=1
for cluster in clusterlist:
newfile=open("cluster"+str(index), "w")
seqlist=[]
for gene in cluster:
for seq in records:
if seq.name == gene:
seqlist.append(seq)
SeqIO.write(seqlist, newfile, "fasta")
newfile.close()
index+=1
# moves the clusters all to a directory called fastaclusters
os.mkdir("fastaclusters")
os.system("mv cluster* fastaclusters/")