-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_sitega_scan.py
executable file
·85 lines (69 loc) · 2.57 KB
/
parse_sitega_scan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
'''
Copyright © 2018 Anton Tsukanov. Contacts: [email protected]
License: http://www.gnu.org/licenses/gpl.txt
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
'''
import argparse
import sys
import re
import pandas as pd
def parse_sitega(path):
sitega = list()
length = 30
with open(path, 'r') as file:
for line in file:
#print(line)
if line.startswith('>'):
line = line[1:].strip().split(':')
name = line[0]
chromosome = line[2]
coordinates_strand = line[3]
start, end = re.findall(r'\d*-\d*', coordinates_strand)[0].split('-')
start = int(start)
end = int(end)
else:
record = dict()
line = line.strip().split()
site = line[3].upper()
strand = line[2]
score = float(line[1])
left_pos = int(line[0])
start_site = start + left_pos
end_site = start + left_pos + length
record['chr'] = chromosome
record['start'] = start_site
record['end'] = end_site
record['name'] = name
record['score'] = score
record['strand'] = strand
record['site'] = site
sitega.append(record)
file.close()
return(sitega)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('sitega', action='store',
help='path to sitega scan file')
parser.add_argument('bed', action='store',
help='path to write file in bed format')
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
return(parser.parse_args())
def main():
args = parse_args()
path_in = args.sitega
path_out = args.bed
sitega = parse_sitega(path_in);
sitega = pd.DataFrame(sitega)
sitega = sitega[['chr', 'start', 'end', 'name', 'score', 'strand', 'site']]
sitega.to_csv(path_out, sep="\t", index=False, header=False)
if __name__=="__main__":
main()