forked from ding-lab/VirusScan
-
Notifications
You must be signed in to change notification settings - Fork 0
/
BLASTn_RefGenome_parser.pl
149 lines (119 loc) · 4.12 KB
/
BLASTn_RefGenome_parser.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/perl -w
use strict;
use Bio::SearchIO;
my $Usage = '
This script accepts a BLASTn output file that were blasted against Reference
genome, find out whether the best hit has a e value lower than the cutoff. If
yes, output query information. If no, the sequence will be kept for further analysis.
perl script <dir><blast output file><Ref genome Taxonomy>
<dir> = directory that blast output file resides in, without last "/"
<blast output file> = name of the blastn output file
<Ref genome Taxonomy> = Bacteria, Homo, Phage, Fungi, Mus, other
';
die $Usage unless scalar @ARGV == 3;
my ($dir, $blastout, $RefGenomeTaxonomy) = @ARGV;
# cutoff value for having a good hit, 1e-10 is a value that gives reasonable confidence
my $E_cutoff = 1e-10;
# create ouput file
my $outFile = $blastout;
$outFile =~ s/RefGblast\.out/RefGblast.parsed/;
$outFile = $dir."/".$outFile;
open (OUT, ">$outFile") or die "can not open file $outFile!\n";
my @keep = (); # query should be kept for further analysis
my @known = (); # queries that are significantly similar to Reference sequences
my $total_records = 0;
#print "parsing blast output files...\n\n";
my $input_file = $dir."/".$blastout;
my $report = new Bio::SearchIO(-format => 'blast', -file => $input_file, -report_type => 'blastn');
# Go through BLAST reports one by one
while(my $result = $report->next_result) {# next query output
# print "\\", $result->query_name, "\\ input\n\n";
if ($result->query_name eq "") { # deals with situation where blast 1st report is empty
next;
}
$total_records++;
my $haveHit = 0;
my $keep = 1;
while(my $hit = $result->next_hit) {
$haveHit = 1;
# check whether the query should be kept for further analysis
if ($hit->significance <= $E_cutoff) {
$keep = 0;
# print $result->query_name, " similar to known, output information!\n\n";
print OUT $result->query_name, "\t", $result->query_length, "\t$RefGenomeTaxonomy\t$RefGenomeTaxonomy\t".$hit->name."\t".$hit->significance,"\n";
}
last; # only need to look at the first hit
}
if ($haveHit) {
if ($keep) {
push @keep, $result->query_name;
# print $result->query_name, " keep!\n\n";
}
else {
push @known, $result->query_name;
}
}
else { # does not have a hit, keep for further analysis
push @keep, $result->query_name;
# print $result->query_name, " keep!\n\n";
}
}
print OUT "# Summary: ", scalar @keep, " out of $total_records ", scalar @keep/$total_records, " is saved for BLASTN analysis.\n";
close OUT;
# generate a fasta file that contains all the non-Reference sequences
# read in blastn input sequences
my $file = $blastout;
$file =~ s/\.RefGblast\.out//;
$file = $dir."/".$file.".fa";
my %seq = &read_FASTA_data($file);
$outFile = $blastout;
$outFile =~ s/\.RefGblast\.out//;
$outFile = $dir."/".$outFile.".RefGfiltered.fa";
open (OUT2, ">$outFile") or die "can not open file $outFile!\n";
foreach my $seq_name (@keep) {
if ($seq_name eq "") { # deals with situation where blast 1st report is empty
next;
}
print OUT2 ">$seq_name\n";
print OUT2 $seq{$seq_name}, "\n";
}
close OUT2;
exit;
############################################################################
# subroutines
sub read_FASTA_data () {
my $fastaFile = shift @_;
#keep old read seperator and set new read seperator to ">"
my $oldseperator = $/;
$/ = ">";
my %fastaSeq;
open (FastaFile, $fastaFile) or die "Can't Open FASTA file: $fastaFile";
while (my $line = <FastaFile>){
# Discard blank lines
if ($line =~ /^\s*$/) {
next;
}
# discard comment lines
elsif ($line =~ /^\s*#/) {
next;
}
# discard the first line which only has ">", keep the rest
elsif ($line ne ">") {
chomp $line;
my @rows = ();
@rows = split (/\s/, $line);
my $contigName = shift @rows;
my $contigSeq = join("", @rows);
$contigSeq =~ s/\s//g; #remove white space
$fastaSeq{$contigName} = $contigSeq;
}
}
# check for correctness
# foreach my $key (keys %fastaSeq){
# print "$key \t $fastaSeq{$key}\n";
# }
#reset the read seperator
$/ = $oldseperator;
close FastaFile;
return %fastaSeq;
}