forked from ding-lab/VirusScan
-
Notifications
You must be signed in to change notification settings - Fork 0
/
check_Blast_parsed_file.pl
76 lines (62 loc) · 2.08 KB
/
check_Blast_parsed_file.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/perl
use strict;
use warnings;
my $usage = "
This script will check all .blastn.parsed or .blastx.parsed files
to make sure parsing blast output file is finished for each file.
perl $0 <blast parsed file>
";
exit( 10 ) unless scalar @ARGV == 1;
my ( $PARSED ) = @ARGV;
my $HOME = $ENV{HOME};
my $finished = &check_blastnParsed_output($PARSED);
exit ($finished);
sub check_blastnParsed_output {
my ( $in_file ) = @_;
my $have_summary_line = 0;
my $line_count = 0;
my $total_seq = 0;
my $saved_seq = 0;
my $num_undefined_taxon = 0;
open (TEMP, "<$in_file") or return 10;
while (my $line = <TEMP>) {
$line_count++;
if ($line =~ /# Summary: (\d+) out of (\d+)/) {
$saved_seq = $1;
$total_seq = $2;
$have_summary_line = 1;
}
if ($line =~ /undefined taxon/) {
$num_undefined_taxon++;
}
}
close TEMP;
if (!$have_summary_line) {
return 10;
}
# taxonomy record has to be equal or greater than the number of sequences get
# successful phylotyped because some sequence could be assigned multiple taxonomy
# categories. Should have at least $num_phylotyped + 1 lines
my $num_phylotyped = $total_seq - $saved_seq;
if ( $num_phylotyped == 0 ) { # every sequence is unassigned
#print "every sequence is unassigned\n";
return 1;
}
# deal with situation where all records showed as undefined taxon and relative
# to humber of phylotyped sequences
elsif ( $num_phylotyped <= $num_undefined_taxon) {
# print "every sequence is undefined taxon\n";
return 10; #changed from 0 to 10, the system default $? is 0, avoid the same value, the same reason below
}
if ( ($line_count - 1) == $num_undefined_taxon) { # deal with situation where all records showed as undefined taxon
# print "every sequence is un defined taxon\n";
return 10;
}
# deal with old situation where some reads were not recorded because of no
# entry of gi-taxon record in the database
if ($num_phylotyped > ($line_count -1 ) ) {
#print "record number less than num phylotyped\n";
return 10;
}
return 1;
}