-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter_RNASeq
159 lines (137 loc) · 4.72 KB
/
filter_RNASeq
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/perl -w
use warnings;
use strict;
#this script was developd by A.J.Amaral 20/11/2008
#and is published in Amaral AJ, Brito FF, Chobanyan T, Yoshikawa S, Yokokura T, Van Vactor D, et al.
#Quality assessment and control of tissue specific RNA-seq libraries of Drosophila transgenic RNAi models.
#Front Genet. 2014 Mar 5;5.
#this script is to filter sequences from *online.fastq files
#sequences with Ns are counted and printed to an output file Ns.txt
#sequences with homopymers 15_30 bp are counted and printed to an output file *homo.txt
# a files with all the counts is created *counts_N_homo.txt
#a file with the filtered sequences is created *_filtered.txt
#USAGE: perl filter_oneline_N_homo.pl <name *oneline:fastq 15 bp file> .. <name *oneline:fastq 30 bp file>
#open error log file
open (STDERR,">standard_error6.txt");
my @files=($ARGV[0]); #,$ARGV[1],$ARGV[2],$ARGV[3],$ARGV[4],$ARGV[5],$ARGV[6],$ARGV[7],$ARGV[8],$ARGV[9], $ARGV[10],$ARGV[11],$ARGV[12],$ARGV[13],$ARGV[14],$ARGV[15]);
#print "infile: $infile\n";
### debugging ###
#print "@files $ARGV[0] $ARGV[1] $ARGV[2] $ARGV[3]\n";
### debugging ###
foreach my $files(@files){
#count variables
my $dotsum15=0;
my $Nsum15=0;
my $homoA15=0;
my $homoC15=0;
my $homoT15=0;
my $homoG15=0;
my $lowQS15=0;
my $filtered15=0;
my $infile15=$files;
### debugging ###
#print "$ARGV[1]\n";
#print "$infile15\n";
#print "_counts_N_homo.txt\n";
#print "$ARGV[1] $infile15 _counts_N_homo.txt\n";
### debugging ###
open (QUAL15,"<$infile15") or die "Can't open $infile15: $!\n";
#open output files for reads 15 bp
# files with counts of Ns and homo
my $countfile15=$ARGV[1].$infile15."_counts_N_homo.txt";
open(COUNT15,">$countfile15") or die "Can't open $countfile15: $!\n";
#open file with read 15 bp with Ns
my $N_file15=$ARGV[1].$infile15."_with_Ns.txt";
open(N15,">$N_file15")or die "Can't open $N_file15: $!\n";
#open file with read 15 bp with homopolymers
my $homo15=$ARGV[1].$infile15."_with_homo.txt";
open(HOMO15,">$homo15") or die "Can't open $homo15: $!\n";
#open file to write 15 bp reads that pass the filters
my $filtered15=$ARGV[1].$infile15."_filtered.txt";
open(FILTER15, ">$filtered15") or die "Can't open $filtered15: $!\n";
#in this loop presence of sequences with Ns and homos is checked.
#these sequences are writen to a separate file
#a file with the counts of these sequences is created
# a file with the sequence passing these filters plus the transformation qs into numbers and the average of qs in the sequence
### debugging ###
while (defined (my $line15=<QUAL15>)){
$line15 =~ s/ +/ /;
my @fastq15=split(" ",$line15);
# print "$line15\n";
my $id15=$fastq15[0];
my $seq15=$fastq15[1];
my $qs15=$fastq15[2];
# print "id= $id15 \n";
# print "seq= $seq15 \n";
# print "qs= $qs15 \n";
### debugging ###
#sequence is checked for bases not called("." or N), and homopolymers A,C,T,G (minimum half of sequence length)
#the sums of these ocurrences are also calculated for statistics report
if ($seq15=~/\.{1}/){
++$dotsum15;
next;
}
if ($seq15=~/N{1}/){
++$Nsum15;
print N15 "$line15\n";
next;
}
if ($seq15=~/A{45,90}/){
++$homoA15;
print HOMO15 "$line15\n";
next;
}
if ($seq15=~/C{45,90}/){
++$homoC15;
print HOMO15 "$line15\n";
next;
}
if ($seq15=~/T{45,90}/){
++$homoT15;
print HOMO15 "$line15\n";
next;
}
if ($seq15=~/G{45,90}/){
++$homoG15;
print HOMO15 "$line15\n";
next;
}
#if sequence starts with motif and is free of long homopolymers, QS are translated from ascii into numbers
else {
my @QS= split('', $qs15);
my @phred=();
my $total=0;
foreach (my $j=0;$j<@QS;++$j){
my $value=ord($QS[$j])-64;
$total+=$value;
push(@phred, $value);
#results are printed to first output file: id"\t"seq"\t"id"\t"QSascii"\t"QSnumbers
}
my $length=@phred;
#print @phred;
my $average_b=$total/$length;
if ($average_b>=30){
#print "$id15\n";
#print "$id15\n";
#print FILTER15 "$id15\t$seq_trim\t$qs15_trim\t@phred_trim\t$average\t"."trimmed\n";
++$filtered15;
print FILTER15 "$id15\t$seq15\t$qs15\t$average_b\n";
} else {++$lowQS15;}
}
}
print COUNT15 "Uncalled bases: $dotsum15 '.' \t or $Nsum15 'N'
Homopolymers:
\t\t AAA $homoA15
\t\t CCC $homoC15
\t\t TTT $homoT15
\t\t GGG $homoG15
Low QS: $lowQS15
Filtered: $filtered15\n";
close N15;
close HOMO15;
close FILTER15;
close COUNT15;
close QUAL15;
}
close STDERR;
exit;