forked from andand/DEGEPRIME
-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathTrimAlignment.pl
270 lines (237 loc) · 8.12 KB
/
TrimAlignment.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
#!/usr/bin/perl -w
=head1 NAME
TrimAlignment.pl - Trims and formats a multiple sequence alignment for use as input to DegePrime.pl
=head1 USAGE
perl TrimAlignment.pl -i <ALIGNMENT_FILE> -o <OUTPUT_FILE> [-min <MIN_OCCUPANCY>] [-max_trail <MAX_TRAILING>] [-trailgap] [-ref <REF_SEQUENCE>] [-h]
Specify <MIN_OCCUPANCY> (and <MAX_TRAILING>) if you want to remove columns of low occupancy, or <REF_SEQUENCE> if you want to remove columns that are not occupied by the reference sequence.
The output alignment file will contain upper and lower case nucleotide letters, the lower case letters indicate that the succeding column(s) have been removed. (If the input alignment file contains any lower case letters these will first be changed to upper case.) Any U:s in the input file will also be replaced by T:s in the output.
If neither <MIN_OCCUPANCY>, <MAX_TRAILING> nor <REF_SEQUENCE> is specified, all columns will be kept (and all nucleotide letters will be upper case). We recommend that you process your alignment file like this even if you don't wish to remove columns, in order to remove unwanted lower case letters before running DegePrime.
=head1 POSITIONAL ARGUMENTS
-i <ALIGNMENT_FILE> Specify alignment file (fasta format)
-o <OUTPUT_FILE> Specify output file name (overwrites existing file with same name)
=head1 OPIONAL ARGUMENTS
-min <MIN_OCCUPANCY> Specify minimum fraction of sequences (in the range 0 to 1) with a nucleotide for a column to be included, default 0. The fraction is calculated by dividing the number of sequences with a nucleotide by the number of sequences that do not have a trailing character (".")(default), or by the total number of sequences, if -trailgap is used.
-max_trail <MAX_TRAILING> Specify maximum fraction of sequences (in the range 0 to 1) with a trailing character (".") for a column to be included, default 1. Will be ignored if -trailgap is used.
-trailgap If used, trailing characters (".") are counted as gaps ("-")(default is to not do so).
-ref <REF_SEQUENCE> Specify id of a reference sequence in the alignment file. Only columns where the reference has a nucleotide will be included. (-min, -max_trailing and -trailgap will be ignored if this is used)
-h Prints this help message
[Press q to close this help message]
=cut
use Getopt::Long;
$alignment_file = undef;
$outfile = undef;
$cutoff = undef;
$cutoff_trailing = 1;
$reference = undef;
$count_trailing_as_gap = undef;
&GetOptions('i=s' => \$alignment_file, 'o=s' => \$outfile, 'min=f' => \$cutoff, 'max_trailing=f' => \$cutoff_trailing, 'ref=s' => \$reference, 'trailgap!' => \$count_trailing_as_gap, 'h!' => \$help);
if (!$alignment_file or !$outfile or $help) {
system ('perldoc', $0);
exit;
}
if ($cutoff) {
if ($cutoff < 0 || $cutoff > 1) {
print "\n Error: allowed minimum occupancy (-min) range: 0 to 1 \n";
print "\n perl degeprime.pl -h for more help \n\n";
exit;
}
}
if ($cutoff and $reference) {
print "\n Error: not possible to specify both minimum occupancy and reference sequence \n";
print "\n perl degeprime.pl -h for more help \n\n";
exit;
}
if (!$cutoff and !$reference) {
$cutoff = 0; # the default value of $cutoff
}
#$reference = "S001099426"; # E. coli K12 GenBank J01695
#$reference = "S000497753"; # Sulfolobus solfataricus P2; AE006720
#$aligned_file = "eukaryotic/arb-silva.de_2011-06-01_id2791.fasta";
#$aligned_file = "23S_bact/arb-silva.de_2011-06-20_id5497.fasta";
####################
print"\nRunning TrimAlignment\n";
if ($reference) {
print"Getting positions occupied by reference sequence\n";
&get_ref_positions;
} else {
print"Calculating occupancies\n";
&get_rich_positions;
}
print"Printing trimmed alignment\n";
&get_short_seqs;
print"Finnished TrimAlignment succesfully\n\n";
####################
sub get_rich_positions {
open (INFILE, $alignment_file) || die ("Can't open $alignment_file");
$id = "NA";
$seq = "";
$total = 0;
@counts = ();
@trailing_counts = ();
while (<INFILE>) {
chomp;
$row = $_;
if (substr($_, 0, 1) eq ">") {
if ($seq ne "") {
$total++;
$seq =~ s/\s+//g;
$l = length($seq);
$lengths{$l} = 1;
if ($count_trailing_as_gap) {
$seq =~ s/\./\-/g;
}
for ($i = 0; $i < length($seq); $i++) {
if (substr($seq, $i, 1) ne "-") {
$counts[$i]++;
}
if (substr($seq, $i, 1) eq ".") {
$trailing_counts[$i]++;
}
}
}
substr($row, 0, 1) = "";
@fields = split(/\s+/, $row);
$id = $fields[0];
$seq = "";
} else {
$seq = $seq.$_;
}
}
$total++;
$seq =~ s/\s+//g;
$l = length($seq);
$lengths{$l} = 1;
if ($count_trailing_as_gap) {
$seq =~ s/\./\-/g;
}
for ($i = 0; $i < length($seq); $i++) {
if (substr($seq, $i, 1) ne "-") {
$counts[$i]++;
}
if (substr($seq, $i, 1) eq ".") {
$trailing_counts[$i]++;
}
}
close(INFILE);
if ((keys %lengths) > 1) {
print "\nError: Aligned sequences have different lengths! \n\n"; die;
}
@comp_pos = ();
for ($i = 0; $i < @counts; $i++) {
if (!defined $counts[$i]) {
$counts[$i] = 0;
}
if (!defined $trailing_counts[$i]) {
$trailing_counts[$i] = 0;
}
if (($trailing_counts[$i]/$total) <= $cutoff_trailing) {
if ($total == $trailing_counts[$i]) {
push(@comp_pos, $i);
} elsif ((($counts[$i] - $trailing_counts[$i])/($total - $trailing_counts[$i])) >= $cutoff) {
push(@comp_pos, $i);
}
}
}
}
sub get_ref_positions {
open (INFILE, $alignment_file) || die ("Can't open $alignment_file");
$id = "NA";
$seq = "";
$ref_found = 0;
while (<INFILE>) {
chomp;
$row = $_;
if (substr($_, 0, 1) eq ">") {
if ($seq ne "") {
$seq =~ s/\s+//g;
if ($id eq $reference) {
$ref_found = 1;
@comp_pos = ();
for ($i = 0; $i < length($seq); $i++) {
#if ( substr($seq, $i, 1) =~ m/[A-Z]/ ) { # Only capital
if ( substr($seq, $i, 1) =~ m/[A-Z]/i ) { # All
push(@comp_pos, $i);
}
}
last;
}
}
substr($row, 0, 1) = "";
@fields = split(/\s+/, $row);
$id = $fields[0];
$seq = "";
} else {
$seq = $seq.$_;
}
}
$seq =~ s/\s+//g;
if ($id eq $reference) {
$ref_found = 1;
@comp_pos = ();
for ($i = 0; $i < length($seq); $i++) {
#if ( substr($seq, $i, 1) =~ m/[A-Z]/) { # Only capital
if ( substr($seq, $i, 1) =~ m/[A-Z]/i ) { # All
push(@comp_pos, $i);
}
}
}
close(INFILE);
if ($ref_found == 0) {
print "\n Error: The reference sequence was not found. Note: if the sequence ids are split by spaces it's only the part before the first space that is considered. \n\n";
die;
}
}
sub get_short_seqs {
open (INFILE, $alignment_file) || die ("Can't open $alignment_file");
open (OUT, ">$outfile");
$id = "NA";
$seq = "";
while (<INFILE>) {
chomp;
$row = $_;
if (substr($row, 0, 1) eq ">") {
if ($seq ne "") {
$seq =~ s/\s+//g;
$seq =~ tr/[a-z]/[A-Z]/;
$seq =~ tr/U/T/;
$seqshort = "";
for ($i = 0; $i < (@comp_pos - 1); $i++) {
$nt = substr($seq, $comp_pos[$i], 1);
$gap = substr($seq, ($comp_pos[$i] + 1), ($comp_pos[$i+1] - $comp_pos[$i] - 1));
if ($gap =~ m/\w/) {
$nt =~ tr/[A-Z]/[a-z]/;
}
$seqshort = $seqshort.$nt;
}
$nt = substr($seq, $comp_pos[-1], 1);
$seqshort = $seqshort.$nt;
print OUT ">$id\n$seqshort\n";
#print ">$id\n$seqshort\n";
}
substr($row, 0, 1) = "";
@fields = split(/\s+/, $row);
$id = $fields[0];
$seq = "";
} else {
$seq = $seq.$_;
}
}
$seq =~ s/\s+//g;
$seq =~ tr/[a-z]/[A-Z]/;
$seq =~ tr/U/T/;
$seqshort = "";
for ($i = 0; $i < (@comp_pos - 1); $i++) {
$nt = substr($seq, $comp_pos[$i], 1);
$gap = substr($seq, ($comp_pos[$i] + 1), ($comp_pos[$i+1] - $comp_pos[$i] - 1));
if ($gap =~ m/\w/) {
$nt =~ tr/[A-Z]/[a-z]/;
}
$seqshort = $seqshort.$nt;
}
$nt = $nt = substr($seq, $comp_pos[-1], 1);
$seqshort = $seqshort.$nt;
print OUT ">$id\n$seqshort\n";
#print ">$id\n$seqshort\n";
close(INFILE);
close(OUT);
}