-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetvar_mastervar_bygeno.pl
210 lines (175 loc) · 5.2 KB
/
getvar_mastervar_bygeno.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!perl
#
# Description: Get all fully called, non-complex variants from a specified mastervar file matching a particular genotype class.
# Can get variants within a given genomic coordinate or throughout the entire genome
# Haven't really determined what happens at positions where there are two different variants/alleles
#
#
#
# Created by Jessica X Chong on 2012-03-26
use strict;
use warnings;
use Getopt::Long;
my ($mastervarfile, $desiredgeno, $targetchr, $targetstart, $targetend, $outputfile, $desiredqual);
GetOptions(
'in=s' => \$mastervarfile,
'genotype=s' => \$desiredgeno,
'qual=s' => \$desiredqual,
'chr:s' => \$targetchr,
'start:i' => \$targetstart,
'end:i' => \$targetend,
'out=s' => \$outputfile,
);
if (!defined $mastervarfile) {
optionUsage("option -i not defined\n");
} elsif (!defined $desiredgeno) {
optionUsage("option -genotype not defined\n");
} elsif (!defined $outputfile) {
optionUsage("option -o not defined\n");
} elsif (defined $targetchr && (!defined $targetstart || !defined $targetend)) {
optionUsage("--chr provided but start or stop not defined\n");
} elsif (!defined $desiredqual) {
optionUsage("--qual not defined\n");
}
sub optionUsage {
my $errorString = $_[0];
print STDERR "$errorString";
print STDERR "perl $0 \n";
print STDERR "\t--i\tinput masterVarBeta-ASM* file\n";
print STDERR "\t--genotype\tgenotype desired, either: homref / het / homalt / all\n";
print STDERR "\t--qual\tquality requirement, either: VQHIGH / any\n";
print STDERR "\t--chr\tchromosome (optional)\n";
print STDERR "\t--start\tstart position (optional)\n";
print STDERR "\t--end\tend position (optional)\n";
print STDERR "\t--o\toutput file\n";
die;
}
if ($mastervarfile !~ m/masterVarBeta/) {
print STDERR "input file ($mastervarfile) is not a mastervar file\n";
die;
}
my $targetchrnum;
if (defined $targetchr) {
if ($targetchr !~ m/chr/) {
$targetchr = 'chr'.$targetchr;
}
$targetchrnum = $targetchr;
$targetchrnum =~ s/chr//;
}
my %allowedvartypes = (
'snp' => 1,
'ins' => 1,
'del' => 1,
'sub' => 1,
'ref' => 1,
);
my %allowedzygosity = (
'hap' => 1,
'hom' => 1,
'het-ref' => 1,
'het-alt' => 1,
);
my $currchr = 'NA';
my @keepfields = (2..9, 14, 15, 18, 19, 25..28);
open (OUT, ">$outputfile");
if ($mastervarfile =~ /.bz2/) {
open (BZ, "bzcat $mastervarfile |") or die "Cannot read mastervarfile $mastervarfile: $!\n";
} else {
open (BZ, "$mastervarfile") or die "Cannot read mastervarfile $mastervarfile: $!\n";
}
while (<BZ>) {
my $nextline = $_;
if ($nextline =~ m/^#/ || $nextline =~ m/^\s*$/) { # skip header lines
next;
} elsif ($_ =~ m/^\>/) {
$nextline =~ s/\s+$//; # Remove line endings
my @headerline = split("\t", $nextline);
print OUT join("\t", @headerline[@keepfields])."\n";
} else {
$nextline =~ s/\s+$//; # Remove line endings
my @line = split ("\t", $nextline);
my ($thischr, $thisstart, $thisend, $zygosity, $vartype, $refallele, $a1, $a2) = @line[2..9];
my $a1quality = $line[14];
my $a2quality = $line[15];
if ($currchr ne $thischr) {
print STDERR "At $thischr";
$currchr = $thischr;
if (defined $targetchr) {
print STDERR ": looking for $targetchr:$targetstart-$targetend";
}
print STDERR "\n";
}
# if ($thischr eq 'chr2') { # DEBUG
# exit;
# }
if ($desiredqual eq 'VQHIGH' && ($a1quality && $a2quality)) {
if ($a1quality ne 'VQHIGH' || $a2quality ne 'VQHIGH') {
next;
}
}
if (defined $targetchr && defined $targetstart && defined $targetend) {
my $thischrnum = $thischr;
$thischrnum =~ s/chr//;
if ($thischrnum > $targetchrnum || (($thischr eq $targetchr ) && ($thisend > $targetend))) {
last;
}
if ($thischr eq $targetchr && $thisstart>=$targetstart && $thisend<=$targetend) {
if (exists $allowedvartypes{$vartype} && exists $allowedzygosity{$zygosity} && testGenoClass($desiredgeno, $zygosity, $vartype) == 1) {
for (my $i=0; $i<=$#keepfields; $i++) {
my $fieldnum = $keepfields[$i];
if (defined $line[$fieldnum]) {
print OUT "$line[$fieldnum]\t";
} else {
print OUT "\t";
}
}
print OUT "\n";
}
}
} else { # no chr coordinates specified, so look at all variants in genome
if (exists $allowedvartypes{$vartype} && exists $allowedzygosity{$zygosity} && testGenoClass($desiredgeno, $zygosity, $vartype) == 1) {
for (my $i=0; $i<=$#keepfields; $i++) {
my $fieldnum = $keepfields[$i];
if (defined $line[$fieldnum]) {
print OUT "$line[$fieldnum]\t"; # rs11934749
} else {
print OUT "\t";
}
}
print OUT "\n";
}
}
}
}
close BZ;
close OUT;
print STDERR "done\n";
sub testGenoClass {
my ($testtype, $zygosity, $vartype) = @_;
my $testresult;
if ($testtype eq 'het') {
if ($zygosity eq 'het-ref' && $vartype ne 'ref') {
$testresult = 1;
} else {
$testresult = 0;
}
}
if ($testtype eq 'homalt') {
if (($zygosity eq 'hom' || $zygosity eq 'het-alt') && $vartype ne 'ref') {
$testresult = 1;
} else {
$testresult = 0;
}
}
if ($testtype eq 'homref') {
if ($zygosity eq 'hom' && $vartype eq 'ref') {
$testresult = 1;
} else {
$testresult = 0;
}
}
if ($testtype eq 'all') {
$testresult = 1;
}
return $testresult;
}