-
Notifications
You must be signed in to change notification settings - Fork 0
/
createPlinkFiles.pl
executable file
·120 lines (110 loc) · 2.96 KB
/
createPlinkFiles.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/perl -w
use strict;
use Getopt::Std;
my %options=();
getopts("i:d:m", \%options);
# Will create a set of plink files from a testvariants output for only SNPs.
#This assumes unrelated people. Change the output'd tped file if this is not the case.
# modified by Jessica to correctly provide basepair positions and sex (assuming Hutterite findivs)
#check for input
if (!defined $options{i}){
usage("option -i not defined\n");
}elsif(!defined $options{d}){
usage("option -d not defined\n");
}
open(IN, $options{i}) or die "Can't open $options{i}: $!\n";
my $tped = $options{d} . "/plink.tped";
open(OUT, ">$tped") or die "Can't open $options{d}/plink.tped: $!\n";;
my $count = 0;
while(<IN>){
chomp;
if($count == 0){
processHeader($_);
$count++;
next;
}
if($count % 1000000 == 0){
print "Working on line $count\n";
}
my @line = split("\t", $_);
if($line[4] ne "snp"){
next;
}elsif($line[1] =~ "chrX|chrY|chrM"){
next;
}
my $ref = $line[5];
my $alleleSeq = $line[6];
my $missing_flag = 0;
my $genotypes = ();
#print "$#line\n";
for(my $i = 8; $i<= $#line; $i++){
if($line[$i] =~ "N"){
#exclude missing by default
if(!$options{m}){
$missing_flag=1;
last;
}else{
# print "$line[$i]\n";
$genotypes .= " 0 0";
}
}elsif($line[$i] =~ "00"){
$genotypes .= " $ref $ref";
}elsif($line[$i] =~ "01|10"){
$genotypes .= " $ref $alleleSeq";
}else{
$genotypes .= " $alleleSeq $alleleSeq";
}
}
if(!$missing_flag){
$line[1] =~ s/chr//;
my $snpname = "chr".$line[1]."_$line[3]";
if($line[7] ne ""){
my @rsNums = split("\;",$line[7]);
#print "@rsNums\n";
my ($dbVer,$rsId) = split("\:",$rsNums[0]);
if($rsId =~ "rs"){
print OUT "$line[1] $rsId 0 $line[3]";
}else{
print OUT "$line[1] $snpname 0 $line[3]";
}
}else{
print OUT "$line[1] $snpname 0 $line[3]";
}
print OUT "$genotypes\n";
}
$count++;
}
close(OUT);
sub processHeader{
my @header=split("\t",$_);
my @ids = splice(@header,8);
my $tfam = $options{d} . "/plink.tfam";
open(HEAD, ">$tfam") or die "Can't open plink.ped: $!\n";
my $countId=0;
foreach my $id (@ids){
$countId++;
my $sex = 0;
print HEAD "$countId\t$id\t0\t0\t$sex\t-9\n";
}
close(HEAD);
}
sub usage {
#print the error message
my $errorString = shift;
print "$errorString";
print "perl createPlink.pl -i -d\n";
print "\t-i\tA test variant file\n";
print "\t-d\toutput directory\n";
print "\t-m\tflag to include missing data (these will be encoded as NN even is it is a half-call\n";
print "\nWritten by Jason M Laramie\n\n";
die;
}
sub getTime{
#get a timestamp
my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
my @weekDays = qw(Sun Mon Tue Wed Thu Fri Sat Sun);
my ($second, $minute, $hour, $dayOfMonth, $month, $yearOffset, $dayOfWeek, $dayOfYear, $daylightSavings) = localtime();
my $year = 1900 + $yearOffset;
my $theTime = "$hour:$minute:$second, $weekDays[$dayOfWeek] $months[$month] $dayOfMonth, $year";
print "$theTime\n";
}