upload

wk8910 · Sep 27, 2018 · 2039d15 · 2039d15
1 parent aa69c6b
commit 2039d15
Show file tree

Hide file tree

Showing 31 changed files with 22,738 additions and 1 deletion.
diff --git a/00.basic_scripts/gff2cds.pl b/00.basic_scripts/gff2cds.pl
@@ -0,0 +1,124 @@
+#! /usr/bin/perl
+use strict;
+use warnings;
+use Bio::SeqIO;
+use Bio::Seq;
+
+sub translate_nucl{
+    my $seq=shift;
+    my $seq_obj=Bio::Seq->new(-seq=>$seq,-alphabet=>'dna');
+    my $pro=$seq_obj->translate;
+    $pro=$pro->seq;
+    return($pro);
+}
+
+my ($gff,$genome_fa,$cds)=@ARGV;
+die "Usage: $0 <gff file> <genome file> <output>\n" if(@ARGV<3);
+
+my %gff;
+my %len;
+open(I,"< $gff");
+my $no=0;
+while(<I>){
+    chomp;
+    next if(/^#/);
+    next if(/^\s*$/);
+    my @a=split(/\s+/);
+    next unless($a[2] eq "CDS");
+    $no++;
+    my ($chr,$source,$start,$end,$strand,$phase,$name)=($a[0],$a[1],$a[3],$a[4],$a[6],$a[7],$a[8]);
+    # $chr=~s/chr//g;
+    $name=~/Parent=([^;]+)/;
+    $name=$1;
+    if(!exists $len{$name}){
+        $len{$name}{chr}=$chr;
+        $len{$name}{source}=$source;
+        $len{$name}{start}=$start;
+        $len{$name}{end}=$end;
+        $len{$name}{strand}=$strand;
+    }
+    else {
+        if($start < $len{$name}{start}){
+            $len{$name}{start}=$start;
+        }
+        if($end > $len{$name}{end}){
+            $len{$name}{end} = $end;
+        }
+    }
+    $len{$name}{cds}.=$_."\n";
+    $gff{$chr}{$name}{$no}{start}=$start;
+    $gff{$chr}{$name}{$no}{end}=$end;
+    $gff{$chr}{$name}{$no}{strand}=$strand;
+    $gff{$chr}{$name}{$no}{phase}=$phase;
+}
+close I;
+
+my $fa=Bio::SeqIO->new(-file=>$genome_fa,-format=>'fasta');
+
+my %whitelist;
+open(O,"> $cds.cds");
+open(E,"> $cds.pep");
+while(my $seq=$fa->next_seq){
+    my $chr=$seq->id;
+    my $seq=$seq->seq;
+    #print ">$id\n$seq\n";
+    next unless(exists $gff{$chr});
+    foreach my $name(keys %{$gff{$chr}}){
+        my $strand="NA";
+        my $line="";
+        my $light=0;
+        foreach my $no(sort { $gff{$chr}{$name}{$a}{start} <=> $gff{$chr}{$name}{$b}{start} } keys %{$gff{$chr}{$name}}){
+            if($strand eq "NA"){
+	$strand=$gff{$chr}{$name}{$no}{strand};
+            }
+            my $start=$gff{$chr}{$name}{$no}{start};
+            my $end=$gff{$chr}{$name}{$no}{end};
+            my $len=$end-$start+1;
+            my $subline=substr($seq,$start-1,$len);
+            $line.=$subline;
+        }
+        if($strand eq "+"){
+            my $pep=translate_nucl($line);
+            $pep=~/^(.*).$/;
+            my $stop_test=$1;
+            next if($stop_test=~/\*/);
+            $light=1;
+            print O ">$name\n$line\n";
+            print E ">$name\n$pep\n";
+        }
+        elsif($strand eq "-"){
+            $line=reverse($line);
+            $line=~tr/ATCGatcg/TAGCtagc/;
+            my $pep=translate_nucl($line);
+            $pep=~/^(.*).$/;
+            my $stop_test=$1;
+            next if($stop_test=~/\*/);
+            $light=1;
+            print O ">$name\n$line\n";
+            print E ">$name\n$pep\n";
+        }
+        if($light==1){
+            $whitelist{$name}=1;
+            print STDERR "$chr\t$name\n";
+        }
+    }
+}
+close O;
+close E;
+
+open O,"> $cds.noStopCodon.gff";
+foreach my $name(sort keys %whitelist){
+    my $chr=$len{$name}{chr};
+    my $source=$len{$name}{source};
+    my $start=$len{$name}{start};
+    my $end=$len{$name}{end};
+    my $strand=$len{$name}{strand};
+    my @gene_line=($chr,$source,"gene",$start,$end,".",$strand,".","ID=$name;Parent=$name;");
+    my $gene_line=join "\t",@gene_line;
+    print O "$gene_line\n";
+    my @mrna_line=($chr,$source,"mRNA",$start,$end,".",$strand,".","ID=$name;Parent=$name;");
+    my $mrna_line=join "\t",@mrna_line;
+    print O "$mrna_line\n";
+    print O "$len{$name}{cds}";
+}
+close O;
diff --git a/00.basic_scripts/translateCDS2Protein.pl b/00.basic_scripts/translateCDS2Protein.pl
@@ -0,0 +1,24 @@
+#! /usr/bin/perl
+use strict;
+use warnings;
+use Bio::SeqIO;
+use Bio::Seq;
+
+my $file=shift;
+
+my $fa=Bio::SeqIO->new(-file=>$file,-format=>'fasta');
+
+while(my $seq_obj=$fa->next_seq){
+    my $id=$seq_obj->id;
+    my $seq=$seq_obj->seq;
+    $seq=&translate_nucl($seq);
+    print ">$id\n$seq\n";
+}
+
+sub translate_nucl{
+    my $seq=shift;
+    my $seq_obj=Bio::Seq->new(-seq=>$seq,-alphabet=>'dna');
+    my $pro=$seq_obj->translate;
+    $pro=$pro->seq;
+    return($pro);
+}
diff --git a/01.mpest/01.generate_control_file.pl b/01.mpest/01.generate_control_file.pl
@@ -0,0 +1,30 @@
+#! /usr/bin/env perl
+use strict;
+use warnings;
+
+# my $all_tree="all.gene_tree";
+my $all_tree=shift;
+# my $out="control_file";
+
+open I,"< $all_tree";
+my %name;
+my $count=0;
+while(<I>){
+    chomp;
+    while(/([^(),]+):/g){
+        my $id=$1;
+        next if(length($id) == 0);
+        $name{$id}=1;
+    }
+    $count++;
+}
+close I;
+my $species_number=keys %name;
+
+# open O,"> $out";
+print "$all_tree\n0\n3141592653589793\n20\n$count\t$species_number\n";
+foreach my $id(sort keys %name){
+    print "$id\t1\t$id\n";
+}
+print "0\n";
+# close O;
diff --git a/01.mpest/02.generate_bootstrap_input.pl b/01.mpest/02.generate_bootstrap_input.pl
@@ -0,0 +1,39 @@
+#! /usr/bin/env perl
+use strict;
+use warnings;
+
+my $dir="genes";
+my $outdir="bootstrap";
+`mkdir $outdir` if(!-e $outdir);
+
+my @genes=<$dir/RAxML_bootstrap.*.phb>;
+
+my $bootstrap_number=100;
+
+my %genes;
+my $sn=0;
+foreach my $genes(@genes){
+    open I,"< $genes";
+    my $tmp=0;
+    while(<I>){
+        chomp;
+        $genes{$sn}{$tmp}=$_;
+        $tmp++;
+    }
+    close I;
+    $sn++;
+}
+
+for(my $i=0;$i<$bootstrap_number;$i++){
+    `mkdir $outdir/$i`;
+    open O,"> $outdir/$i/all.tre";
+    for(my $i=0;$i<$sn;$i++){
+        my $select=int(rand($sn));
+        my $tmp_num=keys %{$genes{$select}};
+        my $tmp=int(rand($tmp_num));
+        # print "$select\t$tmp\n";
+        my $tree=$genes{$select}{$tmp};
+        print O "$tree\n";
+    }
+    close O;
+}
diff --git a/01.mpest/03.run_bootstrap.pl b/01.mpest/03.run_bootstrap.pl
@@ -0,0 +1,14 @@
+#! /usr/bin/env perl
+use strict;
+use warnings;
+
+my $dir="bootstrap";
+
+my @bootstrap=<$dir/*>;
+
+open O,"> $0.sh";
+foreach my $bootstrap(@bootstrap){
+    print O "perl 01.generate_control_file.pl $bootstrap/all.tre > $bootstrap/control_file; ~/software/MP-EST/mpest_1.5/src/mpest $bootstrap/control_file\n";
+}
+close O;
+
diff --git a/01.mpest/04.convertAndCollect.pl b/01.mpest/04.convertAndCollect.pl
@@ -0,0 +1,13 @@
+#! /usr/bin/env perl
+use strict;
+use warnings;
+
+my $dir="bootstrap";
+
+my @file=<$dir/*/all.tre.tre>;
+
+foreach my $file(@file){
+    `perl convertNEXUS2newick.pl $file`;
+}
+
+`cat $dir/*/all.tre.tre.nwk > bootstrap.tre`;
diff --git a/02.count_tpm/01.count.pl b/02.count_tpm/01.count.pl
@@ -0,0 +1,18 @@
+#! /usr/bin/env perl
+use strict;
+use warnings;
+
+my $bamdir="bam";
+my $outdir="count";
+`mkdir $outdir` if(!-e $outdir);
+my @bam=<$bamdir/*.bam>;
+# my $bed="regions.bed";
+my $tool="scripts/sam2reads_count.pl";
+
+open O,"> $0.sh";
+foreach my $bam(@bam){
+    $bam=~/([^\/]+)$/;
+    my $file=$1;
+    print O "samtools view $bam | perl $tool > $outdir/$file.count\n";
+}
+close O;
diff --git a/02.count_tpm/02.generate_tpm.pl b/02.count_tpm/02.generate_tpm.pl
@@ -0,0 +1,86 @@
+#! /usr/bin/env perl
+use strict;
+use warnings;
+
+my $dir="count";
+my $len="cds.len";
+my $rpkm_out="$0.tpm";
+my $count_out="$0.count";
+
+my %len;
+open I,"< $len";
+while (<I>) {
+    chomp;
+    my @a=split(/\s+/);
+    my ($id,$len,$e_len)=@a;
+    $len{$id}=$e_len;
+}
+close I;
+
+my @txt=<$dir/*.count>;
+my %count;
+my %rpkm;
+my @sample_id;
+foreach my $txt(@txt){
+    $txt=~/\/([^\/]+).count/;
+    my $sample_id=$1;
+    push @sample_id,$sample_id;
+    my $total=0;
+    open I,"< $txt";
+    while (<I>) {
+        chomp;
+        my @a=split(/\s+/);
+        my ($id,$count)=@a;
+        next if(!exists$len{$id});
+        my $len=$len{$id};
+        if($len==0){
+            #print STDERR "$id\n";
+            next;
+        }
+        my $rate=$count/$len;
+        $total+=$rate;
+        $count{$sample_id}{$id}=$count;
+    }
+    foreach my $id(keys %len){
+        next if(!exists$len{$id});
+        my $len=$len{$id};
+        next if($len==0);
+        my $count=0;
+        if(exists $count{$sample_id}{$id}){
+            $count=$count{$sample_id}{$id};
+        }
+        my $rate=$count/$len;
+        # my $rpkm=($count*1e9)/($total*$len);
+        my $tpm=($rate/$total)*1e6;
+        $rpkm{$id}{$sample_id}{rpkm}=$tpm;
+        $rpkm{$id}{$sample_id}{count}=$count;
+    }
+    close I;
+}
+
+open R,"> $rpkm_out";
+open C,"> $count_out";
+my $head=join "\t",@sample_id;
+print R "\t$head\n";
+print C "\t$head\n";
+foreach my $id(sort keys %rpkm){
+    my @rpkm=($id);
+    my @count=($id);
+    foreach my $sample_id(@sample_id){
+        my ($rpkm,$count)=(0,0);
+        if(exists $rpkm{$id}{$sample_id}{rpkm}){
+            $rpkm = $rpkm{$id}{$sample_id}{rpkm};
+        }
+        if(exists $rpkm{$id}{$sample_id}{count}){
+            $count = $rpkm{$id}{$sample_id}{count};
+        }
+        push @rpkm,$rpkm;
+        push @count,$count;
+    }
+    my $rpkm_line=join "\t",@rpkm;
+    my $count_line=join "\t",@count;
+    print R "$rpkm_line\n";
+    print C "$count_line\n";
+}
+close R;
+close C;
diff --git a/02.count_tpm/scripts/sam2reads_count.pl b/02.count_tpm/scripts/sam2reads_count.pl
@@ -0,0 +1,19 @@
+#! /usr/bin/env perl
+use strict;
+use warnings;
+
+my %hash;
+my $control=0;
+while(<>){
+    chomp;
+    my @a=split(/\s+/);
+    my $flag=$a[1];
+    next if($flag>256);
+    next if($flag & 4);    my $id=$a[2];
+    $hash{$id}++;
+    # last if($control++>1000);
+}
+
+foreach my $id(sort {$hash{$b} <=> $hash{$a}} keys %hash){
+    print "$id\t$hash{$id}\n";
+}