Skip to content

Latest commit

 

History

History
256 lines (213 loc) · 10.6 KB

readme.org

File metadata and controls

256 lines (213 loc) · 10.6 KB

Primer Design

  1. Find highly conserved regions in all marine dolphin mitos
    • bwa fastmap (v0.7.17) [PMID:22569178] finds exact matches between marine dolphin mitos and the bottlenose dolphin mito
    • awk converts the fastmap outptu to 6 column bed
    • bedtools genomecov (v2.28.0) [PMID:20110278] counts the overlap of conserved regions
    • awk filters for conserved regions in all dolphins and removes regions shorter than 24bp
    ROOT=$(git rev-parse --show-toplevel)
    ml singularity
    
    bwa () {
        singularity exec -B$ROOT /apps/singularity-3/bwa/bwa-0.7.17.sif \
            bwa "$@"
    }
    
    bedtools () {
        singularity exec -B $ROOT /apps/singularity-3/bedtools/bedtools-2.28.0.sif \
            bedtools "$@"
    }
    
    bwa index $ROOT/0-ref/bottlenose.mito.fa
    
    bwa fastmap $ROOT/0-ref/bottlenose.mito.fa \
                $ROOT/0-ref/marine_dolphin.refseq.fasta |
        awk '$1 == "SQ" {sq = $2}
             $1 == "EM" { print $5, $6, $6 + $3 - $2 - 1, sq, $2, $3 }'\
                 FS="[\t:+]+" OFS="\t" |
        bedtools genomecov -g $ROOT/0-ref/bottlenose.mito.fa.fai  -i - -bga |
        awk '$4 == 25 && $3-$2 >= 24 { print $1,$2,$3, "primer" ++c, $3-$2 }' \
            > primers.bed
    
        
  2. Get primer sequences
    ROOT=$(git rev-parse --show-toplevel)
    ml singularity
    
    bedtools () {
        singularity exec -B $ROOT /apps/singularity-3/bedtools/bedtools-2.28.0.sif \
            bedtools "$@"
    }
    
    bedtools getfasta \
        -bed $DIR/primers.bed \
        -fi $ROOT/0-ref/bottlenose.mito.fa \
        -fo $DIR/primers.fa \
        -name
    
        
  3. (Not Needed) Find genes within the conserved regions and conserved regions without genes
    bedtools intersect -loj -b bottlenose.anno.gff3 -a primers.bed |
        awk '$8 == "exon" { sub(".*product=", "", $14); print $1,$2,$3,$4,$5,$14 }' FS='\t' OFS='\t' > primer.exons.bed
    
    
    bedtools intersect -v -b bottlenose.exons.gff3 -a primers.bed > primer.exons.none.bed
    cat primer.exons.bed primer.exons.none.bed | sort -V -k4,4 | sed -e 's/ID=.*\?\(Note\|product\)=//' -e 's/;[^\t]*//' > primers.add-exons
    
        
  4. (Not Needed) Check for NUMTs
    PATH=$PATH:/usr/local/igbb/samtools-1.9/bin/
    
    awk '/^Primer/ { p=$3 }
        /Seque/    {contig = $2}
        /hits forward strand/ { start = $6 }
        /Amplimer length/ { printf "%s:%d-%d\n", contig, start,   start+$03;
                            if(start - $3 > 0) printf "%s:%d-%d\n", contig, start-$3,start }' \
            OFS="\t" selected.primers.dolphin.no-miss |
        xargs samtools faidx  GCF_011762595.1_mTurTru1.mat.Y_genomic.fna |
        blastn -subject primer.pairs.seqs.fa -outfmt '6 std qcovs' -max_target_seqs 1
        
    use strict;
    use warnings;
    
    use JSON::PP;
    
    my $lengths = {};
    open(INDEX, 'GCF_011762595.1_mTurTru1.mat.Y_genomic.fna.fai');
    while(<INDEX>){
        chomp;
        $_ = [split];
        $lengths->{$_->[0]} = $_->[1];
    }
    close INDEX;
    
    open(PRIMER, 'selected.primers.dolphin.no-miss' );
    
    my $labels = {};
    my $primers = [];
    my ($primer, $amplimer, $sequence, $start, $stop, $length);
    
    while(<PRIMER>){
        chomp;
        if(/^Primer name (\S*)/){
            $primer = $1;
            next;
        }
        if(/^Amplimer (\d*)/){
            $amplimer = $1-1;
            next;
        }
        if(/Sequence: (\S*)/){
            $sequence = $1;
            $_ = readline(PRIMER);
            /chromosome (\S*),/;
            $labels->{$sequence} = "Chr $1";
            next;
        }
        if(/hits forward strand at (\d*)/){
            $start = 0+$1;
            next;
        }
        if(/hits reverse strand at \[(\d*)\]/){
            $stop = $lengths->{$sequence} - $1;
            next;
        }
        if(/Amplimer length: (\d*)/){
            ($start, $stop) = sort {$a <=> $b} ($start, $stop);
            warn "$stop-$start (".($stop-$start).") =/= $1"  unless($stop-$start == $1-2);
            next if $sequence eq "NC_012059.1";
            push(@$primers, {block_id => $sequence, start=>$start, end=>$stop, name=>$primer, len=>$1+0});
        }
    }
    
     my $json = JSON::PP->new->ascii->allow_nonref;
    
    print $json->encode( $labels );
    print $json->encode( $primers );
    
    
        
  5. Aided by Primer-BLAST, the locations were then manually screened, modified, and paired to produce ten candidate primer pairs (Table 1). P4, P6, P7, and P10 were chosen after laboratory testing and verification as the best candidates to retrieve the entire mitochondrial genome.
Primer pairsforward primerLengthTmGC%Self complementaritySelf 3’ complementarityreverse primer (still in the same strand)LengthTmGC%Self complementaritySelf 3’ complementarityAmplicon lengthCoordinatesreverse primer in complementary strand
P1CAGCCCAAAACTCAAAGGACTTGGCGG2768.4355.5642AAACTGGCTTCAATCTACTTCTCCCGCCGC3070.2353.33534638571-5209GCGGCGGGAGAAGTAGATTGAAGCCAGTTT
P2GAGCCTGGTGATAGCTGGTTGTCCAA2666.5553.8544CAAGAAAGGAAGGAATCGAACC2257.3145.454254771453-6930GGTTCGATTCCTTCCTTTCTTG
P3GCTACTTCAGTCTATATACCGCC2358.0247.8362AAACTGGCTTCAATCTACTTCTCCCGCCGC3070.2353.33534540669-5209GCGGCGGGAGAAGTAGATTGAAGCCAGTTT
P4GTCCTACGTGATCTGAGTTCAGACCGG2766.0955.5684GACTTCCAATCAGTTAGTTTCGG2357.4743.483170052481-9486CCGAAACTAACTGATTGGAAGTC
P5GCTAAATCCTCACTAGATTGGAGGG2560.514842CTTCGTCTTATACCAACGCCTGAGCCC2767.0355.565266805100-13015GGGCTCAGGCGTTGGTATAAGACGAAG
P6GACTTCCAATCAGTTAGTTTCGG2357.4743.4831ATGCCGCGTGAAACCAGCAACCCGCT2673.4161.544163659460-15825AGCGGGTTGCTGGTTTCACGCGGCAT
P7CCTCACCACCAACACCCAAAGCTGG2567.886042GCTAAATCCTCACTAGATTGGAGGG2560.514842586215418-16143; 1-5137CCCTCCAATCTAGTGAGGATTTAGC
P8CAAGAAAGGAAGGAATCGAACC2257.3145.4542CCTCACCACCAACACCCAAAGCTGG2567.88604285466906-15452CCTCACCACCAACACCCAAAGCTGG
P9GACTTCCAATCAGTTAGTTTCGG2357.4743.4831GAGCCTGGTGATAGCTGGTTGTCCAA2666.5553.854446379460-16143; 1-1479TTGGACAACCAGCTATCACCAGGCTC
P10CAAGAAAGGAAGGAATCGAACC2257.3145.4542CTTCGTCTTATACCAACGCCTGAGCCC2767.0355.565261096906-13015GGGCTCAGGCGTTGGTATAAGACGAAG
PairusedStartEnd
P1F5715209
P2F14536930
P3F6695209
P4T24819486
P5F510013015
P6T946015825
P7T1541816388
P7T15137
P8F690615452
P9F946016388
P9F11479
P10T690613015
library(tidyverse)
library(scales)
library(ggtext)

colnames(locs) <- c('Pair', 'used', 'Start', 'End')

locs <- locs %>%
  arrange(, Pair) %>%
  mutate(pos = (Start+End) / 2,
         Pair = factor(Pair, levels = paste0('P', 1:10))) %>%
  mutate(level = as.numeric(Pair)) %>%
  mutate(ymin=level,
         ymax=level + as.numeric(used)*0.25 + 0.5)

locs$Pair <- select(locs, Pair, used) %>%
  unique %>%
  pull(used, name=Pair) %>%
  ifelse(.,
         sprintf("**%s**", names(.)),
         names(.)) %>%
  setNames(names(.), .) %>%
  fct_recode(locs$Pair, !!!.)

#colorbrewer pastel for primers not used
#colorbrewer set1 for primers used
cols <- c(
  "P1" = "#b3e2cddd",
  "P2" = "#fdcdacdd",
  "P3" = "#cbd5e8dd",
  "P5" = "#f4cae4dd",
  "P8" = "#e6f5c9dd",
  "P9" = "#fff2aedd",
  "**P4**" = "#e41a1cff",
  "**P6**" = "#377eb8ff",
  "**P7**" = "#4daf4aff",
  "**P10**"= "#984ea3ff"
)

label <- function(x){
  l <- ifelse(x ==1 , "1/16.3Kbp",
              label_number(scale_cut=cut_si('bp'))(x))
  str(l)
  return(l)
}

ggplot(locs,
       aes(xmax=Start, xmin=End,
           ymax=ymax, ymin=level,
           fill=Pair )) +
  annotate(geom='rect', xmin=1, xmax=Inf,
           ymin=-Inf, ymax = 0 , fill='white') +
   geom_rect() +
  scale_y_continuous(limits = c(-15, 11), expand = c(0,0)) +
  scale_fill_manual(values=cols)+
  scale_alpha_manual(values=c(0.28, 1)) +
  scale_x_continuous(
    breaks=c(1, seq(2000, 14000, 2000)),
    labels=label) +
  coord_polar(clip='off') +
  theme_minimal() +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        legend.text=element_markdown())