Skip to content

Latest commit

 

History

History
702 lines (577 loc) · 20.6 KB

readme.org

File metadata and controls

702 lines (577 loc) · 20.6 KB

Assembly

Contig Assembly

Kokia drynarioides JFW-HI

  • Remove lambda and remove reads shorter than 4kb from nanopore data
    RAW=$(realpath $DIR/../0-raw/)
    
    printf "$RAW/%s/fastq_pass.fq.gz\n" "${runs[@]}" |
        xargs $DIR/apps/devour -t 20 -l 4000 |
        gzip > $DIR/1-assemble/kd/clean.fq.gz
        
  • Assemble contigs using canu
    ml canu/2.1
    canu \
        -p canu -d $DIR/1-assemble/kd \
        genomeSize=590m \
        gridOptions="-A191400-327070 --time=48:00:00" \
        useGrid=true \
        -nanopore $DIR/1-assemble/kd/clean.fq.gz
    
    
        
    canu/2.1
    n2264
    n:5002264
    L5054
    min1197
    N751201645
    N502757100
    N254885516
    E-size4190213
    max22.95e6
    sum551.1e6
  • clean canu directory
    pushd $DIR/1-assemble/kd
    rm -fr canu-logs/ \
        canu-scripts/ \
        canu.seqStore/ \
        correction/ \
        trimming/ \
        unitigging/ \
        canu.out
    
    gzip *.fasta canu.contigs.layout.*
    
    git add canu.report canu.seqStore.err
    
    git annex add canu.contigs.fasta.gz \
        canu.contigs.layout.readToTig.gz \
        canu.contigs.layout.tigInfo.gz \
        canu.correctedReads.fasta.gz \
        canu.trimmedReads.fasta.gz \
        canu.unassembled.fasta.gz
    
        

Kokia kauaiensis S9

  • Create combined FASTQ file
    ROOT=$(git rev-parse --show-toplevel)
    
    for file in "${runs[@]}"; do
        tar -Oxf $ROOT/wgs/0-raw/$file
    done | zcat -f > $DIR/1-assemble/kk/combined.fq
        
  • Assemble with hifiasm
    ROOT=$(git rev-parse --show-toplevel)
    PATH=$DIR/apps/hifiasm-0.18.5-r499/:$PATH
    
    RAW=$ROOT/wgs/0-raw
    
    hifiasm -t $CPUS -o $DIR/1-assemble/kk/genome \
        --h1 $RAW/Kk/hi-c/Kk_HiC_CKDL220020123-1A_HCWYNDSX5_L1_1.fq.gz \
        --h2 $RAW/Kk/hi-c/Kk_HiC_CKDL220020123-1A_HCWYNDSX5_L1_2.fq.gz \
        $DIR/1-assemble/kk/combined.fq
    
    awk '/^S/{print ">"$2;print $3}' \
        $DIR/1-assemble/kk/genome.hic.p_ctg.gfa \
        > $DIR/1-assemble/kk/genome.hic.p_ctg.fa
    awk '/^S/{print ">"$2;print $3}' \
        $DIR/1-assemble/kk/genome.hic.hap1.p_ctg.gfa \
        > $DIR/1-assemble/kk/genome.hic.hap1.p_ctg.fa
    awk '/^S/{print ">"$2;print $3}' \
        $DIR/1-assemble/kk/genome.hic.hap2.p_ctg.gfa \
        > $DIR/1-assemble/kk/genome.hic.hap2.p_ctg.fa
        

Kokia cookei C69

  • Create combined FASTQ file
    ROOT=$(git rev-parse --show-toplevel)
    
    for file in "${runs[@]}"; do
        tar -Oxf $ROOT/wgs/0-raw/$file
    done | zcat -f > $DIR/1-assemble/kc/combined.fq
        
  • Assemble with hifiasm
    ROOT=$(git rev-parse --show-toplevel)
    PATH=$DIR/apps/hifiasm-0.18.5-r499/:$PATH
    
    RAW=$ROOT/wgs/0-raw
    
    hifiasm -t $CPUS -o $DIR/1-assemble/kc/genome \
        --h1 $RAW/Kc/hi-c/Kc_HiC_CKDL220020122-1A_HCWYNDSX5_L1_1.fq.gz \
        --h2 $RAW/Kc/hi-c/Kc_HiC_CKDL220020122-1A_HCWYNDSX5_L1_2.fq.gz \
        $DIR/1-assemble/kc/combined.fq
    
    awk '/^S/{print ">"$2;print $3}' \
        $DIR/1-assemble/kc/genome.hic.p_ctg.gfa \
        > $DIR/1-assemble/kc/genome.hic.p_ctg.fa
    awk '/^S/{print ">"$2;print $3}' \
        $DIR/1-assemble/kc/genome.hic.hap1.p_ctg.gfa \
        > $DIR/1-assemble/kc/genome.hic.hap1.p_ctg.fa
    awk '/^S/{print ">"$2;print $3}' \
        $DIR/1-assemble/kc/genome.hic.hap2.p_ctg.gfa \
        > $DIR/1-assemble/kc/genome.hic.hap2.p_ctg.fa
        

Polish

ROOT=$(git rev-parse --show-toplevel)

ml singularity/3.8.3

# clean up singularity medaka command. take the first element as
# subcommand, then remove it from stack. Pass the remaining stack to
# medaka cmd.
medaka () {
    cmd=medaka_$1
    shift;
    singularity exec -B $ROOT $DIR/apps/medaka_v1.7.1.sif $cmd $@;
}

for KOKIA in kc kk; do
    medaka consensus -t $CPUS -m r104_e81_sup_g5015 \
        -i $DIR/1-assemble/$KOKIA/combined.fq \
        -d $DIR/1-assemble/$KOKIA/genome.hic.p_ctg.fa \
        -o $DIR/2-polish/$KOKIA/
done

Scaffolding

NamePolsihed Genomes
Kc2-polish/kc/consensus.fasta
Kd1-assemble/kd/canu.contigs.fasta
Kk2-polish/kk/consensus.fasta
NameForwardReverse
KcKc/hi-c/Kc_HiC_CKDL220020122-1A_HCWYNDSX5_L1_1.fq.gzKc/hi-c/Kc_HiC_CKDL220020122-1A_HCWYNDSX5_L1_2.fq.gz
KkKk/hi-c/Kk_HiC_CKDL220020123-1A_HCWYNDSX5_L1_1.fq.gzKk/hi-c/Kk_HiC_CKDL220020123-1A_HCWYNDSX5_L1_2.fq.gz
Kdkd/hi-c/kokia_S3HiC_R1.fastq.gzkd/hi-c/kokia_S3HiC_R2.fastq.gz
  • Create bwa database for assembly
    ROOT=$(git rev-parse --show-toplevel)
    PATH=$DIR/apps/bwa-0.7.17:$PATH
    
    line=$(sed -n ${SLURM_ARRAY_TASK_ID}p <<<"$genomes")
    read name fasta <<<"$line"
    
    bwa index -a bwtsw -p $DIR/3-scaffolding/0-db/$name $DIR/$fasta
        
  • Align Hi-C data to assembly
    ROOT=$(git rev-parse --show-toplevel)
    RAW=$(realpath $ROOT/wgs/0-raw/)
    
    PATH=$DIR/apps/bwa-0.7.17:$PATH
    PATH=$DIR/apps/samtools-1.17/bin:$PATH
    PATH=$PATH:$DIR/apps/samblaster-v.0.1.26/
    
    line=$(sed -n ${SLURM_ARRAY_TASK_ID}p <<<"$data")
    read name fwd rev <<<"$line"
    
    bwa mem -5SP -t 48 $DIR/3-scaffolding/0-db/$name $RAW/$fwd $RAW/$rev |
        samblaster |
        samtools view -bS -F 2316 |
        samtools sort -m 60G -o $DIR/3-scaffolding/1-bwa/$name.bam
    
    samtools index $DIR/3-scaffolding/1-bwa/$name.bam
        
  • Run yahs
    ROOT=$(git rev-parse --show-toplevel)
    PATH=$DIR/apps/yahs/:$PATH
    
    line=$(sed -n ${SLURM_ARRAY_TASK_ID}p <<<"$genomes")
    read name fasta <<<"$line"
    
    yahs $DIR/$fasta $DIR/3-scaffolding/1-bwa/$name.bam \
         -o $DIR/3-scaffolding/2-yahs/$name
        

Splitting combined chrs

Based on the dotplot to kirkii, scaffold_1 for both Kc and Kk and scaffold_2 and scaffold_7 in Kc contain multiple chrs. Using hic contact map to split those scaffolds.

  • Get regions
    for name in Kc Kk; do
        awk '$1 == "scaffold_1" && $5 == "W" {printf "%s:%d-%d\n", $6, $7, $8}' \
            3-scaffolding/2-yahs/${name}_scaffolds_final.agp \
            > 3-scaffolding/3-split/$name.scaff1.regions
    done
    
    name=Kc
    awk '$1 == "scaffold_2" && $5 == "W" {printf "%s:%d-%d\n", $6, $7, $8}' \
        3-scaffolding/2-yahs/${name}_scaffolds_final.agp \
        > 3-scaffolding/3-split/$name.scaff2.regions
    
    name=Kc
    awk '$1 == "scaffold_7" && $5 == "W" {printf "%s:%d-%d\n", $6, $7, $8}' \
        3-scaffolding/2-yahs/${name}_scaffolds_final.agp \
        > 3-scaffolding/3-split/$name.scaff7.regions
    
        
  • Plot
        ml gd/
    
    #    for scaff in Kk.scaff1 Kc.scaff1 Kc.scaff2 Kc.scaff7; do
        for scaff in Kc.scaff7; do
            name=${scaff:0:2}
            $DIR/apps/hic-viz/hic-viz -m 600 -b 300 -s 5 \
                                      -r $DIR/3-scaffolding/3-split/$scaff.regions \
                                      $DIR/3-scaffolding/1-bwa/$name.bam \
                                      > $DIR/3-scaffolding/3-split/$scaff.png
    
        done
        
  • split scaffolds
    ml python
    
    awk '$1 == "scaffold_1" && $6 == "ptg000024l" && $7==1 ;
         $1 == "scaffold_1" && $6 == "ptg000120l" && $7==1 ;
         $1 == "scaffold_1" && $6 == "ptg000004l" && $7==4001 ;
         $1 == "scaffold_1" && $6 == "ptg000153l" && $7==375001 ;
         $1 == "scaffold_1" && $6 == "ptg000592l" && $7==8001 ;
         $1 == "scaffold_2" && $6 == "ptg000137l" && $7==1 ; 
         $1 == "scaffold_2" && $6 == "ptg000330l" && $7==1
         $1 == "scaffold_7" && $6 == "ptg000740l" && $7==11001
         $1 == "scaffold_7" && $6 == "ptg000242l" && $7==1
         $1 == "scaffold_7" && $6 == "ptg000337l" && $7==3001
         $1 == "scaffold_7" && $6 == "ptg000285l" && $7==1041001
         ' \
             FS="\t" OFS="\t" \
             $DIR/3-scaffolding/2-yahs/Kc_scaffolds_final.agp |
        awk '{_[$1] = _[$1] "," ($2-1)}
             END {for( i in _ ) {
                     sub("^,", "", _[i]);
                     print i, _[i]
                     }}' OFS="\t" |
        agptools split /dev/stdin \
                 $DIR/3-scaffolding/2-yahs/Kc_scaffolds_final.agp \
                 > $DIR/3-scaffolding/Kc.agp
    
    
    awk '$1 == "scaffold_1" && $6 == "ptg000053l" && $7==1 ;' \
        $DIR/3-scaffolding/2-yahs/Kk_scaffolds_final.agp |
        awk '{_[$1] = _[$1] "," ($2-1)}
             END {for( i in _ ) {
                     sub("^,", "", _[i]);
                     print i, _[i]
                     }}' OFS="\t" |
        agptools split /dev/stdin \
                 $DIR/3-scaffolding/2-yahs/Kk_scaffolds_final.agp \
                 > $DIR/3-scaffolding/Kk.agp
        
    
        
  • make fasta
    ml python
    
    cat  <<<"$genomes" |
        while read name fasta; do
            agptools assemble $DIR/$fasta \
                     $DIR/3-scaffolding/$name.agp \
                     > $DIR/3-scaffolding/$name.fa
        done
        

Arrange

  • Align to kirkii
    ml minimap2/2.17
    
    name=$(sed -n ${SLURM_ARRAY_TASK_ID}p <<<"$genomes")
    
    minimap2 -x asm5 -t 48 \
        $DIR/0-ref/kirkii.fa \
        $DIR/3-scaffolding/$name.fa \
        > $DIR/4-arrange/$name.paf
        
  • Dotplot
    name=$(sed -n ${SLURM_ARRAY_TASK_ID}p <<<"$genomes")
    cd $DIR/4-arrange
    Rscript $DIR/apps/pafCoordsDotPlotly.R \
         -i $DIR/4-arrange/$name.paf \
         -o $name \
         -m 20000 \
         -p 12 \
         -x -s -t -l
        
  • Orient and arrange using kirkii
    ROOT=$(git rev-parse --show-toplevel)
    name=$(sed -n ${SLURM_ARRAY_TASK_ID}p <<<"$genomes")
    ml singularity
    
    ragtag () {
        singularity exec --no-home -B$ROOT \
                    $DIR/apps/ragtag_2.1.0--pyhb7b1952_0.sif \
                    ragtag.py "$@"
        
    }
    
    ragtag scaffold -t 48 -q 60 -i 0.75 \
           -o $DIR/4-arrange/3-ragtag/$name \
           $DIR/0-ref/kirkii.fa \
           $DIR/3-scaffolding/$name.fa
        
  • Join yahs scaffolds based on ragtag arrangment
    ml python
    name=$(sed -n ${SLURM_ARRAY_TASK_ID}p <<<"$genomes")
    
    awk '$5 == "W" { _[$1] = _[$1] "," $9 $6; }
             END {for( i in _ ) {
                     sub("^,", "", _[i]);
                     print _[i], i;
                     }}' OFS="\t" \
                         $DIR/4-arrange/3-ragtag/$name/ragtag.scaffold.agp |
        sed  -e "s/KI_/${name}_/" -e "s/_RagTag//"|
        sort -k2,2 |
        agptools join -n 100 -t scaffold -e align_xgenus \
                 /dev/stdin $DIR/3-scaffolding/$name.agp \
                 > $DIR/4-arrange/$name.agp
        

Curate

  • Plot
    PATH=$DIR/apps/hic-viz/:$PATH
    PATH=$DIR/apps/agp-curator/:$PATH
    
    scaff=$1
    name=${scaff:0:2}
    
    if [ ! -e $DIR/4-arrange/$name.$scaff.png ]; then
        awk '$1 == chr && $5 == "W" {
                sub("+", "", $9);
                printf "%s%s:%d-%d\n", $9, $6, $7, $8
                        }' chr=$scaff $DIR/4-arrange/$name.agp |
            hic-viz -m 600 -b 300 -s 8 -r /dev/stdin  \
                    -f /usr/share/fonts/dejavu-sans-fonts/DejaVuSans.ttf \
                    -o $DIR/4-arrange/$name.$scaff.png \
                    $DIR/3-scaffolding/1-bwa/$name.bam
    fi
    
    magpie $DIR/5-curate/$name.magpie \
           $DIR/4-arrange/$name.agp \
           > $DIR/5-curate/$name.agp
    awk '$1 == chr && $5 == "W" {
            sub("+", "", $9);
            printf "%s%s:%d-%d\n", $9, $6, $7, $8
            }' chr=$scaff $DIR/5-curate/$name.agp |
        hic-viz -m 600 -b 300 -s 8 -r /dev/stdin  \
                -f /usr/share/fonts/dejavu-sans-fonts/DejaVuSans.ttf \
                -o $DIR/5-curate/$name.$scaff.png \
                $DIR/3-scaffolding/1-bwa/$name.bam
    
        
  • Assemble and align curated genomes to kirkii
    ml python  minimap2/2.14
    DIR=/home/maa146/Projects/Kokia/wgs/1-assembly
    
    name=$(sed -n ${SLURM_ARRAY_TASK_ID}p <<<"$genomes")
    
    ~/.local/bin/agptools assemble \
                          $DIR/2-polish/$name.fa \
                          $DIR/5-curate/$name.agp \
                          > $DIR/5-curate/$name.fa
    
    minimap2 -x asm5 \
             $DIR/0-ref/kirkii.fa.gz \
             $DIR/5-curate/$name.fa \
             > $DIR/5-curate/$name.paf
    
    cd $DIR/5-curate    
    
    Rscript $DIR/apps/pafCoordsDotPlotly.R \
            -i $DIR/5-curate/$name.paf \
            -o $name \
            -m 20000 \
            -p 20 \
            -x -s -t -l
        

Hic

  • Align hic to curated genomes
    • Create bwa database for assembly
      ml python
      ROOT=$(git rev-parse --show-toplevel)
      PATH=$DIR/apps/bwa-0.7.17:$PATH
      PATH=$DIR/apps/agp-curator/:$PATH
      
      name=$(sed -n ${SLURM_ARRAY_TASK_ID}p <<<"$genomes")
      
      magpie $DIR/5-curate/$name.magpie \
             $DIR/4-arrange/$name.agp \
             > $DIR/5-curate/$name.agp
      
      agptools assemble \
               $DIR/2-polish/$name.fa \
               $DIR/5-curate/$name.agp \
               > $DIR/5-curate/$name.fa
      
      bwa index -a bwtsw \
          -p $DIR/5-curate/hic/0-db/$name \
           $DIR/5-curate/$name.fa
              
    • Align Hi-C data to assembly
      ROOT=$(git rev-parse --show-toplevel)
      RAW=$(realpath $ROOT/wgs/0-raw/)
      
      PATH=$DIR/apps/bwa-0.7.17:$PATH
      PATH=$DIR/apps/samtools-1.17/bin:$PATH
      PATH=$PATH:$DIR/apps/samblaster-v.0.1.26/
      
      line=$(sed -n ${SLURM_ARRAY_TASK_ID}p <<<"$data")
      read name fwd rev <<<"$line"
      
      bwa mem -5SP -t 48 \
          $DIR/5-curate/hic/0-db/$name $RAW/$fwd $RAW/$rev |
          samblaster |
          samtools view -bS -F 2316 |
          samtools sort -m 60G \
                   -o $DIR/5-curate/hic/1-bwa/$name.bam
      
      samtools index $DIR/5-curate/hic/1-bwa/$name.bam
              

Finialize

yahs split some contigs that are now adjacent in the agp file. Finding and combining all those to reduce gaps.

ml python
PATH=$DIR/apps/agp-curator/:~/.local/bin:$PATH

cat <<<"$genomes" |
    while read name; do
        magpie --simplify \
               $DIR/5-curate/$name.magpie \
               $DIR/4-arrange/$name.agp \
               > $DIR/5-curate/$name.agp

        agptools assemble \
                 $DIR/2-polish/$name.fa \
                 $DIR/5-curate/$name.agp |
            gzip > $DIR/5-curate/$name.fa.gz
    done

Kd

Kd was curated manually. Only thing needed is to orient chrs based on kirkii

DIR=.

PATH=$DIR/apps/samtools-1.17/:$PATH

Kd=$DIR/Kokia_drynarioides_JFW-HI/Kokia_drynarioides.fasta
#  minimap2 -x asm5 $DIR/0-ref/kirkii.fa.gz $Kd |
cat $DIR/5-curate/Kd.paf |
      awk '$11 >= 100000 {
         i = 1;
         if($5 == "-") i = -1;
         _[$1] += i; }
       END{for(i in _) print i, _[i];}' OFS="\t" |
      while read chr dir; do
          flags="--mark-strand no"
          if [ $dir -lt 0 ]; then
              flags="-i $flags"
          fi

          samtools faidx $flags $Kd $chr
      done |
      gzip > $DIR/5-curate/Kd.fa.gz

NCBI Submission

Mostly ONT adapter contamination found on the end of internal contigs; however, since I’ve uploaded gapped fasta files, the contamination check won’t remove them.

use strict;
use warnings;

my ($contam_file, $genome_file) = @ARGV;
my $fh;

my $contam = {};
open($fh, $contam_file);
my $headers;
while(<$fh>){
    chomp;

    #skip empty lines
    next unless length $_;

    if($_ eq "Exclude:" || $_ eq "Trim:"){
        $_ = readline $fh;
        chomp;

        $headers=[split ", "];
        next;
    }

    #skip header lines
    next unless defined $headers;

    my %d;
    @d{@$headers} = split "\t";

    $contam->{$d{"Sequence name"}} ||= [];
    push(@{$contam->{$d{"Sequence name"}}}, $d{"span(s)"});
}
close($fh);

open($fh, $genome_file);
my $curpos;
while(<$fh>) {
    next unless s/^>//;
    chomp;

    my $h = $_;
    my $s ="";
    for ($curpos = tell($fh); $_ = readline($fh); $curpos = tell($fh)) {
        if(/^>/){
            seek($fh, $curpos, 0);
            last;
        }
        chomp;
        $s .= $_;
    }

    if($contam->{$h}){

        # trim
        if(defined($contam->{$h}->[0])){
            foreach my $range (@{$contam->{$h}}){
                my ($start, $end) = split "[.][.]", $range, 2;
                $start -= 1;
                my $length = $end - $start;

                my $type;
                if($start == 0 ||
                   substr($s, $start-1, 1) eq "N" ||
                   substr($s, $end, 1) eq "N" ||
                   substr($s, $end, 1) eq "") {
                    $type = "Extending";
                } else {
                    $type = "Adding";
                }

                print STDERR join "\t", $h, $range, $length, $type,
                    substr($s, $start, $length);
                print STDERR "\n";

                my $check = length($s);
                substr($s, $start, $length, "X"x$length);
                die "$check != " . length($s)
                    unless $check == length($s);

                print STDERR substr($s, $start, $length);
                print STDERR "\n"
            }
        # exclude
        }else{
            print STDERR join "\t", $h, "Exclude";
            print STDERR "\n";
            next;
        }
    }

    my $gap = "N"x100;
    printf ">%s\n%s\n", $h,
        join($gap, grep {length($_) >= 200} split "[NXnx]+", $s);
}
close($fh);

First Pass

for i in Kc Kd Kk; do
    ./6-ncbi/remove.contam.pl \
        6-ncbi/first/Contamination_$i.txt \
        5-curate/$i.fa \
        2> 6-ncbi/first/$i.report |
        fold -w60 > 6-ncbi/first/$i.fa;
done

Second Pass.

for i in Kc Kk; do
    gunzip 6-ncbi/$i.fa.gz
    ./6-ncbi/remove.contam.pl \
        6-ncbi/second/Contamination_$i.2.txt \
        6-ncbi/$i.fa \
        2> 6-ncbi/second/$i.report |
        fold -w60 > 6-ncbi/second/$i.fa;
    ./6-ncbi/seq-lengh.pl 6-ncbi/second/$i.fa \
        > 6-ncbi/second/$i.len
done
use strict;
use warnings;

use List::Util qw/min max/;

my ($genome_file) = @ARGV;
my $fh;

open($fh, $genome_file);
my $curpos;
while(<$fh>) {
    next unless s/^>//;
    chomp;

    my $h = $_;
    my $s ="";
    for ($curpos = tell($fh); $_ = readline($fh); $curpos = tell($fh)) {
        if(/^>/){
            seek($fh, $curpos, 0);
            last;
        }
        chomp;
        $s .= $_;
    }

    my @contigs = map {length} split 'NN*', $s;

    print join "\t", $h, length($s), scalar(@contigs),
        min(@contigs), max(@contigs);
    print "\n";
}
close($fh);