Skip to content

Methods for assembling and annotating four Campylobacter jejuni strains

Notifications You must be signed in to change notification settings

IGBB/campylobacter_jejuni

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

4 Commits
 
 

Repository files navigation

Assembly and Annotation of Four C. jejuni Strains

Raw Data

StrainAccessionType
MS2167SRR20020687Illumina
MS2074SRR20020688Illumina
MS2058SRR20020689Illumina
MS2005SRR20020690Illumina
MS2167SRR20020691Nanopore
MS2074SRR20020692Nanopore
MS2058SRR20020693Nanopore
MS2005SRR20020694Nanopore
  • Download SRA data
    ROOT=$(git rev-parse --show-toplevel)
    
    for SRA in "${sra[@]}"; do
    
        URL=ftp://ftp.sra.ebi.ac.uk/vol1/fastq/${SRA:0:6}/
    
        if [ ${#SRA} -gt 9 ]; then
            TMP=00${SRA:9}
            URL+=${TMP: -3}/
        fi
    
        URL+=$SRA
    
        wget -O $ROOT/0-raw/${SRA}_1.fastq.gz $URL/${SRA}_1.fastq.gz
        wget -O $ROOT/0-raw/${SRA}_2.fastq.gz $URL/${SRA}_2.fastq.gz
    
    done
    
        

Assemble

  1. Canu
    ROOT=$(git rev-parse --show-toplevel)
    cd $ROOT
    
    PATH=$ROOT/apps/canu-1.9/bin:$PATH
    
    name=${names[$SLURM_ARRAY_TASK_ID]}
    
    canu -d $ROOT/1-assemble/1-canu/$name -p "$name" \
        genomeSize=2m useGrid=false \
        -nanopore-raw $ROOT/0-raw/${data[$name]}_1.fastq.gz
    
    
        
  2. Trim
    ROOT=$(git rev-parse --show-toplevel)
    DIR=$ROOT/1-assemble
    cd $DIR
    
    name=${names[$SLURM_ARRAY_TASK_ID]}
    assembly=$DIR/1-canu/$name/$name.contigs.fasta
    
    ml igbb samtools blast
    ml singularity
    
    mkdir $DIR/2-trim/$name
    
    grep '>' -A10 --no-group-separator $assembly |
        blastn -query - -subject $assembly -outfmt '6 std qcovs' -out $DIR/2-trim/$name/blast.out
    
    awk '$1 == $2 && $3 >= 95 && $13 >= 95 && $9 > 1000 && $9 < $10 && $1 != last {
         printf "%s:1-%d\n", $1, $9; last=$1}' $DIR/2-trim/$name/blast.out \
             > $DIR/2-trim/$name/trim.lst
    
    xargs samtools faidx $assembly \
        < $DIR/2-trim/$name/trim.lst \
        > $DIR/2-trim/$name/trim.fa
    
    sed 's/:.*//' $DIR/2-trim/$name/trim.lst |
        grep -f - -v $assembly.fai |
        cut -f 1 > $DIR/2-trim/$name/unchanged.txt
    
    cat $DIR/2-trim/$name/unchanged.txt |
        xargs samtools faidx $assembly >> $DIR/2-trim/$name/trim.fa
    
        
  3. Fix start
    >NC_000913.3:c3883729-3882326 Escherichia coli str. K-12 substr. MG1655, complete genome
    GTGTCACTTTCGCTTTGGCAGCAGTGTCTTGCCCGATTGCAGGATGAGTTACCAGCCACAGAATTCAGTA
    TGTGGATACGCCCATTGCAGGCGGAACTGAGCGATAACACGCTGGCCCTGTACGCGCCAAACCGTTTTGT
    CCTCGATTGGGTACGGGACAAGTACCTTAATAATATCAATGGACTGCTAACCAGTTTCTGCGGAGCGGAT
    GCCCCACAGCTGCGTTTTGAAGTCGGCACCAAACCGGTGACGCAAACGCCACAAGCGGCAGTGACGAGCA
    ACGTCGCGGCCCCTGCACAGGTGGCGCAAACGCAGCCGCAACGTGCTGCGCCTTCTACGCGCTCAGGTTG
    GGATAACGTCCCGGCCCCGGCAGAACCGACCTATCGTTCTAACGTAAACGTCAAACACACGTTTGATAAC
    TTCGTTGAAGGTAAATCTAACCAACTGGCGCGCGCGGCGGCTCGCCAGGTGGCGGATAACCCTGGCGGTG
    CCTATAACCCGTTGTTCCTTTATGGCGGCACGGGTCTGGGTAAAACTCACCTGCTGCATGCGGTGGGTAA
    CGGCATTATGGCGCGCAAGCCGAATGCCAAAGTGGTTTATATGCACTCCGAGCGCTTTGTTCAGGACATG
    GTTAAAGCCCTGCAAAACAACGCGATCGAAGAGTTTAAACGCTACTACCGTTCCGTAGATGCACTGCTGA
    TCGACGATATTCAGTTTTTTGCTAATAAAGAACGATCTCAGGAAGAGTTTTTCCACACCTTCAACGCCCT
    GCTGGAAGGTAATCAACAGATCATTCTCACCTCGGATCGCTATCCGAAAGAGATCAACGGCGTTGAGGAT
    CGTTTGAAATCCCGCTTCGGTTGGGGACTGACTGTGGCGATCGAACCGCCAGAGCTGGAAACCCGTGTGG
    CGATCCTGATGAAAAAGGCCGACGAAAACGACATTCGTTTGCCGGGCGAAGTGGCGTTCTTTATCGCCAA
    GCGTCTACGATCTAACGTACGTGAGCTGGAAGGGGCGCTGAACCGCGTCATTGCCAATGCCAACTTTACC
    GGACGGGCGATCACCATCGACTTCGTGCGTGAGGCGCTGCGCGACTTGCTGGCATTGCAGGAAAAACTGG
    TCACCATCGACAATATTCAGAAGACGGTGGCGGAGTACTACAAGATCAAAGTCGCGGATCTCCTTTCCAA
    GCGTCGATCCCGCTCGGTGGCGCGTCCGCGCCAGATGGCGATGGCGCTGGCGAAAGAGCTGACTAACCAC
    AGTCTGCCGGAGATTGGCGATGCGTTTGGTGGCCGTGACCACACGACGGTGCTTCATGCCTGCCGTAAGA
    TCGAGCAGTTGCGTGAAGAGAGCCACGATATCAAAGAAGATTTTTCAAATTTAATCAGAACATTGTCATC
    GTAA
    
        
    ROOT=$(git rev-parse --show-toplevel)
    DIR=$ROOT/1-assemble
    cd $DIR
    
    ml singularity
    
    name=${names[$SLURM_ARRAY_TASK_ID]}
    assembly=$DIR/2-trim/$name/trim.fa
    
    mkdir $DIR/3-fixstart/$name
    
    singularity exec -B$ROOT --cleanenv $ROOT/apps/circlator.simg \
        circlator fixstart \
        --genes_fa $DIR/3-fixstart/dnaa.fa \
        --ignore $DIR/2-trim/$name/unchanged.txt \
        $assembly \
        $DIR/3-fixstart/$name/fixstart
    
        
  4. Filter
    ROOT=$(git rev-parse --show-toplevel)
    DIR=$ROOT/1-assemble
    cd $DIR
    
    ml igbb samtools bwa
    
    name=${names[$SLURM_ARRAY_TASK_ID]}
    assembly=$DIR/3-fixstart/$name/fixstart.fasta
    
    mkdir $DIR/4-filter/$name
    
    bwa index $assembly
    
    bwa mem -t $SLURM_CPUS_PER_TASK $assembly \
        $ROOT/raw/${data[$name]}_{1,2}.fastq.gz |
        samtools view -bS -F4 - |
        samtools sort -o $DIR/4-filter/$name/frags.bam -T $DIR/4-filter/$name/frags.tmp -
    
    samtools index $DIR/4-filter/$name/frags.bam
    
        
    ROOT=$(git rev-parse --show-toplevel)
    DIR=$ROOT/1-assemble
    cd $DIR
    
    ml igbb samtools bwa
    
    name=${names[$SLURM_ARRAY_TASK_ID]}
    assembly=$DIR/3-fixstart/$name/fixstart.fasta
    
    ml igbb samtools
    bedtools () { $ROOT/apps/bedtools-2.29.2 $@; }
    
    samtools faidx $assembly
    
    bedtools genomecov -ibam $DIR/4-filter/$name/frags.bam > $DIR/4-filter/$name/genome.cov
    
    awk '$1 != "genome" && $2 == 0 && $5 == 1 {s[$1]++}
         {_[$1]++}
         END {
           s["genome"]++;
           for(k in _)
             if(!s[k])
               print k
         }' $DIR/4-filter/$name/genome.cov > $DIR/4-filter/$name/filter.lst
    
    
    xargs samtools faidx $assembly \
        < $DIR/4-filter/$name/filter.lst\
        > $DIR/4-filter/$name/filtered.fa
    
        
  5. Pilon
    ROOT=$(git rev-parse --show-toplevel)
    DIR=$ROOT/1-assemble
    cd $DIR
    
    ml igbb samtools bwa
    
    name=${names[$SLURM_ARRAY_TASK_ID]}
    assembly=$DIR/4-filter/$name/filtered.fa
    
    ml igbb bwa samtools
    
    mkdir $DIR/5-pilon/$name
    
    java -Xmx16G -jar $ROOT/apps/pilon-1.23.jar \
        --genome $assembly \
        --frags $DIR/4-filter/$name/frags.bam \
        --outdir $DIR/5-pilon/$name/ \
        --output pilon \
        --changes
    
        
  6. Coverage
    ROOT=$(git rev-parse --show-toplevel)
    DIR=$ROOT/1-assemble
    cd $DIR
    
    ml igbb samtools bwa minimap2
    ml singularity
    bedtools () { $ROOT/apps/bedtools-2.29.2 $@; }
    
    name=${names[$SLURM_ARRAY_TASK_ID]}
    assembly=$DIR/5-pilon/$name/pilon.fasta
    
    mkdir $DIR/6-coverage/$name
    
    minimap2 -x map-ont -d $assembly.mmi $assembly
    bwa index $assembly
    samtools faidx $assembly
    
    minimap2 -ax map-ont -t $SLURM_CPUS_PER_TASK -a --sam-hit-only \
        $assembly.mmi $ROOT/0-raw/${nanopore[$name]}_1.fastq.gz |
        samtools sort -m10G -o $DIR/6-coverage/$name/nanopore.bam \
            -T $DIR/6-coverage/$name/nanopore.tmp -
    
    bwa mem -t $PBS_NUM_PPN -R "@RG\tID:$name\tPL:illumina\tSM:$name\tLB:$name" \
        $assembly $ROOT/0-raw/${illumina[$name]}_{1,2}.fastq.gz |
        samtools fixmate -O bam - - |
        samtools sort -o $DIR/6-coverage/${name}/illumina.bam \
            -T $DIR/6-coverage/${name}/illumina.tmp -
    
    samtools index $DIR/6-coverage/${name}/illumina.bam
    
    for type in nanopore illumina; do
        bedtools genomecov -ibam $DIR/6-coverage/$name/$type.bam > $DIR/6-coverage/$name/$type.gencov
        bedtools genomecov -bga -ibam $DIR/6-coverage/$name/$type.bam > $DIR/6-coverage/$name/$type.perbase
    
        bedtools makewindows -g $assembly.fai -n 1 |
            bedtools coverage -sorted -a - -b $DIR/6-coverage/$name/$type.bam -g $assembly.fai \
                > $DIR/6-coverage/$name/$type.hist
    
        bedtools makewindows -g $assembly.fai -w 100 |
            bedtools coverage -sorted -a - -b $DIR/6-coverage/$name/$type.bam -g $assembly.fai \
                > $DIR/6-coverage/$name/$type.coverage
    done
        
    for file in 6-coverage/MS*/*.gencov ; do
        echo $file;
        awk '{t[$1]+=$2*$3; c[$1]+=$3;} END {for(i in t) print i, t[i]/c[i];}' $file;
    done
        
MS2005IlluminaNanopore
genome587.654732.671
tig00000001:1-1748626_pilon571.423736.11
tig00000002:1-5175_pilon6746.58404.858
tig00000005_pilon0.060029815.5255
MS2058IlluminaNanopore
genome885.409465.478
tig00000002:1-40652_pilon1085.76231.062
tig00000006:1-1750266_pilon889.547472.265
tig00000007_pilon520.681409.633
MS2074IlluminaNanopore
genome986.434690.612
tig00000001:1-1621348_pilon986.434690.612
MS2169IlluminaNanopore
genome1007.31950.819
tig00000001:1-1702611_pilon1007.31950.819

Annotation

  1. Prokka
    ROOT=$(git rev-parse --show-toplevel)
    DIR=$ROOT/2-annotation
    cd $DIR
    
    
    ml singularity
    prokka () { singularity exec -B $ROOT $ROOT/apps/prokka-v1.13.sif prokka $@;}
    
    for name in "${names[@]]}"; do
        assembly=$ROOT/1-assembly/5-pilon/$name/pilon.fasta
    
        prokka --cpus 12 \
            --outdir $DIR/1-prokka/$name \
            --prefix $name \
            --locustag $name \
            $assembly
    done
        

About

Methods for assembling and annotating four Campylobacter jejuni strains

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published