Skip to content

Latest commit

 

History

History
610 lines (465 loc) · 20.1 KB

readme.org

File metadata and controls

610 lines (465 loc) · 20.1 KB

Whole Genome Sequencing Samples

ROOT=$(git rev-parse --show-toplevel)
DIR=$ROOT/wgs/0-raw/

PATH=$DIR/ont-guppy/bin/:$PATH
id="${flowcells[$SLURM_ARRAY_TASK_ID]}"

SCRATCH=/local/scratch/tony.arick/$SLURM_JOB_ID

cp $DIR/$id/fast5_all.tar $SCRATCH

mkdir $SCRATCH/fast5/ $SCRATCH/super
tar -C $SCRATCH/fast5 -xvf $SCRATCH/fast5_all.tar

guppy_basecaller -c dna_r10.4.1_e8.2_260bps_sup.cfg \
    --compress_fastq \
    --device "cuda:1:100%" \
    --input_path $SCRATCH/fast5 \
    --save_path $SCRATCH/super

cp -r $SCRATCH/super $DIR/$id/

Kokia kauaiensis S9

Nanopore

The machine crashed several times on the last couple run (suspected software issues).

IDDirectory
FAT17276Kk/20220401_1133_X3_FAT17276_1c6df87c
FAT09348Kk/20220401_1507_X4_FAT09348_18327be0
FAU07188Kk/20220610_1415_X4_FAU07188_e2f8daf7
FAU08590Kk/20220610_1415_X5_FAU08590_f1023326
FAU93705.1Kk/20230207_2137_X3_FAU93705_83076433
FAU93705.2Kk/20230208_1718_X3_FAU93705_32b3c63f
FAU92850.1Kk/20230215_2215_X5_FAU92850_879f24da
FAU92850.2Kk/20230217_1422_X5_FAU92850_880260d2
FAU92850.3Kk/20230218_0141_X5_FAU92850_5e177b5a

First two runs were super basecalled on the GridION. This took 9 days while using the V100s takes 8hrs. So, the second set of runs (20220610) were super basecalled on the V100s, making the names slightly different, hence the new table.

IDDirectory
FAT17276Kk/20220401_1133_X3_FAT17276_1c6df87c/fastq_pass.tar
FAT09348Kk/20220401_1507_X4_FAT09348_18327be0/fastq_pass.tar
FAU07188Kk/20220610_1415_X4_FAU07188_e2f8daf7/super.pass.tar
FAU08590Kk/20220610_1415_X5_FAU08590_f1023326/super.pass.tar
FAU93705.1Kk/20230207_2137_X3_FAU93705_83076433/super.pass.tar
FAU93705.2Kk/20230208_1718_X3_FAU93705_32b3c63f/super.pass.tar
FAU92850.1Kk/20230215_2215_X5_FAU92850_879f24da/super.pass.tar
FAU92850.2Kk/20230217_1422_X5_FAU92850_880260d2/super.pass.tar
FAU92850.3Kk/20230218_0141_X5_FAU92850_5e177b5a/super.pass.tar

tar data

tar‘ing fastq and fast5 files to simplify transfer. Both pass and fail fast5 files are added together. The file name labels the data as pass/fail. Running md5sum pre- and post- tar to validate.

check () {
    error=$1
    if [ $error -eq 0 ]; then
        echo "[ OK ]"
    else
        echo "[FAIL]"
    fi
}



for id in "${flowcells[@]}"; do
    pushd $DIR/$id/

    printf "compressing summary ............ "
    gzip sequencing_summary_*.txt
    check $?

    for dir in fastq_{fail,pass} fast5_pass; do
        printf "pre-tar md5sum [$dir] .... "
        find $dir -type f -execdir md5sum {} \; > $dir.pre.md5
        check $?

        printf "tar files [$dir] ......... "
        tar -C $dir -cf $dir.tar ./
        check $?

        printf "post-tar md5sum [$dir] ... "
        tar --to-command=md5sum -vxf $dir.tar |
            awk 'NR > 2{printf "%s  ./%s\n", $2, $1}' \
                RS='./' FS="[\n ]" > $dir.post.md5
        check $?

        printf "compare md5sum [$dir] .... "
        cmp --quiet $dir.{pre,post}.md5
        check $?
    done

    dir=fast5_fail
    mv fast5_pass.tar fast5_all.tar

    printf "pre-tar md5sum [$dir] .... "
    find $dir -type f -execdir md5sum {} \; |
        cat fast5_pass.pre.md5 - > fast5_all.pre.md5
    check $?

    printf "tar files [$dir] ......... "
    tar -C $dir -rf fast5_all.tar ./
    check $?

    printf "post-tar md5sum [$dir] ... "
    tar --to-command=md5sum -vxf $dir.tar |
        awk 'NR > 2{printf "%s  ./%s\n", $2, $1}' \
            RS='./' FS="[\n ]" > fast5_all.post.md5
    check $?

    printf "compare md5sum [$dir] .... "
    cmp --quiet fast5_all.{pre,post}.md5
    check $?



    popd
done

Super Basecalling

<<basecaller>>
  • tar sup basecalled reads
    check () {
        error=$1
        if [ $error -eq 0 ]; then
            echo "[ OK ]"
        else
            echo "[FAIL]"
        fi
    }
    
    for id in "${flowcells[@]}"; do
        pushd $DIR/$id/
    
        printf "compressing summary ............ "
        gzip -c super/sequencing_summary.txt > super.sequencing_summary.txt.gz
        check $?
    
        for dir in pass fail; do
            printf "pre-tar md5sum [$dir] .... "
            find super/$dir -type f -execdir md5sum {} \; > super.$dir.pre.md5
            check $?
    
            printf "tar files [$dir] ......... "
            tar -C super/$dir -cf super.$dir.tar ./
            check $?
    
            printf "post-tar md5sum [$dir] ... "
            tar --to-command=md5sum -vxf super.$dir.tar |
                awk 'NR > 2{printf "%s  ./%s\n", $2, $1}' \
                    RS='./' FS="[\n ]" > super.$dir.post.md5
            check $?
    
            printf "compare md5sum [$dir] .... "
            cmp --quiet super.$dir.{pre,post}.md5
            check $?
        done
        popd
    done
    
        

Hi-C

Hi-C library made by Phase-Genomics plant kit

nameforwardreverse
kk-hicKk/hi-c/Kk_HiC_CKDL220020123-1A_HCWYNDSX5_L1_1.fq.gzKk/hi-c/Kk_HiC_CKDL220020123-1A_HCWYNDSX5_L1_2.fq.gz

Kokia cookei C69

Nanopore

Kc runs follow the same patter as Kk above.

IDDirectory
FAT09344Kc/20220401_1133_X1_FAT09344_ed52cffd/
FAT12726Kc/20220401_1507_X2_FAT12726_12343096/
FAU08661Kc/20220610_1415_X1_FAU08661_325fe942/
FAU08677Kc/20220610_1415_X2_FAU08677_f6a3b4f3/
FAU95766.1Kc/20230207_2137_X1_FAU95766_25130a77/
FAU95766.2Kc/20230208_1718_X1_FAU95766_6dbf6786/
FAU97335.1Kc/20230215_2215_X4_FAU97335_633c4b27
FAU97335.2Kc/20230217_1422_X4_FAU97335_1ba4d19b
FAU97335.3Kc/20230218_0141_X4_FAU97335_37bfe488
IDDirectory
FAT09344Kc/20220401_1133_X1_FAT09344_ed52cffd/fastq_pass.tar
FAT12726Kc/20220401_1507_X2_FAT12726_12343096/fastq_pass.tar
FAU08661Kc/20220610_1415_X1_FAU08661_325fe942/super.pass.tar
FAU08677Kc/20220610_1415_X2_FAU08677_f6a3b4f3/super.pass.tar
FAU95766.1Kc/20230207_2137_X1_FAU95766_25130a77/super.pass.tar
FAU95766.2Kc/20230208_1718_X1_FAU95766_6dbf6786/super.pass.tar
FAU97335.1Kc/20230215_2215_X4_FAU97335_633c4b27/super.pass.tar
FAU97335.2Kc/20230217_1422_X4_FAU97335_1ba4d19b/super.pass.tar
FAU97335.3Kc/20230218_0141_X4_FAU97335_37bfe488/super.pass.tar
tar data
see Kk
  • Super Basecalling
    <<basecaller>>
        
    • tar sup basecalled reads
      <<backup-sup>>
              
  • Combine fastq data
    for dir in "${flowcells[@]}"; do
        tar -Oxf $DIR/$dir/fastq_pass.tar > $DIR/$dir/fastq_pass.fq.gz
    done
        

Quality

Plot quality heatmap using sequencing_summary. Limited the x and y axis to 3 IQR range.

library(tidyverse)
library(data.table)

outlier_value <- function(x) {
quantile(x, probs=0.75) + 3* IQR(x)
}

files <- list.files('Kc', pattern="sequencing_summary_.*.txt", recursive = T, full.names = T)
names(files) <- sub('Kc/(.*)/.*', '\\1', files)

data <- lapply(files, fread) %>%
  bind_rows(.id='library') %>%
  select(library, sequence_length_template, mean_qscore_template) %>%
  mutate(length_outlier = sequence_length_template > outlier_value(sequence_length_template),
         score_outlier = mean_qscore_template > outlier_value(mean_qscore_template)) %>%
  mutate(x = ifelse(length_outlier, outlier_value(sequence_length_template), sequence_length_template),
         y = ifelse(score_outlier, outlier_value(mean_qscore_template), mean_qscore_template),
         outlier = length_outlier | score_outlier)


ggplot(data, aes(x, y, shape=outlier)) +
  geom_bin2d(binwidth=c(100, 0.5)) +
  scale_x_continuous(expand=c(0,0)) +
  scale_y_continuous(expand=c(0,0)) +
  facet_grid(rows='library') +
  theme_minimal()

Hi-C

Hi-C library made by Phase-Genomics plant kit

nameforwardreverse
kc-hicKc/hi-c/Kc_HiC_CKDL220020122-1A_HCWYNDSX5_L1_1.fq.gzKc/hi-c/Kc_HiC_CKDL220020122-1A_HCWYNDSX5_L1_2.fq.gz

Kokia drynarioides JFW-HI

Biosample: SAMN07559306

Nanopore

QCQCRunRun
IDPoresDatePoresDate
FAL2934215732019-08-1315362019-08-26
FAL2295311812019-08-1311612019-08-26
FAL1843512822019-08-1312532019-08-26
FAL1864712702019-08-1312102019-08-26
FAL2955813682019-08-1313622019-08-26
IDDirectory
FAL29342kd/nanopore/20190826_1854_GA10000_FAL29342_89e34370/
FAL22953kd/nanopore/20190826_1854_GA20000_FAL22953_79be28df/
FAL18435kd/nanopore/20190826_1854_GA30000_FAL18435_98887108/
FAL18647kd/nanopore/20190826_1854_GA40000_FAL18647_98f13006/
FAL29558kd/nanopore/20190826_1854_GA50000_FAL29558_fd9e83e6/
ROOT=$(git rev-parse --show-toplevel)

ml singularity
abyss-fac () {
    singularity exec -B$ROOT \
        /apps/singularity-3/abyss/abyss-v2.1.5-7-deb_cv1.sif \
        /usr/lib/abyss/abyss-fac $@
}

for name in "${!flowcells[@]}"; do
     abyss-fac $DIR/${flowcells[$name]}/fastq_pass.fq.gz
done
namenn:500L50minN75N50N25E-sizemaxsum
FAL2934233093503211137713370500411577201200686838196416.96e9
FAL2295320136541945000438810500475685691268993907415411.17e9
FAL1843526301472566886604515500444679961176287328507514.27e9
FAL1864731392793066621832101500458569879652757715891516.37e9
FAL2955828748912794287696116500437171301020277738426914.35e9

Super Accurate Basecalling

Two runs (0 and 2) had an issue, copied below, with read splitting. Removed the flag and reran just those.

[guppy/error] pipeline::ThreadedNode::worker_function: Exception thrown in dataHandlerNode worker thread: Basecall telemetry received for non-existent run segment of run ID

ml singularity
ROOT=$(git rev-parse --show-toplevel)

guppy () {
    singularity exec -B$ROOT ~/guppy-gpu.sif guppy_basecaller "$@"
}


id="${flowcells[$SLURM_ARRAY_TASK_ID]}"

SCRATCH=/local/scratch/tony.arick/$SLURM_JOB_ID

cp $DIR/$id/fast5_pass.tar $DIR/$id/fast5_fail.tar $SCRATCH

mkdir $SCRATCH/fast5/ $SCRATCH/super
tar -C $SCRATCH/fast5 -xvf $SCRATCH/fast5_pass.tar
tar -C $SCRATCH/fast5 -xvf $SCRATCH/fast5_fail.tar

if [ -e $DIR/$id/fast5_skip.tar ]; then
    cp $DIR/$id/fast5_skip.tar $SCRATCH
    tar -C $SCRATCH/fast5 -xvf $SCRATCH/fast5_skip.tar
fi

guppy -c dna_r9.4.1_450bps_sup_plant.cfg \
    --compress_fastq \
    --device "cuda:all:100%" \
    --input_path $SCRATCH/fast5 \
    --save_path $SCRATCH/super

cp -r $SCRATCH/super $DIR/$id/sup
  • tar files as usual
    datadir=("sup/super/" "sup/" "sup/super/" "sup/" "sup/")
    
    for i in {0..4}; do
        id="${flowcells[$i]}"
        pushd $DIR/$id/
    
        printf "Compressing summary    "
        gzip -c ${datadir[$i]}/sequencing_summary.txt > sup.sequencing_summary.txt.gz
        echo "[ OK ]"
        for dir in pass fail; do
            printf "Pre tar md5sum         "
            find ${datadir[$i]}/$dir -type f -execdir md5sum {} \; > sup.$dir.pre.md5 && echo "[ OK ]" || echo "[FAIL]"
    
            printf "Tar files              "
            tar -C ${datadir[$i]}/$dir -cf sup.$dir.tar ./ && echo "[ OK ]" || echo "[FAIL]"
    
            printf "Post tar md5sum        "
            tar --to-command=md5sum -vxf sup.$dir.tar |
                awk 'NR > 2{printf "%s  ./%s\n", $2, $1}' \
                    RS='./' FS="[\n ]" > sup.$dir.post.md5 && echo "[ OK ]" || echo "[FAIL]"
            printf "Check md5sum           "
            cmp --quiet sup.$dir.{pre,post}.md5 && echo "[ OK ]" || echo "[FAIL]"
        done
    
        printf "Create fastq file      "
        tar -Oxf sup.pass.tar > sup.pass.fq.gz && echo "[ OK ]" || echo "[FAIL]"
    
       popd
    done
        
  • Summary stats
    ml singularity
    ROOT=$(git rev-parse --show-toplevel)
    
    abyss-fac () {
     /apps/singularity-3/abyss/abyss-v2.1.5-7-deb_cv1.sif /usr/lib/abyss/abyss-fac "$@"
    }
    
    abyss-fac $(printf "$DIR/%s/sup.pass.fq.gz " "${flowcells[@]}")
        
namenn:500L50minN75N50N25E-sizemaxsum
FAL2934223990802335266524164500402375231174084596254312.16e9
FAL2295315880781496354334293500418376281151084142399827.665e9
FAL18435210145920448494840965004337783711525850512283611.17e9
FAL186472853813274729872873250040976425897369197067913.24e9
FAL295582622225250922561713050039226512942271056351311.7e9
Total115646551113299226884154112718510634788023998255.94e9

cleanup/backup

  • Since lustre doesn’t like lots of small files, tar the fast5 and fastq directories. Run md5sum pre- and post- tar for paranoia. Not gzip‘ing since the data being tar‘d is already compressed. Consolidate duplex run into pass/fail fastq files.

    Base-calling for two of the runs was cut short and restarted. Combining the original call directory (fastq_pass) and the restart directory (fastq_new) into the fastq_pass.tar.gz. The restarted reads start with fastq_runid_.

      for id in "${flowcells[@]}"; do
          pushd $DIR/$id/
    
          gzip *_sequencing_summary.txt fastq_{pass,fail}/*.fastq
          for dir in fast{5,q}_{fail,pass} fast5_skip; do
              [ -d $dir ] || continue
              find $dir -type f -execdir md5sum {} \; > $dir.pre.md5
              tar -C $dir -vcf $dir.tar ./
              tar --to-command=md5sum -vxf $dir.tar |
                  awk 'NR > 2{printf "%s  ./%s\n", $2, $1}' \
                      RS='./' FS="[\n ]" > $dir.post.md5
              diff $dir.{pre,post}.md5 > $dir.md5.check
          done
    
          if [ -d "fastq_new" ] ; then
              gzip fastq_new/*.fastq
    
              gzip --stdout fastq_new/sequencing_summary.txt >> *_sequencing_summary.txt.gz
              rm fastq_new/*.log fastq_new/sequencing_summary.txt fastq_new/sequencing_telemetry.js
    
              find fastq_new -type f -execdir md5sum {} \; >> fastq_pass.pre.md5
              tar -C fastq_new -vrf fastq_pass.tar ./
              tar --to-command=md5sum -vxf fastq_pass.tar |
                  awk 'NR > 2{printf "%s  ./%s\n", $2, $1}' \
                      RS='./' FS="[\n ]" > fastq_pass.post.md5
              diff fastq_pass.{pre,post}.md5  > fastq_pass.md5.check
          fi
    
    
          popd
    done
        
  • The following will get the fastq data from the tar file into a single fastq.gz file
    ROOT=$(git rev-parse --show-toplevel)
    
    for dir in "${flowcells[@]}"; do
        tar -Oxf $DIR/$dir/fastq_pass.tar > $DIR/$dir/fastq_pass.fq.gz
    done
        

    FAL29558 and FAL18435 had malformed fastq files. Excluding them since we have coverage to spare.

    tar -Oxf $DIR/kd/nanopore/20190826_1854_GA50000_FAL29558_fd9e83e6/fastq_pass.tar \
        --exclude=./FAL29558_c95ea3ae2ba850cfa310d38dc7f59268d6e2e2b8_628.fastq.gz \
        > $DIR/kd/nanopore/20190826_1854_GA50000_FAL29558_fd9e83e6/fastq_pass.fq.gz
    
    tar -Oxf $DIR/kd/nanopore/20190826_1854_GA30000_FAL18435_98887108/fastq_pass.tar \
        --exclude=./FAL18435_a661f44b2edcc62a3f030caaf8581f84b2fa6838_373.fastq.gz \
        > $DIR/kd/nanopore/20190826_1854_GA30000_FAL18435_98887108/fastq_pass.fq.gz
    
        
  • Add data to git-annex
    for id in "${flowcells[@]}"; do
        pushd $DIR/$id/
    
        git add report.pdf final_summary.txt
        git annex add *sequencing_summary.txt.gz \
            fast?_{pass,fail}.tar \
            fastq_pass.fq.gz \
            report.md duty_time.csv.gz throughput.csv.gz
        popd
    done
        

Illumina

RunInsertSizeInstrument
SRR6195037350Illumina MiSeq
SRR6195036350Illumina MiSeq
SRR6195040350Illumina HiSeq 2000
SRR6195039550Illumina MiSeq
SRR6195038550Illumina MiSeq
SRR6195041550Illumina HiSeq 2000
cd $DIR/kd/illumina/

for SRA in "${acc[@]}"; do

    URL=ftp://ftp.sra.ebi.ac.uk/vol1/fastq/${SRA:0:6}/

    if [ ${#SRA} -gt 9 ]; then
        DIR=00${SRA:9}
        URL+=${DIR: -3}/
    fi

    URL+=$SRA

    wget $URL/${SRA}_1.fastq.gz
    wget $URL/${SRA}_2.fastq.gz

done

Hi-C

Hi-C library made by Phase-Genomics

nameforwardreverse
kd-hickd/hi-c/kokia_S3HiC_R1.fastq.gzkd/hi-c/kokia_S3HiC_R2.fastq.gz

RNA-Seq

RunInstrument
SRR6195950HiSeq X Ten

SRA upload

ROOT=$(git rev-parse --show-toplevel)

for name in "${!kc[@]}"; do
    file=${kc[$name]}
    tar -Oxf $DIR/$file | zcat -f > $DIR/for-ncbi/$name.fq
done

for name in "${!kk[@]}"; do
    file=${kk[$name]}
    tar -Oxf $DIR/$file | zcat -f > $DIR/for-ncbi/$name.fq
done

for name in "${!kd[@]}"; do
    file=${kd[$name]}
    cp $DIR/$file/sup.pass.fq.gz $DIR/for-ncbi/$name.fq.gz
done