ROOT=$(git rev-parse --show-toplevel)
DIR=$ROOT/wgs/0-raw/
PATH=$DIR/ont-guppy/bin/:$PATH
id="${flowcells[$SLURM_ARRAY_TASK_ID]}"
SCRATCH=/local/scratch/tony.arick/$SLURM_JOB_ID
cp $DIR/$id/fast5_all.tar $SCRATCH
mkdir $SCRATCH/fast5/ $SCRATCH/super
tar -C $SCRATCH/fast5 -xvf $SCRATCH/fast5_all.tar
guppy_basecaller -c dna_r10.4.1_e8.2_260bps_sup.cfg \
--compress_fastq \
--device "cuda:1:100%" \
--input_path $SCRATCH/fast5 \
--save_path $SCRATCH/super
cp -r $SCRATCH/super $DIR/$id/
The machine crashed several times on the last couple run (suspected software issues).
ID | Directory |
---|---|
FAT17276 | Kk/20220401_1133_X3_FAT17276_1c6df87c |
FAT09348 | Kk/20220401_1507_X4_FAT09348_18327be0 |
FAU07188 | Kk/20220610_1415_X4_FAU07188_e2f8daf7 |
FAU08590 | Kk/20220610_1415_X5_FAU08590_f1023326 |
FAU93705.1 | Kk/20230207_2137_X3_FAU93705_83076433 |
FAU93705.2 | Kk/20230208_1718_X3_FAU93705_32b3c63f |
FAU92850.1 | Kk/20230215_2215_X5_FAU92850_879f24da |
FAU92850.2 | Kk/20230217_1422_X5_FAU92850_880260d2 |
FAU92850.3 | Kk/20230218_0141_X5_FAU92850_5e177b5a |
First two runs were super basecalled on the GridION. This took 9 days while using the V100s takes 8hrs. So, the second set of runs (20220610) were super basecalled on the V100s, making the names slightly different, hence the new table.
ID | Directory |
---|---|
FAT17276 | Kk/20220401_1133_X3_FAT17276_1c6df87c/fastq_pass.tar |
FAT09348 | Kk/20220401_1507_X4_FAT09348_18327be0/fastq_pass.tar |
FAU07188 | Kk/20220610_1415_X4_FAU07188_e2f8daf7/super.pass.tar |
FAU08590 | Kk/20220610_1415_X5_FAU08590_f1023326/super.pass.tar |
FAU93705.1 | Kk/20230207_2137_X3_FAU93705_83076433/super.pass.tar |
FAU93705.2 | Kk/20230208_1718_X3_FAU93705_32b3c63f/super.pass.tar |
FAU92850.1 | Kk/20230215_2215_X5_FAU92850_879f24da/super.pass.tar |
FAU92850.2 | Kk/20230217_1422_X5_FAU92850_880260d2/super.pass.tar |
FAU92850.3 | Kk/20230218_0141_X5_FAU92850_5e177b5a/super.pass.tar |
tar
‘ing fastq
and fast5
files to simplify transfer. Both pass and fail fast5
files are added together. The file name labels the data as pass/fail. Running md5sum
pre- and post- tar
to validate.
check () {
error=$1
if [ $error -eq 0 ]; then
echo "[ OK ]"
else
echo "[FAIL]"
fi
}
for id in "${flowcells[@]}"; do
pushd $DIR/$id/
printf "compressing summary ............ "
gzip sequencing_summary_*.txt
check $?
for dir in fastq_{fail,pass} fast5_pass; do
printf "pre-tar md5sum [$dir] .... "
find $dir -type f -execdir md5sum {} \; > $dir.pre.md5
check $?
printf "tar files [$dir] ......... "
tar -C $dir -cf $dir.tar ./
check $?
printf "post-tar md5sum [$dir] ... "
tar --to-command=md5sum -vxf $dir.tar |
awk 'NR > 2{printf "%s ./%s\n", $2, $1}' \
RS='./' FS="[\n ]" > $dir.post.md5
check $?
printf "compare md5sum [$dir] .... "
cmp --quiet $dir.{pre,post}.md5
check $?
done
dir=fast5_fail
mv fast5_pass.tar fast5_all.tar
printf "pre-tar md5sum [$dir] .... "
find $dir -type f -execdir md5sum {} \; |
cat fast5_pass.pre.md5 - > fast5_all.pre.md5
check $?
printf "tar files [$dir] ......... "
tar -C $dir -rf fast5_all.tar ./
check $?
printf "post-tar md5sum [$dir] ... "
tar --to-command=md5sum -vxf $dir.tar |
awk 'NR > 2{printf "%s ./%s\n", $2, $1}' \
RS='./' FS="[\n ]" > fast5_all.post.md5
check $?
printf "compare md5sum [$dir] .... "
cmp --quiet fast5_all.{pre,post}.md5
check $?
popd
done
<<basecaller>>
tar
sup basecalled readscheck () { error=$1 if [ $error -eq 0 ]; then echo "[ OK ]" else echo "[FAIL]" fi } for id in "${flowcells[@]}"; do pushd $DIR/$id/ printf "compressing summary ............ " gzip -c super/sequencing_summary.txt > super.sequencing_summary.txt.gz check $? for dir in pass fail; do printf "pre-tar md5sum [$dir] .... " find super/$dir -type f -execdir md5sum {} \; > super.$dir.pre.md5 check $? printf "tar files [$dir] ......... " tar -C super/$dir -cf super.$dir.tar ./ check $? printf "post-tar md5sum [$dir] ... " tar --to-command=md5sum -vxf super.$dir.tar | awk 'NR > 2{printf "%s ./%s\n", $2, $1}' \ RS='./' FS="[\n ]" > super.$dir.post.md5 check $? printf "compare md5sum [$dir] .... " cmp --quiet super.$dir.{pre,post}.md5 check $? done popd done
Hi-C library made by Phase-Genomics plant kit
name | forward | reverse |
---|---|---|
kk-hic | Kk/hi-c/Kk_HiC_CKDL220020123-1A_HCWYNDSX5_L1_1.fq.gz | Kk/hi-c/Kk_HiC_CKDL220020123-1A_HCWYNDSX5_L1_2.fq.gz |
Kc runs follow the same patter as Kk above.
ID | Directory |
---|---|
FAT09344 | Kc/20220401_1133_X1_FAT09344_ed52cffd/ |
FAT12726 | Kc/20220401_1507_X2_FAT12726_12343096/ |
FAU08661 | Kc/20220610_1415_X1_FAU08661_325fe942/ |
FAU08677 | Kc/20220610_1415_X2_FAU08677_f6a3b4f3/ |
FAU95766.1 | Kc/20230207_2137_X1_FAU95766_25130a77/ |
FAU95766.2 | Kc/20230208_1718_X1_FAU95766_6dbf6786/ |
FAU97335.1 | Kc/20230215_2215_X4_FAU97335_633c4b27 |
FAU97335.2 | Kc/20230217_1422_X4_FAU97335_1ba4d19b |
FAU97335.3 | Kc/20230218_0141_X4_FAU97335_37bfe488 |
ID | Directory |
---|---|
FAT09344 | Kc/20220401_1133_X1_FAT09344_ed52cffd/fastq_pass.tar |
FAT12726 | Kc/20220401_1507_X2_FAT12726_12343096/fastq_pass.tar |
FAU08661 | Kc/20220610_1415_X1_FAU08661_325fe942/super.pass.tar |
FAU08677 | Kc/20220610_1415_X2_FAU08677_f6a3b4f3/super.pass.tar |
FAU95766.1 | Kc/20230207_2137_X1_FAU95766_25130a77/super.pass.tar |
FAU95766.2 | Kc/20230208_1718_X1_FAU95766_6dbf6786/super.pass.tar |
FAU97335.1 | Kc/20230215_2215_X4_FAU97335_633c4b27/super.pass.tar |
FAU97335.2 | Kc/20230217_1422_X4_FAU97335_1ba4d19b/super.pass.tar |
FAU97335.3 | Kc/20230218_0141_X4_FAU97335_37bfe488/super.pass.tar |
tar
data- see Kk
- Super Basecalling
<<basecaller>>
tar
sup basecalled reads<<backup-sup>>
- Combine fastq data
for dir in "${flowcells[@]}"; do tar -Oxf $DIR/$dir/fastq_pass.tar > $DIR/$dir/fastq_pass.fq.gz done
Plot quality heatmap using sequencing_summary. Limited the x and y axis to 3 IQR range.
library(tidyverse)
library(data.table)
outlier_value <- function(x) {
quantile(x, probs=0.75) + 3* IQR(x)
}
files <- list.files('Kc', pattern="sequencing_summary_.*.txt", recursive = T, full.names = T)
names(files) <- sub('Kc/(.*)/.*', '\\1', files)
data <- lapply(files, fread) %>%
bind_rows(.id='library') %>%
select(library, sequence_length_template, mean_qscore_template) %>%
mutate(length_outlier = sequence_length_template > outlier_value(sequence_length_template),
score_outlier = mean_qscore_template > outlier_value(mean_qscore_template)) %>%
mutate(x = ifelse(length_outlier, outlier_value(sequence_length_template), sequence_length_template),
y = ifelse(score_outlier, outlier_value(mean_qscore_template), mean_qscore_template),
outlier = length_outlier | score_outlier)
ggplot(data, aes(x, y, shape=outlier)) +
geom_bin2d(binwidth=c(100, 0.5)) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
facet_grid(rows='library') +
theme_minimal()
Hi-C library made by Phase-Genomics plant kit
name | forward | reverse |
---|---|---|
kc-hic | Kc/hi-c/Kc_HiC_CKDL220020122-1A_HCWYNDSX5_L1_1.fq.gz | Kc/hi-c/Kc_HiC_CKDL220020122-1A_HCWYNDSX5_L1_2.fq.gz |
Biosample: SAMN07559306
QC | QC | Run | Run | |
ID | Pores | Date | Pores | Date |
---|---|---|---|---|
FAL29342 | 1573 | 2019-08-13 | 1536 | 2019-08-26 |
FAL22953 | 1181 | 2019-08-13 | 1161 | 2019-08-26 |
FAL18435 | 1282 | 2019-08-13 | 1253 | 2019-08-26 |
FAL18647 | 1270 | 2019-08-13 | 1210 | 2019-08-26 |
FAL29558 | 1368 | 2019-08-13 | 1362 | 2019-08-26 |
ID | Directory |
---|---|
FAL29342 | kd/nanopore/20190826_1854_GA10000_FAL29342_89e34370/ |
FAL22953 | kd/nanopore/20190826_1854_GA20000_FAL22953_79be28df/ |
FAL18435 | kd/nanopore/20190826_1854_GA30000_FAL18435_98887108/ |
FAL18647 | kd/nanopore/20190826_1854_GA40000_FAL18647_98f13006/ |
FAL29558 | kd/nanopore/20190826_1854_GA50000_FAL29558_fd9e83e6/ |
ROOT=$(git rev-parse --show-toplevel)
ml singularity
abyss-fac () {
singularity exec -B$ROOT \
/apps/singularity-3/abyss/abyss-v2.1.5-7-deb_cv1.sif \
/usr/lib/abyss/abyss-fac $@
}
for name in "${!flowcells[@]}"; do
abyss-fac $DIR/${flowcells[$name]}/fastq_pass.fq.gz
done
name | n | n:500 | L50 | min | N75 | N50 | N25 | E-size | max | sum |
---|---|---|---|---|---|---|---|---|---|---|
FAL29342 | 3309350 | 3211137 | 713370 | 500 | 4115 | 7720 | 12006 | 8683 | 81964 | 16.96e9 |
FAL22953 | 2013654 | 1945000 | 438810 | 500 | 4756 | 8569 | 12689 | 9390 | 74154 | 11.17e9 |
FAL18435 | 2630147 | 2566886 | 604515 | 500 | 4446 | 7996 | 11762 | 8732 | 85075 | 14.27e9 |
FAL18647 | 3139279 | 3066621 | 832101 | 500 | 4585 | 6987 | 9652 | 7577 | 158915 | 16.37e9 |
FAL29558 | 2874891 | 2794287 | 696116 | 500 | 4371 | 7130 | 10202 | 7773 | 84269 | 14.35e9 |
Two runs (0 and 2) had an issue, copied below, with read splitting. Removed the flag and reran just those.
[guppy/error] pipeline::ThreadedNode::worker_function: Exception thrown in dataHandlerNode worker thread: Basecall telemetry received for non-existent run segment of run ID
ml singularity
ROOT=$(git rev-parse --show-toplevel)
guppy () {
singularity exec -B$ROOT ~/guppy-gpu.sif guppy_basecaller "$@"
}
id="${flowcells[$SLURM_ARRAY_TASK_ID]}"
SCRATCH=/local/scratch/tony.arick/$SLURM_JOB_ID
cp $DIR/$id/fast5_pass.tar $DIR/$id/fast5_fail.tar $SCRATCH
mkdir $SCRATCH/fast5/ $SCRATCH/super
tar -C $SCRATCH/fast5 -xvf $SCRATCH/fast5_pass.tar
tar -C $SCRATCH/fast5 -xvf $SCRATCH/fast5_fail.tar
if [ -e $DIR/$id/fast5_skip.tar ]; then
cp $DIR/$id/fast5_skip.tar $SCRATCH
tar -C $SCRATCH/fast5 -xvf $SCRATCH/fast5_skip.tar
fi
guppy -c dna_r9.4.1_450bps_sup_plant.cfg \
--compress_fastq \
--device "cuda:all:100%" \
--input_path $SCRATCH/fast5 \
--save_path $SCRATCH/super
cp -r $SCRATCH/super $DIR/$id/sup
tar
files as usualdatadir=("sup/super/" "sup/" "sup/super/" "sup/" "sup/") for i in {0..4}; do id="${flowcells[$i]}" pushd $DIR/$id/ printf "Compressing summary " gzip -c ${datadir[$i]}/sequencing_summary.txt > sup.sequencing_summary.txt.gz echo "[ OK ]" for dir in pass fail; do printf "Pre tar md5sum " find ${datadir[$i]}/$dir -type f -execdir md5sum {} \; > sup.$dir.pre.md5 && echo "[ OK ]" || echo "[FAIL]" printf "Tar files " tar -C ${datadir[$i]}/$dir -cf sup.$dir.tar ./ && echo "[ OK ]" || echo "[FAIL]" printf "Post tar md5sum " tar --to-command=md5sum -vxf sup.$dir.tar | awk 'NR > 2{printf "%s ./%s\n", $2, $1}' \ RS='./' FS="[\n ]" > sup.$dir.post.md5 && echo "[ OK ]" || echo "[FAIL]" printf "Check md5sum " cmp --quiet sup.$dir.{pre,post}.md5 && echo "[ OK ]" || echo "[FAIL]" done printf "Create fastq file " tar -Oxf sup.pass.tar > sup.pass.fq.gz && echo "[ OK ]" || echo "[FAIL]" popd done
- Summary stats
ml singularity ROOT=$(git rev-parse --show-toplevel) abyss-fac () { /apps/singularity-3/abyss/abyss-v2.1.5-7-deb_cv1.sif /usr/lib/abyss/abyss-fac "$@" } abyss-fac $(printf "$DIR/%s/sup.pass.fq.gz " "${flowcells[@]}")
name | n | n:500 | L50 | min | N75 | N50 | N25 | E-size | max | sum |
---|---|---|---|---|---|---|---|---|---|---|
FAL29342 | 2399080 | 2335266 | 524164 | 500 | 4023 | 7523 | 11740 | 8459 | 62543 | 12.16e9 |
FAL22953 | 1588078 | 1496354 | 334293 | 500 | 4183 | 7628 | 11510 | 8414 | 239982 | 7.665e9 |
FAL18435 | 2101459 | 2044849 | 484096 | 500 | 4337 | 7837 | 11525 | 8505 | 122836 | 11.17e9 |
FAL18647 | 2853813 | 2747298 | 728732 | 500 | 4097 | 6425 | 8973 | 6919 | 70679 | 13.24e9 |
FAL29558 | 2622225 | 2509225 | 617130 | 500 | 3922 | 6512 | 9422 | 7105 | 63513 | 11.7e9 |
Total | 11564655 | 11132992 | 2688415 | 4112 | 7185 | 10634 | 7880 | 239982 | 55.94e9 |
- Since lustre doesn’t like lots of small files,
tar
the fast5 and fastq directories. Runmd5sum
pre- and post-tar
for paranoia. Notgzip
‘ing since the data beingtar
‘d is already compressed. Consolidate duplex run into pass/fail fastq files.Base-calling for two of the runs was cut short and restarted. Combining the original call directory (fastq_pass) and the restart directory (fastq_new) into the fastq_pass.tar.gz. The restarted reads start with
fastq_runid_
.for id in "${flowcells[@]}"; do pushd $DIR/$id/ gzip *_sequencing_summary.txt fastq_{pass,fail}/*.fastq for dir in fast{5,q}_{fail,pass} fast5_skip; do [ -d $dir ] || continue find $dir -type f -execdir md5sum {} \; > $dir.pre.md5 tar -C $dir -vcf $dir.tar ./ tar --to-command=md5sum -vxf $dir.tar | awk 'NR > 2{printf "%s ./%s\n", $2, $1}' \ RS='./' FS="[\n ]" > $dir.post.md5 diff $dir.{pre,post}.md5 > $dir.md5.check done if [ -d "fastq_new" ] ; then gzip fastq_new/*.fastq gzip --stdout fastq_new/sequencing_summary.txt >> *_sequencing_summary.txt.gz rm fastq_new/*.log fastq_new/sequencing_summary.txt fastq_new/sequencing_telemetry.js find fastq_new -type f -execdir md5sum {} \; >> fastq_pass.pre.md5 tar -C fastq_new -vrf fastq_pass.tar ./ tar --to-command=md5sum -vxf fastq_pass.tar | awk 'NR > 2{printf "%s ./%s\n", $2, $1}' \ RS='./' FS="[\n ]" > fastq_pass.post.md5 diff fastq_pass.{pre,post}.md5 > fastq_pass.md5.check fi popd done
- The following will get the fastq data from the tar file into a single fastq.gz
file
ROOT=$(git rev-parse --show-toplevel) for dir in "${flowcells[@]}"; do tar -Oxf $DIR/$dir/fastq_pass.tar > $DIR/$dir/fastq_pass.fq.gz done
FAL29558 and FAL18435 had malformed fastq files. Excluding them since we have coverage to spare.
tar -Oxf $DIR/kd/nanopore/20190826_1854_GA50000_FAL29558_fd9e83e6/fastq_pass.tar \ --exclude=./FAL29558_c95ea3ae2ba850cfa310d38dc7f59268d6e2e2b8_628.fastq.gz \ > $DIR/kd/nanopore/20190826_1854_GA50000_FAL29558_fd9e83e6/fastq_pass.fq.gz tar -Oxf $DIR/kd/nanopore/20190826_1854_GA30000_FAL18435_98887108/fastq_pass.tar \ --exclude=./FAL18435_a661f44b2edcc62a3f030caaf8581f84b2fa6838_373.fastq.gz \ > $DIR/kd/nanopore/20190826_1854_GA30000_FAL18435_98887108/fastq_pass.fq.gz
- Add data to git-annex
for id in "${flowcells[@]}"; do pushd $DIR/$id/ git add report.pdf final_summary.txt git annex add *sequencing_summary.txt.gz \ fast?_{pass,fail}.tar \ fastq_pass.fq.gz \ report.md duty_time.csv.gz throughput.csv.gz popd done
Run | InsertSize | Instrument |
---|---|---|
SRR6195037 | 350 | Illumina MiSeq |
SRR6195036 | 350 | Illumina MiSeq |
SRR6195040 | 350 | Illumina HiSeq 2000 |
SRR6195039 | 550 | Illumina MiSeq |
SRR6195038 | 550 | Illumina MiSeq |
SRR6195041 | 550 | Illumina HiSeq 2000 |
cd $DIR/kd/illumina/
for SRA in "${acc[@]}"; do
URL=ftp://ftp.sra.ebi.ac.uk/vol1/fastq/${SRA:0:6}/
if [ ${#SRA} -gt 9 ]; then
DIR=00${SRA:9}
URL+=${DIR: -3}/
fi
URL+=$SRA
wget $URL/${SRA}_1.fastq.gz
wget $URL/${SRA}_2.fastq.gz
done
Hi-C library made by Phase-Genomics
name | forward | reverse |
---|---|---|
kd-hic | kd/hi-c/kokia_S3HiC_R1.fastq.gz | kd/hi-c/kokia_S3HiC_R2.fastq.gz |
Run | Instrument |
---|---|
SRR6195950 | HiSeq X Ten |
ROOT=$(git rev-parse --show-toplevel)
for name in "${!kc[@]}"; do
file=${kc[$name]}
tar -Oxf $DIR/$file | zcat -f > $DIR/for-ncbi/$name.fq
done
for name in "${!kk[@]}"; do
file=${kk[$name]}
tar -Oxf $DIR/$file | zcat -f > $DIR/for-ncbi/$name.fq
done
for name in "${!kd[@]}"; do
file=${kd[$name]}
cp $DIR/$file/sup.pass.fq.gz $DIR/for-ncbi/$name.fq.gz
done