Using cutadapt v4.3 to remove illumina adapters
ml python/3.9.2
PATH=$HOME/.local/bin:$PATH
TRUSEQ_FWD=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA
TRUSEQ_REV=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
parallel --eta --xapply cutadapt \
-a $TRUSEQ_FWD -A $TRUSEQ_REV \
-o $DIR/1-filter/{1}.R1.fq.gz -p $DIR/1-filter/{1}.R2.fq.gz \
$DIR/0-raw/{2} $DIR/0-raw/{3} '>' $DIR/1-filter/{1}.log \
::: "${names[@]}" ::: "${fwd[@]}" ::: "${rev[@]}"
Plot log info
library(tidyverse)
row_to_colnames <- function(df) {
names(df) <- as.character(unlist(df[1,]))
df[-1,]
}
read.cutadapt <- function(log.file) {
con <- file(log.file)
lines <- readLines(con)
close.connection(con)
sections <- cumsum(grepl("^===", lines))
tabs <- grepl("\t", lines)
data <- lapply(split(lines[tabs], sections[tabs]), str_split_fixed, pattern="\t", n=5) %>%
lapply(as.data.frame) %>%
lapply(row_to_colnames) %>%
setNames(c('Fwd', 'Rev')) %>%
bind_rows(.id='Read') %>%
mutate_at(c("length", "count", "expect", "max.err"), as.numeric)
}
log <- list.files("1-filter", ".log", full.names = T) %>%
setNames(sub(".log", "", basename(.))) %>%
lapply(read.cutadapt) %>%
bind_rows(.id="log")
ggplot(log, aes(length, count, color=Read, group=length)) +
geom_boxplot() +
scale_x_continuous(trans='reverse', labels=function(x) 150-x) +
scale_y_continuous(labels=scales::label_number_si()) +
facet_grid(Read~.) +
theme_minimal()
ROOT=$(git rev-parse --show-toplevel)
ml singularity
bwa () {
singularity exec -B $ROOT \
/apps/singularity-3/bwa/bwa-0.7.17.sif bwa $@
}
samtools () {
singularity exec -B $ROOT \
/apps/singularity-3/samtools/samtools-v1.9-4-deb_cv1.sif \
samtools $@
}
fasta=$DIR/0-ref/Kd.contigs.fa
bwa index -a bwtsw $fasta
samtools faidx $fasta
ROOT=$(git rev-parse --show-toplevel)
name="${names[$SLURM_ARRAY_TASK_ID]}"
ml singularity
bwa () {
singularity exec -B $ROOT \
/apps/singularity-3/bwa/bwa-0.7.17.sif bwa $@
}
samtools () {
singularity exec -B $ROOT \
/apps/singularity-3/samtools/samtools-v1.9-4-deb_cv1.sif \
samtools $@
}
PATH=$PATH:$DIR/apps/samblaster-v.0.1.26/
fasta=$DIR/0-ref/Kd.contigs.fa
bwa mem -t 20 $fasta $DIR/1-filter/$name.R1.fq.gz $DIR/1-filter/$name.R2.fq.gz |
samblaster |
samtools view -bS |
samtools sort -m 35G -o $DIR/2-align/$name.bam -
for i in {0..162}; do
sed -e '/samblaster: -----/d' \
-e '/samblaster: Pair/,/samblaster: Total/s/samblaster: //p' \
-n $DIR/slurm-155609_$i.out |
sed 's/\(Pair\|Both\) /\1_/' |
awk 'NR == 1 {for(i=2; i < NF; i++) h[i]=$i;}
NR > 1 {for(i=2; i < NF; i++) print name, $1, h[i], $i;}' \
name=${names[$i]} OFS="\t"
done > $DIR/2-align/stats.txt
ROOT=$(git rev-parse --show-toplevel)
name="${names[$SLURM_ARRAY_TASK_ID]}"
ml singularity
samtools () {
singularity exec -B $ROOT \
/apps/singularity-3/samtools/samtools-v1.9-4-deb_cv1.sif \
samtools $@
}
samtools depth $DIR/2-align/$name.bam > $DIR/2-align/$name.depth
ROOT=$(git rev-parse --show-toplevel)
ml singularity
bcftools () {
singularity exec -B $ROOT \
/apps/singularity-3/bcftools/bcftools-1.9.sif \
bcftools "$@"
}
fasta=$DIR/0-ref/Kd.contigs.fa
bcftools mpileup -Ou -R <(split -nr/${SLURM_ARRAY_TASK_ID}/50 $fasta.fai | cut -f1) -f $fasta \
$(printf "$DIR/2-align/%s.bam " "${names[@]}") |
bcftools call -vmO z -o 3-variants/all.${SLURM_ARRAY_TASK_ID}.vcf.gz
ROOT=$(git rev-parse --show-toplevel)
ml singularity
bcftools () {
singularity exec -B $ROOT \
/apps/singularity-3/bcftools/bcftools-1.9.sif \
bcftools "$@"
}
bcftools concat 3-variants/all.{1..50}.vcf.gz > 3-variants/all.combined.vcf
library(tidyverse)
library(SNPRelate)
library(SeqArray)
library(ggrepel)
# Using SeqArray converter instead of SNPRelate's
seqVCF2GDS("3-variants/all.combined.vcf", '3-variants/all.gds')
genofile <- seqOpen('3-variants/all.gds')
# Run PCA using SNPRelate
#snpgdsVCF2GDS(vcf, 'all.gds', method='biallelic.only')
#genofile <- snpgdsOpen('all.gds')
pca <- snpgdsPCA(genofile, autosome.only=F, num.thread=48)
pc.percent <- pca$varprop*100
saveRDS(pca, '3-variants/all.pca.Rdata')
## Grab relevent data
## data <- data.frame(Library.ID = pca$sample.id,
## EV1 = pca$eigenvect[,1], # the first eigenvector
## EV2 = pca$eigenvect[,2], # the second eigenvector
## stringsAsFactors = FALSE) %>%
## mutate(label = basename(Library.ID),
## shape = sub("_.*", "", label))
## # Plot PCA, labeling outliers
## p <- ggplot(data, aes(x=EV1, y=EV2, color=shape)) +
## geom_point() +
## geom_label_repel(aes(label=label)) +
## scale_shape_manual(values=c(20, 4)) +
## xlab(sprintf("PC 1 (%0.2f%%)", pc.percent[1])) +
## ylab(sprintf("PC 2 (%0.2f%%)", pc.percent[2])) +
## theme_minimal()
## p
library(tidyverse)
library(ggrepel)
library(ggpp)
pca <- readRDS('3-variants/all.pca.Rdata')
pc.percent <- pca$varprop*100
## Grab relevent data
data <- data.frame(Library.ID = pca$sample.id,
EV1 = pca$eigenvect[,1], # the first eigenvector
EV2 = pca$eigenvect[,2], # the second eigenvector
stringsAsFactors = FALSE) %>%
mutate(name = sub(".bam$", "", basename(Library.ID)),
color = sub("_.*", "", name),
shape = 'original') %>%
filter(!name %in% c("Kk_3171", "Kc_A_1_1") )
data[data$name %in% sprintf("Kk_%d", c(3156:3160)), 'color'] <- 'Kc'
data[data$name %in% sprintf("Kk_%d", c(3156:3160)), 'shape'] <- 'change'
data[data$name %in% "SRR6195040", 'color'] <- 'Kd'
data[data$name %in% "SRR6195040", 'shape'] <- 'change'
data[data$name %in% c("Kd_HAVO_1a", "Kd_HAVO_1b", "Kk_3155"), 'label'] <-
c("Kd_HAVO_1a", "Kd_HAVO_1b", "Kk_3155")
# Plot PCA, labeling outliers
p <- ggplot(data, aes(x=EV1, y=EV2, shape=shape, color=color, label=label)) +
geom_point() +
geom_label_repel(max.overlaps = Inf,
position = position_nudge_center(x=-0.02, y=0.01)) +
scale_shape_manual(values=c(4, 20)) +
guides(shape='none') +
xlab(sprintf("PC 1 (%0.2f%%)", pc.percent[1])) +
ylab(sprintf("PC 2 (%0.2f%%)", pc.percent[2])) +
theme_minimal()
#plotly::ggplotly(p )
p