Skip to content

Latest commit

 

History

History
295 lines (240 loc) · 8.43 KB

readme.org

File metadata and controls

295 lines (240 loc) · 8.43 KB

GBS samples

Cutadapt

Using cutadapt v4.3 to remove illumina adapters

ml python/3.9.2
PATH=$HOME/.local/bin:$PATH

TRUSEQ_FWD=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA
TRUSEQ_REV=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT

parallel --eta --xapply cutadapt \
    -a $TRUSEQ_FWD -A $TRUSEQ_REV \
    -o $DIR/1-filter/{1}.R1.fq.gz -p $DIR/1-filter/{1}.R2.fq.gz \
    $DIR/0-raw/{2} $DIR/0-raw/{3} '>' $DIR/1-filter/{1}.log \
    ::: "${names[@]}" ::: "${fwd[@]}" ::: "${rev[@]}"

Plot log info

library(tidyverse)

row_to_colnames <- function(df) {
  names(df) <- as.character(unlist(df[1,]))
  df[-1,]
}

read.cutadapt <- function(log.file) {
con <- file(log.file)
lines <- readLines(con)
close.connection(con)
sections <- cumsum(grepl("^===", lines))
tabs <- grepl("\t", lines)

data <- lapply(split(lines[tabs], sections[tabs]), str_split_fixed, pattern="\t", n=5) %>%
  lapply(as.data.frame) %>%
  lapply(row_to_colnames) %>%
  setNames(c('Fwd', 'Rev')) %>%
  bind_rows(.id='Read') %>%
  mutate_at(c("length", "count", "expect", "max.err"), as.numeric)

}

log <- list.files("1-filter", ".log", full.names = T) %>%
  setNames(sub(".log", "", basename(.))) %>%
  lapply(read.cutadapt) %>%
  bind_rows(.id="log")

ggplot(log, aes(length, count, color=Read, group=length)) +
  geom_boxplot() +
  scale_x_continuous(trans='reverse', labels=function(x) 150-x) +
  scale_y_continuous(labels=scales::label_number_si()) +
  facet_grid(Read~.) +
  theme_minimal()

Align

Create bwa database for assembly

ROOT=$(git rev-parse --show-toplevel)

ml singularity
bwa () {
    singularity exec -B $ROOT \
        /apps/singularity-3/bwa/bwa-0.7.17.sif bwa $@
}
samtools () {
    singularity exec -B $ROOT \
        /apps/singularity-3/samtools/samtools-v1.9-4-deb_cv1.sif \
        samtools $@
}
fasta=$DIR/0-ref/Kd.contigs.fa
bwa index -a bwtsw $fasta
samtools faidx $fasta

Align to contigs

ROOT=$(git rev-parse --show-toplevel)
name="${names[$SLURM_ARRAY_TASK_ID]}"

ml singularity
bwa () {
    singularity exec -B $ROOT \
        /apps/singularity-3/bwa/bwa-0.7.17.sif bwa $@
}
samtools () {
    singularity exec -B $ROOT \
        /apps/singularity-3/samtools/samtools-v1.9-4-deb_cv1.sif \
        samtools $@
}
PATH=$PATH:$DIR/apps/samblaster-v.0.1.26/

fasta=$DIR/0-ref/Kd.contigs.fa

bwa mem -t 20 $fasta $DIR/1-filter/$name.R1.fq.gz $DIR/1-filter/$name.R2.fq.gz |
    samblaster |
    samtools view -bS |
    samtools sort -m 35G -o $DIR/2-align/$name.bam -

Stats

for i in {0..162}; do
sed -e '/samblaster: -----/d' \
    -e '/samblaster: Pair/,/samblaster: Total/s/samblaster: //p' \
    -n $DIR/slurm-155609_$i.out |
    sed 's/\(Pair\|Both\) /\1_/' |
    awk 'NR == 1 {for(i=2; i < NF; i++) h[i]=$i;}
        NR > 1 {for(i=2; i < NF; i++) print name, $1, h[i], $i;}' \
            name=${names[$i]} OFS="\t"
done > $DIR/2-align/stats.txt

Coverage

ROOT=$(git rev-parse --show-toplevel)
name="${names[$SLURM_ARRAY_TASK_ID]}"

ml singularity
samtools () {
    singularity exec -B $ROOT \
        /apps/singularity-3/samtools/samtools-v1.9-4-deb_cv1.sif \
        samtools $@
}

samtools depth $DIR/2-align/$name.bam > $DIR/2-align/$name.depth

Variant calling

ROOT=$(git rev-parse --show-toplevel)

ml singularity
bcftools () {
    singularity exec -B $ROOT \
        /apps/singularity-3/bcftools/bcftools-1.9.sif \
        bcftools "$@"
}

fasta=$DIR/0-ref/Kd.contigs.fa

bcftools mpileup -Ou -R <(split -nr/${SLURM_ARRAY_TASK_ID}/50 $fasta.fai | cut -f1) -f $fasta \
    $(printf "$DIR/2-align/%s.bam " "${names[@]}") |
    bcftools call -vmO z -o 3-variants/all.${SLURM_ARRAY_TASK_ID}.vcf.gz
ROOT=$(git rev-parse --show-toplevel)

ml singularity
bcftools () {
    singularity exec -B $ROOT \
        /apps/singularity-3/bcftools/bcftools-1.9.sif \
        bcftools "$@"
}

bcftools concat 3-variants/all.{1..50}.vcf.gz > 3-variants/all.combined.vcf
library(tidyverse)
library(SNPRelate)
library(SeqArray)
library(ggrepel)

# Using SeqArray converter instead of SNPRelate's
seqVCF2GDS("3-variants/all.combined.vcf", '3-variants/all.gds')
genofile <- seqOpen('3-variants/all.gds')
# Run PCA using SNPRelate
#snpgdsVCF2GDS(vcf, 'all.gds', method='biallelic.only')
#genofile <- snpgdsOpen('all.gds')
pca <- snpgdsPCA(genofile, autosome.only=F, num.thread=48)
pc.percent <- pca$varprop*100

saveRDS(pca, '3-variants/all.pca.Rdata')

## Grab relevent data
## data <- data.frame(Library.ID = pca$sample.id,
##     EV1 = pca$eigenvect[,1],    # the first eigenvector
##     EV2 = pca$eigenvect[,2],    # the second eigenvector
##     stringsAsFactors = FALSE) %>%
##   mutate(label = basename(Library.ID),
##          shape = sub("_.*", "", label))

## # Plot PCA, labeling outliers
## p <- ggplot(data, aes(x=EV1, y=EV2, color=shape)) +
##   geom_point() +
##   geom_label_repel(aes(label=label)) +
##   scale_shape_manual(values=c(20, 4)) +
##   xlab(sprintf("PC 1 (%0.2f%%)", pc.percent[1])) +
##   ylab(sprintf("PC 2 (%0.2f%%)", pc.percent[2])) +
##   theme_minimal()
## p
library(tidyverse)
library(ggrepel)
library(ggpp)

pca <- readRDS('3-variants/all.pca.Rdata')
pc.percent <- pca$varprop*100

## Grab relevent data
data <- data.frame(Library.ID = pca$sample.id,
    EV1 = pca$eigenvect[,1],    # the first eigenvector
    EV2 = pca$eigenvect[,2],    # the second eigenvector
    stringsAsFactors = FALSE) %>%
  mutate(name = sub(".bam$", "", basename(Library.ID)),
         color = sub("_.*", "", name),
         shape = 'original') %>%
  filter(!name %in% c("Kk_3171", "Kc_A_1_1") )


data[data$name %in% sprintf("Kk_%d", c(3156:3160)), 'color'] <- 'Kc'
data[data$name %in% sprintf("Kk_%d", c(3156:3160)), 'shape'] <- 'change'

data[data$name %in% "SRR6195040", 'color'] <- 'Kd'
data[data$name %in% "SRR6195040", 'shape'] <- 'change'

data[data$name %in% c("Kd_HAVO_1a", "Kd_HAVO_1b", "Kk_3155"), 'label'] <-
     c("Kd_HAVO_1a", "Kd_HAVO_1b", "Kk_3155")

# Plot PCA, labeling outliers
p <- ggplot(data, aes(x=EV1, y=EV2, shape=shape, color=color, label=label)) +
  geom_point() +
  geom_label_repel(max.overlaps = Inf,
                   position = position_nudge_center(x=-0.02, y=0.01)) +
  scale_shape_manual(values=c(4, 20)) +
  guides(shape='none') +
  xlab(sprintf("PC 1 (%0.2f%%)", pc.percent[1])) +
  ylab(sprintf("PC 2 (%0.2f%%)", pc.percent[2])) +
  theme_minimal()

#plotly::ggplotly(p  )

p

all-samples.png