Skip to content

Latest commit

 

History

History
888 lines (707 loc) · 26.8 KB

readme.org

File metadata and controls

888 lines (707 loc) · 26.8 KB

Compare Genomes

Namebraker gtfFasta
Kocoo2-annotation/Kocoo.gtf.gz1-assembly/6-ncbi/Kc.fa.gz
Kodry2-annotation/Kodry.gtf.gz1-assembly/6-ncbi/Kd.fa.gz
Kokau2-annotation/Kokau.gtf.gz1-assembly/6-ncbi/Kk.fa.gz
Gokir2-annotation/Gokir.gtf.gz1-assembly/0-ref/kirkii.fa.gz

Genespace (all)

  • Get bed files
    ROOT=$(git rev-parse --show-toplevel)
    
    for name in "${!anno[@]}"; do
        read gtf fasta <<<"${anno[$name]}"
        zcat $ROOT/wgs/$gtf |
            grep '^K' |
            awk '$3 == "transcript" {print $1,$4,$5,$9}' \
                FS="\t" OFS="\t" > $DIR/1-genespace/bed/$name.bed
    done
        

  • Run Genescape
    library(GENESPACE)
    
    gpar <- init_genespace(
          wd = file.path(DIR, "1-genespace") ,
          genomeIDs=c("Kocoo", "Kodry", "Kokau", "Gokir"), 
          path2mcscanx = file.path(DIR, "apps", "MCScanX"),
          path2orthofinder = file.path(DIR, "apps", "orthofinder"),
          path2diamond = file.path(DIR, "apps", "diamond"),
          nCores = 48)
    
    out <- run_genespace(gpar)
    
        
  • Get Pangenome
    library(GENESPACE)
    
    load(file.path(DIR, "1-genespace", "results", "gsParams.rda"),
         verbose = TRUE)
    
    data.table::fwrite(file = file.path(DIR, "1-genespace", "Gokir_syn.txt"),
           syntenic_pangenes(gsParam = gsParam,
                             refGenome = "Gokir",
                             maxPlacePerChr = 2,
                             minPropInterp2keep = 0.75),
           row.names = F, quote = F, sep = '\t')
    
    data.table::fwrite(file = file.path(DIR, "1-genespace", "Gokir_pan.txt"),
           query_pangenes(gsParam = gsParam,
                          refGenome = "Gokir",
                          showArrayMem=F,
                          showNSOrtho=F),
           row.names = F, quote = F, sep = '\t')
    
        
  • Fix riparian plot
    library(ggplot2)
    library(GENESPACE)
    
    load(file.path(DIR, "1-genespace", "results", "gsParams.rda"),
         verbose = TRUE)
    
    invchr <- data.frame(genome=rep("Kodry", 7),
                         chr=sprintf("Kd_%02d", c(3,6:11)))
    
    ## plot_riparian(gsParam = gsParam,
    ##               invertTheseChrs=invchr,
    ##               genomeIDs = c("Gokir", "Kokau", "Kodry", "Kocoo"),
    ##               refGenome = "Gokir",
    ##               useOrder = FALSE,
    ##               useRegions = FALSE,
    ##               reorderBySynteny = TRUE,
    ##               syntenyWeight = 1,
    ##               pdfFile = file.path(DIR, "1-genespace","Gokir_def.pdf"))
    
    ##custom colour & background:
    ggthemes <- theme(panel.background = element_rect(fill = "white"))
    customPal <- colorRampPalette(c("darkorange", "skyblue", "darkblue",
                                    "purple", "darkred", "salmon"))
    plot_riparian(gsParam = gsParam,
                  invertTheseChrs=invchr,
                  palette = customPal,
                  braidAlpha = .75,
                  chrFill = "lightgrey",
                  addThemes = ggthemes,
                  customRefChrOrder = c("KI_01", "KI_2_4",
                                        sprintf("KI_%02d", c(3,5:13))),
                  genomeIDs = c("Gokir", "Kokau", "Kodry", "Kocoo"),
                  refGenome = "Gokir",
                  pdfFile = file.path(DIR, "1-genespace","Gokir_col.pdf"))
    
    
        
DIR=/90daydata/gbru_kokia/Kokia/wgs/4-compare
RES=$DIR/1-genespace/orthofinder/Results_Mar21

sed -n -e 1p -e 5p \
    $RES/Comparative_Genomics_Statistics/Statistics_PerSpecies.tsv |
    column -t -s$'\t' |
    tr -d '\r'

sed -n -e 7p -e 17p -e 18p\
    $RES/Comparative_Genomics_Statistics/Statistics_Overall.tsv |
    column -t -s$'\t' |
    tr -d '\r'


echo ""
sed -n -e 8p \
    $RES/Comparative_Genomics_Statistics/Statistics_Overall.tsv |
    column -t -s$'\t' |
    tr -d '\r'

sed -n -e 1p -e 9,11p \
    $RES/Comparative_Genomics_Statistics/Statistics_PerSpecies.tsv |
    column -t -s$'\t' |
    tr -d '\r'

Genespace (Kokia)

  • Get bed files
    ROOT=$(git rev-parse --show-toplevel)
    
    for name in "${!anno[@]}"; do
        read gtf fasta <<<"${anno[$name]}"
        zcat $ROOT/wgs/$gtf |
            grep -E '^K[cdk]_([01][0-9]|2_4)' |
            awk '$3 == "transcript" {print $1,$4,$5,$9}' \
                FS="\t" OFS="\t" > $DIR/1-genespace-kokia/bed/$name.bed
    done
        

  • Run Genescape
    library(GENESPACE)
    
    gpar <- init_genespace(
          wd = file.path(DIR, "1-genespace-kokia") ,
          genomeIDs=c("Kocoo", "Kodry", "Kokau"),
          path2mcscanx = file.path(DIR, "apps", "MCScanX"),
          path2orthofinder = file.path(DIR, "apps", "orthofinder"),
          path2diamond = file.path(DIR, "apps", "diamond"),
          nCores = 48)
    
    out <- run_genespace(gpar)
    
     save(c(gpar, out), file = "1-genespace-kokia/out.RData")
    
        
DIR=/90daydata/gbru_kokia/Kokia/wgs/4-compare
RES=$DIR/1-genespace-kokia/orthofinder/Results_Mar22

sed -n -e 1p -e 3,6p \
    $RES/Comparative_Genomics_Statistics/Statistics_PerSpecies.tsv |
    column -t -s$'\t' |
    tr -d '\r'

sed -n -e 7p -e 17p -e 18p\
    $RES/Comparative_Genomics_Statistics/Statistics_Overall.tsv |
    column -t -s$'\t' |
    tr -d '\r'


echo ""
sed -n -e 8p \
    $RES/Comparative_Genomics_Statistics/Statistics_Overall.tsv |
    column -t -s$'\t' |
    tr -d '\r'

sed -n -e 1p -e 9,11p \
    $RES/Comparative_Genomics_Statistics/Statistics_PerSpecies.tsv |
    column -t -s$'\t' |
    tr -d '\r'

Hi-C

Mapping the hi-c libraries of the three kokia samples to the four genomes to validate SV found in genespace.

NameForwardReverse
KocooKc/hi-c/Kc_HiC_CKDL220020122-1A_HCWYNDSX5_L1_1.fq.gzKc/hi-c/Kc_HiC_CKDL220020122-1A_HCWYNDSX5_L1_2.fq.gz
Kodrykd/hi-c/kokia_S3HiC_R1.fastq.gzkd/hi-c/kokia_S3HiC_R2.fastq.gz
KokauKk/hi-c/Kk_HiC_CKDL220020123-1A_HCWYNDSX5_L1_1.fq.gzKk/hi-c/Kk_HiC_CKDL220020123-1A_HCWYNDSX5_L1_2.fq.gz
  • Create bwa database for assembly
    ROOT=$(git rev-parse --show-toplevel)
    PATH=$DIR/apps/bwa-0.7.17:$PATH
    
    names=(${!anno[@]})
    name=${names[$SLURM_ARRAY_TASK_ID]}
    readarray -t lib <<<"${anno[$name]}"
    gtf=${lib[0]}
    fasta=${lib[1]}
    
    bwa index -a bwtsw -p $DIR/2-hic/0-db/$name $ROOT/wgs/$fasta
        
  • Align Hi-C data to assembly
    ROOT=$(git rev-parse --show-toplevel)
    RAW=$(realpath $ROOT/wgs/0-raw/)
    
    PATH=$DIR/apps/bwa-0.7.17:$PATH
    PATH=$DIR/apps/samtools-1.17/bin:$PATH
    PATH=$PATH:$DIR/apps/samblaster-v.0.1.26/
    
    line=$(sed -n ${SLURM_ARRAY_TASK_ID}p <<<"$data")
    read name fwd rev <<<"$line"
    
    bwa mem -5SP -t 48 $DIR/2-hic/0-db/$DB $RAW/$fwd $RAW/$rev |
        samblaster |
        samtools view -bS -F 2316 |
        samtools sort -m 60G -o $DIR/2-hic/1-bwa/$DB-$name.bam
        
  • Graph
    ml gd/
    PATH=$DIR/apps/hic-viz:$PATH
    PATH=$DIR/apps/samtools-1.17/bin:$PATH
    
    for name in {Gokir,Kocoo,Kodry,Kokau}-{Kocoo,Kodry,Kokau}; do
        bam=$DIR/2-hic/1-bwa/$name.bam
        echo hic-viz $bam '>' $DIR/2-hic/2-viz/$name.png
    done | parallel
        
ROOT=$(git rev-parse --show-toplevel)
DIR=$ROOT/wgs/4-compare/
PATH=$DIR/apps/hic-viz:$PATH

hic-viz -f /usr/share/fonts/dejavu-sans-fonts/DejaVuSans.ttf \
    -b 750 -s 4 \
    -m 500 \
    -r <(echo Kc_{01,2_4,03} Kc_{05..13}) \
    $DIR/2-hic/1-bwa/Kocoo-Kocoo.bam > $DIR/2-hic/Kocoo.png
ROOT=$(git rev-parse --show-toplevel)
DIR=$ROOT/wgs/4-compare/
PATH=$DIR/apps/hic-viz:$PATH

hic-viz -f /usr/share/fonts/dejavu-sans-fonts/DejaVuSans.ttf \
    -b 750 -s 4 \
    -m 1000 \
    -r <(echo Kd_{01,2_4,03} Kd_{05..13}) \
    $DIR/2-hic/1-bwa/Kodry-Kodry.bam > $DIR/2-hic/Kodry.png
ROOT=$(git rev-parse --show-toplevel)
DIR=$ROOT/wgs/4-compare/
PATH=$DIR/apps/hic-viz:$PATH

hic-viz -f /usr/share/fonts/dejavu-sans-fonts/DejaVuSans.ttf \
    -b 750 -s 4 \
    -m 2000 \
    -r <(echo Kk_{01,2_4,03} Kk_{05..13}) \
    $DIR/2-hic/1-bwa/Kokau-Kokau.bam > $DIR/2-hic/Kokau.png

BUSCO

  • Get eudicot lineage
    LINEAGEURL=https://busco-data.ezlab.org/v5/data/lineages/
    wget -O- --no-check $LINEAGEURL/eudicots_odb10.2024-01-08.tar.gz |
          tar -xz -C $DIR/3-busco
        
  • Run all
    PATH=$DIR/apps/gffread-0.12.7.Linux_x86_64/:$PATH
    ROOT=$(git rev-parse --show-toplevel)
    ml singularity
    
    busco () {
        singularity exec -B $DIR \
        $DIR/apps/busco-v5.5.0_cv1 \
        busco "$@"
    }
    
    names=(${!anno[@]})
    name=${names[$SLURM_ARRAY_TASK_ID]}
    readarray -t lib <<<"${anno[$name]}"
    gtf=${lib[0]}
    fasta=${lib[1]}
    
    SCRATCH=/local/scratch/tony.arick/$SLURM_JOB_ID/
    zcat $ROOT/wgs/$fasta > $SCRATCH/$name.genome.fa
    zcat $ROOT/wgs/$gtf |
        grep '^K' > $SCRATCH/$name.gtf
    
    gffread -J \
        -w $SCRATCH/$name.trans.fa \
        -y $SCRATCH/$name.protein.fa \
        -g $SCRATCH/$name.genome.fa \
        $SCRATCH/$name.gtf
    
    for type in genome trans protein; do
        mkdir $SCRATCH/$type
        cd $SCRATCH/$type
    
        busco -i $SCRATCH/$name.$type.fa \
            -l $DIR/3-busco/eudicots_odb10 \
            -m $type  \
            -o $name \
            -c 48
    
        tar -C $SCRATCH/$type/$name -cf $DIR/3-busco/$name.$type.tar \
            short_summary.specific.eudicots_odb10.$name.txt \
            short_summary.specific.eudicots_odb10.$name.json \
            logs \
            run_eudicots_odb10
    done
        
  • stats
    for i in Kocoo Kodry Kokau Gokir; do
        type=trans
          tar -O -xf 3-busco/$i.$type.tar \
            short_summary.specific.eudicots_odb10.$i.txt
    done | less
        
  • Graph
    for i in Kocoo Kodry Kokau Gokir; do
        for type in genome trans protein; do
          tar -O -xf 3-busco/$i.$type.tar \
            short_summary.specific.eudicots_odb10.$i.json \
            > $i.$type.json;
      done
    done
        
    library(tidyverse)
    library(rjson)
    library(cowplot)
    
    
    plots <- lapply(c("Genome", "Protein"),
                    function(mode){
    
                      data <- paste(c("Gokir", "Kocoo", "Kodry", "Kokau"),
                                    tolower(mode), "json", sep=".") %>%
                        setNames(sub(".json", "", .)) %>%
                        lapply(function (file) fromJSON(file=file)$results) %>%
                        lapply(as.data.frame) %>%
                        bind_rows(.id="Species.type") %>%
                        separate(Species.type, into=c("Species", "Mode")) %>%
                        select(Species, label=one_line_summary,
                               Single.copy, Multi.copy, Fragmented, Missing) %>%
                        mutate(Species = factor(Species, c("Gokir", "Kocoo", "Kodry", "Kokau"),
                                                c("Gossypioides kirkii",
                                                  "Kokia cookei",
                                                  "Kokia drynarioides",
                                                  "Kokia kauaiensis"))) %>%
                        gather(-Species, -label, key="key", value="value")
    
                      labels <- select(data, Species, label)
    
                      ggplot(data) +
                        geom_col(aes(value, Species, fill=key)) +
                        geom_text(aes(0, Species, label=label), labels, hjust=-0.01) +
                        scale_fill_manual(values = c('#33a02c','#b2df8a',
                                                     '#fdbf6f', '#fb9a99'),
                                          name = element_blank(),
                                          breaks = c("Single.copy",
                                                     "Multi.copy",
                                                     "Missing",
                                                     "Fragmented"),
                                          labels = c("Single Copy",
                                                     "Duplicated",
                                                     "Missing",
                                                     "Fragmented")) +
                        scale_x_continuous(expand=c(0,0)) +
                        ggtitle(mode) +
                        theme_minimal() +
                        theme(axis.title = element_blank(),
                              axis.text.x = element_blank(),
                              legend.position="none",
                              axis.text.y = element_text(face="italic"))
                    })
    
    plots[[3]] = get_legend(plots[[1]] + theme(legend.position="bottom"))
    plots[[2]] = plots[[2]] + ggtitle("Annotation");
    plot_grid(plotlist = plots, rel_heights=c(1,1,0.3), ncol=1)
    
    ggsave("busco.all.png", width=7, height=3, bg="white")
        

    ./busco.all.png

    library(tidyverse)
    library(rjson)
    
    
    paste(c("Gokir", "Kocoo", "Kodry", "Kokau"),
                  "genome", "json", sep=".") %>%
      setNames(sub(".json", "", .)) %>%
      lapply(function (file) fromJSON(file=file)$results) %>%
      lapply(as.data.frame) %>%
      bind_rows(.id="Species.type") %>%
      separate(Species.type, into=c("Species", "Mode")) %>%
      select(Species, Number.of.contigs, Total.length, Percent.gaps, Contigs.N50) %>%
      mutate(Species = factor(Species, c("Gokir", "Kocoo", "Kodry", "Kokau"),
                              c("Gossypioides kirkii",
                                "Kokia cookei",
                                "Kokia drynarioides",
                                "Kokia kauaiensis")))
    
        

There seems to be a large loss in Kc_11. Validating it by aligning Kc reads and contigs to Kd and Kk

ROOT=$(git rev-parse --show-toplevel)

for file in "${runs[@]}"; do
    tar -Oxf $ROOT/wgs/0-raw/$file
done | zcat -f > $TMPDIR/kc.combined.fq

PATH=$DIR/apps/samtools-1.20/bin/:$PATH
PATH=$DIR/apps/minimap2-2.28_x64-linux/:$PATH

minimap2 -ax map-ont -t $SLURM_CPUS_PER_TASK \
    $ROOT/wgs/1-assembly/6-ncbi/Kk.fa.gz \
    $TMPDIR/kc.combined.fq |
    samtools view -Sb - |
    samtools sort -m300G -o $DIR/4-validate/Kk.bam -

samtools index $DIR/4-validate/Kk.bam

minimap2 -ax map-ont -t $SLURM_CPUS_PER_TASK \
    $ROOT/wgs/1-assembly/6-ncbi/Kd.fa.gz \
    $TMPDIR/kc.combined.fq |
    samtools view -Sb - |
    samtools sort -m300G -o $DIR/4-validate/Kd.bam -

samtools index $DIR/4-validate/Kd.bam
ROOT=$(git rev-parse --show-toplevel)

PATH=$DIR/apps/samtools-1.20/bin/:$PATH
PATH=$DIR/apps/bedtools-2.31.1/bin/:$PATH

for spec in Kd Kk; do
    zcat $ROOT/wgs/1-assembly/6-ncbi/$spec.fa.gz \
        > $TMPDIR/$spec.fa
    samtools faidx $TMPDIR/$spec.fa

    bedtools makewindows -g $TMPDIR/$spec.fa.fai \
        -w 20000 -s 10000 |
        samtools bedcov /dev/stdin \
            $DIR/4-validate/$spec.bam \
            > $DIR/4-validate/$spec.cov
done
library(tidyverse)
library(scales)

data <- list.files("4-validate", ".cov", full.names = T) %>%
  lapply(read.delim, header = F,
         col.names = c("Chr", "Start", "End", "Coverage")) %>%
  bind_rows %>%
  filter(Chr %in% paste(rep(c("Kd", "Kk"), each=12),
      rep(c("01", "2_4", "03", "05", "06",
            "07", "08", "09", 10:13), 2),
      sep="_")) %>%
  separate(Chr, into=c("Ref", "ChrNum"), sep=c(3)) %>%
  mutate(Coverage=ifelse(Coverage > 2000000, NA, Coverage),
         y= -1 * Start,
         ChrNum = factor(ChrNum, c("01", "2_4", "03", "05", "06",
            "07", "08", "09", 10:13)))
head(data)


label_number_chr <- function(x) {
  label_number(scale=1, scale_cut=cut_si("bp"))(-1 * x)
}
ggplot(data, aes(Ref, y, fill=Coverage)) +
  geom_tile(width = 0.75) +
  scale_y_continuous(labels = label_number_chr,
                     expand = c(0,0)) +
  facet_grid(cols = vars(ChrNum)) +
  theme_minimal() +
  theme(axis.text.x = element_blank(),
        axis.title = element_blank())
ROOT=$(git rev-parse --show-toplevel)
SCRATCH=/local/scratch/tony.arick/$SLURM_JOB_ID/

ml minimap2/2.17

minimap2 -x asm20 -t 48 \
    $ROOT/wgs/1-assembly/6-ncbi/Kk.fa.gz \
    $ROOT/wgs/1-assembly/6-ncbi/Kc.fa.gz \
    > $DIR/4-validate/Kc-vs-Kk.paf

minimap2 -x asm20 -t 48 \
    $ROOT/wgs/1-assembly/6-ncbi/Kd.fa.gz \
    $ROOT/wgs/1-assembly/6-ncbi/Kc.fa.gz \
    > $DIR/4-validate/Kc-vs-Kd.paf

Assembly stats

PATH=$DIR/apps/samtools-1.17/bin/:$PATH
ROOT=$(git rev-parse --show-toplevel)
ml python

for name in "${!anno[@]}"; do
    readarray -t lib <<<"${anno[$name]}"
    gtf=${lib[0]}
    fasta=${lib[1]}

    SCRATCH=/local/scratch/tony.arick/$SLURM_JOB_ID/
    zcat $ROOT/wgs/$fasta > $SCRATCH/$name.fa

    samtools faidx $SCRATCH/$name.fa
    cut -f 1,2 $SCRATCH/$name.fa.fai > $DIR/5-stats/$name.size

    echo $name
    cut -f 1 $SCRATCH/$name.fa.fai | head -12 |
    samtools faidx -r - $SCRATCH/$name.fa |
        tr -dc N |
        wc -c

    assembly_stats $SCRATCH/$name.fa > $DIR/5-stats/$name.json
done

number of Ns

Gokir28700
Kodry76800
Kokau65300
Kocoo88900

Merqury

Get data for Kd

for SRA in SRR61950{36..41}; do
    URL=ftp://ftp.sra.ebi.ac.uk/vol1/fastq/${SRA:0:6}/

    if [ ${#SRA} -gt 9 ]; then
        ACC=00${SRA:9}
        URL+=${ACC: -3}/
    fi

    URL+=$SRA

    wget -O $DIR/5-merqury/${SRA}_1.fastq.gz $URL/${SRA}_1.fastq.gz
    wget -O $DIR/5-merqury/${SRA}_2.fastq.gz $URL/${SRA}_2.fastq.gz
done

Using the nanopore data for Kc and Kk; however, this isn’t ideal.

ml singularity
singularity exec $ROOT/wgs/4-compare/apps/merqury-1.3.sif \
    /usr/local/share/merqury/best_k.sh 580000000
ROOT=$(git rev-parse --show-toplevel)
PATH=$DIR/apps/meryl-1.4.1/bin/:$PATH

# for file in "${kc_runs[@]}"; do
#     tar -Oxf $ROOT/wgs/0-raw/$file
# done | zcat -f > $TMPDIR/kc.combined.fq

# meryl count k=21 $TMPDIR/kc.combined.fq \
#     output $DIR/5-merqury/kc.reads.meryl
# rm $TMPDIR/kc.combined.fq

# for file in "${kk_runs[@]}"; do
#     tar -Oxf $ROOT/wgs/0-raw/$file
# done | zcat -f > $TMPDIR/kk.combined.fq

# meryl count k=21 $TMPDIR/kk.combined.fq \
#     output $DIR/5-merqury/kk.reads.meryl
# rm $TMPDIR/kk.combined.fq

for name in "${!kd_runs[@]}"; do
    meryl count k=21 $ROOT/wgs/0-raw/${kd_runs[$name]}/sup.pass.fq.gz \
        output $TMPDIR/$name.meryl
done
meryl union-sum output $DIR/5-merqury/kd.reads.nano.meryl \
    $(printf "$TMPDIR/%s.meryl " "${!kd_runs[@]}")


# for SRA in SRR61950{36..41}; do
#     meryl count k=21 $DIR/5-merqury/${SRA}_1.fastq.gz \
#         output $TMPDIR/${SRA}_1.meryl
#     meryl count k=21 $DIR/5-merqury/${SRA}_2.fastq.gz \
#         output $TMPDIR/${SRA}_2.meryl
# done
# meryl union-sum output $DIR/5-merqury/kd.reads.meryl \
#     $TMPDIR/SRR61950{36..41}_{1,2}.meryl
ROOT=$(git rev-parse --show-toplevel)
PATH=$DIR/apps/samtools-1.20/bin/:$PATH
PATH=$DIR/apps/bedtools-2.31.1/bin/:$PATH
PATH=$DIR/apps/meryl-1.4.1/bin/:$PATH
export MERQURY=$DIR/apps/merqury/

cd $DIR/5-merqury/

# zcat $ROOT/wgs/1-assembly/6-ncbi/Kc.fa.gz > $TMPDIR/Kc.fa
# $MERQURY/merqury.sh \
#     $DIR/5-merqury/kc.reads.meryl \
#     $TMPDIR/Kc.fa \
#     Kc

# zcat $ROOT/wgs/1-assembly/6-ncbi/Kk.fa.gz > $TMPDIR/Kk.fa
# $DIR/apps/merqury/merqury.sh \
#     $DIR/5-merqury/kk.reads.meryl \
#     $TMPDIR/Kk.fa \
#     Kk

zcat $ROOT/wgs/1-assembly/6-ncbi/Kd.fa.gz > $TMPDIR/Kd.fa
# $DIR/apps/merqury/merqury.sh \
#     $DIR/5-merqury/kd.reads.meryl \
#     $TMPDIR/Kd.fa \
#     Kd

$DIR/apps/merqury/merqury.sh \
    $DIR/5-merqury/kd.reads.nano.meryl \
    $TMPDIR/Kd.fa \
    Kd.nano

Completeness

SpeciesCompleteness
Kcall37019311438857067995.2705
Kd-illuminaall37347960938435105397.1715
Kd-nanoporeall37166088239067925195.132
Kkall37643291640002133494.1032

QV

SpeciesQV ScoreError Rate
Kc84449356100004441.44277.17341e-05
Kd-illumina273363455223695736.26580.000236277
Kd-nanopore24678055223695746.71942.12842e-05
Kk71892855584095742.10226.16287e-05

LAI

ROOT=$(git rev-parse --show-toplevel)
ml singularity

export BLAST_USAGE_REPORT=false

name="${names[$SLURM_ARRAY_TASK_ID]}"
readarray -t lib <<<"${anno[$name]}"
gtf=${lib[0]}
fasta=${lib[1]}

cd $TMPDIR
genome=$TMPDIR/$name.fa

zcat $ROOT/wgs/$fasta |
    sed '/>/s/_unplaced_/_Z/'> $genome

singularity exec -B $ROOT -B $TMPDIR \
        $DIR/apps/tetools-1.88.5.sif \
        LTRPipeline --debug --threads 80 $genome

tar -C $TMPDIR -cf $DIR/6-lai/$name.tar \
    --transform='s#^./##' .
ROOT=$(git rev-parse --show-toplevel)
ml singularity

export BLAST_USAGE_REPORT=false

name="${names[$SLURM_ARRAY_TASK_ID]}"
readarray -t lib <<<"${anno[$name]}"
gtf=${lib[0]}
fasta=${lib[1]}

cd $TMPDIR
genome=$TMPDIR/$name.fa

zcat $ROOT/wgs/$fasta |
    sed '/>/s/_unplaced_/_Z/'> $genome

tar -C $TMPDIR -Oxf $DIR/6-lai/$name.tar */seq.fa.LTRlib.fa \
    > $TMPDIR/repeats.fa
tar -C $TMPDIR -Oxf $DIR/6-lai/$name.tar */seq.fa.pass.list \
    > $TMPDIR/pass.list

singularity exec -B $ROOT -B $TMPDIR \
        $DIR/apps/tetools-1.88.5.sif \
        RepeatMasker -pa 80 \
        -lib $TMPDIR/repeats.fa \
        $TMPDIR/$name.fa

singularity exec -B $ROOT -B $TMPDIR \
        $DIR/apps/tetools-1.88.5.sif \
        /opt/LTR_retriever/LAI -genome $genome \
        -intact $TMPDIR/pass.list \
        -all $genome.out

tar -C $TMPDIR -cf $DIR/6-lai/$name.tar \
    --transform='s#^./##' .