Skip to content

Latest commit

 

History

History
396 lines (324 loc) · 11.5 KB

readme.org

File metadata and controls

396 lines (324 loc) · 11.5 KB

Annotation

NameFasta
Kc1-assembly/6-ncbi/Kc.fa.gz
Kd1-assembly/6-ncbi/Kd.fa.gz
Kk1-assembly/6-ncbi/Kk.fa.gz
Gk1-assembly/0-ref/kirkii.fa.gz

Repeats

  • Run RepeatModeler
    ROOT=$(git rev-parse --show-toplevel)
    ml singularity
    
    name=${names[$SLURM_ARRAY_TASK_ID]}
    fasta=${genomes[$name]}
    
    export BLAST_USAGE_REPORT=false
    
    SCRATCH=/local/scratch/tony.arick/$SLURM_JOB_ID/
    cd $SCRATCH
    
    singularity exec -B $ROOT -B $SCRATCH $DIR/apps/dfam-tetools-1.87.sif \
        BuildDatabase -name $SCRATCH/$name.db $ROOT/wgs/$fasta
    singularity exec -B $ROOT -B $SCRATCH $DIR/apps/dfam-tetools-1.87.sif \
        RepeatModeler -database $SCRATCH/$name.db \
        -threads 48 -LTRStruct
    
    mkdir $DIR/1-repeats/$name/
    cp -r $SCRATCH $DIR/1-repeats/$name/modeler
        

Stats

DIR=/90daydata/gbru_kokia/Kokia/wgs/2-annotation

for i in Kc Kd Kk; do
    echo $i
    sed -n -e 6,11p -e '/Unclass/p' \
        -e '/Gypsy/p' -e '/Copia/p' \
        $DIR/1-repeats/$i/masker/$i.fa.tbl
done

Braker3

ROOT=$(git rev-parse --show-toplevel)
name=${names[$SLURM_ARRAY_TASK_ID]}

ml singularity

SCRATCH=/local/scratch/tony.arick/$SLURM_JOB_ID/
mkdir $DIR/2-braker3/$name
cd $DIR/2-braker3/$name

mkdir $DIR/2-braker3/$name/home
cp $HOME/.gm_key $DIR/2-braker3/$name/home

singularity exec --cleanenv -H $DIR/2-braker3/$name/home \
    -B $ROOT -B $SCRATCH \
    $DIR/apps/braker3_v.1.0.4.1.sif braker.pl \
    --species=$name \
    --genome=$DIR/1-repeats/$name/masker/$name.fa.masked \
    --prot_seq=$DIR/2-braker3/Viridiplantae.fa \
    --threads=48 \
    --workingdir=$SCRATCH
cp -r $SCRATCH $DIR/2-braker3/$name/output

Rename

ROOT=$(git rev-parse --show-toplevel)
PATH=$DIR/apps/gffread-0.12.7.Linux_x86_64/:$PATH

declare -A names
names["Kc"]="Kocoo"
names["Kk"]="Kokau"
names["Kd"]="Kodry"
names["Gk"]="Gokir"

#for i in "${!genomes[@]}"; do
for i in Kc Kd Kk; do
    fasta=${genomes[$i]}
    cp $ROOT/wgs/$fasta $DIR/${names[$i]}.fa.gz

    tmp_fa=$(mktemp)
    zcat $ROOT/wgs/$fasta > $tmp_fa
    tmp_gtf=$(mktemp)
    #            tar -O -xf 2-braker3/$i.tar braker.gtf |
    cat $DIR/2-braker3/$i/output/braker.gtf |
        awk '$1 != chr { chr=$1; c=1;
                                   num=$1;
                                   if(num ~ /^K[cdkI]/){
                                   sub(".._", "0", num);
                                   num=substr(num, length(num)-2, 3)
                                   }else{
                                   sub("^.*_", "Z", num);
                                   }
                                }
                       }
                       $3 == "gene" {
                           name = $9;
                           rep=sprintf("%s.%03sG%04d00", spec, num, c++)
                       }
                       1 { gsub(name, rep, $9);
                           sub("\\.t", ".", $9);
                           print }' \
                               FS="\t" OFS="\t" spec=${names[$i]} |
        tee $tmp_gtf |
        gzip > $DIR/${names[$i]}.gtf.gz

    gffread -J -y $DIR/${names[$i]}.pep.fa \
        -w $DIR/${names[$i]}.cds.fa \
        -g $tmp_fa $tmp_gtf

    gzip $DIR/${names[$i]}.pep.fa $DIR/${names[$i]}.cds.fa

    rm $tmp_fa $tmp_gtf
done
  • Graph
    library(tidyverse)
    
    data <- list.files(pattern=".gtf.gz") %>%
      setNames(substring(., 0,5)) %>%
      lapply(data.table::fread) %>%
      bind_rows(.id='Species') %>%
      filter(V3 == "gene") %>%
      mutate(chr = substring(V1, 4)) %>%
      group_by(Species, chr) %>%
      count()
    
    data %>%
      mutate(chr = factor(chr, c('01', '2_4', '03',
                                 sprintf('%02d', 5:13)))) %>%
    ggplot(aes(chr, n, shape=Species, color=Species)) +
      geom_point(position=position_dodge(width=0.5)) +
      scale_y_continuous(limits=c(0,4600), expand=c(0,0),
                         name="Number of Genes") +
      scale_x_discrete(expand=c(0,0), name="Chromosome") +
      theme_minimal() +
      theme(panel.grid.major.x=element_blank())
    
    ggsave('gene-counts.png', width=12, height=4, bg='white')
        

./gene-counts.png

InterproScan

Download latest version of iprscan: 5.65-97.0

IPRSCAN_URL=https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5
cd $DIR/apps
wget $IPRSCAN_URL/5.65-97.0/interproscan-5.65-97.0-64-bit.tar.gz

ml singularity
singularity pull docker://interpro/interproscan:5.65-97.0

Initial IPR Setup

IPRDATA=$DIR/apps/interproscan-5.65-97.0/data
singularity exec \
    -B $IPRDATA:/opt/interproscan/data -B $DIR \
    $DIR/apps/interproscan_5.65-97.0.sif \
    cd /opt/interproscan && \
    python3 setup.py -f interproscan.properties
 

Get protein sequences from renamed gtf file

ROOT=$(git rev-parse --show-toplevel)
PATH=$DIR/apps/gffread-0.12.7.Linux_x86_64/:$PATH
PATH=$DIR/apps/samtools-1.17/:$PATH

declare -A names
names["Kc"]="Kocoo"
names["Kk"]="Kokau"
names["Kd"]="Kodry"
names["Gk"]="Gokir"

for name in ${!genomes[@]}; do
    tmp_fasta=$(mktemp)
    tmp_gtf=$(mktemp)

    zcat $ROOT/wgs/${genomes[$name]} > $tmp_fasta
    zcat $DIR/${names[$name]}.gtf.gz > $tmp_gtf

    gffread -J -y $DIR/4-iprscan/$name.pep.fa \
        -g $tmp_fasta $tmp_gtf

    rm $tmp_fasta
    rm $tmp_gtf

    samtools faidx $DIR/4-iprscan/$name.pep.fa
done

Run iprscan on protein sequences.

ml singularity

PATH=$DIR/apps/samtools-1.17/:$PATH
PATH=$DIR/apps/interproscan-5.65-97.0/:$PATH

SCRATCH=/local/scratch/tony.arick/$SLURM_JOB_ID/

# Set spec with --export=spec=Kd

#PREFIX=$spec.pep.$SLURM_ARRAY_TASK_ID
PREFIX=$spec.pep

# split -n "l/$SLURM_ARRAY_TASK_ID/20" $DIR/4-iprscan/$spec.pep.fa.fai |
#     cut -f 1 |
#     samtools faidx -r - $DIR/4-iprscan/$spec.pep.fa \
#         > $SCRATCH/$PREFIX.fa
cp $DIR/4-iprscan/$spec.pep.fa $SCRATCH/$PREFIX.fa

IPRDATA=$DIR/apps/interproscan-5.65-97.0/data
singularity exec \
    -B $IPRDATA:/opt/interproscan/data -B $DIR \
    $DIR/apps/interproscan_5.65-97.0.sif \
    /opt/interproscan/interproscan.sh \
    -i $SCRATCH/$PREFIX.fa \
    -f xml,tsv \
    -pathways \
    -iprlookup \
    -goterms \
    -dp  \
    -appl 'Pfam, PRINTS, PANTHER, TIGRFAM, SUPERFAMILY, PIRSF, ProSiteProfiles, ProSitePatterns, SMART' \
    -b $SCRATCH/$PREFIX \
    -cpu $SLURM_CPUS_PER_TASK \
    > $DIR/4-iprscan/$PREFIX.log

mv $SCRATCH/$PREFIX.{tsv,xml} $DIR/4-iprscan/

Stats

number of genes

DIR=/90daydata/gbru_kokia/Kokia/wgs/2-annotation
for name in Kocoo Kodry Kokau; do
    zcat $DIR/$name.gtf.gz |
        awk '{_[$3]++} END {for ( i in _) print name, i, _[i];}' \
            name=$name
done | grep -e gene -e transcript

exon stats

DIR=/90daydata/gbru_kokia/Kokia/wgs/2-annotation

for name in Kocoo Kodry Kokau; do
    zcat $DIR/$name.gtf.old.gz |
        awk '$3 == "exon" {_[$10]++}
        END { for(i in _){ print _[i]; }}' name=$name |
        Rscript -e "summary(as.numeric(readLines(file('stdin'))))"
done
DIR=/90daydata/gbru_kokia/Kokia/wgs/2-annotation

for name in Kocoo Kodry Kokau; do
    zcat $DIR/$name.gtf.old.gz |
        awk '$3 == "exon" {_[$10]++}
        END { for(i in _){ print _[i]; }}' name=$name |
        Rscript -e "table(as.numeric(readLines(file('stdin')))==1)"
done
Kocoogene39268891422.700418
Kodrygene38042755019.846485
Kokaugene39242836821.324092
Kocootranscript41953891421.247587
Kodrytranscript40766755018.520336
Kokautranscript41898836819.972314
DIR=/90daydata/gbru_kokia/Kokia/wgs/2-annotation
name=Kocoo
for name in Kocoo Kodry Kokau; do
    zcat $DIR/$name.gtf.old.gz |
        awk '$3 == "exon" {print $5 - $4}' |
        Rscript -e "summary(as.numeric(readLines(file('stdin'))))"
done