Name | Fasta |
---|---|
Kc | 1-assembly/6-ncbi/Kc.fa.gz |
Kd | 1-assembly/6-ncbi/Kd.fa.gz |
Kk | 1-assembly/6-ncbi/Kk.fa.gz |
Gk | 1-assembly/0-ref/kirkii.fa.gz |
- Run RepeatModeler
ROOT=$(git rev-parse --show-toplevel) ml singularity name=${names[$SLURM_ARRAY_TASK_ID]} fasta=${genomes[$name]} export BLAST_USAGE_REPORT=false SCRATCH=/local/scratch/tony.arick/$SLURM_JOB_ID/ cd $SCRATCH singularity exec -B $ROOT -B $SCRATCH $DIR/apps/dfam-tetools-1.87.sif \ BuildDatabase -name $SCRATCH/$name.db $ROOT/wgs/$fasta singularity exec -B $ROOT -B $SCRATCH $DIR/apps/dfam-tetools-1.87.sif \ RepeatModeler -database $SCRATCH/$name.db \ -threads 48 -LTRStruct mkdir $DIR/1-repeats/$name/ cp -r $SCRATCH $DIR/1-repeats/$name/modeler
DIR=/90daydata/gbru_kokia/Kokia/wgs/2-annotation
for i in Kc Kd Kk; do
echo $i
sed -n -e 6,11p -e '/Unclass/p' \
-e '/Gypsy/p' -e '/Copia/p' \
$DIR/1-repeats/$i/masker/$i.fa.tbl
done
ROOT=$(git rev-parse --show-toplevel)
name=${names[$SLURM_ARRAY_TASK_ID]}
ml singularity
SCRATCH=/local/scratch/tony.arick/$SLURM_JOB_ID/
mkdir $DIR/2-braker3/$name
cd $DIR/2-braker3/$name
mkdir $DIR/2-braker3/$name/home
cp $HOME/.gm_key $DIR/2-braker3/$name/home
singularity exec --cleanenv -H $DIR/2-braker3/$name/home \
-B $ROOT -B $SCRATCH \
$DIR/apps/braker3_v.1.0.4.1.sif braker.pl \
--species=$name \
--genome=$DIR/1-repeats/$name/masker/$name.fa.masked \
--prot_seq=$DIR/2-braker3/Viridiplantae.fa \
--threads=48 \
--workingdir=$SCRATCH
cp -r $SCRATCH $DIR/2-braker3/$name/output
ROOT=$(git rev-parse --show-toplevel)
PATH=$DIR/apps/gffread-0.12.7.Linux_x86_64/:$PATH
declare -A names
names["Kc"]="Kocoo"
names["Kk"]="Kokau"
names["Kd"]="Kodry"
names["Gk"]="Gokir"
#for i in "${!genomes[@]}"; do
for i in Kc Kd Kk; do
fasta=${genomes[$i]}
cp $ROOT/wgs/$fasta $DIR/${names[$i]}.fa.gz
tmp_fa=$(mktemp)
zcat $ROOT/wgs/$fasta > $tmp_fa
tmp_gtf=$(mktemp)
# tar -O -xf 2-braker3/$i.tar braker.gtf |
cat $DIR/2-braker3/$i/output/braker.gtf |
awk '$1 != chr { chr=$1; c=1;
num=$1;
if(num ~ /^K[cdkI]/){
sub(".._", "0", num);
num=substr(num, length(num)-2, 3)
}else{
sub("^.*_", "Z", num);
}
}
}
$3 == "gene" {
name = $9;
rep=sprintf("%s.%03sG%04d00", spec, num, c++)
}
1 { gsub(name, rep, $9);
sub("\\.t", ".", $9);
print }' \
FS="\t" OFS="\t" spec=${names[$i]} |
tee $tmp_gtf |
gzip > $DIR/${names[$i]}.gtf.gz
gffread -J -y $DIR/${names[$i]}.pep.fa \
-w $DIR/${names[$i]}.cds.fa \
-g $tmp_fa $tmp_gtf
gzip $DIR/${names[$i]}.pep.fa $DIR/${names[$i]}.cds.fa
rm $tmp_fa $tmp_gtf
done
- Graph
library(tidyverse) data <- list.files(pattern=".gtf.gz") %>% setNames(substring(., 0,5)) %>% lapply(data.table::fread) %>% bind_rows(.id='Species') %>% filter(V3 == "gene") %>% mutate(chr = substring(V1, 4)) %>% group_by(Species, chr) %>% count() data %>% mutate(chr = factor(chr, c('01', '2_4', '03', sprintf('%02d', 5:13)))) %>% ggplot(aes(chr, n, shape=Species, color=Species)) + geom_point(position=position_dodge(width=0.5)) + scale_y_continuous(limits=c(0,4600), expand=c(0,0), name="Number of Genes") + scale_x_discrete(expand=c(0,0), name="Chromosome") + theme_minimal() + theme(panel.grid.major.x=element_blank()) ggsave('gene-counts.png', width=12, height=4, bg='white')
Download latest version of iprscan: 5.65-97.0
IPRSCAN_URL=https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5
cd $DIR/apps
wget $IPRSCAN_URL/5.65-97.0/interproscan-5.65-97.0-64-bit.tar.gz
ml singularity
singularity pull docker://interpro/interproscan:5.65-97.0
Initial IPR Setup
IPRDATA=$DIR/apps/interproscan-5.65-97.0/data
singularity exec \
-B $IPRDATA:/opt/interproscan/data -B $DIR \
$DIR/apps/interproscan_5.65-97.0.sif \
cd /opt/interproscan && \
python3 setup.py -f interproscan.properties
Get protein sequences from renamed gtf file
ROOT=$(git rev-parse --show-toplevel)
PATH=$DIR/apps/gffread-0.12.7.Linux_x86_64/:$PATH
PATH=$DIR/apps/samtools-1.17/:$PATH
declare -A names
names["Kc"]="Kocoo"
names["Kk"]="Kokau"
names["Kd"]="Kodry"
names["Gk"]="Gokir"
for name in ${!genomes[@]}; do
tmp_fasta=$(mktemp)
tmp_gtf=$(mktemp)
zcat $ROOT/wgs/${genomes[$name]} > $tmp_fasta
zcat $DIR/${names[$name]}.gtf.gz > $tmp_gtf
gffread -J -y $DIR/4-iprscan/$name.pep.fa \
-g $tmp_fasta $tmp_gtf
rm $tmp_fasta
rm $tmp_gtf
samtools faidx $DIR/4-iprscan/$name.pep.fa
done
Run iprscan on protein sequences.
ml singularity
PATH=$DIR/apps/samtools-1.17/:$PATH
PATH=$DIR/apps/interproscan-5.65-97.0/:$PATH
SCRATCH=/local/scratch/tony.arick/$SLURM_JOB_ID/
# Set spec with --export=spec=Kd
#PREFIX=$spec.pep.$SLURM_ARRAY_TASK_ID
PREFIX=$spec.pep
# split -n "l/$SLURM_ARRAY_TASK_ID/20" $DIR/4-iprscan/$spec.pep.fa.fai |
# cut -f 1 |
# samtools faidx -r - $DIR/4-iprscan/$spec.pep.fa \
# > $SCRATCH/$PREFIX.fa
cp $DIR/4-iprscan/$spec.pep.fa $SCRATCH/$PREFIX.fa
IPRDATA=$DIR/apps/interproscan-5.65-97.0/data
singularity exec \
-B $IPRDATA:/opt/interproscan/data -B $DIR \
$DIR/apps/interproscan_5.65-97.0.sif \
/opt/interproscan/interproscan.sh \
-i $SCRATCH/$PREFIX.fa \
-f xml,tsv \
-pathways \
-iprlookup \
-goterms \
-dp \
-appl 'Pfam, PRINTS, PANTHER, TIGRFAM, SUPERFAMILY, PIRSF, ProSiteProfiles, ProSitePatterns, SMART' \
-b $SCRATCH/$PREFIX \
-cpu $SLURM_CPUS_PER_TASK \
> $DIR/4-iprscan/$PREFIX.log
mv $SCRATCH/$PREFIX.{tsv,xml} $DIR/4-iprscan/
number of genes
DIR=/90daydata/gbru_kokia/Kokia/wgs/2-annotation
for name in Kocoo Kodry Kokau; do
zcat $DIR/$name.gtf.gz |
awk '{_[$3]++} END {for ( i in _) print name, i, _[i];}' \
name=$name
done | grep -e gene -e transcript
exon stats
DIR=/90daydata/gbru_kokia/Kokia/wgs/2-annotation
for name in Kocoo Kodry Kokau; do
zcat $DIR/$name.gtf.old.gz |
awk '$3 == "exon" {_[$10]++}
END { for(i in _){ print _[i]; }}' name=$name |
Rscript -e "summary(as.numeric(readLines(file('stdin'))))"
done
DIR=/90daydata/gbru_kokia/Kokia/wgs/2-annotation
for name in Kocoo Kodry Kokau; do
zcat $DIR/$name.gtf.old.gz |
awk '$3 == "exon" {_[$10]++}
END { for(i in _){ print _[i]; }}' name=$name |
Rscript -e "table(as.numeric(readLines(file('stdin')))==1)"
done
Kocoo | gene | 39268 | 8914 | 22.700418 |
Kodry | gene | 38042 | 7550 | 19.846485 |
Kokau | gene | 39242 | 8368 | 21.324092 |
Kocoo | transcript | 41953 | 8914 | 21.247587 |
Kodry | transcript | 40766 | 7550 | 18.520336 |
Kokau | transcript | 41898 | 8368 | 19.972314 |
DIR=/90daydata/gbru_kokia/Kokia/wgs/2-annotation
name=Kocoo
for name in Kocoo Kodry Kokau; do
zcat $DIR/$name.gtf.old.gz |
awk '$3 == "exon" {print $5 - $4}' |
Rscript -e "summary(as.numeric(readLines(file('stdin'))))"
done