forked from functional-dark-side/agnostos-wf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprofile_search.sh
97 lines (82 loc) · 3.14 KB
/
profile_search.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/bin/bash
set -e
set -x
usage() { echo -e "Usage: $(basename $0) [OPTIONS...]
Options:
--query genes fasta sequences \
--clu_hmm cluster HMMs db \
--clu_cat cluster categories table \
--info genes additional information \
--mmseqs location of mmseqs binary \
--threads number of threads to use" 1>&2; exit 1; }
OPT_LIST="query:,clu_hmm:,clu_cat:,info:,mmseqs:,threads:"
eval set -- $(getopt -o '' --long "${OPT_LIST}" -- "$@")
while true; do
case "$1" in
--query)
QUERY=$2; shift 2 ;;
--clu_hmm)
CLHMM=$2; shift 2 ;;
--clu_cat)
CLCAT=$2; shift 2 ;;
--info)
INFO=$2; shift 2 ;;
--mmseqs)
MMSEQS_BIN=$2; shift 2 ;;
--threads)
NSLOTS=$2; shift 2 ;;
--)
shift; break ;;
*)
usage ;;
esac
done
if [ -z "${QUERY}" ] ; then
usage
fi
MMSEQS_BIN=${MMSEQS_BIN:=~/opt/MMseqs2/bin/mmseqs}
# Fixed variables
MPI="srun --mpi=pmi2"
EFILTER="${PWD}"/agnostos-wf/Profile_search/evalue_filter.awk
MVOTE="${PWD}"/agnostos-wf/Profile_search/majority_vote_categ.R
LTMP=/vol/scratch/tmp
OUTDIR="${PWD}"/profile_search_res
mkdir -p "${OUTDIR}"
# Create query DB
# Create sequence databases
NAME=$(basename "${QUERY}" .fasta)
QUERY_DB="${OUTDIR}"/"${NAME}"_db
"${MMSEQS_BIN}" createdb "${QUERY}" "${QUERY_DB}"
# Decide to use mpi search based on the number of query sequences:
NSEQS=$(grep -c '^>' "${QUERY}")
if [[ ${NSEQS} -ge 10000 ]]; then
# Sequernce-profile search against the Cluster HMM profiles
sbatch --ntasks-per-node 1 --wait \
--nodes 9 --cpus-per-task 28 \
--job-name profs --partition nomaster \
--wrap " ${MMSEQS_BIN} search ${QUERY_DB} ${CLHMM} ${OUTDIR}/${NAME}_vs_mg_gtdb_hmm_db ${OUTDIR}/tmp \
--local-tmp ${LTMP} \
--threads ${NSLOTS} -e 1e-20 --cov-mode 2 -c 0.6 \
--split-mode 0 --split-memory-limit 150G \
--mpi-runner \"${MPI}\" "
else
# Sequernce-profile search against the Cluster HMM profiles
"${MMSEQS_BIN}" search "${QUERY_DB}" "${CLHMM}" "${OUTDIR}"/"${NAME}"_vs_mg_gtdb_hmm_db "${OUTDIR}"/tmp\
--threads "${NSLOTS}" -e 1e-20 --cov-mode 2 -c 0.6
fi
wait
# Convert result database in tsv-file
"${MMSEQS_BIN}" convertalis "${QUERY_DB}" "${CLHMM}" "${OUTDIR}"/"${NAME}"_vs_mg_gtdb_hmm_db \
"${OUTDIR}"/"${NAME}"_vs_mg_gtdb_hmm_hits.tsv \
--format-output 'query,target,pident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qcov,tcov' \
--threads "${NSLOTS}"
# Filter hits within 90% of the Log(best(e-value))
awk -v P=0.9 -f "${EFILTER}" "${OUTDIR}"/"${NAME}"_vs_mg_gtdb_hmm_hits.tsv > "${OUTDIR}"/"${NAME}"_vs_mg_gtdb_hmm_res_qcov06_e90.tsv
# Add functional category information
join -12 -21 <(sort -k2,2 "${OUTDIR}"/"${NAME}"_vs_mg_gtdb_hmm_res_qcov06_e90.tsv ) \
<(sort -k1,1 "${CLCAT}") \
> "${OUTDIR}"/"${NAME}"_vs_mg_gtdb_hmm_search_res.tsv
gzip "${OUTDIR}"/"${NAME}"_vs_mg_gtdb_hmm_search_res.tsv
# Parse results to get best-hits and category proportions per sample (or genome or contig..)
# NB: the info file should have the following format: <gene> <sample> (or <genome>, or <contig>)
Rscript --vanilla "${MVOTE}" "${OUTDIR}"/"${NAME}"_vs_mg_gtdb_hmm_search_res.tsv.gz "${INFO}"