Skip to content

Commit

Permalink
v0.4.1 (#48)
Browse files Browse the repository at this point in the history
* v0.4.1, no default -d, fix and improved debug -Z, fix link gtdb release 07-RS207

* threads on check missing files
  • Loading branch information
pirovc authored Apr 8, 2022
1 parent 39602a9 commit a5df18f
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 82 deletions.
15 changes: 8 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -199,10 +199,10 @@ or
┌─┐┌─┐┌┐┌┌─┐┌┬┐┌─┐ ┬ ┬┌─┐┌┬┐┌─┐┌┬┐┌─┐┬─┐
│ ┬├┤ ││││ ││││├┤ │ │├─┘ ││├─┤ │ ├┤ ├┬┘
└─┘└─┘┘└┘└─┘┴ ┴└─┘────└─┘┴ ─┴┘┴ ┴ ┴ └─┘┴└─
v0.4.0
v0.4.1

Database options:
-d Database (comma-separated entries) [genbank, refseq] Default: refseq
-d Database (comma-separated entries) [genbank, refseq]

Organism options:
-g Organism group (comma-separated entries) [archaea, bacteria, fungi, human, invertebrate, metagenomes, other, plant, protozoa, vertebrate_mammalian, vertebrate_other, viral]. Example: archaea,bacteria.
Expand All @@ -227,9 +227,9 @@ or
Default: 0
-F custom filter for the assembly summary in the format colA:val1|colB:valX,valY (case insensitive). Example: -F "2:PRJNA12377,PRJNA670754|14:Partial" for column infos check ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt
Default: ""
-D Start date to keep sequences (>=), based on the sequence release date. Format YYYYMMDD. Example: -D 20201030
-D Start date to keep sequences (>=), based on the sequence release date. Format YYYYMMDD. Example: 20201030
Default: ""
-E End date to keep sequences (<=), based on the sequence release date. Format YYYYMMDD. Example: -D 20201231
-E End date to keep sequences (<=), based on the sequence release date. Format YYYYMMDD. Example: 20201231
Default: ""
-z Keep only assemblies present on the latest GTDB release

Expand All @@ -251,8 +251,8 @@ or
Default: ""
-k Dry-run, no data is downloaded or updated - just checks for available sequences and changes
-i Fix failed downloads or any incomplete data from a previous run, keep current version
-m Check MD5 for downloaded files
-t Threads
-m Check MD5 of downloaded files
-t Threads to parallelize download and some file operations
Default: 1

Misc. options:
Expand All @@ -263,7 +263,8 @@ or
-n Conditional exit status. Exit Code = 1 if more than N files failed to download (integer for file number, float for percentage, 0 -> off)
Default: 0
-V Verbose log to report successful file downloads
-D Print print debug information and exit
-Z Print debug information and run in debug mode


## References:

Expand Down
100 changes: 59 additions & 41 deletions genome_updater.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ IFS=$' '
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

version="0.4.0"
version="0.4.1"
genome_updater_args=$( printf "%q " "$@" )
export genome_updater_args

Expand All @@ -44,7 +44,7 @@ use_curl=${use_curl:-0}
# Export locale numeric to avoid errors on printf in different setups
export LC_NUMERIC="en_US.UTF-8"

gtdb_urls=( "https://data.gtdb.ecogenomic.org/releases/latest/ar122_taxonomy.tsv.gz"
gtdb_urls=( "https://data.gtdb.ecogenomic.org/releases/latest/ar53_taxonomy.tsv.gz"
"https://data.gtdb.ecogenomic.org/releases/latest/bac120_taxonomy.tsv.gz" )

#activate aliases in the script
Expand Down Expand Up @@ -500,7 +500,7 @@ remove_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] OR fi
check_missing_files() # ${1} file, ${2} fields [assembly_accesion,url], ${3} extension - returns assembly accession, url and filename
{
# Just returns if file doesn't exist or if it's zero size
list_files ${1} ${2} ${3} | xargs --no-run-if-empty -n3 sh -c 'if [ ! -s "'"${target_output_prefix}${files_dir}"'${2}" ]; then echo "${0}'$'\t''${1}'$'\t''${2}"; fi'
list_files ${1} ${2} ${3} | xargs -P "${threads}" --no-run-if-empty -n3 sh -c 'if [ ! -s "'"${target_output_prefix}${files_dir}"'${2}" ]; then echo "${0}'$'\t''${1}'$'\t''${2}"; fi'
}

check_complete_record() # parameters: ${1} file, ${2} field [assembly accession, url], ${3} extension - returns assembly accession, url
Expand Down Expand Up @@ -575,7 +575,7 @@ print_debug() # parameters: ${1} tools
}

# Defaults
database="refseq"
database=""
organism_group=""
species=""
taxids=""
Expand Down Expand Up @@ -624,7 +624,7 @@ function showhelp {
print_logo
echo
echo $'Database options:'
echo $' -d Database (comma-separated entries) [genbank, refseq]\tDefault: refseq'
echo $' -d Database (comma-separated entries) [genbank, refseq]'
echo
echo $'Organism options:'
echo $' -g Organism group (comma-separated entries) [archaea, bacteria, fungi, human, invertebrate, metagenomes, other, plant, protozoa, vertebrate_mammalian, vertebrate_other, viral]. Example: archaea,bacteria.\n\tDefault: ""'
Expand All @@ -640,8 +640,8 @@ function showhelp {
echo $' -P Number of top references for each species nodes to download. 0 for all. Selection order: RefSeq Category, Assembly level, Relation to type material, Date (most recent first)\n\tDefault: 0'
echo $' -A Number of top references for each taxids (leaf nodes) to download. 0 for all. Selection order: RefSeq Category, Assembly level, Relation to type material, Date (most recent first)\n\tDefault: 0'
echo $' -F custom filter for the assembly summary in the format colA:val1|colB:valX,valY (case insensitive). Example: -F "2:PRJNA12377,PRJNA670754|14:Partial" for column infos check ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt\n\tDefault: ""'
echo $' -D Start date to keep sequences (>=), based on the sequence release date. Format YYYYMMDD. Example: -D 20201030\n\tDefault: ""'
echo $' -E End date to keep sequences (<=), based on the sequence release date. Format YYYYMMDD. Example: -D 20201231\n\tDefault: ""'
echo $' -D Start date to keep sequences (>=), based on the sequence release date. Format YYYYMMDD. Example: 20201030\n\tDefault: ""'
echo $' -E End date to keep sequences (<=), based on the sequence release date. Format YYYYMMDD. Example: 20201231\n\tDefault: ""'
echo $' -z Keep only assemblies present on the latest GTDB release'
echo
echo $'Report options:'
Expand All @@ -657,8 +657,8 @@ function showhelp {
echo $' -B Base label to use as the current version. Can be used to rollback to an older version or to create multiple branches from a base version. It only applies for updates. \n\tDefault: ""'
echo $' -k Dry-run, no data is downloaded or updated - just checks for available sequences and changes'
echo $' -i Fix failed downloads or any incomplete data from a previous run, keep current version'
echo $' -m Check MD5 for downloaded files'
echo $' -t Threads\n\tDefault: 1'
echo $' -m Check MD5 of downloaded files'
echo $' -t Threads to parallelize download and some file operations\n\tDefault: 1'
echo
echo $'Misc. options:'
echo $' -x Allow the deletion of regular extra files if any found in the files folder. Symbolic links that do not belong to the current version will always be deleted.'
Expand All @@ -667,7 +667,7 @@ function showhelp {
echo $' -w Silent output with download progress (%) and download version at the end'
echo $' -n Conditional exit status. Exit Code = 1 if more than N files failed to download (integer for file number, float for percentage, 0 -> off)\n\tDefault: 0'
echo $' -V Verbose log to report successful file downloads'
echo $' -D Print print debug information and exit'
echo $' -Z Print debug information and run in debug mode'
echo
}

Expand All @@ -690,52 +690,63 @@ done
if [ "${tool_not_found}" -eq 1 ]; then exit 1; fi

OPTIND=1 # Reset getopts
while getopts "d:g:S:T:c:l:F:o:e:R:b:B:t:f:P:A:D:E:zn:akixmurpswhDV" opt; do
while getopts "aA:b:B:d:D:c:De:E:f:F:g:hikl:mn:o:pP:rR:sS:t:T:uVwxzZ" opt; do
case ${opt} in
d) database=${OPTARG} ;;
g) organism_group=${OPTARG// } ;; #remove spaces
S) species=${OPTARG// } ;; #remove spaces
T) taxids=${OPTARG// } ;; #remove spaces
c) refseq_category=${OPTARG} ;;
l) assembly_level=${OPTARG} ;;
F) custom_filter=${OPTARG} ;;
o) working_dir=${OPTARG} ;;
e) external_assembly_summary=${OPTARG} ;;
R) retry_download_batch=${OPTARG} ;;
a) download_taxonomy=1 ;;
A) top_assemblies_taxids=${OPTARG} ;;
b) label=${OPTARG} ;;
B) rollback_label=${OPTARG} ;;
t) threads=${OPTARG} ;;
f) file_formats=${OPTARG// } ;; #remove spaces
P) top_assemblies_species=${OPTARG} ;;
A) top_assemblies_taxids=${OPTARG} ;;
c) refseq_category=${OPTARG} ;;
d) database=${OPTARG} ;;
D) date_start=${OPTARG} ;;
e) external_assembly_summary=${OPTARG} ;;
E) date_end=${OPTARG} ;;
z) gtdb_only=1 ;;
a) download_taxonomy=1 ;;
k) dry_run=1 ;;
f) file_formats=${OPTARG// } ;; #remove spaces
F) custom_filter=${OPTARG} ;;
g) organism_group=${OPTARG// } ;; #remove spaces
h|\?) showhelp; exit 0 ;;
i) just_fix=1 ;;
x) delete_extra_files=1 ;;
k) dry_run=1 ;;
l) assembly_level=${OPTARG} ;;
m) check_md5=1 ;;
u) updated_assembly_accession=1 ;;
r) updated_sequence_accession=1 ;;
p) url_list=1 ;;
n) conditional_exit=${OPTARG} ;;
o) working_dir=${OPTARG} ;;
p) url_list=1 ;;
P) top_assemblies_species=${OPTARG} ;;
r) updated_sequence_accession=1 ;;
R) retry_download_batch=${OPTARG} ;;
s) silent=1 ;;
w) silent_progress=1 ;;
D) debug_mode=1 ;;
S) species=${OPTARG// } ;; #remove spaces
t) threads=${OPTARG} ;;
T) taxids=${OPTARG// } ;; #remove spaces
u) updated_assembly_accession=1 ;;
V) verbose_log=1 ;;
h|\?) showhelp; exit 0 ;;
w) silent_progress=1 ;;
x) delete_extra_files=1 ;;
z) gtdb_only=1 ;;
Z) debug_mode=1 ;;
:) echo "Option -${OPTARG} requires an argument." >&2; exit 1 ;;
esac
done
if [ ${OPTIND} -eq 1 ]; then showhelp; exit 1; fi
shift $((OPTIND-1))
[ "${1:-}" = "--" ] && shift

# Print tools and versions
if [ "${debug_mode}" -eq 1 ] ; then
print_debug tools;
exit 0;
# If debug is the only parameter, exit, otherwise set debug mode for the run (set -x)
if [ ${OPTIND} -eq 2 ]; then
exit 0;
else
set -x
fi
fi
# No params
if [ ${OPTIND} -eq 1 ]; then
showhelp;
exit 1;
fi
shift $((OPTIND-1))
[ "${1:-}" = "--" ] && shift

######################### General parameter validation #########################
if [[ -z "${database}" ]]; then
echo "Database is required (-d)"; exit 1;
Expand Down Expand Up @@ -842,7 +853,6 @@ if [[ "${MODE}" == "UPDATE" ]] || [[ "${MODE}" == "FIX" ]]; then # get existing
if [[ -f "${rollback_assembly_summary}" ]]; then
rm ${default_assembly_summary}
ln -s -r "${rollback_assembly_summary}" "${default_assembly_summary}"

else
echo "Rollback label/assembly_summary.txt not found ["${rollback_assembly_summary}"]"; exit 1
fi
Expand Down Expand Up @@ -928,6 +938,10 @@ else
fi
echolog "-------------------------------------------" "1"

if [ "${debug_mode}" -eq 1 ] ; then
ls -laR "${working_dir}"
fi

# new
if [[ "${MODE}" == "NEW" ]]; then

Expand Down Expand Up @@ -983,7 +997,6 @@ if [[ "${MODE}" == "NEW" ]]; then
fi
echolog "" "1"
fi

fi

else # update/fix
Expand Down Expand Up @@ -1176,6 +1189,11 @@ if [ "${dry_run}" -eq 0 ]; then
if [ "${silent_progress}" -eq 1 ] ; then
echo "$(dirname $(readlink -m ${default_assembly_summary}))"
fi

if [ "${debug_mode}" -eq 1 ] ; then
ls -laR "${working_dir}"
fi

# Exit conditional status
exit $(exit_status ${expected_files} ${current_files})
fi
Loading

0 comments on commit a5df18f

Please sign in to comment.