From 19494372b0c18115bb82ef126a526b33cd5c7a34 Mon Sep 17 00:00:00 2001 From: JianJunJin Date: Wed, 3 Apr 2024 17:12:08 -0400 Subject: [PATCH] 1.7.7.1 - bug fixes 1. fix a import bug of 'from scipy import stat, log, inf' issue (issue #132 #315) 2. fix a ZeroDivisionError bug when the estimated coverage is 0 (issue #311) 3. Disentangling failed -> Disentangling unsuccessful to avoid panic (issue #308) 4. fix a bug in parsing options when '-F anonym' is used (issue #319) 5. have max_multiplicity passed to no-slim case 6. minor adjustment --- GetOrganelleLib/statistical_func.py | 11 +++- GetOrganelleLib/versions.py | 10 ++++ Utilities/disentangle_organelle_assembly.py | 6 +- Utilities/evaluate_assembly_using_mapping.py | 4 +- Utilities/fastg_to_gfa.py | 2 +- Utilities/get_annotated_regions_from_gb.py | 2 +- Utilities/gfa_to_fasta.py | 2 +- Utilities/gfa_to_fastg.py | 2 +- Utilities/join_spades_fastg_by_blast.py | 2 +- .../rm_low_coverage_duplicated_contigs.py | 2 +- Utilities/round_statistics.py | 2 +- Utilities/slim_graph.py | 2 +- get_organelle_from_assembly.py | 26 +++++---- get_organelle_from_reads.py | 55 +++++++++++-------- 14 files changed, 79 insertions(+), 49 deletions(-) diff --git a/GetOrganelleLib/statistical_func.py b/GetOrganelleLib/statistical_func.py index 50f2c2a..ebbaf87 100755 --- a/GetOrganelleLib/statistical_func.py +++ b/GetOrganelleLib/statistical_func.py @@ -1,5 +1,5 @@ try: - from scipy import stats, inf, log + from scipy import stats except ImportError: class stats: class norm: @@ -7,6 +7,15 @@ def logpdf(foo1, foo2, foo3): raise ImportError("Failed in 'from scipy import stats, inf, log'!") inf = float("inf") from math import log +try: + from scipy import inf, log +except ImportError: + try: + from numpy import inf, log + except ImportError: + inf = float("inf") + from math import log + from copy import deepcopy try: import numpy as np diff --git a/GetOrganelleLib/versions.py b/GetOrganelleLib/versions.py index 13cced1..a8c8aad 100644 --- a/GetOrganelleLib/versions.py +++ b/GetOrganelleLib/versions.py @@ -5,6 +5,16 @@ def get_versions(): versions = [ + { + "number": "1.7.7.1", + "features": [ + "1. fix a import bug of 'from scipy import stat, log, inf' issue (issue #132 #315)", + "2. fix a ZeroDivisionError bug when the estimated coverage is 0 (issue #311)", + "3. Disentangling failed -> Disentangling unsuccessful to avoid panic (issue #308)", + "4. fix a bug in parsing options when '-F anonym' is used (issue #319)", + ], + "time": "2024-04-03 17:05 UTC-5" + }, { "number": "1.7.7.0", "features": [ diff --git a/Utilities/disentangle_organelle_assembly.py b/Utilities/disentangle_organelle_assembly.py index 5866a55..48d716d 100755 --- a/Utilities/disentangle_organelle_assembly.py +++ b/Utilities/disentangle_organelle_assembly.py @@ -317,10 +317,10 @@ def disentangle_circular_assembly(fastg_file, tab_file, prefix, weight_factor, t except KeyError as e: if str(e).strip("'") == options.mode: log_handler.error(options.mode + " not found in " + str(options.tab_file) + "!") - log_handler.error("Disentangling failed!") + log_handler.error("Disentangling unsuccessful!") else: log_handler.exception(str(e)) - log_handler.error("Disentangling failed!") + log_handler.error("Disentangling unsuccessful!") if not options.acyclic_allowed: log_handler.info("You might try again with '--linear' to export contig(s) " "instead of circular genome.") @@ -329,7 +329,7 @@ def disentangle_circular_assembly(fastg_file, tab_file, prefix, weight_factor, t log_handler.info("Please open an issue at https://github.com/Kinggerm/GetOrganelle/issues if you find bugs!\n") except Exception as e: log_handler.exception(str(e)) - log_handler.error("Disentangling failed!") + log_handler.error("Disentangling unsuccessful!") if not options.acyclic_allowed: log_handler.info("You might try again with '--linear' to export contig(s) " "instead of circular genome.") diff --git a/Utilities/evaluate_assembly_using_mapping.py b/Utilities/evaluate_assembly_using_mapping.py index 0ea5e8b..83fbac8 100755 --- a/Utilities/evaluate_assembly_using_mapping.py +++ b/Utilities/evaluate_assembly_using_mapping.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python +#!/usr/bin/env python # coding:utf8 from argparse import ArgumentParser @@ -29,7 +29,7 @@ try: # python2 UnicodeDecodeError ± - reload(sys) + reload(sys) # type: ignore sys.setdefaultencoding('utf8') except NameError: pass diff --git a/Utilities/fastg_to_gfa.py b/Utilities/fastg_to_gfa.py index 8ab5075..312e03a 100755 --- a/Utilities/fastg_to_gfa.py +++ b/Utilities/fastg_to_gfa.py @@ -21,7 +21,7 @@ def main(): if type(2/1) == float: fastg = input('Please input gfa file:').strip() else: - fastg = raw_input('Please input gfa file:').strip() + fastg = raw_input('Please input gfa file:').strip() # type: ignore PATH_OF_THIS_SCRIPT = os.path.split(os.path.realpath(__file__))[0] sys.path.insert(0, os.path.join(PATH_OF_THIS_SCRIPT, "..")) from GetOrganelleLib.assembly_parser import Assembly diff --git a/Utilities/get_annotated_regions_from_gb.py b/Utilities/get_annotated_regions_from_gb.py index a12afe9..2606d02 100755 --- a/Utilities/get_annotated_regions_from_gb.py +++ b/Utilities/get_annotated_regions_from_gb.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python +#!/usr/bin/env python import os import time diff --git a/Utilities/gfa_to_fasta.py b/Utilities/gfa_to_fasta.py index 85b2ba6..8207710 100755 --- a/Utilities/gfa_to_fasta.py +++ b/Utilities/gfa_to_fasta.py @@ -33,7 +33,7 @@ def main(): if type(2/1) == float: gfa_file = input('Please input gfa file:').strip() else: - gfa_file = raw_input('Please input gfa file:').strip() + gfa_file = raw_input('Please input gfa file:').strip() # type: ignore if gfa_file.strip(): write_fasta(gfa_file +'.fasta', read_gfa_as_fasta(gfa_file), False) diff --git a/Utilities/gfa_to_fastg.py b/Utilities/gfa_to_fastg.py index 6d4ed25..9cb2bcc 100755 --- a/Utilities/gfa_to_fastg.py +++ b/Utilities/gfa_to_fastg.py @@ -21,7 +21,7 @@ def main(): if type(2/1) == float: gfa_file = input('Please input gfa file:').strip() else: - gfa_file = raw_input('Please input gfa file:').strip() + gfa_file = raw_input('Please input gfa file:').strip() # type: ignore PATH_OF_THIS_SCRIPT = os.path.split(os.path.realpath(__file__))[0] sys.path.insert(0, os.path.join(PATH_OF_THIS_SCRIPT, "..")) from GetOrganelleLib.assembly_parser import Assembly diff --git a/Utilities/join_spades_fastg_by_blast.py b/Utilities/join_spades_fastg_by_blast.py index 0661650..4575f26 100755 --- a/Utilities/join_spades_fastg_by_blast.py +++ b/Utilities/join_spades_fastg_by_blast.py @@ -7,7 +7,7 @@ import subprocess try: # python2 - import commands + import commands # type: ignore except: pass from argparse import ArgumentParser diff --git a/Utilities/rm_low_coverage_duplicated_contigs.py b/Utilities/rm_low_coverage_duplicated_contigs.py index 3b5b248..af314ca 100755 --- a/Utilities/rm_low_coverage_duplicated_contigs.py +++ b/Utilities/rm_low_coverage_duplicated_contigs.py @@ -3,7 +3,7 @@ from argparse import ArgumentParser import subprocess try: - import commands + import commands # type: ignore except: pass import os diff --git a/Utilities/round_statistics.py b/Utilities/round_statistics.py index 83354c5..1090000 100755 --- a/Utilities/round_statistics.py +++ b/Utilities/round_statistics.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python +#!/usr/bin/env python from argparse import ArgumentParser import os diff --git a/Utilities/slim_graph.py b/Utilities/slim_graph.py index 9899b5c..9e0d651 100755 --- a/Utilities/slim_graph.py +++ b/Utilities/slim_graph.py @@ -6,7 +6,7 @@ import subprocess try: # python2 - import commands + import commands # type: ignore except: pass inf = float("inf") diff --git a/get_organelle_from_assembly.py b/get_organelle_from_assembly.py index 1f905de..41016d3 100755 --- a/get_organelle_from_assembly.py +++ b/get_organelle_from_assembly.py @@ -396,7 +396,7 @@ def _check_default_db(this_sub_organelle, extra_type=""): log_types.append("embplant_pt") log_handler.info("LABEL DB: " + single_line_db_versions(existing_label_db, log_types)) # working directory - log_handler.info("WORKING DIR: " + os.getcwd()) + log_handler.info("WORKING_DIR=" + os.getcwd()) log_handler.info(" ".join(["\"" + arg + "\"" if " " in arg else arg for arg in sys.argv]) + "\n") assert is_valid_path(os.path.realpath(options.output_base)), \ @@ -450,10 +450,13 @@ def _check_default_db(this_sub_organelle, extra_type=""): elif sub_organelle_t in ("embplant_nr", "fungus_nr", "animal_mt"): options.expected_max_size.append(int(raw_default_value / 10)) elif sub_organelle_t == "anonym": - ref_seqs = read_fasta(options.genes_fasta[got_t])[1] - options.expected_max_size.append(10 * sum([len(this_seq) for this_seq in ref_seqs])) - log_handler.info("Setting '--expected-max-size " + str(options.expected_max_size) + - "' for estimating the word size value for anonym type.") + if options.genes_fasta: + ref_seqs = read_fasta(options.genes_fasta[got_t])[1] + options.expected_max_size.append(10 * sum([len(this_seq) for this_seq in ref_seqs])) + log_handler.info("Setting '--expected-max-size " + str(options.expected_max_size) + + "' for estimating the word size value for anonym type.") + else: + options.expected_max_size.append(inf) else: temp_val_len = len(str(options.expected_max_size).split(",")) if temp_val_len != organelle_type_len: @@ -619,6 +622,7 @@ def disentangle_inside(fastg_f, tab_f, o_p, w_f, log_in, type_f=3., mode_in="emb if no_slim: input_graph.estimate_copy_and_depth_by_cov(mode=mode_in, log_handler=log_in, verbose=verbose_in) target_results = input_graph.estimate_copy_and_depth_precisely( + maximum_copy_num=max_copy_in, broken_graph_allowed=acyclic_allowed_in, return_new_graphs=True, verbose=verbose_in, log_handler=log_in) else: @@ -841,11 +845,11 @@ def disentangle_inside(fastg_f, tab_f, o_p, w_f, log_in, type_f=3., mode_in="emb if verbose: raise e except RuntimeError as e: - log_handler.info("Disentangling failed: RuntimeError: " + str(e).strip()) + log_handler.info("Disentangling unsuccessful: RuntimeError: " + str(e).strip()) except TimeoutError as e: log_handler.info("Disentangling timeout. (see " + timeout_flag + " for more)") except ProcessingGraphFailed as e: - log_handler.info("Disentangling failed: " + str(e).strip()) + log_handler.info("Disentangling unsuccessful: " + str(e).strip()) except Exception as e: log_handler.exception("") raise e @@ -879,11 +883,11 @@ def disentangle_inside(fastg_f, tab_f, o_p, w_f, log_in, type_f=3., mode_in="emb if verbose: raise e except RuntimeError as e: - log_handler.info("Disentangling failed: RuntimeError: " + str(e).strip()) + log_handler.info("Disentangling unsuccessful: RuntimeError: " + str(e).strip()) except TimeoutError as e: log_handler.info("Disentangling timeout. (see " + timeout_flag + " for more)") except ProcessingGraphFailed as e: - log_handler.info("Disentangling failed: " + str(e).strip()) + log_handler.info("Disentangling unsuccessful: " + str(e).strip()) except Exception as e: log_handler.exception("") raise e @@ -915,11 +919,11 @@ def disentangle_inside(fastg_f, tab_f, o_p, w_f, log_in, type_f=3., mode_in="emb except RuntimeError as e: if verbose: log_handler.exception("") - log_handler.info("Disentangling failed: RuntimeError: " + str(e).strip()) + log_handler.info("Disentangling unsuccessful: RuntimeError: " + str(e).strip()) except TimeoutError as e: log_handler.info("Disentangling timeout. (see " + timeout_flag + " for more)") except ProcessingGraphFailed as e: - log_handler.info("Disentangling failed: " + str(e).strip()) + log_handler.info("Disentangling unsuccessful: " + str(e).strip()) except Exception as e: raise e else: diff --git a/get_organelle_from_reads.py b/get_organelle_from_reads.py index 4e29313..c3027d8 100755 --- a/get_organelle_from_reads.py +++ b/get_organelle_from_reads.py @@ -716,7 +716,7 @@ def _check_default_db(this_sub_organelle, extra_type=""): log_label_types.append("embplant_pt") log_handler.info("LABEL DB: " + single_line_db_versions(existing_label_db, log_label_types)) # working directory - log_handler.info("WORKING DIR: " + os.getcwd()) + log_handler.info("WORKING_DIR=" + os.getcwd()) log_handler.info(" ".join(["\"" + arg + "\"" if " " in arg else arg for arg in sys.argv]) + "\n") # if options.run_spades: @@ -872,11 +872,14 @@ def _check_default_db(this_sub_organelle, extra_type=""): elif sub_organelle_t in ("embplant_nr", "fungus_nr", "animal_mt"): options.expected_max_size.append(int(raw_default_value / 10)) elif sub_organelle_t == "anonym": - ref_seqs = read_fasta(options.genes_fasta[got_t])[1] - options.expected_max_size.append(10 * sum([len(this_seq) for this_seq in ref_seqs])) - log_handler.info( - "Setting '--expected-max-size " + ",".join([str(t_s) for t_s in options.expected_max_size]) + - "' for estimating the word size value for anonym type.") + if options.genes_fasta: + ref_seqs = read_fasta(options.genes_fasta[got_t])[1] + options.expected_max_size.append(10 * sum([len(this_seq) for this_seq in ref_seqs])) + log_handler.info( + "Setting '--expected-max-size " + ",".join([str(t_s) for t_s in options.expected_max_size]) + + "' for estimating the word size value for anonym type.") + else: + options.expected_max_size.append(inf) else: temp_val_len = len(str(options.expected_max_size).split(",")) if temp_val_len != organelle_type_len: @@ -1243,18 +1246,22 @@ def estimate_word_size(base_cov, base_cov_deviation, read_length, target_size, m echo_problem = True word_cov = max(min_word_cov, word_cov) word_cov = trans_word_cov(word_cov, base_cov, mean_error_rate / 2., read_length) - # 1. relationship between kmer coverage and base coverage, k_cov = base_cov * (read_len - k_len + 1) / read_len - estimated_word_size = int(read_length * (1 - word_cov / base_cov)) + 1 - # print(estimated_word_size) - estimated_word_size = min(int(read_length * MAX_RATIO_RL_WS), max(min_word_size, estimated_word_size)) - if echo_problem: - if log_handler: - log_handler.warning("Guessing that you are using too few data for assembling " + organelle_type + "!") - log_handler.warning("GetOrganelle is still trying ...") - else: - sys.stdout.write("Guessing that you are using too few data for assembling " + organelle_type + "!\n") - sys.stdout.write("GetOrganelle is still trying ...\n") - return int(round(estimated_word_size, 0)) + if base_cov == 0: + log_handler.error("Word size estimation failed due to improper seed or too few input data!") + sys.exit() + else: + # 1. relationship between kmer coverage and base coverage, k_cov = base_cov * (read_len - k_len + 1) / read_len + estimated_word_size = int(read_length * (1 - word_cov / base_cov)) + 1 + # print(estimated_word_size) + estimated_word_size = min(int(read_length * MAX_RATIO_RL_WS), max(min_word_size, estimated_word_size)) + if echo_problem: + if log_handler: + log_handler.warning("Guessing that you are using too few data for assembling " + organelle_type + "!") + log_handler.warning("GetOrganelle is still trying ...") + else: + sys.stdout.write("Guessing that you are using too few data for assembling " + organelle_type + "!\n") + sys.stdout.write("GetOrganelle is still trying ...\n") + return int(round(estimated_word_size, 0)) def calculate_word_size_according_to_ratio(word_size_ratio, mean_read_len, log_handler): @@ -3619,11 +3626,11 @@ def disentangle_inside(fastg_f, tab_f, o_p, w_f, log_in, type_f=3., mode_in="emb except RuntimeError as e: if verbose: log_handler.exception("") - log_handler.info("Disentangling failed: RuntimeError: " + str(e).strip()) + log_handler.info("Disentangling unsuccessful: RuntimeError: " + str(e).strip()) except TimeoutError: log_handler.info("Disentangling timeout. (see " + timeout_flag + " for more)") except ProcessingGraphFailed as e: - log_handler.info("Disentangling failed: " + str(e).strip()) + log_handler.info("Disentangling unsuccessful: " + str(e).strip()) except Exception as e: log_handler.exception("") sys.exit() @@ -3663,11 +3670,11 @@ def disentangle_inside(fastg_f, tab_f, o_p, w_f, log_in, type_f=3., mode_in="emb except RuntimeError as e: if verbose: log_handler.exception("") - log_handler.info("Disentangling failed: RuntimeError: " + str(e).strip()) + log_handler.info("Disentangling unsuccessful: RuntimeError: " + str(e).strip()) except TimeoutError: log_handler.info("Disentangling timeout. (see " + timeout_flag + " for more)") except ProcessingGraphFailed as e: - log_handler.info("Disentangling failed: " + str(e).strip()) + log_handler.info("Disentangling unsuccessful: " + str(e).strip()) except Exception as e: log_handler.exception("") sys.exit() @@ -3708,11 +3715,11 @@ def disentangle_inside(fastg_f, tab_f, o_p, w_f, log_in, type_f=3., mode_in="emb except RuntimeError as e: if verbose: log_handler.exception("") - log_handler.info("Disentangling failed: RuntimeError: " + str(e).strip()) + log_handler.info("Disentangling unsuccessful: RuntimeError: " + str(e).strip()) except TimeoutError: log_handler.info("Disentangling timeout. (see " + timeout_flag + " for more)") except ProcessingGraphFailed as e: - log_handler.info("Disentangling failed: " + str(e).strip()) + log_handler.info("Disentangling unsuccessful: " + str(e).strip()) except Exception as e: raise e else: