Various enchancements to circularisation

rrwick · May 13, 2020 · 82f751f · 82f751f
1 parent 935dd4f
commit 82f751f
Show file tree

Hide file tree

Showing 4 changed files with 135 additions and 53 deletions.
diff --git a/trycycler/__main__.py b/trycycler/__main__.py
@@ -151,9 +151,9 @@ def reconcile_subparser(subparsers):
                                 'in order to fix circularisation')
 
     final_check_args = group.add_argument_group('Final check')
-    final_check_args.add_argument('--min_identity', type=float, default=99.0,
+    final_check_args.add_argument('--min_identity', type=float, default=98.0,
                                   help='Minimum allowed pairwise percent identity')
-    final_check_args.add_argument('--max_indel_size', type=int, default=100,
+    final_check_args.add_argument('--max_indel_size', type=int, default=250,
                                   help='Maximum allowed pairwise indel size')
 
     other_args = group.add_argument_group('Other')

diff --git a/trycycler/circularisation.py b/trycycler/circularisation.py
@@ -11,8 +11,6 @@
 If not, see <http://www.gnu.org/licenses/>.
 """
 
-import sys
-
 from .alignment import align_a_to_b, align_reads_to_seq, get_best_alignment_per_read
 from .log import log, section_header, explanation, quit_with_error
 from .misc import remove_duplicates
@@ -39,17 +37,19 @@ def circularise_seq_with_others(name_a, seqs, args):
     log(f'Circularising {name_a}:')
     seq_a = seqs[name_a]
 
-    candidate_seqs = []
+    candidate_seqs, fail_reasons = [], []
     for name_b, seq_b in seqs.items():
         if name_a != name_b:
-            candidate_seqs.append(circularise_seq_with_another(seq_a, seq_b, name_a, name_b, args))
+            candidate_seq, fail_reason = \
+                circularise_seq_with_another(seq_a, seq_b, name_a, name_b, args)
+            candidate_seqs.append(candidate_seq)
+            fail_reasons.append(fail_reason)
     candidate_seqs = [s for s in candidate_seqs if s is not None]
     candidate_seqs = remove_duplicates(candidate_seqs)
 
     if len(candidate_seqs) == 0:
         circularised_seq = None
-        # TODO: make this message more verbose with suggestions on what to do.
-        quit_with_error(f'Error: failed to circularise sequence {name_a}.')
+        quit_with_error(get_fail_message(name_a, fail_reasons))
 
     elif len(candidate_seqs) == 1:
         circularised_seq = candidate_seqs[0]
@@ -61,6 +61,43 @@ def circularise_seq_with_others(name_a, seqs, args):
     return circularised_seq
 
 
+def get_fail_message(name_a, fail_reasons):
+    fail_message = f'Error: failed to circularise sequence {name_a}.'
+    fail_reasons = set(x for x in fail_reasons if x is not None)
+    if len(fail_reasons) > 1:
+        return f'Error: failed to circularise sequence {name_a} for multiple reasons. You must ' \
+               f'either repair this sequence or exclude it and then try running trycycler ' \
+               f'reconcile again.'
+    elif 'end not found' in fail_reasons:
+        return f'Error: failed to circularise sequence {name_a} because its end could not be ' \
+               f'found in other sequences. You can either trim some sequence off the ' \
+               f'end of {name_a} or exclude the sequence altogether and try again.'
+    elif 'start not found' in fail_reasons:
+        return f'Error: failed to circularise sequence {name_a} because its start could not be ' \
+               f'found in other sequences. You can either trim some sequence off the ' \
+               f'start of {name_a} or exclude the sequence altogether and try again.'
+    elif 'same start/end' in fail_reasons:
+        return f'Error: failed to circularise sequence {name_a} because it had the same ' \
+               f'start/end as the other sequences. I.e. the circularisations were not different ' \
+               f'and therefore could not reconcile each other.'
+    elif 'multiple possibilities' in fail_reasons:
+        return f'Error: failed to circularise sequence {name_a} because its start/end sequences ' \
+               f'were found in multiple ambiguous places in other sequences. This is likely ' \
+               f'because {name_a} starts/ends in a repetitive region. You can either manually ' \
+               f'repair its circularisation (and ensure it does not start/end in a repetitive ' \
+               f'region) or exclude the sequence altogether and try again.'
+    elif 'too much extra' in fail_reasons:
+        return f'Error: failed to circularise sequence {name_a} because it contained too much ' \
+               f'start-end overlap. You can either manually trim the contig sequence, ' \
+               f'increase the value of the --max_trim_seq/--max_trim_seq_percent parameters or ' \
+               f'exclude the sequence altogether and try again.'
+    elif 'too much missing' in fail_reasons:
+        return f'Error: failed to circularise sequence {name_a} because it contained too large ' \
+               f'of a start-end gap. You can either increase the value of the --max_add_seq/' \
+               f'--max_add_seq_percent parameters or exclude the sequence altogether and try again.'
+    return fail_message
+
+
 def circularise_seq_with_another(seq_a, seq_b, name_a, name_b, args):
     log(f'  using {name_b}:')
 
@@ -70,16 +107,24 @@ def circularise_seq_with_another(seq_a, seq_b, name_a, name_b, args):
     max_trim_seq_relative = int(round(args.max_trim_seq_percent * len(seq_a) / 100.0))
     max_trim_seq = min(args.max_trim_seq, max_trim_seq_relative)
 
-    end_alignment, start_alignment = find_end_and_start(seq_a, seq_b, name_a, name_b, args)
+    end_alignment, start_alignment, fail_reason = \
+        find_end_and_start(seq_a, seq_b, name_a, name_b, args)
     if end_alignment is None or start_alignment is None:
-        log('    cannot circularise')
-        return None
+        log(f'    unable to circularise: {name_a}\'s start/end could not be found in {name_b}')
+        return None, fail_reason
 
     # If we found them with no gap at all (end immediately followed by start), that implies seq A
     # is already cleanly circularised.
     if end_alignment.ref_end == start_alignment.ref_start:
         log(f'    no adjustment needed ({name_a} is already circular)')
-        return seq_a
+        return seq_a, None
+
+    # If the starts or ends of the two sequences are the same, then we can't circularise because
+    # there's no difference.
+    if start_alignment.ref_start == 0 or end_alignment.ref_end == len(seq_b):
+        log(f'    unable to circularise: {name_a}\'s start/end is the same as {name_b}\'s '
+            f'start/end')
+        return None, 'same start/end'
 
     # If we found them with a small gap (end then missing seq then start), that implies seq A has a
     # gapped circularisation and needs some sequence added.
@@ -88,97 +133,135 @@ def circularise_seq_with_another(seq_a, seq_b, name_a, name_b, args):
         if len(missing_seq) < max_add_seq:
             log(f'    circularising {name_a} by adding {len(missing_seq)} bp of sequence from'
                 f' {name_b} ({end_alignment.ref_end}-{start_alignment.ref_start})')
-            return seq_a + missing_seq
+            return seq_a + missing_seq, None
         else:
             log(f'    unable to circularise: {name_a} requires {len(missing_seq)} bp to be added ' 
                 f'but settings only allow {max_add_seq} bp')
-            return None
+            return None, 'too much missing'
 
     # If we got here, then it seems seq A has overlapping circularisation and some sequence will
     # need to be removed. To figure out how much to trim, we take the part of seq B which precedes
     # the start alignment and align it back to seq A.
-    pre_start_alignment = find_pre_start_alignment(seq_a, seq_b, start_alignment)
+    pre_start_alignment = find_pre_start_alignment(seq_a, seq_b, name_a, name_b, start_alignment,
+                                                   args.verbose)
     if pre_start_alignment is not None:
         trim_point = pre_start_alignment.ref_end
         trim_amount = len(seq_a) - trim_point
         if trim_amount < max_trim_seq:
             log(f'    circularising {name_a} by trimming {trim_amount} bp of sequence from the end')
-            return seq_a[:trim_point]
+            return seq_a[:trim_point], None
         else:
             log(f'    unable to circularise: {name_a} requires {trim_amount} bp to be trimmed ' 
                 f'but settings only allow {max_trim_seq} bp')
-            return None
+            return None, 'too much extra'
 
     log('    unable to circularise')
-    return None
+    return None, 'other'
+
+
+def get_start_end_size(seq):
+    """
+    Don't use a start/end size of more than 10% of the sequence length.
+    """
+    start_end_size = settings.START_END_SIZE
+    if start_end_size > len(seq) / 10.0:
+        start_end_size = int(round(len(seq) / 10.0))
+    return start_end_size
 
 
 def find_end_and_start(seq_a, seq_b, name_a, name_b, args):
-    seq_a_start_start, seq_a_start_end = 0, settings.START_END_SIZE
-    seq_a_end_start, seq_a_end_end = len(seq_a) - settings.START_END_SIZE, len(seq_a)
+    start_end_size = get_start_end_size(seq_a)
+    seq_a_start_start, seq_a_start_end = 0, start_end_size
+    seq_a_end_start, seq_a_end_end = len(seq_a) - start_end_size, len(seq_a)
 
     start_seq = seq_a[seq_a_start_start:seq_a_start_end]
     end_seq = seq_a[seq_a_end_start:seq_a_end_end]
 
-    # Look for seq A's end sequence in seq B. We should hopefully find one instance.
+    # Look for seq A's end sequence in seq B.
     if args.verbose:
-        log(f'    looking for {name_a}\'s end in {name_b}...   ', end='')
+        log(f'    looking for {name_a}\'s end ({seq_a_end_start}-{seq_a_end_end}) in {name_b}...')
     end_alignments = align_a_to_b(end_seq, seq_b)
     end_alignments = [a for a in end_alignments if a.strand == '+'
                       and a.percent_identity >= settings.START_END_IDENTITY_THRESHOLD
                       and a.query_cov >= settings.START_END_COV_THRESHOLD]
     if len(end_alignments) == 0:
         if args.verbose:
-            log('not found')
-        return None, None
-    elif len(end_alignments) > 1:
-        if args.verbose:
-            log('multiple hits')
-        return None, None
-    assert len(end_alignments) == 1
-    end_alignment = end_alignments[0]
+            log('      not found')
+        return None, None, 'end not found'
     if args.verbose:
-        log(f'found at {end_alignment.ref_start}-{end_alignment.ref_end}')
+        for end_alignment in end_alignments:
+            log(f'      found at {end_alignment.ref_start}-{end_alignment.ref_end}')
 
-    # Look for seq A's start sequence in seq B. We should hopefully find one instance.
+    # Look for seq A's start sequence in seq B.
     if args.verbose:
-        log(f'    looking for {name_a}\'s start in {name_b}... ', end='')
+        log(f'    looking for {name_a}\'s start ({seq_a_start_start}-{seq_a_start_end}) in '
+            f'{name_b}... ')
     start_alignments = align_a_to_b(start_seq, seq_b)
     start_alignments = [a for a in start_alignments if a.strand == '+'
                         and a.percent_identity >= settings.START_END_IDENTITY_THRESHOLD
                         and a.query_cov >= settings.START_END_COV_THRESHOLD]
     if len(start_alignments) == 0:
         if args.verbose:
-            log('not found')
-        return None, None
-    elif len(start_alignments) > 1:
-        if args.verbose:
-            log('multiple hits')
-        return None, None
-    assert len(start_alignments) == 1
-    start_alignment = start_alignments[0]
+            log('      not found')
+        return None, None, 'start not found'
     if args.verbose:
-        log(f'found at {start_alignment.ref_start}-{start_alignment.ref_end}')
+        for start_alignment in start_alignments:
+            log(f'      found at {start_alignment.ref_start}-{start_alignment.ref_end}')
 
-    return end_alignment, start_alignment
+    if len(end_alignments) == 1 and len(start_alignments) == 1:
+        return end_alignments[0], start_alignments[0], None
+
+    # Since we have multiple possibilities, we will try all start/end combinations.
+    if args.verbose:
+        log(f'    checking different end-start alignment combinations...')
+    assert len(end_alignments) > 1 or len(start_alignments) > 1
+    combinations = []
+    for end_alignment in end_alignments:
+        end_1, end_2 = end_alignment.ref_start, end_alignment.ref_end
+        for start_alignment in start_alignments:
+            start_1, start_2 = start_alignment.ref_start, start_alignment.ref_end
+            distance = abs(end_alignment.ref_end - start_alignment.ref_start)
+            if args.verbose:
+                log(f'      {end_1}-{end_2}, {start_1}-{start_2} distance={distance}')
+            combinations.append((distance, end_alignment, start_alignment))
+    combinations = sorted(combinations, key=lambda x: x[0])  # sort from closest to furthest
+    assert len(combinations) > 1
+
+    best_second_best_diff = combinations[1][0] - combinations[0][0]
+    assert best_second_best_diff >= 0
+
+    if best_second_best_diff >= settings.START_END_MIN_COMBINATION_DIFF:
+        _, end_alignment, start_alignment = combinations[0]
+        return end_alignment, start_alignment, None
+    else:
+        return None, None, 'multiple possibilities'
 
 
-def find_pre_start_alignment(seq_a, seq_b, start_alignment):
-    pre_start_seq = seq_b[start_alignment.ref_start - settings.START_END_SIZE:
-                          start_alignment.ref_start]
-    if len(pre_start_seq) != settings.START_END_SIZE:
+def find_pre_start_alignment(seq_a, seq_b, name_a, name_b, start_alignment, verbose):
+    start_end_size = get_start_end_size(seq_a)
+    start, end = start_alignment.ref_start - start_end_size, start_alignment.ref_start
+    if start < 0:
+        return None
+    pre_start_seq = seq_b[start:end]
+    if verbose:
+        log(f'    looking for {name_b}\'s before-start ({start}-{end}) in {name_a}... ')
+
+    if len(pre_start_seq) != start_end_size:
         return None
     pre_start_alignments = align_a_to_b(pre_start_seq, seq_a)
     pre_start_alignments = [a for a in pre_start_alignments if a.strand == '+'
                             and a.percent_identity >= settings.START_END_IDENTITY_THRESHOLD
                             and a.query_cov >= settings.START_END_COV_THRESHOLD]
     if len(pre_start_alignments) == 0:
         return None
-    elif len(pre_start_alignments) > 1:
-        return None
-    else:
+    elif len(pre_start_alignments) == 1:
         return pre_start_alignments[0]
 
+    # If we got here, then there are multiple alignments. We choose the one closest to the end
+    # of the
+    pre_start_alignments = sorted(pre_start_alignments, key=lambda a: a.ref_end)
+    return pre_start_alignments[-1]
+
 
 def choose_best_circularisation(candidate_seqs, reads, threads):
     """

diff --git a/trycycler/initial_check.py b/trycycler/initial_check.py
@@ -11,8 +11,6 @@
 If not, see <http://www.gnu.org/licenses/>.
 """
 
-import sys
-
 from .log import log, section_header, explanation, dim, red, quit_with_error
 from .mash import get_mash_dist_matrix
 

diff --git a/trycycler/settings.py b/trycycler/settings.py
@@ -19,9 +19,10 @@
 
 MAX_MASH_DISTANCE = 0.25
 
-START_END_SIZE = 500
+START_END_SIZE = 1000
 START_END_COV_THRESHOLD = 95.0
 START_END_IDENTITY_THRESHOLD = 95.0
+START_END_MIN_COMBINATION_DIFF = 1000
 
 CIRCULARISATION_CHOICE_ALIGNMENT_SIZE = 1000