Lots of enchancements to cluster command

rrwick · May 5, 2020 · ff33db3 · ff33db3
1 parent da31f25
commit ff33db3
Show file tree

Hide file tree

Showing 8 changed files with 196 additions and 67 deletions.
diff --git a/trycycler/__main__.py b/trycycler/__main__.py
@@ -88,11 +88,18 @@ def cluster_subparser(subparsers):
     required_args.add_argument('-a', '--assemblies', type=str, required=True, nargs='+',
                                help='Input assemblies whose contigs will be clustered (multiple '
                                     'FASTA files)')
+    required_args.add_argument('-r', '--reads', type=str, required=True,
+                               help='Long reads (FASTQ format) used to generate the assemblies')
     required_args.add_argument('-o', '--out_dir', type=pathlib.Path, required=True,
                                help='Output directory')
 
     setting_args = group.add_argument_group('Settings')
-    setting_args.add_argument('-d', '--distance', type=float, default=0.01,
+    setting_args.add_argument('--min_contig_len', type=int, default=1000,
+                              help='Exclude contigs shorter than this many base pairs in length')
+    setting_args.add_argument('--min_contig_depth', type=float, default=0.1,
+                              help='Exclude contigs with less than this much read depth relative '
+                                   'to the mean read depth')
+    setting_args.add_argument('--distance', type=float, default=0.01,
                               help='Mash distance complete-linkage clustering threshold')
     setting_args.add_argument('-t', '--threads', type=int, default=get_default_thread_count(),
                               help='Number of threads to use for alignment')

diff --git a/trycycler/align.py b/trycycler/align.py
@@ -17,7 +17,7 @@
 from .circularisation import circularise
 from .initial_check import initial_sanity_check
 from .log import log, section_header, explanation, dim, red
-from .misc import get_sequence_file_type, load_fasta, get_fastq_stats
+from .misc import get_sequence_file_type, load_fasta, check_input_reads
 from .pairwise import get_pairwise_alignments
 from .software import check_minimap2
 from .starting_seq import get_starting_seq, rotate_to_starting_seq
@@ -65,17 +65,6 @@ def load_contig_sequences(cluster_dir):
     return contig_seqs, fasta_names
 
 
-def check_input_reads(filename):
-    read_type = get_sequence_file_type(filename)
-    if read_type != 'FASTQ':
-        sys.exit(f'\nError: input reads ({filename}) are not in FASTQ format')
-    log(f'Input reads: {filename}')
-    read_count, total_size, n50 = get_fastq_stats(filename)
-    log(f'  {read_count:,} reads ({total_size:,} bp)')
-    log(f'  N50 = {n50:,} bp')
-    log()
-
-
 def check_input_contigs(cluster_dir):
     filenames = get_contigs_from_cluster_dir(cluster_dir)
     if len(filenames) < 2:

diff --git a/trycycler/alignment.py b/trycycler/alignment.py
@@ -95,6 +95,19 @@ def align_reads_to_seq(reads, seq, threads, include_cigar=True):
     return alignments
 
 
+def align_reads_to_fasta(reads, fasta, threads, include_cigar=True):
+    minimap2_command = ['minimap2']
+    if include_cigar:
+        minimap2_command += ['--eqx', '-c']
+    minimap2_command += ['-x', 'map-ont', '-t', str(threads), str(fasta), str(reads)]
+    with open(os.devnull, 'w') as dev_null:
+        out = subprocess.check_output(minimap2_command, stderr=dev_null)
+    out = out.decode()
+    alignment_lines = out.splitlines()
+    alignments = [Alignment(x) for x in alignment_lines]
+    return alignments
+
+
 def get_best_alignment_per_read(alignments):
     alignments_per_read = collections.defaultdict(list)
     for a in alignments:

diff --git a/trycycler/cluster.py b/trycycler/cluster.py
@@ -22,20 +22,27 @@
 import scipy.spatial.distance as ssd
 import numpy as np
 
-from .log import log, section_header, explanation
+from .alignment import align_reads_to_fasta, get_best_alignment_per_read
+from .log import log, section_header, explanation, red
 from .mash import get_mash_dist_matrix
-from .misc import get_sequence_file_type, load_fasta
+from .misc import get_sequence_file_type, load_fasta, check_input_reads
 from .software import check_mash
 
 
 def cluster(args):
     welcome_message()
-    check_inputs_and_requirements(args)
+    assembly_lengths = check_inputs_and_requirements(args)
     seqs, fasta_names = load_assembly_sequences(args.assemblies)
     seq_names = list(seqs.keys())
+    depths, depth_filter = get_contig_depths(args.assemblies, seqs, seq_names, fasta_names,
+                                             args.reads, args.threads, assembly_lengths,
+                                             args.min_contig_depth)
+    seq_names = filter_contigs(args.assemblies, fasta_names, seq_names, seqs, args.min_contig_len,
+                               args.min_contig_depth, depth_filter)
     matrix = distance_matrix(seqs, seq_names, args.distance)
-    cluster_numbers = complete_linkage(seqs, matrix, args.distance, args.out_dir)
-    build_tree(seq_names, seqs, matrix, args.out_dir, cluster_numbers)
+    cluster_numbers = complete_linkage(seqs, seq_names, depths, matrix, args.distance, args.out_dir)
+    build_tree(seq_names, seqs, depths, matrix, args.out_dir, cluster_numbers)
+    finished_message()
 
 
 def welcome_message():
@@ -44,16 +51,27 @@ def welcome_message():
                 'assemblies (e.g. from different assemblers) into highly-similar groups.')
 
 
+def finished_message():
+    section_header('Finished!')
+    explanation('Now you must decide which clusters are good (i.e. contain well-assembled contigs '
+                'for replicons in the genome) and which are bad (i.e. contain incomplete or '
+                'spurious contigs). You can then delete the directories corresponding to the bad '
+                'clusters and then proceed to the next step in the pipeline: trycycler align.')
+
+
 def check_inputs_and_requirements(args):
-    check_input_assemblies(args.assemblies)
+    assembly_lengths = check_input_assemblies(args.assemblies)
+    check_input_reads(args.reads)
     check_output_directory(args.out_dir)
     check_required_software()
+    return assembly_lengths
 
 
 def check_input_assemblies(filenames):
     if len(filenames) < 2:
         sys.exit('\nError: two or more input assemblies are required')
-    log(f'Input assemblies:')
+    log_lines = []
+    total_lengths = {}
     for i, f in enumerate(filenames):
         contig_type = get_sequence_file_type(f)
         if contig_type != 'FASTA':
@@ -66,17 +84,29 @@ def check_input_assemblies(filenames):
             contig_names.add(contig_name)
         contig_count = len(seqs)
         total_length = sum(len(s[1]) for s in seqs)
-        noun = 'contig' if contig_count == 1 else 'contigs'
         letter = string.ascii_uppercase[i]
-        log(f'  {letter}: {f} ({contig_count} {noun}, {total_length:,} bp)')
+        total_lengths[letter] = total_length
+        log_lines.append((letter, f, contig_count, total_length))
+
+    longest_filename = max(len(f) for _, f, _, _ in log_lines)
+    longest_length = max(len(f'{l:,}') for _, _, _, l in log_lines)
+    longest_contigs = max(len(f'{c:,}') for _, _, c, _ in log_lines)
+    log(f'Input assemblies:')
+    for letter, f, contig_count, total_length in log_lines:
+        noun = 'contig' if contig_count == 1 else 'contigs'
+        padded_f = f.ljust(longest_filename)
+        padded_length = f'{total_length:,}'.rjust(longest_length)
+        padded_contigs = f'{contig_count:,}'.rjust(longest_contigs)
+        log(f'  {letter}: {padded_f} ({padded_length} bp, {padded_contigs} {noun})')
     log()
+    return total_lengths
 
 
 def check_output_directory(directory):
     if directory.is_file():
-        sys.exit(f'\nError: output directory ({directory}) already exists as a file')
+        sys.exit(f'Error: output directory ({directory}) already exists as a file')
     if directory.is_dir():
-        sys.exit(f'\nError: output directory ({directory}) already exists')
+        sys.exit(f'Error: output directory ({directory}) already exists')
     else:
         log(f'Creating output directory: {directory}')
         directory.mkdir(parents=True)
@@ -99,40 +129,125 @@ def load_assembly_sequences(filenames):
         for name, seq in seqs:
             if len(seq) < 32:  # very short sequences won't work in Mash
                 continue
-            name = name.replace('#', '_')  # hashes in names can sometimes cause downstream problems
+            name = name.replace('#', '_')  # hashes in names can cause downstream problems
+            # TODO: make some tests with hashes in the names: might cause problems with depths
             full_name = f'{letter}_{name}'
             assembly_seqs[full_name] = seq
     return assembly_seqs, fasta_names
 
 
+def get_contig_depths(assembly_filenames, seqs, seq_names, fasta_names, read_filename, threads,
+                      assembly_lengths, min_contig_depth):
+    section_header('Getting contig depths')
+    explanation('Trycycler now aligns the reads to each of the assemblies to assign a read depth '
+                'value to each of the contigs.')
+
+    name_to_letter = {v: k for k, v in fasta_names.items()}
+    depths = {n: 0.0 for n in seq_names}
+    depth_filter = {}
+    for assembly_filename in assembly_filenames:
+        letter = name_to_letter[assembly_filename]
+        log(f'{letter} ({assembly_filename}):')
+        alignments = align_reads_to_fasta(read_filename, assembly_filename, threads)
+        alignments = get_best_alignment_per_read(alignments)
+        log(f'  {len(alignments):,} alignments', end='')
+        total_alignment_length = sum((a.ref_end - a.ref_start) for a in alignments)
+        mean_depth = total_alignment_length / assembly_lengths[letter]
+        threshold_depth = mean_depth * min_contig_depth
+        log(f', mean depth = {mean_depth:.1f}x')
+        for a in alignments:
+            seq_name = f'{letter}_{a.ref_name}'
+            depth_contribution = (a.ref_end - a.ref_start) / a.ref_length
+            assert depth_contribution > 0.0
+            depths[seq_name] += depth_contribution
+        log_lines = []
+        for seq_name, depth in depths.items():
+            if seq_name.startswith(letter + '_'):
+                seq_length = len(seqs[seq_name])
+                log_lines.append((seq_name, seq_length, depth))
+                depth_filter[seq_name] = (depth >= threshold_depth)
+
+        longest_seq_name = max(len(n) + 1 for n, _, _ in log_lines)
+        longest_length = max(len(f'{l:,}') for _, l, _, in log_lines)
+        longest_depth = max(len(f'{d:.1f}') for _, _, d in log_lines)
+        for seq_name, seq_length, depth in log_lines:
+            padded_seq_name = f'{seq_name}:'.ljust(longest_seq_name)
+            padded_length = f'{seq_length:,}'.rjust(longest_length)
+            padded_depth = f'{depth:.1f}'.rjust(longest_depth)
+            output_line = f'  {padded_seq_name} {padded_length} bp, {padded_depth}x'
+            if depth_filter[seq_name]:
+                log(output_line)
+            else:
+                log(red(output_line))
+        log()
+
+    return depths, depth_filter
+
+
+def filter_contigs(assembly_filenames, fasta_names, seq_names, seqs, min_contig_len,
+                   min_contig_depth, depth_filter):
+    section_header('Filtering contigs')
+    explanation(f'Contigs are now filtered out if they are too short (<{min_contig_len:,} bp) or '
+                f'too low depth (<{min_contig_depth:.1f} times the mean depth).')
+
+    final_seq_names = []
+    name_to_letter = {v: k for k, v in fasta_names.items()}
+    for assembly_filename in assembly_filenames:
+        pass_names, fail_length_names, fail_depth_names = [], [], []
+        letter = name_to_letter[assembly_filename]
+        log(f'{letter} ({assembly_filename}):')
+        for seq_name in seq_names:
+            if seq_name.startswith(letter + '_'):
+                if len(seqs[seq_name]) < min_contig_len:
+                    fail_length_names.append(seq_name)
+                elif not depth_filter[seq_name]:
+                    fail_depth_names.append(seq_name)
+                else:
+                    pass_names.append(seq_name)
+        if not fail_depth_names and not fail_length_names:
+            log('  all contigs passed filtering')
+        else:
+            if pass_names:
+                log('  passed filtering:         ' + ', '.join(pass_names))
+            if fail_length_names:
+                log('  removed for short length: ' + ', '.join(fail_length_names))
+            if fail_depth_names:
+                log('  removed for low depth:    ' + ', '.join(fail_depth_names))
+        final_seq_names += pass_names
+        log()
+
+    return final_seq_names
+
+
 def distance_matrix(seqs, seq_names, distance):
-    section_header('Distance matrix')
+    section_header('Building distance matrix')
     explanation('Mash is used to build a distance matrix of all contigs in the assemblies.')
-    mash_matrix = get_mash_dist_matrix(seq_names, seqs, distance)
+    mash_matrix = get_mash_dist_matrix(seq_names, seqs, distance, indent=False)
     return mash_matrix
 
 
-def save_matrix_to_phylip(seq_names, seqs, matrix, out_dir, cluster_numbers):
+def save_matrix_to_phylip(seq_names, seqs, depths, matrix, out_dir, cluster_numbers):
     phylip = out_dir / 'contigs.phylip'
     log(f'saving distance matrix: {phylip}')
     with open(phylip, 'wt') as f:
         f.write(str(len(seq_names)))
         f.write('\n')
         for a in seq_names:
             seq_len = len(seqs[a])
-            f.write(f'cluster_{cluster_numbers[a]}_{a}_{seq_len}_bp')
+            depth = depths[a]
+            f.write(f'cluster_{cluster_numbers[a]}_{a}_{seq_len}_bp_{depth:.1f}x')
             for b in seq_names:
                 f.write('\t')
                 f.write(str(matrix[(a, b)]))
             f.write('\n')
     return phylip
 
 
-def build_tree(seq_names, seqs, matrix, out_dir, cluster_numbers):
-    section_header('Build FastME tree')
+def build_tree(seq_names, seqs, depths, matrix, out_dir, cluster_numbers):
+    section_header('Building FastME tree')
     explanation('R (ape and phangorn) are used to build a FastME tree of the relationships '
                 'between the contigs.')
-    phylip = save_matrix_to_phylip(seq_names, seqs, matrix, out_dir, cluster_numbers)
+    phylip = save_matrix_to_phylip(seq_names, seqs, depths, matrix, out_dir, cluster_numbers)
     with tempfile.TemporaryDirectory() as temp_dir:
         temp_dir = pathlib.Path(temp_dir)
         tree_script, newick = create_tree_script(temp_dir, phylip)
@@ -157,10 +272,10 @@ def create_tree_script(temp_dir, phylip):
     return str(tree_script), pathlib.Path(newick).relative_to(pathlib.Path.cwd())
 
 
-def complete_linkage(seqs, distances, threshold, out_dir):
+def complete_linkage(seqs, seq_names, depths, distances, threshold, out_dir):
     section_header('Clustering')
-    explanation('The contigs are now split into clusters using a complete-linkage approach.')
-    seq_names = list(seqs.keys())
+    explanation('The contigs are now split into clusters using a complete-linkage hierarchical '
+                'approach.')
 
     # Build the distance matrix as a numpy array.
     matrix = []
@@ -183,18 +298,33 @@ def complete_linkage(seqs, distances, threshold, out_dir):
     cluster_names = sorted(clusters.keys(), reverse=True,
                            key=lambda c: sum(len(s[1]) for s in clusters[c]))
 
+    cluster_dir = out_dir / f'cluster_{0:03d}' / '1_contigs'
+    longest_fasta = max(len(str(cluster_dir / f'{name}.fasta')) + 1 for name in seq_names)
+    longest_length = max(len(f'{len(s):,}') for s in seqs.values())
+    longest_depth = max(len(f'{d:.1f}') for d in depths.values())
+
     cluster_numbers = {}
     for i, cluster_name in enumerate(cluster_names):
         cluster_num = i + 1
         cluster_dir = out_dir / f'cluster_{cluster_num:03d}' / '1_contigs'
         cluster_dir.mkdir(parents=True)
         log(f'{cluster_dir}:')
+        log_lines = []
         for name, seq in clusters[cluster_name]:
             cluster_numbers[name] = cluster_num
             seq_fasta = cluster_dir / f'{name}.fasta'
+            seq_length = len(seq)
+            seq_depth = depths[name]
             with open(seq_fasta, 'wt') as f:
                 f.write(f'>{name}\n')
                 f.write(f'{seq}\n')
-            log(f'  {seq_fasta} ({len(seq):,} bp)')
+            log_lines.append((seq_fasta, seq_length, seq_depth))
+
+        for seq_fasta, seq_length, seq_depth in log_lines:
+            padded_fasta = f'{seq_fasta}:'.ljust(longest_fasta)
+            padded_length = f'{seq_length:,}'.rjust(longest_length)
+            padded_depth = f'{seq_depth:.1f}'.rjust(longest_depth)
+            log(f'  {padded_fasta} {padded_length} bp, {padded_depth}x')
         log()
+
     return cluster_numbers