Skip to content

Commit

Permalink
Lots of enchancements to cluster command
Browse files Browse the repository at this point in the history
  • Loading branch information
rrwick committed May 5, 2020
1 parent da31f25 commit ff33db3
Show file tree
Hide file tree
Showing 8 changed files with 196 additions and 67 deletions.
9 changes: 8 additions & 1 deletion trycycler/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,18 @@ def cluster_subparser(subparsers):
required_args.add_argument('-a', '--assemblies', type=str, required=True, nargs='+',
help='Input assemblies whose contigs will be clustered (multiple '
'FASTA files)')
required_args.add_argument('-r', '--reads', type=str, required=True,
help='Long reads (FASTQ format) used to generate the assemblies')
required_args.add_argument('-o', '--out_dir', type=pathlib.Path, required=True,
help='Output directory')

setting_args = group.add_argument_group('Settings')
setting_args.add_argument('-d', '--distance', type=float, default=0.01,
setting_args.add_argument('--min_contig_len', type=int, default=1000,
help='Exclude contigs shorter than this many base pairs in length')
setting_args.add_argument('--min_contig_depth', type=float, default=0.1,
help='Exclude contigs with less than this much read depth relative '
'to the mean read depth')
setting_args.add_argument('--distance', type=float, default=0.01,
help='Mash distance complete-linkage clustering threshold')
setting_args.add_argument('-t', '--threads', type=int, default=get_default_thread_count(),
help='Number of threads to use for alignment')
Expand Down
13 changes: 1 addition & 12 deletions trycycler/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from .circularisation import circularise
from .initial_check import initial_sanity_check
from .log import log, section_header, explanation, dim, red
from .misc import get_sequence_file_type, load_fasta, get_fastq_stats
from .misc import get_sequence_file_type, load_fasta, check_input_reads
from .pairwise import get_pairwise_alignments
from .software import check_minimap2
from .starting_seq import get_starting_seq, rotate_to_starting_seq
Expand Down Expand Up @@ -65,17 +65,6 @@ def load_contig_sequences(cluster_dir):
return contig_seqs, fasta_names


def check_input_reads(filename):
read_type = get_sequence_file_type(filename)
if read_type != 'FASTQ':
sys.exit(f'\nError: input reads ({filename}) are not in FASTQ format')
log(f'Input reads: {filename}')
read_count, total_size, n50 = get_fastq_stats(filename)
log(f' {read_count:,} reads ({total_size:,} bp)')
log(f' N50 = {n50:,} bp')
log()


def check_input_contigs(cluster_dir):
filenames = get_contigs_from_cluster_dir(cluster_dir)
if len(filenames) < 2:
Expand Down
13 changes: 13 additions & 0 deletions trycycler/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,19 @@ def align_reads_to_seq(reads, seq, threads, include_cigar=True):
return alignments


def align_reads_to_fasta(reads, fasta, threads, include_cigar=True):
minimap2_command = ['minimap2']
if include_cigar:
minimap2_command += ['--eqx', '-c']
minimap2_command += ['-x', 'map-ont', '-t', str(threads), str(fasta), str(reads)]
with open(os.devnull, 'w') as dev_null:
out = subprocess.check_output(minimap2_command, stderr=dev_null)
out = out.decode()
alignment_lines = out.splitlines()
alignments = [Alignment(x) for x in alignment_lines]
return alignments


def get_best_alignment_per_read(alignments):
alignments_per_read = collections.defaultdict(list)
for a in alignments:
Expand Down
176 changes: 153 additions & 23 deletions trycycler/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,27 @@
import scipy.spatial.distance as ssd
import numpy as np

from .log import log, section_header, explanation
from .alignment import align_reads_to_fasta, get_best_alignment_per_read
from .log import log, section_header, explanation, red
from .mash import get_mash_dist_matrix
from .misc import get_sequence_file_type, load_fasta
from .misc import get_sequence_file_type, load_fasta, check_input_reads
from .software import check_mash


def cluster(args):
welcome_message()
check_inputs_and_requirements(args)
assembly_lengths = check_inputs_and_requirements(args)
seqs, fasta_names = load_assembly_sequences(args.assemblies)
seq_names = list(seqs.keys())
depths, depth_filter = get_contig_depths(args.assemblies, seqs, seq_names, fasta_names,
args.reads, args.threads, assembly_lengths,
args.min_contig_depth)
seq_names = filter_contigs(args.assemblies, fasta_names, seq_names, seqs, args.min_contig_len,
args.min_contig_depth, depth_filter)
matrix = distance_matrix(seqs, seq_names, args.distance)
cluster_numbers = complete_linkage(seqs, matrix, args.distance, args.out_dir)
build_tree(seq_names, seqs, matrix, args.out_dir, cluster_numbers)
cluster_numbers = complete_linkage(seqs, seq_names, depths, matrix, args.distance, args.out_dir)
build_tree(seq_names, seqs, depths, matrix, args.out_dir, cluster_numbers)
finished_message()


def welcome_message():
Expand All @@ -44,16 +51,27 @@ def welcome_message():
'assemblies (e.g. from different assemblers) into highly-similar groups.')


def finished_message():
section_header('Finished!')
explanation('Now you must decide which clusters are good (i.e. contain well-assembled contigs '
'for replicons in the genome) and which are bad (i.e. contain incomplete or '
'spurious contigs). You can then delete the directories corresponding to the bad '
'clusters and then proceed to the next step in the pipeline: trycycler align.')


def check_inputs_and_requirements(args):
check_input_assemblies(args.assemblies)
assembly_lengths = check_input_assemblies(args.assemblies)
check_input_reads(args.reads)
check_output_directory(args.out_dir)
check_required_software()
return assembly_lengths


def check_input_assemblies(filenames):
if len(filenames) < 2:
sys.exit('\nError: two or more input assemblies are required')
log(f'Input assemblies:')
log_lines = []
total_lengths = {}
for i, f in enumerate(filenames):
contig_type = get_sequence_file_type(f)
if contig_type != 'FASTA':
Expand All @@ -66,17 +84,29 @@ def check_input_assemblies(filenames):
contig_names.add(contig_name)
contig_count = len(seqs)
total_length = sum(len(s[1]) for s in seqs)
noun = 'contig' if contig_count == 1 else 'contigs'
letter = string.ascii_uppercase[i]
log(f' {letter}: {f} ({contig_count} {noun}, {total_length:,} bp)')
total_lengths[letter] = total_length
log_lines.append((letter, f, contig_count, total_length))

longest_filename = max(len(f) for _, f, _, _ in log_lines)
longest_length = max(len(f'{l:,}') for _, _, _, l in log_lines)
longest_contigs = max(len(f'{c:,}') for _, _, c, _ in log_lines)
log(f'Input assemblies:')
for letter, f, contig_count, total_length in log_lines:
noun = 'contig' if contig_count == 1 else 'contigs'
padded_f = f.ljust(longest_filename)
padded_length = f'{total_length:,}'.rjust(longest_length)
padded_contigs = f'{contig_count:,}'.rjust(longest_contigs)
log(f' {letter}: {padded_f} ({padded_length} bp, {padded_contigs} {noun})')
log()
return total_lengths


def check_output_directory(directory):
if directory.is_file():
sys.exit(f'\nError: output directory ({directory}) already exists as a file')
sys.exit(f'Error: output directory ({directory}) already exists as a file')
if directory.is_dir():
sys.exit(f'\nError: output directory ({directory}) already exists')
sys.exit(f'Error: output directory ({directory}) already exists')
else:
log(f'Creating output directory: {directory}')
directory.mkdir(parents=True)
Expand All @@ -99,40 +129,125 @@ def load_assembly_sequences(filenames):
for name, seq in seqs:
if len(seq) < 32: # very short sequences won't work in Mash
continue
name = name.replace('#', '_') # hashes in names can sometimes cause downstream problems
name = name.replace('#', '_') # hashes in names can cause downstream problems
# TODO: make some tests with hashes in the names: might cause problems with depths
full_name = f'{letter}_{name}'
assembly_seqs[full_name] = seq
return assembly_seqs, fasta_names


def get_contig_depths(assembly_filenames, seqs, seq_names, fasta_names, read_filename, threads,
assembly_lengths, min_contig_depth):
section_header('Getting contig depths')
explanation('Trycycler now aligns the reads to each of the assemblies to assign a read depth '
'value to each of the contigs.')

name_to_letter = {v: k for k, v in fasta_names.items()}
depths = {n: 0.0 for n in seq_names}
depth_filter = {}
for assembly_filename in assembly_filenames:
letter = name_to_letter[assembly_filename]
log(f'{letter} ({assembly_filename}):')
alignments = align_reads_to_fasta(read_filename, assembly_filename, threads)
alignments = get_best_alignment_per_read(alignments)
log(f' {len(alignments):,} alignments', end='')
total_alignment_length = sum((a.ref_end - a.ref_start) for a in alignments)
mean_depth = total_alignment_length / assembly_lengths[letter]
threshold_depth = mean_depth * min_contig_depth
log(f', mean depth = {mean_depth:.1f}x')
for a in alignments:
seq_name = f'{letter}_{a.ref_name}'
depth_contribution = (a.ref_end - a.ref_start) / a.ref_length
assert depth_contribution > 0.0
depths[seq_name] += depth_contribution
log_lines = []
for seq_name, depth in depths.items():
if seq_name.startswith(letter + '_'):
seq_length = len(seqs[seq_name])
log_lines.append((seq_name, seq_length, depth))
depth_filter[seq_name] = (depth >= threshold_depth)

longest_seq_name = max(len(n) + 1 for n, _, _ in log_lines)
longest_length = max(len(f'{l:,}') for _, l, _, in log_lines)
longest_depth = max(len(f'{d:.1f}') for _, _, d in log_lines)
for seq_name, seq_length, depth in log_lines:
padded_seq_name = f'{seq_name}:'.ljust(longest_seq_name)
padded_length = f'{seq_length:,}'.rjust(longest_length)
padded_depth = f'{depth:.1f}'.rjust(longest_depth)
output_line = f' {padded_seq_name} {padded_length} bp, {padded_depth}x'
if depth_filter[seq_name]:
log(output_line)
else:
log(red(output_line))
log()

return depths, depth_filter


def filter_contigs(assembly_filenames, fasta_names, seq_names, seqs, min_contig_len,
min_contig_depth, depth_filter):
section_header('Filtering contigs')
explanation(f'Contigs are now filtered out if they are too short (<{min_contig_len:,} bp) or '
f'too low depth (<{min_contig_depth:.1f} times the mean depth).')

final_seq_names = []
name_to_letter = {v: k for k, v in fasta_names.items()}
for assembly_filename in assembly_filenames:
pass_names, fail_length_names, fail_depth_names = [], [], []
letter = name_to_letter[assembly_filename]
log(f'{letter} ({assembly_filename}):')
for seq_name in seq_names:
if seq_name.startswith(letter + '_'):
if len(seqs[seq_name]) < min_contig_len:
fail_length_names.append(seq_name)
elif not depth_filter[seq_name]:
fail_depth_names.append(seq_name)
else:
pass_names.append(seq_name)
if not fail_depth_names and not fail_length_names:
log(' all contigs passed filtering')
else:
if pass_names:
log(' passed filtering: ' + ', '.join(pass_names))
if fail_length_names:
log(' removed for short length: ' + ', '.join(fail_length_names))
if fail_depth_names:
log(' removed for low depth: ' + ', '.join(fail_depth_names))
final_seq_names += pass_names
log()

return final_seq_names


def distance_matrix(seqs, seq_names, distance):
section_header('Distance matrix')
section_header('Building distance matrix')
explanation('Mash is used to build a distance matrix of all contigs in the assemblies.')
mash_matrix = get_mash_dist_matrix(seq_names, seqs, distance)
mash_matrix = get_mash_dist_matrix(seq_names, seqs, distance, indent=False)
return mash_matrix


def save_matrix_to_phylip(seq_names, seqs, matrix, out_dir, cluster_numbers):
def save_matrix_to_phylip(seq_names, seqs, depths, matrix, out_dir, cluster_numbers):
phylip = out_dir / 'contigs.phylip'
log(f'saving distance matrix: {phylip}')
with open(phylip, 'wt') as f:
f.write(str(len(seq_names)))
f.write('\n')
for a in seq_names:
seq_len = len(seqs[a])
f.write(f'cluster_{cluster_numbers[a]}_{a}_{seq_len}_bp')
depth = depths[a]
f.write(f'cluster_{cluster_numbers[a]}_{a}_{seq_len}_bp_{depth:.1f}x')
for b in seq_names:
f.write('\t')
f.write(str(matrix[(a, b)]))
f.write('\n')
return phylip


def build_tree(seq_names, seqs, matrix, out_dir, cluster_numbers):
section_header('Build FastME tree')
def build_tree(seq_names, seqs, depths, matrix, out_dir, cluster_numbers):
section_header('Building FastME tree')
explanation('R (ape and phangorn) are used to build a FastME tree of the relationships '
'between the contigs.')
phylip = save_matrix_to_phylip(seq_names, seqs, matrix, out_dir, cluster_numbers)
phylip = save_matrix_to_phylip(seq_names, seqs, depths, matrix, out_dir, cluster_numbers)
with tempfile.TemporaryDirectory() as temp_dir:
temp_dir = pathlib.Path(temp_dir)
tree_script, newick = create_tree_script(temp_dir, phylip)
Expand All @@ -157,10 +272,10 @@ def create_tree_script(temp_dir, phylip):
return str(tree_script), pathlib.Path(newick).relative_to(pathlib.Path.cwd())


def complete_linkage(seqs, distances, threshold, out_dir):
def complete_linkage(seqs, seq_names, depths, distances, threshold, out_dir):
section_header('Clustering')
explanation('The contigs are now split into clusters using a complete-linkage approach.')
seq_names = list(seqs.keys())
explanation('The contigs are now split into clusters using a complete-linkage hierarchical '
'approach.')

# Build the distance matrix as a numpy array.
matrix = []
Expand All @@ -183,18 +298,33 @@ def complete_linkage(seqs, distances, threshold, out_dir):
cluster_names = sorted(clusters.keys(), reverse=True,
key=lambda c: sum(len(s[1]) for s in clusters[c]))

cluster_dir = out_dir / f'cluster_{0:03d}' / '1_contigs'
longest_fasta = max(len(str(cluster_dir / f'{name}.fasta')) + 1 for name in seq_names)
longest_length = max(len(f'{len(s):,}') for s in seqs.values())
longest_depth = max(len(f'{d:.1f}') for d in depths.values())

cluster_numbers = {}
for i, cluster_name in enumerate(cluster_names):
cluster_num = i + 1
cluster_dir = out_dir / f'cluster_{cluster_num:03d}' / '1_contigs'
cluster_dir.mkdir(parents=True)
log(f'{cluster_dir}:')
log_lines = []
for name, seq in clusters[cluster_name]:
cluster_numbers[name] = cluster_num
seq_fasta = cluster_dir / f'{name}.fasta'
seq_length = len(seq)
seq_depth = depths[name]
with open(seq_fasta, 'wt') as f:
f.write(f'>{name}\n')
f.write(f'{seq}\n')
log(f' {seq_fasta} ({len(seq):,} bp)')
log_lines.append((seq_fasta, seq_length, seq_depth))

for seq_fasta, seq_length, seq_depth in log_lines:
padded_fasta = f'{seq_fasta}:'.ljust(longest_fasta)
padded_length = f'{seq_length:,}'.rjust(longest_length)
padded_depth = f'{seq_depth:.1f}'.rjust(longest_depth)
log(f' {padded_fasta} {padded_length} bp, {padded_depth}x')
log()

return cluster_numbers
Loading

0 comments on commit ff33db3

Please sign in to comment.