-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathpartition.py
152 lines (126 loc) · 6.41 KB
/
partition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""
Copyright 2020 Ryan Wick ([email protected])
https://github.com/rrwick/Trycycler
This file is part of Trycycler. Trycycler is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by the Free Software Foundation,
either version 3 of the License, or (at your option) any later version. Trycycler is distributed
in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
details. You should have received a copy of the GNU General Public License along with Trycycler.
If not, see <http://www.gnu.org/licenses/>.
"""
import collections
import sys
from .alignment import align_reads_to_seq
from .intrange import IntRange
from .log import log, section_header, explanation
from .misc import get_sequence_file_type, load_fasta, iterate_fastq, check_input_reads
from .software import check_minimap2
def partition(args):
welcome_message()
check_inputs_and_requirements(args)
best_clusters = align_reads(args.cluster_dirs, args.reads, args.threads, args.min_aligned_len,
args.min_read_cov)
save_reads_per_cluster(args.cluster_dirs, args.reads, best_clusters)
def welcome_message():
section_header('Starting Trycycler partitioning')
explanation('Trycycler partition is a tool for partitioning reads by cluster. I.e. each read '
'will be assigned to one cluster and saved in a file for that cluster.')
def check_inputs_and_requirements(args):
check_input_reads(args.reads)
check_input_clusters(args.cluster_dirs)
check_required_software()
def check_input_clusters(cluster_dirs):
if len(cluster_dirs) < 1:
sys.exit('\nError: one or more input cluster directories are required')
log(f'Input clusters:')
for d in cluster_dirs:
contigs = sorted(d.glob('2_all_seqs.fasta'))
if not contigs:
sys.exit(f'\nError: there is not a 2_all_seqs.fasta file in {d}')
assert len(contigs) == 1
f = contigs[0]
contig_type = get_sequence_file_type(f)
if contig_type != 'FASTA':
sys.exit(f'\nError: input contig file ({f}) is not in FASTA format')
seqs = load_fasta(f)
if len(seqs) == 0:
sys.exit(f'\nError: contig file ({f}) contains no sequences')
noun = 'contig' if len(seqs) == 1 else 'contigs'
mean_len = sum(len(s[1]) for s in seqs) // len(seqs)
log(f' {f}: {len(seqs)} {noun}, mean length = {mean_len:,} bp')
log()
def check_required_software():
log('Checking required software:')
check_minimap2()
log()
def align_reads(cluster_dirs, reads, threads, min_aligned_len, min_read_cov):
section_header('Aligning reads to each contig')
explanation('The reads are independently aligned to each of the contigs and Trycycler will '
'remember the single best alignment for each read.')
best_clusters = {}
best_matching_bases = collections.defaultdict(int)
for d in cluster_dirs:
contigs = sorted(d.glob('2_all_seqs.fasta'))
assert len(contigs) == 1
seqs = load_fasta(contigs[0])
seq_labels = [f'{name} ({len(seq):,} bp): ' for name, seq in seqs]
longest_label = max(len(x) for x in seq_labels)
for name, seq in seqs:
seq_len = len(seq)
log(f'{name} ({len(seq):,} bp): '.rjust(longest_label), end='')
doubled_seq = seq + seq
alignments = align_reads_to_seq(reads, doubled_seq, threads, include_cigar=False)
# Toss out alignments entirely in the second half of the doubled sequence.
alignments = [a for a in alignments if a.ref_start < seq_len]
log(f'{len(alignments):,} alignments')
# Group alignments by read name.
alignments_by_read = collections.defaultdict(list)
for a in alignments:
alignments_by_read[a.query_name].append(a)
# Only consider reads for which enough sequence aligned and a large enough fraction
# of the read aligned. Note that it's okay for this to come in multiple alignments,
# so a read which glitches out in the middle can still pass.
for read_name, read_alignments in alignments_by_read.items():
r = IntRange()
read_length = None
for a in read_alignments:
read_length = a.query_length
r.add_range(a.query_start, a.query_end)
total_len = r.total_length()
read_coverage = 100.0 * total_len / read_length
if total_len < min_aligned_len or read_coverage < min_read_cov:
continue
# If the read passed the checks, then we remember its best single alignment.
for a in read_alignments:
if a.matching_bases > best_matching_bases[read_name]:
best_clusters[read_name] = d
best_matching_bases[read_name] = a.matching_bases
log()
return best_clusters
def save_reads_per_cluster(cluster_dirs, reads, best_clusters):
section_header('Saving reads to cluster directories')
explanation('Reads are now saved in a file in the cluster directory to which they aligned '
'best.')
for d in cluster_dirs:
save_reads_one_cluster(d, reads, best_clusters)
def save_reads_one_cluster(cluster_dir, reads, best_clusters):
cluster_reads = cluster_dir / '4_reads.fastq'
log(f'{cluster_reads}:')
total_read_count, total_read_bases = 0, 0
cluster_read_count, cluster_read_bases = 0, 0
with open(cluster_reads, 'wt') as f:
for read_name, header, seq, qual in iterate_fastq(reads):
total_read_count += 1
total_read_bases += len(seq)
if read_name in best_clusters:
best_cluster = best_clusters[read_name]
if best_cluster == cluster_dir:
cluster_read_count += 1
cluster_read_bases += len(seq)
f.write(f'{header}\n{seq}\n+\n{qual}\n')
count_percent = 100 * cluster_read_count / total_read_count
bases_percent = 100 * cluster_read_bases / total_read_bases
log(f' {cluster_read_count:,} reads ({count_percent:.2f}%)')
log(f' {cluster_read_bases:,} bases ({bases_percent:.2f}%)')
log()