forked from molgenis/VaSeBuilder
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvasebuilder.py
2651 lines (2306 loc) · 111 KB
/
vasebuilder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
"""Main VaSeBuilder module."""
import sys
import io
import time
import random
import logging
import gzip
import os
from datetime import datetime
from collections import OrderedDict
import numpy as np
import pysam
# Import VaSe specific classes.
from sample_mapper import SampleMapper
from variant_context_file import VariantContextFile
from variant_context import VariantContext
from overlap_context import OverlapContext
class VaSeBuilder:
"""Method object with all main functionality.
The init also saves the identifier, date, and time of the current run.
Attributes
----------
creation_id : str
Unique (uuid) identifier to identify VaSeBuilder runs
creation_time : str
The date and time of creation to identify VaSeBuilder runs
vaselogger : Logger
VaSeBuilder logger to log VaSeBuilder activity
"""
def __init__(self, vaseid):
self.vaselogger = logging.getLogger("VaSe_Logger")
self.creation_id = str(vaseid)
self.creation_time = datetime.now()
self.vaselogger.info(f"VaSeBuilder: {self.creation_id} ; {self.creation_time}")
# VariantContextFile that saves the acceptor, donor, and variant contexts
# with their associated data.
self.contexts = VariantContextFile()
# Dictionary used for debug messages.
self.debug_dict = {"vw": "Establishing search window",
"dr": "Gathering donor variant reads",
"dc": "Determining donor context",
"ar": "Gathering acceptor variant reads",
"ac": "Determining acceptor context",
"cc": "Determining combined context",
"cdr": "Gathering combined context donor reads",
"car": "Gathering combined context acceptor reads",
"done": "Variant complete. Processing"}
# Method to print debug messages.
def debug_msg(self, step, variant_id, starting_time=None):
"""Print preformed debug message for a given step.
If a starting_time parameter is set, the massage output will include
the elapsed time from starting_time to the the time the
message is printed.
Parameters
----------
step : str
DESCRIPTION.
variant_id : str
DESCRIPTION.
starting_time : float, optional
DESCRIPTION. The default is None.
Returns
-------
None.
"""
process = self.debug_dict[step]
for_var = f"for variant {variant_id}"
if starting_time:
took = f" took {time.time() - starting_time} seconds."
else:
took = "."
self.vaselogger.debug(f"{process} {for_var}{took}")
# ===METHODS TO PRODUCE VARIANT CONTEXTS===================================
def get_sample_vcf_variants_2(self, variant_fileloc, filterlist=None):
"""Read and return read variants from a variant file.
Parameters
----------
variant_fileloc : str
Path to variant file to read
filterlist : list of tuple
Variants to include
Returns
-------
sample_variant_list : list of pysam.VariantRecord
Read variants fro the variant file
"""
sample_variant_list = []
try:
variant_file = pysam.VariantFile(variant_fileloc, "r")
vcfvars = list(variant_file.fetch())
except IOError:
self.vaselogger.warning(f"Could not open variant file {variant_fileloc}")
return sample_variant_list
if filterlist is None:
sample_variant_list = [(var, None) for var in vcfvars]
return sample_variant_list
for var in vcfvars:
variant_to_add = self.filter_vcf_variant(var, filterlist)
if variant_to_add is not None:
sample_variant_list.append(variant_to_add)
return sample_variant_list
def refetch_donor_variants(self, samples, varconfile):
"""Refetch VCF variants from sample VCFs using variant context as a filter.
Used in BuildSpikeIns tool while running with a pre-made variant
context file. Variant context files only store alignment read IDs and
basic variant info, so the original full records must be retrieved
from their original files before constructing spike-ins. Only variants
matching those in the variant context file will be retrieved from each
sample. Variants retrieved will overwrite the placeholder filter
variants located in the variant contexts read from file.
Parameters
----------
samples : list of sample_mapper.Sample objects.
varconfile : VariantContextFile
"""
all_varcons = varconfile.get_variant_contexts_by_sampleid()
for sample in samples:
sample_varcons = all_varcons[sample.hash_id]
for sample_varcon in sample_varcons:
varcon_variants = self.get_sample_vcf_variants_2(sample.vcf, sample_varcon.variants)
sample_varcon.variants = [var[0] for var in varcon_variants]
def rebuild(self, samples, varconfile, reference):
"""Refetch reads and variants for variant contexts."""
all_varcons = varconfile.get_variant_contexts_by_sampleid()
viables = [sample for sample in samples
if sample.hash_id in all_varcons]
self.refetch_donor_reads(viables, varconfile, reference)
self.refetch_donor_variants(viables, varconfile)
@staticmethod
def filter_vcf_variant(vcfvariant, filtervariantlist):
"""Check if a sample variant is in the filter list and set prioritization.
Compares variant sample, chromosome, position, and alleles. Alleles
are only checked for at least one matching allele in both ref and alt,
not exact matches for all alleles. Prioritization settings are added if
present in the filter. If multiple matching variants are found, the
highest prioritization settings are used if present.
Parameters
----------
vcfvariant : pysam.VariantRecord
Sample variant to check
filtervariantlist : list of InclusionVariant objects
List of variants to include and their prioritizations
Returns
-------
(vcfvariant, [Filter...])
vcfvariant and its prioritization Filter objects, or None if none found
"""
# Check if chrom and pos match with an inclusion variant.
matches = [filtervar for filtervar in filtervariantlist
if filtervar.chrom == vcfvariant.chrom
and filtervar.pos == vcfvariant.pos]
# Check if alleles match with matched position variants.
matches = [filtervar for filtervar in matches
if set(vcfvariant.ref.split(",")) & set(filtervar.ref.split(","))
and set(vcfvariant.alts) & set(filtervar.alts)]
# Return None if no matches found.
if not matches:
return None
# Get existing prioritization filters for all matches.
match_priorities = [x.priorities for x in matches
if x.priorities is not None]
# Set highest prioritization filter, or None if None.
if not match_priorities:
return (vcfvariant, None)
return (vcfvariant, max(match_priorities))
@staticmethod
def determine_variant_type(vcfvariantstart, vcfvariantstop):
"""Determine and return the variant type.
Determination of the variant type is based on the distance between
the right and left most genomic position of the variant reference
and alternative allele(s).
Parameters
----------
vcfvariantstart : int
Leftmost genomic position of the variant
vcfvariantstop : int
Rightmost genomic position of the variant
Returns
-------
str
Type of variant (snp/indel)
"""
if (vcfvariantstop - vcfvariantstart) <= 1:
return "snp"
return "indel"
def determine_read_search_window(self, varianttype, vcfvariant):
"""Determine and return the search window for fetching reads.
The determined search window depends on the provided variant type.
SNPs return a search window of SNP position -1 and +1. Indels return
a search window based indel leftmost position and length. If the
variant type is neither SNP or indel, the search window returned
will be [-1, -1].
Parameters
----------
varianttype : str
Variant type to determine search window for
vcfvariant : VcfVariant
Variant to determine search window with
Returns
-------
list of int
Search window start and stop, -1 and -1 if variant type is invalid
"""
if varianttype == "snp":
return [vcfvariant.get_variant_pos() - 1, vcfvariant.get_variant_pos() + 1]
if varianttype == "indel":
return self.determine_indel_read_range(vcfvariant.get_variant_pos(),
vcfvariant.get_variant_ref_allele(),
vcfvariant.get_variant_alt_alleles())
return [-1, -1]
# Returns the search start and stop to use for searching BAM reads
# overlapping with the range of the indel.
@staticmethod
def determine_indel_read_range(variantpos, variantref, variantalts):
"""Determine and return the search start and stop to use for an indel.
Parameters
----------
variantpos : int
Leftmost genomic position of the variant
variantref : str
Variant reference allele(s)
variantalts : tuple
Variant alternative allele(s)
Returns
-------
list of int
Search window start and stop
"""
searchstart = variantpos
searchstop = variantpos + max(
max([len(x) for x in variantref.split(",")]),
max([len(x) for x in variantalts])
)
return [searchstart, searchstop]
def get_variant_reads(self, variantchrom, variantstart, variantend, bamfile):
"""Fetch and return reads overlapping with a specified variant.
First reads overlapping directly with the variant position are
fetched. Then the read mates are fetched using the RNEXT and PNEXT
values of each read. Lastly, it is ensured that each read only
occurs once.
Parameters
----------
contextid : str
Identifier of the context to fetch reads for
variantchrom : str
Chromosome name to fetch reads from
variantstart : int
Leftmost genomic position to use for fetching
variantend : int
Rightmost genomic position to use for fetching
bamfile : pysam.AlignmentFile
Already opened pysam AlignmentFile
umatelist : list of str
Identifiers of reads with an unmapped mate
Returns
-------
variantreads : list of pysam.AlignedSegment
Fetched reads and their read mates
"""
hardclipped_read_num = 0
duplicate_read_num = 0
secondary_read_num = 0
read_objects = []
variantreads = []
clipped_reads = []
list_r1 = []
list_r2 = []
for vread in bamfile.fetch(variantchrom, variantstart-1, variantend+1):
if vread.is_duplicate:
duplicate_read_num += 1
if vread.is_secondary:
secondary_read_num += 1
if self.read_is_hard_clipped(vread):
hardclipped_read_num += 1
clipped_reads.append(vread)
continue
read_objects.append(vread)
self.vaselogger.debug(f"Fetched {hardclipped_read_num} reads with hardclipped bases")
for clipped in clipped_reads:
read_objects.append(self.fetch_primary_from_secondary(clipped, bamfile))
for vread in read_objects:
if vread.is_read1:
list_r1.append(vread)
elif vread.is_read2:
list_r2.append(vread)
list_r1_ids = [x.query_name for x in list_r1]
list_r2_ids = [x.query_name for x in list_r2]
for r1 in list_r1:
if r1.query_name not in list_r2_ids:
list_r2.append(self.fetch_mate_read(r1.query_name, r1.next_reference_name,
r1.next_reference_start,
self.get_read_pair_num(r1), bamfile))
for r2 in list_r2:
if r2.query_name not in list_r1_ids:
list_r1.append(self.fetch_mate_read(r2.query_name, r2.next_reference_name,
r2.next_reference_start,
self.get_read_pair_num(r2), bamfile))
variantreads = list_r1 + list_r2
variantreads = self.uniqify_variant_reads(variantreads)
return variantreads
@staticmethod
def fetch_primary_from_secondary(secondary_read, bamfile):
"""Fetch the primary alignment of a read based on the position recorded in its SA tag.
The SA tag is read from a provided Pysam read object. Then, reads
at the recorded alignment position are fetched. The first
non-secondary alignment with a matching read ID and pair number is
returned.
Parameters
----------
secondary_read : pysam.AlignedSegment
Secondary alignment whose primary alignment will be located
bamfile : pysam.AlignmentFile
Already opened pysam AlignmentFile
Returns
-------
primary_read : pysam.AlignedSegment
Primary alignment with the same read ID and pair number
as the input secondary alignment.
"""
primary_locus = secondary_read.get_tag("SA").split(",")[0:2]
for fetched in bamfile.fetch(primary_locus[0],
int(primary_locus[1]) - 1,
int(primary_locus[1])):
if ((fetched.query_name == secondary_read.query_name)
and (fetched.is_read1 == secondary_read.is_read1)
and not fetched.is_secondary):
primary_read = fetched
break
return primary_read
@staticmethod
def uniqify_variant_reads(variantreads):
"""Ensure each read only occurs once and return the updated set.
Parameters
----------
variantreads : list of pysam.AlignedSegment
Sets of reads to process
Returns
-------
unique_variantreads : list of pysam.AlignedSegment
Set of reads with each read occurring only once
"""
unique_variantreads = []
checklist = []
for fetched in variantreads:
readpn = "2"
if fetched.is_read1:
readpn = "1"
id_pair = (fetched.query_name, readpn)
if id_pair not in checklist:
unique_variantreads.append(fetched)
checklist.append(id_pair)
return unique_variantreads
def fetch_mate_read(self, readid, rnext, pnext, pair_num, bamfile):
"""Fetch and return the mate read of a specified read.
The mate read is fetched from an opened alignment file using the
RNEXT and PNEXT value of the read. Of the fetched reads, only the
read with the same identifier is returned.
Parameters
----------
rnext : str
Chromosome name the mate read is located on
pnext : int
Leftmost genomic position of the mate read
readid : str
Identifier of the read to search the mate for
bamfile : pysam.AlignmentFile
Already opened pysam alignment file
Returns
-------
pysam.AlignedSegment or None
The mate read if found, None if not
"""
for bamread in bamfile.fetch(rnext, pnext, pnext + 1):
if (bamread.query_name == readid
and bamread.reference_start == pnext
and self.get_read_pair_num(bamread) != pair_num):
return bamread
return None
def determine_context(self, contextreads, contextorigin, contextchr):
"""Determine and return an acceptor/donor context.
The acceptor/donor context is determined from a set of reads
overlapping with the variant including their read mates. The leftmost
and rightmost genomic positions of all reads are collected and
filtered for outliers. The context start (leftmost genomic) position
is then determined by taking minimum and maximum leftmost and
rightmost position respectively.
Parameters
----------
contextreads : list of pysam.AlignedSegment
Reads that form the context
contextorigin : int
Variant genomic position the context will be based on
contextchr : str
Chromosome name the context is located on
Returns
-------
list of str and int
Essential context data (chromosome, variant pos, start, end)
"""
# Check whether there are reads to determine the context for.
if not contextreads:
return []
# Get read start and stop position, while filtering out read mates
# that map to different chr.
starts = [conread.reference_start
for conread in contextreads
if conread.reference_name == contextchr]
stops = [conread.reference_end
for conread in contextreads
if conread.reference_name == contextchr and conread.reference_end is not None]
# Number of mates filtered for mapping to different chr.
num_diff_chr = len(contextreads) - len(starts)
# Filter outlier read start/stops.
filtered_starts = self.filter_outliers(starts)
filtered_stops = self.filter_outliers(stops)
# Number of filtered outlier positions.
num_filtered = (len(starts)
+ len(stops)
- len(filtered_starts)
- len(filtered_stops))
self.vaselogger.debug(f"{num_diff_chr} read mate(s) filtered due to "
"alignment to different reference sequence.")
self.vaselogger.debug(f"{num_filtered} outlier read position(s) filtered.")
# Set variant context as chr, min start, max end.
contextstart = min(filtered_starts)
contextend = max(filtered_stops)
return [contextchr, contextorigin, contextstart, contextend]
@staticmethod
def filter_outliers(pos_list, k=3):
"""Filter outliers from a list of start or stop positions and return the filtered list.
Outlier start/stop positions are filtered from the list using Tukey's
Fences method. For more info please see:
https://en.wikipedia.org/wiki/Outlier#Tukey's_fences
Parameters
----------
pos_list : list of int
Start/stop positions
k : int
Factor to determine outlier
Returns
-------
filtered : list of str
List of read positions without outliers
"""
# First and third quartile values of the positions.
quartile_1 = np.percentile(pos_list, 25)
quartile_3 = np.percentile(pos_list, 75)
# Interquartile range.
iq_range = quartile_3 - quartile_1
# Only include positions that fall within the range of
# (quartile_1 to quartile_3) +/- k*iq_range.
filtered = [x for x in pos_list
if (quartile_1 - (k * iq_range)) <= x <= (quartile_3 + (k * iq_range))]
return filtered
@staticmethod
def determine_largest_context(contextorigin, acceptor_context,
donor_context):
"""Determine the size of the variant context based on both the acceptor and donor reads.
Parameters
----------
contextorigin : int
Variant position that the context is based on
acceptor_context : list of str and int
Essential acceptor context data
donor_context : list of str and int
Essential donor context data
Returns
-------
list of str and int
Context (chromosome, variant pos, start, end)
"""
largest_context = [donor_context[0]]
largest_context.append(contextorigin)
# Determine and save the smallest context start.
largest_context.append(min(acceptor_context[2], donor_context[2]))
# Determine and save the largest context end.
largest_context.append(max(acceptor_context[3], donor_context[3]))
return largest_context
# ===METHODS TO PRODUCE THE VALIDATION FASTQ FILES=========================
@staticmethod
def build_donor_fq(donorbamreaddata, fr, fastq_outpath):
"""Build and write a fastq file containing only donor reads.
Parameters
----------
donorbamreaddata : list of tuple
Donor reads to write to fastq file
fr : str
Write forward ('1') or reverse ('2') fastq file
fastq_outpath : str
Path and name to write donor fastq file to
"""
# Write the new VaSe FastQ file.
# vasefq_outname = self.set_fastq_out_path(fastq_outpath, fr, 1)
fqgz_outfile = io.BufferedWriter(open(fastq_outpath, "wb"))
donorbamreaddata.sort(key=lambda dbr: dbr[0], reverse=False)
for bamread in donorbamreaddata:
# Check if the BAM read is R1 or R2.
if bamread[1] == fr:
fqlines = ("@" + str(bamread[0]) + "\n"
+ str(bamread[2]) + "\n"
+ "+\n"
+ str(bamread[3]) + "\n")
fqgz_outfile.write(fqlines.encode("utf-8"))
fqgz_outfile.flush()
fqgz_outfile.close()
@staticmethod
def is_required_read(bamread, fr):
"""Check and return whether the current read is the one that is required.
When writing the validation fastq files only R1, or forward (F),
reads should go to the R1 validation fastq file. Similarly, the R2
reads should only go into the R2 validation fastq file.
Parameters
----------
bamread : DonorBamRead
Read to check
fr : str
Whether the read is required for R1 or R2
Returns
-------
bool
True if the read is required, False if not
"""
if fr == "F":
return bamread.is_read1
return bamread.is_read2
@staticmethod
def set_fastq_out_path(outpath, fr, lnum):
"""Set and return the fastq output path and filename.
Parameters
----------
outpath : str
Path and suffix to write fastq file to
fr: str
Will the fastq file be R1 ('1') or R2 ('2')
lnum: int
Lane number (i.e.: 1, 2, 3 or 4)
Returns
-------
str
Full path to write fastq file to
"""
if fr == "1":
return f"{outpath}_{datetime.now().date()}_L{lnum}_R1.fastq"
return f"{outpath}_{datetime.now().date()}_L{lnum}_R2.fastq"
# ===METHODS TO OBTAIN SOME DATA OF THE VASEBUILDER OBJECT=================
def get_creation_id(self):
"""Return the identifier of the current VaSeBuilder object.
Returns
-------
self.creation_id : str
VaSeBuilder creation identifier
"""
return self.creation_id
def get_creation_time(self):
"""Return the date and time the current VaSeBuilder object has been made.
Returns
-------
str
VaSeBuilder creation date and time
"""
return self.creation_time
@staticmethod
def get_vcf_variant_id(vcfvariant):
"""Return an identifier for a variant as CHROM_POS.
Parameters
----------
vcfvariant : pysam.VariantRecord
Variant for which to construct an identifier
Returns
-------
str
Variant identifier as CHROM_POS
"""
return f"{vcfvariant.chrom}_{vcfvariant.pos}"
@staticmethod
def get_read_pair_num(pysam_bamread):
"""Return the read pair number of a provided read.
Parameters
----------
pysam_bamread : pysam.AlignedSegment
Read to determine the read pair number of
Returns
-------
str
Read pair number ('1' or '2')
"""
if pysam_bamread.is_read1:
return "1"
return "2"
# ===METHODS TO WRITE OUTPUT FILES=========================================
def write_used_donor_files(self, outfileloc, samples,
used_donor_files, vbuuid, file_type):
"""Write a list of donor alignment or variant files used in constructing variant contexts.
Parameters
----------
outfileloc : str
Path to write the output file to
filesamplemap : dict
Donor files per sample
used_donor_files : list
Donor alignment or variant files used to construct variant contexts
vbuuid : str
Unique identifier of the current VaSeBuilder
"""
try:
with open(outfileloc, "w") as outfile:
outfile.write(f"#VBUUID: {vbuuid}\n")
outfile.write("#SampleId\tDonorFile\n")
for sample in samples:
if file_type == "a":
if sample.bam in used_donor_files:
# XXX: Need to figure out what to do about sample name in the filenames.
outfile.write(f"{sample.hash_id}\t{sample.bam}\n")
elif file_type == "v":
if sample.vcf in used_donor_files:
outfile.write(f"{sample.hash_id}\t{sample.vcf}\n")
except IOError:
self.vaselogger.critical("Could not write used donor files to "
f"{outfileloc}")
def write_optional_output_files(self, outpath, contextfile):
"""Write optional output files to a specified output location.
The optional output files contain data about acceptor and donor
contexts as well as output files containing the reads with unmapped
mates per variant context. The optional output files are only
produced when VaSeBuilder is run with with debug set to True.
Parameters
----------
outpath: str
Path to write the optional output file to
contextfile: VariantContextFile
Variant context data
"""
# Write the optional acceptor context files; acceptor contexts,
# read ids with unmapped mate and left/right positions.
self.vaselogger.debug("Writing acceptor contexts to "
f"{outpath}acceptorcontexts.txt")
contextfile.write_acceptor_context_file(f"{outpath}acceptorcontexts.txt",
self.creation_id)
self.vaselogger.debug("Writing acceptor context statistics to "
f"{outpath}acceptorcontextstats.txt")
contextfile.write_acceptor_context_stats(f"{outpath}acceptorcontextstats.txt",
self.creation_id)
self.vaselogger.debug("Writing acceptor context read identifiers with "
"unmapped mates to"
f"mates to {outpath}acceptor_unmapped.txt")
contextfile.write_acceptor_unmapped_mates(f"{outpath}acceptor_unmapped.txt",
self.creation_id)
self.vaselogger.debug("Writing left and right most read positions of "
"each acceptor context to "
f"{outpath}acceptor_positions.txt")
contextfile.write_acceptor_left_right_positions(f"{outpath}acceptor_positions.txt",
self.creation_id)
# Write the optional donor context files; donor contexts,
# read ids with unmapped mate and left/right positions.
self.vaselogger.debug("Writing donor contexts to "
f"{outpath}donorcontexts.txt")
contextfile.write_donor_context_file(f"{outpath}donorcontexts.txt",
self.creation_id)
self.vaselogger.debug("Writing donor context statistics to "
f"{outpath}donorcontextstats.txt")
contextfile.write_donor_context_stats(f"{outpath}donorcontextstats.txt",
self.creation_id)
self.vaselogger.debug("Writing donor context read identifiers "
f"with unmapped mates to {outpath}donor_unmapped.txt")
contextfile.write_donor_unmapped_mates(f"{outpath}donor_unmapped.txt",
self.creation_id)
self.vaselogger.debug("Writing left and right most read positions "
f"of each donor context to {outpath}donor_positions.txt")
contextfile.write_donor_left_right_positions(f"{outpath}donor_positions.txt",
self.creation_id)
# Write the optional variant context files; acceptor & donor
# unmapped mates and left/right positions.
self.vaselogger.debug("Writing variant context acceptor read identifiers with unmapped "
f"mates to {outpath}varcon_unmapped_acceptor.txt")
contextfile.write_reads_with_unmapped_mate("acceptor",
f"{outpath}varcon_unmapped_acceptor.txt",
self.creation_id)
self.vaselogger.debug("Writing variant context donor read identifiers "
f"with unmapped mates to {outpath}varcon_unmapped_donor.txt")
contextfile.write_reads_with_unmapped_mate("donor", f"{outpath}varcon_unmapped_donor.txt",
self.creation_id)
self.vaselogger.debug("Writing variant context left- and right-most read positions of "
f"acceptor reads to {outpath}varcon_positions_acceptor.txt")
contextfile.write_left_right_positions(
"acceptor",
f"{outpath}varcon_positions_acceptor.txt",
self.creation_id
)
self.vaselogger.debug("Writing variant context left and right most "
"read positions of donor reads to "
f"{outpath}varcon_positions_donor.txt")
contextfile.write_left_right_positions(
"donor",
f"{outpath}varcon_positions_donor.txt",
self.creation_id
)
def write_bed_file(self, variantcontextdata, bedoutloc, vbuuid):
"""Write variant contexts as a BED file.
Parameters
----------
variantcontextdata : list of VariantContext
Variants contexts to write to BED file
bedoutloc : str
Path to write the BED file to
"""
try:
with open(bedoutloc, "w") as bedoutfile:
bedoutfile.write(f"#VBUUID: {vbuuid}\n")
for varcon in variantcontextdata:
bedoutfile.write(f"{varcon.get_variant_context_chrom()}\t"
f"{varcon.get_variant_context_start()}\t"
f"{varcon.get_variant_context_end()}\t"
f"{varcon.get_variant_context_id()}\n")
except IOError:
self.vaselogger.warning("Could not write variant context data to BED file: "
f"{bedoutloc}")
def check_sequence_names(self, referencefile, alignmentfile):
"""Check if the chromosome names in the genome reference and alignment file are the same.
Parameters
----------
referencefile : str
Path to genome reference fasta file
alignmentfile : pysam.AlignmentFile
Returns
-------
None
"""
reference_seqnames = self.get_reference_sequence_names(referencefile)
alignment_seqnames = SampleMapper.get_alignment_sequence_names(alignmentfile)
shared_seqnames = reference_seqnames & alignment_seqnames
if (len(shared_seqnames) < len(reference_seqnames)
or len(shared_seqnames) < len(alignment_seqnames)):
self.vaselogger.warning("Reference file and alignment file do not "
"contain the same sequence names")
def get_reference_sequence_names(self, reference_fileloc):
"""Return the sequence names from the reference genome fasta file.
Parameters
----------
reference_fileloc: str
Path to genomic reference fasta file
Returns
-------
reference_seqnames : list of str
Extracted chromosome names
"""
reference_seqnames = set()
try:
with open(reference_fileloc, "r") as reference_file:
for fileline in reference_file:
if fileline.startswith(">"):
filelinedata = fileline.strip().split(" ")
reference_seqnames.add(filelinedata[0][1:])
except IOError:
self.vaselogger.critical(f"Could not read genome reference file {reference_fileloc}")
sys.exit()
return reference_seqnames
@staticmethod
def divide_donorfastqs_over_acceptors(list_of_donorfqs, num_of_acceptors):
"""Divide a list of donor fastq files over a number of acceptor fastq files.
Donor fastq files are divided equally into a specified number of
groups. The number of groups is specified by the number of acceptor
fastq files to use as template.
Parameters
----------
list_of_donorfqs: list of str
Paths to donor fastq files
num_of_acceptors: int
Number of template fastq files
"""
cycler = 0
split_donors = [[] for x in range(num_of_acceptors)]
copy_list_donorfqs = list(list_of_donorfqs)
while copy_list_donorfqs:
cycle = cycler % num_of_acceptors
split_donors[cycle].append(copy_list_donorfqs.pop())
cycler += 1
return split_donors
# BUILDS A SET OF R1/R2 VALIDATION FASTQS WITH ALREADY EXISTING DONOR FASTQS
@classmethod
def make_bam_headers(cls, samples, variant_context_file):
"""Construct simple BAM headers for all samples.
Retrieves original BAM header information from the first read from the
first variant context (arbitrary selection) constructed for each
sample. If no variant context was created for a sample, it is skipped.
Headers are reduced to only necessary information, and sample IDs are
replaced with the hashed ID for each sample (if a hash was created).
Returns a dictionary of sample_ID: header pairs.
Parameters
----------
samples : list of sample_mapper.Sample objects
variant_context_file : VariantContextFile
Returns
-------
used_headers : list of OrderedDict objects
"""
# Get all varcons per sample and all used donor BAM filenames.
varcons_per_sample = variant_context_file.get_variant_contexts_by_sampleid()
# used_bams = variant_context_file.get_donor_alignment_files()
# Empty dict to store results.
used_headers = {}
for sample in samples:
# Skip unused samples.
if sample.hash_id not in varcons_per_sample:
continue
# if sample.bam not in used_bams:
# continue
# Retrieve header from arbitrary read from this sample.
head = varcons_per_sample[sample.hash_id][0].variant_context_dreads[0].header.as_dict()
# Replace header sample and library fields with hash (if hashed).
if "RG" in head:
for i in range(len(head["RG"])):
head["RG"][i]["SM"] = sample.hash_id
head["RG"][i]["LB"] = sample.hash_id
# Keep only basic header information.
head = cls.select_bam_header_fields(head, ["HD", "SQ", "RG"])
used_headers[sample.hash_id] = head
return used_headers
def run_a_mode_v3(self, samples, variant_context_file, out_path, prefix="VaSe"):
"""Write all spike-ins to a single BAM and single VCF.
All donor reads from all created variant contexts are written to a
single BAM file, which has a merged header constructed from all used
sample BAMs with only minimal necessary header lines retained. If
hashing was enabled when creating sample objects (default), sample IDs
in the BAM headers will be replaced with their hashes. All donor
variants from all created variant contexts are written to one
single-sample VCF file with an arbitrary sample name.
Parameters
----------
samples : list of sample_mapper.Sample objects
variant_context_file : VariantContextFile
out_path : str
Path to output directory.
prefix : str
Prefix for output BAM and VCF filenames
"""
self.vaselogger.debug("Running VaSeBuilder A-mode")
# Get all used headers and replace IDs if necessary, then merge them.
headers = self.make_bam_headers(samples, variant_context_file)
merged_header = list(headers.values())[0]
for header in list(headers.values())[1:]:
merged_header = self.merge_donor_alignment_headers(merged_header, header)
# Get all donor reads from all variant contexts and write to BAM.
donor_reads_to_add = variant_context_file.get_all_variant_context_donor_reads_2()
outpathbam = f"{out_path}{prefix}.bam"
self.vaselogger.debug(f"Start writing A-mode donor BAM output file to {outpathbam}")
self.write_spike_in_bam(merged_header, donor_reads_to_add, outpathbam)
# Get all variants from all variant contexts and write to VCF.
donor_variants_to_add = variant_context_file.get_all_variant_context_variant_records()
outpathvcf = f"{out_path}{prefix}.vcf"
self.vaselogger.debug(f"Start writing A-mode donor VCF output file to {outpathvcf}")
self.write_vcf_slice("VaSeBuilder", donor_variants_to_add, outpathvcf)
@staticmethod
def merge_donor_alignment_headers(base_header, header_to_add):
"""Merge a new header into a provided header.
Parameters
----------
base_header : OrderedDict
Header to add another header to
header_to_add : OrderedDict
AlignmentFile header to merge into larger header
Returns
-------
base_header : OrderedDict
"""
merged_header = OrderedDict(base_header)
present_read_groups = {x["ID"] for x in base_header["RG"]