From 0754245539ce65850bc35d26b32fcd208c6a11e1 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sun, 28 Feb 2021 14:20:37 +0100 Subject: [PATCH 1/2] Add quality iterator --- src/fastq/record.jl | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/fastq/record.jl b/src/fastq/record.jl index c8056eb..3a1189d 100644 --- a/src/fastq/record.jl +++ b/src/fastq/record.jl @@ -300,21 +300,27 @@ end "Get the length of the fastq record's sequence." @inline seqlen(record::Record) = last(record.sequence) - first(record.sequence) + 1 +""" + quality_iter(record::Record, [offset::Integer=33, [part::UnitRange]])::Vector{UInt8} + +Get an iterator of base quality of `record`. This iterator is corrupted if the record is mutated. +""" +function quality_iter(record::Record, offset::Integer=33, part::UnitRange{Int}=1:lastindex(record.quality)) + checkfilled(record) + offs = convert(UInt8, offset) + part = record.quality[part] + data = record.data + return (@inbounds(data[i]) - offs for i in part) +end + """ quality(record::Record, [offset::Integer=33, [part::UnitRange]])::Vector{UInt8} Get the base quality of `record`. """ function quality(record::Record, offset::Integer=33, part::UnitRange{Int}=1:lastindex(record.quality))::Vector{UInt8} - checkfilled(record) - quality = record.data[record.quality[part]] - for i in 1:lastindex(part) - # TODO: Checked arithmetic? - @inbounds quality[i] -= offset - end - return quality + collect(quality_iter(record, offset, part)) end - """ quality(record::Record, encoding_name::Symbol, [part::UnitRange])::Vector{UInt8} From 6b0f588dfdcd760627574074ae38b22966c1a0b4 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sun, 28 Feb 2021 14:46:15 +0100 Subject: [PATCH 2/2] Add sequence iterator --- src/fasta/fasta.jl | 1 + src/fasta/record.jl | 15 +++++++++++++++ src/fastq/record.jl | 15 +++++++++++++++ test/runtests.jl | 6 +++++- 4 files changed, 36 insertions(+), 1 deletion(-) diff --git a/src/fasta/fasta.jl b/src/fasta/fasta.jl index c34f44a..c9f81f3 100644 --- a/src/fasta/fasta.jl +++ b/src/fasta/fasta.jl @@ -10,6 +10,7 @@ import BioGenerics: BioGenerics, isfilled import BioGenerics.Exceptions: missingerror import BioGenerics.Automa: State import BioSequences +import BioSymbols import TranscodingStreams: TranscodingStreams, TranscodingStream import ..FASTX: identifier, description, sequence diff --git a/src/fasta/record.jl b/src/fasta/record.jl index 51885a9..bb42183 100644 --- a/src/fasta/record.jl +++ b/src/fasta/record.jl @@ -212,6 +212,21 @@ function header(record::Record) return String(record.data[range]) end +""" + sequence_iter(T, record::Record) + +Yields an iterator of the sequence, with elements of type `T`. `T` is constructed +through `T(Char(x))` for each byte `x`. E.g. `sequence_iter(DNA, record)`. +Mutating the record will corrupt the iterator. +""" +function sequence_iter(::Type{T}, record::Record, + part::UnitRange{<:Integer}=1:lastindex(record.sequence)) where {T <: BioSymbols.BioSymbol} + checkfilled(record) + seqpart = record.sequence[part] + data = record.data + return (T(Char(@inbounds (data[i]))) for i in seqpart) +end + """ sequence(::Type{S}, record::Record, [part::UnitRange{Int}])::S diff --git a/src/fastq/record.jl b/src/fastq/record.jl index 3a1189d..35e7df3 100644 --- a/src/fastq/record.jl +++ b/src/fastq/record.jl @@ -240,6 +240,21 @@ function Base.copyto!(dest::BioSequences.LongSequence, doff, src::Record, soff, return copyto!(dest, doff, src.data, src.sequence[soff], N) end +""" + sequence_iter(T, record::Record) + +Yields an iterator of the sequence, with elements of type `T`. `T` is constructed +through `T(Char(x))` for each byte `x`. E.g. `sequence_iter(DNA, record)`. +Mutating the record will corrupt the iterator. +""" +function sequence_iter(::Type{T}, record::Record, + part::UnitRange{<:Integer}=1:lastindex(record.sequence)) where {T <: BioSymbols.BioSymbol} + checkfilled(record) + seqpart = record.sequence[part] + data = record.data + return (T(Char(@inbounds (data[i]))) for i in seqpart) +end + """ sequence(::Type{S}, record::Record, [part::UnitRange{Int}]) diff --git a/test/runtests.jl b/test/runtests.jl index 6836077..38fd123 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,7 @@ using Test using FASTX using FormatSpecimens +using BioSymbols import BioGenerics import BioGenerics.Testing: intempdir import BioSequences: @@ -35,7 +36,8 @@ import BioSequences: @test BioGenerics.hassequence(record) @test FASTA.hassequence(record) @test FASTA.sequence(record) == dna"ACGT" - @test FASTA.sequence(record, 2:3) == dna"CG" + @test collect(FASTA.sequence_iter(DNA, record)) == [DNA_A, DNA_C, DNA_G, DNA_T] + @test FASTA.sequence(record, 2:3) == LongDNASeq(collect(FASTA.sequence_iter(DNA, record, 2:3))) == dna"CG" @test FASTA.sequence(String, record) == "ACGT" @test FASTA.sequence(String, record, 2:3) == "CG" @@ -331,6 +333,8 @@ end @test FASTQ.header(record) == FASTQ.identifier(record) * " " * FASTA.description(record) @test FASTQ.hassequence(record) == BioGenerics.hassequence(record) == true @test FASTQ.sequence(LongDNASeq, record) == seq + @test LongDNASeq(collect(FASTQ.sequence_iter(DNA, record))) == seq + @test LongDNASeq(collect(FASTQ.sequence_iter(DNA, record, 3:7))) == dna"GCTCA" @test copyto!(LongDNASeq(FASTQ.seqlen(record)), record) == seq @test_throws ArgumentError copyto!(LongDNASeq(10), FASTQ.Record()) @test FASTQ.sequence(record) == BioGenerics.sequence(record) == seq