Skip to content

Commit

Permalink
Merge pull request #41 from jakobnissen/qualityiter
Browse files Browse the repository at this point in the history
Add quality and sequence iterator
  • Loading branch information
jakobnissen authored Mar 3, 2021
2 parents b1a8ecb + 6b0f588 commit e72dcfe
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 9 deletions.
1 change: 1 addition & 0 deletions src/fasta/fasta.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import BioGenerics: BioGenerics, isfilled
import BioGenerics.Exceptions: missingerror
import BioGenerics.Automa: State
import BioSequences
import BioSymbols
import TranscodingStreams: TranscodingStreams, TranscodingStream
import ..FASTX: identifier, description, sequence

Expand Down
15 changes: 15 additions & 0 deletions src/fasta/record.jl
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,21 @@ function header(record::Record)
return String(record.data[range])
end

"""
sequence_iter(T, record::Record)
Yields an iterator of the sequence, with elements of type `T`. `T` is constructed
through `T(Char(x))` for each byte `x`. E.g. `sequence_iter(DNA, record)`.
Mutating the record will corrupt the iterator.
"""
function sequence_iter(::Type{T}, record::Record,
part::UnitRange{<:Integer}=1:lastindex(record.sequence)) where {T <: BioSymbols.BioSymbol}
checkfilled(record)
seqpart = record.sequence[part]
data = record.data
return (T(Char(@inbounds (data[i]))) for i in seqpart)
end

"""
sequence(::Type{S}, record::Record, [part::UnitRange{Int}])::S
Expand Down
37 changes: 29 additions & 8 deletions src/fastq/record.jl
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,21 @@ function Base.copyto!(dest::BioSequences.LongSequence, doff, src::Record, soff,
return copyto!(dest, doff, src.data, src.sequence[soff], N)
end

"""
sequence_iter(T, record::Record)
Yields an iterator of the sequence, with elements of type `T`. `T` is constructed
through `T(Char(x))` for each byte `x`. E.g. `sequence_iter(DNA, record)`.
Mutating the record will corrupt the iterator.
"""
function sequence_iter(::Type{T}, record::Record,
part::UnitRange{<:Integer}=1:lastindex(record.sequence)) where {T <: BioSymbols.BioSymbol}
checkfilled(record)
seqpart = record.sequence[part]
data = record.data
return (T(Char(@inbounds (data[i]))) for i in seqpart)
end

"""
sequence(::Type{S}, record::Record, [part::UnitRange{Int}])
Expand Down Expand Up @@ -300,21 +315,27 @@ end
"Get the length of the fastq record's sequence."
@inline seqlen(record::Record) = last(record.sequence) - first(record.sequence) + 1

"""
quality_iter(record::Record, [offset::Integer=33, [part::UnitRange]])::Vector{UInt8}
Get an iterator of base quality of `record`. This iterator is corrupted if the record is mutated.
"""
function quality_iter(record::Record, offset::Integer=33, part::UnitRange{Int}=1:lastindex(record.quality))
checkfilled(record)
offs = convert(UInt8, offset)
part = record.quality[part]
data = record.data
return (@inbounds(data[i]) - offs for i in part)
end

"""
quality(record::Record, [offset::Integer=33, [part::UnitRange]])::Vector{UInt8}
Get the base quality of `record`.
"""
function quality(record::Record, offset::Integer=33, part::UnitRange{Int}=1:lastindex(record.quality))::Vector{UInt8}
checkfilled(record)
quality = record.data[record.quality[part]]
for i in 1:lastindex(part)
# TODO: Checked arithmetic?
@inbounds quality[i] -= offset
end
return quality
collect(quality_iter(record, offset, part))
end

"""
quality(record::Record, encoding_name::Symbol, [part::UnitRange])::Vector{UInt8}
Expand Down
6 changes: 5 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using Test
using FASTX
using FormatSpecimens
using BioSymbols
import BioGenerics
import BioGenerics.Testing: intempdir
import BioSequences:
Expand Down Expand Up @@ -35,7 +36,8 @@ import BioSequences:
@test BioGenerics.hassequence(record)
@test FASTA.hassequence(record)
@test FASTA.sequence(record) == dna"ACGT"
@test FASTA.sequence(record, 2:3) == dna"CG"
@test collect(FASTA.sequence_iter(DNA, record)) == [DNA_A, DNA_C, DNA_G, DNA_T]
@test FASTA.sequence(record, 2:3) == LongDNASeq(collect(FASTA.sequence_iter(DNA, record, 2:3))) == dna"CG"
@test FASTA.sequence(String, record) == "ACGT"
@test FASTA.sequence(String, record, 2:3) == "CG"

Expand Down Expand Up @@ -331,6 +333,8 @@ end
@test FASTQ.header(record) == FASTQ.identifier(record) * " " * FASTA.description(record)
@test FASTQ.hassequence(record) == BioGenerics.hassequence(record) == true
@test FASTQ.sequence(LongDNASeq, record) == seq
@test LongDNASeq(collect(FASTQ.sequence_iter(DNA, record))) == seq
@test LongDNASeq(collect(FASTQ.sequence_iter(DNA, record, 3:7))) == dna"GCTCA"
@test copyto!(LongDNASeq(FASTQ.seqlen(record)), record) == seq
@test_throws ArgumentError copyto!(LongDNASeq(10), FASTQ.Record())
@test FASTQ.sequence(record) == BioGenerics.sequence(record) == seq
Expand Down

0 comments on commit e72dcfe

Please sign in to comment.