Skip to content

Commit

Permalink
More tests and tweaks to FASTA
Browse files Browse the repository at this point in the history
  • Loading branch information
jakobnissen committed Jul 26, 2022
1 parent 816a366 commit 17f3dfc
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 7 deletions.
7 changes: 6 additions & 1 deletion src/fasta/record.jl
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ function Record(description::AbstractString, sequence::Union{BioSequences.BioSeq
end

function Base.:(==)(record1::Record, record2::Record)
record1.description_len == record2.description_len || return false
filled1 = filled(record1)
filled1 == filled(record2) || return false
return memcmp(pointer(record1.data), pointer(record2.data), filled1) == 0
Expand Down Expand Up @@ -253,5 +254,9 @@ end

# TODO: Base's hash does not hash all elements. Do we have a better implementation?
function Base.hash(record::Record, h::UInt)
hash(view(record.data, filled(record)), h objectid(Record))
# The description length is informative of the record's content
# in a way that the sequence length and identifier length isn't.
# I.e. you could have ">A\nAG" vs ">AA\nG"
h = hash(record.description_len, h)
hash(view(record.data, filled(record)), h)
end
73 changes: 67 additions & 6 deletions test/fasta.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
using FASTX: FASTA
using FASTX.FASTA: Record, identifier, description, sequence
using FASTX.FASTA: Record, identifier, description, sequence, Reader, Writer

using BioSequences: LongDNA, LongRNA, LongAA, @dna_str, @rna_str, @aa_str

Expand Down Expand Up @@ -90,15 +90,21 @@ end
@test description(record) == "ag | kop[\ta]"
@test sequence(String, record) == ""

# Trailing whitespace
# Trailing description whitespace
record = Record(">hdr name\t \r\npkmn\naj")
@test identifier(record) == "hdr"
@test description(record) == "hdr name\t "
@test sequence(String, record) == "pkmnaj"

# Trailing sequence whitespace
record = Record(">here\nplKn\n.\n \t\v\n\n \n \n")
@test identifier(record) == description(record) == "here"
@test sequence(String, record) == "plKn. \t\v "
end

# Tests trailing bytes in data field are OK
@testset "Noncoding bytes" begin
record = FASTA.Record(">abc\nOOJM\nQQ")
record = Record(">abc\nOOJM\nQQ")
resize!(record.data, 1000)
@test identifier(record) == description(record) == "abc"
@test sequence(String, record) == "OOJMQQ"
Expand Down Expand Up @@ -220,15 +226,70 @@ end
end
end


# Includes "unique"
@testset "Hashing" begin
records = map(Record, [
">A\n\n",
">A\nAG",
">AA\nG",
])
# Same as previous, but with noncoding data
push!(records, Record(codeunits("AAGGGG"), 2, 2, 1))

@test hash(first(records)) == hash(first(records))
@test hash(records[end]) == hash(records[end-1])
@test isequal(records[end], records[end-1])
@test !isequal(records[3], records[2])
@test length(unique(records)) == length(records) - 1
end

@testset "Reader basics" begin
# Empty reader
reader = Reader(IOBuffer(""))
@test isnothing(iterate(reader))
close(reader)

# Resumable
reader = Reader(IOBuffer(">header\nTAG\nAA\n\r\n\r\n>header2\nAAA\n\nGG\n\r\n"))
(r, s) = iterate(reader)
@test identifier(r) == "header"
@test sequence(String, r) == "TAGAA"
(r, s) = iterate(reader)
@test identifier(r) == "header2"
@test sequence(String, r) == "AAAGG"
@test isnothing(iterate(reader))
close(reader)

# Copies on iteration
reader = Reader(IOBuffer(">A\nG\n>A\nG"))
records = collect(reader)
@test first(records) !== last(records)
close(reader)
end

@testset "Writer basics" begin
function test_writer(records, regex::Regex)
buffer = IOBuffer()
writer = Writer(buffer)
for record in records
write(writer, record)
end
close(writer) # necessary to flush
str = String(take!(buffer))
@test occursin(regex, str)
end

# Empty writer

# Empty records
#records = [Record(), Record()]
#test_writer(records, r"^>\n\n>\n\n$")

# Does not write uncoding bytes in records

#


end

@testset "Writter append" begin
Expand All @@ -237,8 +298,8 @@ end
@testset "Writer width" begin
end

# Records can be written, then re-read without loss
# When with arbitrary whitespace
# Records can be written, then re-read without loss,
# except arbitrary whitespace in the sequence
@testset "Roundtrip" begin
end

Expand Down

0 comments on commit 17f3dfc

Please sign in to comment.