Skip to content

Commit

Permalink
Add function barrier in npzreadarray and introduce readheader (#44)
Browse files Browse the repository at this point in the history
This change does two things:

- Make the `Header` type parametric and concrete: this seems to cut down on allocations while reading arrays

On master:
```julia
julia> npzwrite("abc.npy", ones(3,4));

julia> @Btime npzread("abc.npy");
  23.639 μs (91 allocations: 5.80 KiB)
```

After this PR: 
```julia
julia> @Btime npzread("abc.npy");
  20.911 μs (80 allocations: 5.28 KiB)
```

The difference in run-time is marginal.

- Introduce a function `readheader` that returns the header. This might be convenient to get the `eltype` and `size` of the array without reading it in (eg. to pre-allocate an array of the correct size). This function is not exported.

```julia
julia> npzwrite("abc.npy", ones(3,4));

julia> NPZ.readheader("abc.npy")
NPZ.Header{Float64,2,typeof(ltoh)}(ltoh, true, (3, 4))

julia> NPZ.readheader("abc.npy") |> size
(3, 4)

julia> NPZ.readheader("abc.npy") |> eltype
Float64
```
  • Loading branch information
jishnub authored Jan 9, 2021
1 parent 9a66947 commit 8834d3c
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 19 deletions.
8 changes: 4 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
name = "NPZ"
uuid = "15e1cf62-19b3-5cfa-8e77-841668bca605"
version = "0.4.0"
version = "0.4.1"

[deps]
Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"

[compat]
Compat = "≥ 1.0.0"
ZipFile = "0.3.0"
julia = "0.7.0"
Compat = "1, 2, 3"
ZipFile = "0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9"
julia = "0.7, 1"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
78 changes: 63 additions & 15 deletions src/NPZ.jl
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ function parsetuple(s::AbstractString)
s = parsechar(s, ',')
end
s = parsechar(s, ')')
tup, s
Tuple(tup), s
end

function parsedtype(s::AbstractString)
Expand All @@ -159,24 +159,31 @@ function parsedtype(s::AbstractString)
(toh, Numpy2Julia[t]), s
end

struct Header
descr::Tuple{Function, DataType}
struct Header{T,N,F<:Function}
descr::F
fortran_order::Bool
shape::Vector{Int}
shape::NTuple{N,Int}
end

Header{T}(descr::F, fortran_order, shape::NTuple{N,Int}) where {T,N,F} = Header{T,N,F}(descr, fortran_order, shape)
Base.size(hdr::Header) = hdr.shape
Base.eltype(hdr::Header{T}) where T = T
Base.ndims(hdr::Header{T,N}) where {T,N} = N

function parseheader(s::AbstractString)
s = parsechar(s, '{')

dict = Dict{String,Any}()
T = Any
for _ in 1:3
s = strip(s)
key, s = parsestring(s)
s = strip(s)
s = parsechar(s, ':')
s = strip(s)
if key == "descr"
dict[key], s = parsedtype(s)
(descr, T), s = parsedtype(s)
dict[key] = descr
elseif key == "fortran_order"
dict[key], s = parsebool(s)
elseif key == "shape"
Expand All @@ -196,10 +203,10 @@ function parseheader(s::AbstractString)
if s != ""
error("malformed header")
end
Header(dict["descr"], dict["fortran_order"], dict["shape"])
Header{T}(dict["descr"], dict["fortran_order"], dict["shape"])
end

function npzreadarray(f::IO)
function readheader(f::IO)
@compat b = read!(f, Vector{UInt8}(undef, length(NPYMagic)))
if b != NPYMagic
error("not a numpy array file")
Expand All @@ -208,26 +215,33 @@ function npzreadarray(f::IO)

# support for version 2 files
if b[1] == 1
hdrlen = readle(f, UInt16)
hdrlen = UInt32(readle(f, UInt16))
elseif b[1] == 2
hdrlen = readle(f, UInt32)
hdrlen = UInt32(readle(f, UInt32))
else
error("unsupported NPZ version")
end

@compat hdr = ascii(String(read!(f, Vector{UInt8}(undef, hdrlen))))
hdr = parseheader(strip(hdr))
parseheader(strip(hdr))
end

toh, typ = hdr.descr
function _npzreadarray(f, hdr::Header{T}) where {T}
toh = hdr.descr
if hdr.fortran_order
@compat x = map(toh, read!(f, Array{typ}(undef, hdr.shape...)))
@compat x = map(toh, read!(f, Array{T}(undef, hdr.shape)))
else
@compat x = map(toh, read!(f, Array{typ}(undef, reverse(hdr.shape)...)))
@compat x = map(toh, read!(f, Array{T}(undef, reverse(hdr.shape))))
if ndims(x) > 1
x = permutedims(x, collect(ndims(x):-1:1))
end
end
x isa Array{<:Any, 0} ? x[1] : x
ndims(x) == 0 ? x[1] : x
end

function npzreadarray(f::IO)
hdr = readheader(f)
_npzreadarray(f, hdr)
end

function samestart(a::AbstractVector, b::AbstractVector)
Expand Down Expand Up @@ -287,7 +301,6 @@ function npzread(filename::AbstractString, vars...)
close(f)
error("not a NPY or NPZ/Zip file: $filename")
end

close(f)
return data
end
Expand All @@ -300,6 +313,41 @@ function npzread(dir::ZipFile.Reader,
if f.name in vars || _maybetrimext(f.name) in vars)
end

"""
readheader(filename, [vars...])
Return a header or a collection of headers corresponding to each variable contained in `filename`.
The header contains information about the `eltype` and `size` of the array that may be extracted using
the corresponding accessor functions.
"""
function readheader(filename::AbstractString, vars...)
# Detect if the file is a numpy npy array file or a npz/zip file.
f = open(filename)
@compat b = read!(f, Vector{UInt8}(undef, MaxMagicLen))

if samestart(b, ZIPMagic)
fz = ZipFile.Reader(filename)
data = readheader(fz, vars...)
close(fz)
elseif samestart(b, NPYMagic)
seekstart(f)
data = readheader(f)
else
close(f)
error("not a NPY or NPZ/Zip file: $filename")
end

close(f)
return data
end
function readheader(dir::ZipFile.Reader,
vars = map(f -> _maybetrimext(f.name), dir.files))

Dict(_maybetrimext(f.name) => readheader(f)
for f in dir.files
if f.name in vars || _maybetrimext(f.name) in vars)
end

function npzwritearray(
f::IO, x::AbstractArray{UInt8}, T::DataType, shape)

Expand Down
21 changes: 21 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
using NPZ, Compat

import NPZ: readheader

@static if VERSION >= v"0.7.0-DEV.2575"
using Test
else
Expand Down Expand Up @@ -118,3 +120,22 @@ end
npzwrite(f, zeros())
@test npzread(f) == 0.0
end

@testset "readheader" begin
f = tempname()
arr = zeros(2,3)
npzwrite(f, arr)
hdr = readheader(f)
@test size(hdr) == size(arr)
@test ndims(hdr) == ndims(arr)
@test eltype(hdr) == eltype(arr)

npzwrite(f, ones(2,2), x = ones(Int,3), y = 3)
hdr = readheader(f)
@test size(hdr["arr_0"]) == (2,2)
@test eltype(hdr["arr_0"]) == eltype(npzread(f, ["arr_0"])["arr_0"])
@test size(hdr["x"]) == (3,)
@test eltype(hdr["x"]) == eltype(npzread(f, ["x"])["x"])
@test size(hdr["y"]) == ()
@test eltype(hdr["y"]) == eltype(npzread(f, ["y"])["y"])
end

2 comments on commit 8834d3c

@fhs
Copy link
Owner

@fhs fhs commented on 8834d3c Jan 9, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/27619

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.4.1 -m "<description of version>" 8834d3ceb9be626d254204580ad585edfacb8b1a
git push origin v0.4.1

Please sign in to comment.