Add function barrier in npzreadarray and introduce readheader (#44)

This change does two things: - Make the `Header` type parametric and concrete: this seems to cut down on allocations while reading arrays On master: ```julia julia> npzwrite("abc.npy", ones(3,4)); julia> @Btime npzread("abc.npy"); 23.639 μs (91 allocations: 5.80 KiB) ``` After this PR: ```julia julia> @Btime npzread("abc.npy"); 20.911 μs (80 allocations: 5.28 KiB) ``` The difference in run-time is marginal. - Introduce a function `readheader` that returns the header. This might be convenient to get the `eltype` and `size` of the array without reading it in (eg. to pre-allocate an array of the correct size). This function is not exported. ```julia julia> npzwrite("abc.npy", ones(3,4)); julia> NPZ.readheader("abc.npy") NPZ.Header{Float64,2,typeof(ltoh)}(ltoh, true, (3, 4)) julia> NPZ.readheader("abc.npy") |> size (3, 4) julia> NPZ.readheader("abc.npy") |> eltype Float64 ```
fhs · Jan 9, 2021 · 8834d3c · 8834d3c · fhs · Jan 9, 2021
1 parent 9a66947
commit 8834d3c
Show file tree

Hide file tree

Showing 3 changed files with 88 additions and 19 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,15 +1,15 @@
 name = "NPZ"
 uuid = "15e1cf62-19b3-5cfa-8e77-841668bca605"
-version = "0.4.0"
+version = "0.4.1"
 
 [deps]
 Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
 ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 
 [compat]
-Compat = "≥ 1.0.0"
-ZipFile = "≥ 0.3.0"
-julia = "≥ 0.7.0"
+Compat = "1, 2, 3"
+ZipFile = "0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9"
+julia = "0.7, 1"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

diff --git a/src/NPZ.jl b/src/NPZ.jl
@@ -137,7 +137,7 @@ function parsetuple(s::AbstractString)
         s = parsechar(s, ',')
     end
     s = parsechar(s, ')')
-    tup, s
+    Tuple(tup), s
 end
 
 function parsedtype(s::AbstractString)
@@ -159,24 +159,31 @@ function parsedtype(s::AbstractString)
     (toh, Numpy2Julia[t]), s
 end
 
-struct Header
-    descr::Tuple{Function, DataType}
+struct Header{T,N,F<:Function}
+    descr::F
     fortran_order::Bool
-    shape::Vector{Int}
+    shape::NTuple{N,Int}
 end
 
+Header{T}(descr::F, fortran_order, shape::NTuple{N,Int}) where {T,N,F} = Header{T,N,F}(descr, fortran_order, shape)
+Base.size(hdr::Header) = hdr.shape
+Base.eltype(hdr::Header{T}) where T = T
+Base.ndims(hdr::Header{T,N}) where {T,N} = N
+
 function parseheader(s::AbstractString)
     s = parsechar(s, '{')
 
     dict = Dict{String,Any}()
+    T = Any
     for _ in 1:3
         s = strip(s)
         key, s = parsestring(s)
         s = strip(s)
         s = parsechar(s, ':')
         s = strip(s)
         if key == "descr"
-            dict[key], s = parsedtype(s)
+            (descr, T), s = parsedtype(s)
+            dict[key] = descr
         elseif key == "fortran_order"
             dict[key], s = parsebool(s)
         elseif key == "shape"
@@ -196,10 +203,10 @@ function parseheader(s::AbstractString)
     if s != ""
         error("malformed header")
     end
-    Header(dict["descr"], dict["fortran_order"], dict["shape"])
+    Header{T}(dict["descr"], dict["fortran_order"], dict["shape"])
 end
 
-function npzreadarray(f::IO)
+function readheader(f::IO)
     @compat b = read!(f, Vector{UInt8}(undef, length(NPYMagic)))
     if b != NPYMagic
         error("not a numpy array file")
@@ -208,26 +215,33 @@ function npzreadarray(f::IO)
 
     # support for version 2 files
     if b[1] == 1
-        hdrlen = readle(f, UInt16)
+        hdrlen = UInt32(readle(f, UInt16))
     elseif b[1] == 2 
-        hdrlen = readle(f, UInt32)
+        hdrlen = UInt32(readle(f, UInt32))
     else
         error("unsupported NPZ version")
     end
 
     @compat hdr = ascii(String(read!(f, Vector{UInt8}(undef, hdrlen))))
-    hdr = parseheader(strip(hdr))
+    parseheader(strip(hdr))
+end
 
-    toh, typ = hdr.descr
+function _npzreadarray(f, hdr::Header{T}) where {T}
+    toh = hdr.descr
     if hdr.fortran_order
-        @compat x = map(toh, read!(f, Array{typ}(undef, hdr.shape...)))
+        @compat x = map(toh, read!(f, Array{T}(undef, hdr.shape)))
     else
-        @compat x = map(toh, read!(f, Array{typ}(undef, reverse(hdr.shape)...)))
+        @compat x = map(toh, read!(f, Array{T}(undef, reverse(hdr.shape))))
         if ndims(x) > 1
             x = permutedims(x, collect(ndims(x):-1:1))
         end
     end
-    x isa Array{<:Any, 0} ? x[1] : x
+    ndims(x) == 0 ? x[1] : x
+end
+
+function npzreadarray(f::IO)
+    hdr = readheader(f)
+    _npzreadarray(f, hdr)
 end
 
 function samestart(a::AbstractVector, b::AbstractVector)
@@ -287,7 +301,6 @@ function npzread(filename::AbstractString, vars...)
         close(f)
         error("not a NPY or NPZ/Zip file: $filename")
     end
-
     close(f)
     return data
 end
@@ -300,6 +313,41 @@ function npzread(dir::ZipFile.Reader,
             if f.name in vars || _maybetrimext(f.name) in vars)
 end
 
+"""
+    readheader(filename, [vars...])
+
+Return a header or a collection of headers corresponding to each variable contained in `filename`. 
+The header contains information about the `eltype` and `size` of the array that may be extracted using 
+the corresponding accessor functions.
+"""
+function readheader(filename::AbstractString, vars...)
+    # Detect if the file is a numpy npy array file or a npz/zip file.
+    f = open(filename)
+    @compat b = read!(f, Vector{UInt8}(undef, MaxMagicLen))
+
+    if samestart(b, ZIPMagic)
+        fz = ZipFile.Reader(filename)
+        data = readheader(fz, vars...)
+        close(fz)
+    elseif samestart(b, NPYMagic)
+        seekstart(f)
+        data = readheader(f)
+    else
+        close(f)
+        error("not a NPY or NPZ/Zip file: $filename")
+    end
+
+    close(f)
+    return data
+end
+function readheader(dir::ZipFile.Reader, 
+    vars = map(f -> _maybetrimext(f.name), dir.files))
+
+    Dict(_maybetrimext(f.name) => readheader(f)
+        for f in dir.files 
+            if f.name in vars || _maybetrimext(f.name) in vars)
+end
+
 function npzwritearray(
     f::IO, x::AbstractArray{UInt8}, T::DataType, shape)
 

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,5 +1,7 @@
 using NPZ, Compat
 
+import NPZ: readheader
+
 @static if VERSION >= v"0.7.0-DEV.2575"
     using Test
 else
@@ -118,3 +120,22 @@ end
     npzwrite(f, zeros())
     @test npzread(f) == 0.0
 end
+
+@testset "readheader" begin
+    f = tempname()
+    arr = zeros(2,3)
+    npzwrite(f, arr)
+    hdr = readheader(f)
+    @test size(hdr) == size(arr)
+    @test ndims(hdr) == ndims(arr)
+    @test eltype(hdr) == eltype(arr)
+
+    npzwrite(f, ones(2,2), x = ones(Int,3), y = 3)
+    hdr = readheader(f)
+    @test size(hdr["arr_0"]) == (2,2)
+    @test eltype(hdr["arr_0"]) == eltype(npzread(f, ["arr_0"])["arr_0"])
+    @test size(hdr["x"]) == (3,)
+    @test eltype(hdr["x"]) == eltype(npzread(f, ["x"])["x"])
+    @test size(hdr["y"]) == ()
+    @test eltype(hdr["y"]) == eltype(npzread(f, ["y"])["y"])
+end