handle multiple blocks in datastream (#21)

JuliaHEP · Jul 3, 2021 · 67f1e36 · 67f1e36
1 parent 94b8e6e
commit 67f1e36
Show file tree

Hide file tree

Showing 6 changed files with 40 additions and 17 deletions.
diff --git a/src/UnROOT.jl b/src/UnROOT.jl
@@ -10,9 +10,9 @@ using Parameters
 using StaticArrays
 
 include("constants.jl")
-include("utils.jl")
 include("io.jl")
 include("types.jl")
+include("utils.jl")
 include("streamers.jl")
 include("bootstrap.jl")
 include("root.jl")

diff --git a/src/streamers.jl b/src/streamers.jl
@@ -79,10 +79,8 @@ function Streamers(io)
 
     if iscompressed(tkey)
         @debug "Compressed stream at $(start)"
-        _start = tkey.fSeekKey
         seekstart(io, tkey)
         compression_header = unpack(io, CompressionHeader)
-        skipped = position(io) - _start
         #FIXME for some reason we need to re-pack such that it ends at exact bytes.
         skipped = position(io) - start
         # notice our `TKey` size is not the same as official TKey, can't use sizeof()

diff --git a/src/types.jl b/src/types.jl
@@ -101,22 +101,29 @@ function datastream(io, tkey::T) where T<:Union{TKey, TBasketKey}
     @debug "Compressed stream at $(start)"
     _start = tkey.fSeekKey
     seekstart(io, tkey)
-    compression_header = unpack(io, CompressionHeader)
-    skipped = position(io) - _start
-    io_buf = IOBuffer(read(io, tkey.fNbytes - skipped))
-    if String(compression_header.algo) == "ZL"
-        return IOBuffer(read(ZlibDecompressorStream(io_buf), tkey.fObjlen))
-    elseif String(compression_header.algo) == "XZ"
-        #FIXME doesn't work, why
-        return IOBuffer(read(XzDecompressorStream(io_buf), tkey.fObjlen))
-    elseif String(compression_header.algo) == "L4"
-        #FIXME doesn't work
-        skip(io_buf, 8) #skip checksum
-        stream = IOBuffer(lz4_decompress(read(io_buf), tkey.fObjlen))
-    else
-        error("Unsupported compression type '$(String(compression_header.algo))'")
+    fufilled = 0
+    uncomp_data = Vector{UInt8}(undef, tkey.fObjlen)
+    while fufilled < tkey.fObjlen # careful with 0/1-based index when thinking about offsets
+        compression_header = unpack(io, CompressionHeader)
+        cname, _, compbytes, uncompbytes = unpack(compression_header)
+        io_buf = IOBuffer(read(io, compbytes))
+
+        # indexing `0+1 to 0+2` are two bytes, no need to +1 in the second term
+        uncomp_data[fufilled+1:fufilled+uncompbytes] .= if cname == "ZL"
+            read(ZlibDecompressorStream(io_buf), uncompbytes)
+        elseif cname == "XZ"
+            read(XzDecompressorStream(io_buf), uncompbytes)
+        elseif cname == "L4"
+            skip(io_buf, 8) #skip checksum
+            lz4_decompress(read(io_buf), uncompbytes)
+        else
+            error("Unsupported compression type '$(String(compression_header.algo))'")
+        end
+
+        fufilled += uncompbytes
     end
 
+    return IOBuffer(uncomp_data)
 end
 
 

diff --git a/src/utils.jl b/src/utils.jl
@@ -31,3 +31,21 @@ macro stack(into, structs...)
         end
     )
 end
+
+"""
+    unpack(x::CompressionHeader)
+
+Return the following information:
+- Name of compression algorithm
+- Level of the compression
+- compressedbytes and uncompressedbytes according to [uproot3](https://github.com/scikit-hep/uproot3/blob/54f5151fb7c686c3a161fbe44b9f299e482f346b/uproot3/source/compressed.py#L132)
+"""
+function unpack(x::CompressionHeader)
+    algname = String(x.algo)
+    ver = Int(x.method)
+    # shift without casting to `Int` will give you 0x00 because we're shifting 0 bits into UInt8
+    compressedbytes = x.c1 + (Int(x.c2) << 8) + (Int(x.c3) << 16)
+    uncompressedbytes = x.u1 + (Int(x.u2) << 8) + (Int(x.u3) << 16)
+
+    return algname, ver, compressedbytes, uncompressedbytes
+end
diff --git a/test/samples/__pycache__/lz4.cpython-39.pyc b/test/samples/__pycache__/lz4.cpython-39.pyc
diff --git a/test/samples/tree_with_large_array_lz4.root b/test/samples/tree_with_large_array_lz4.root