Skip to content

Commit

Permalink
Add Iterator interface fix #6 (#28)
Browse files Browse the repository at this point in the history
* Fix entry size bug

Co-authored-by: Nick Amin <[email protected]>

* Indexing interface for `LazyBranch`

* Indexing interface's test

* Support map and broadcasting over LazyBranch via iteration

* Bump version


fix #6
  • Loading branch information
Moelf authored Jul 8, 2021
1 parent a975f22 commit 0338002
Show file tree
Hide file tree
Showing 11 changed files with 268 additions and 82 deletions.
4 changes: 3 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
name = "UnROOT"
uuid = "3cd96dde-e98d-4713-81e9-a4a1b0235ce9"
authors = ["Tamas Gal", "Jerry Ling"]
version = "0.1.9"
version = "0.2.0"

[deps]
CodecLz4 = "5ba52731-8f18-5e0d-9241-30f10d1ec561"
CodecXz = "ba30903b-d9e8-5048-a5ec-d1f5b0d4b47b"
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2"
LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637"
MD5 = "6ac74813-4b46-53a4-afec-0b5dc9d7885c"
Memoization = "6fafb56a-5788-4b4e-91ca-c0cea6611c73"
Expand All @@ -18,6 +19,7 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
CodecLz4 = "^0.3.0, ^0.4.0"
CodecXz = "^0.6.0, ^0.7.0"
CodecZlib = "^0.6.0, ^0.7.0"
CodecZstd = "^0.6.0, ^0.7.0"
LRUCache = "^1.3.0"
MD5 = "^0.2.1"
Memoization = "^0.1.10"
Expand Down
41 changes: 32 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,38 @@ Three's also a [discussion](https://github.com/scikit-hep/uproot/issues/401) rea
documentation on uproot's issue page.

## Status
The project is in early prototyping phase and contributions are very
welcome.

We support reading all scalar branch and jagged branch of "basic" types, as
a metric, UnROOT can already read all branches of CMS' NanoAOD:
We support reading all scalar branch and jagged branch of "basic" types, provide
indexing interface (thus iteration too) with basket-cache. As
a metric, UnROOT can read all branches of CMS NanoAOD:

``` julia
using UnROOT

julia> t = ROOTFile("test/samples/NanoAODv5_sample.root")
ROOTFile("test/samples/NanoAODv5_sample.root") with 2 entries and 21 streamers.

# reading a flat branch
julia< b = rf["Events/Electron_dxy"];

julia> BA = LazyBranch(rf, b);

# you can access a branch by index, this is fairly fast, memory footprint ~ single basket
julia> for i = 5:8
@show BA[i]
end
ab[i] = Float32[]
ab[i] = Float32[-0.0012559891]
ab[i] = Float32[0.06121826, 0.00064229965]
ab[i] = Float32[0.005870819, 0.00054883957, -0.00617218]

# or a range
julia> BA[5:8]
4-element Vector{Vector{Float32}}:
[]
[-0.0012559891]
[0.06121826, 0.00064229965]
[0.005870819, 0.00054883957, -0.00617218]

# or dump a branch
julia> array(t, "Events/HLT_Mu3_PFJet40")
1000-element BitVector:
0
Expand All @@ -47,7 +66,7 @@ julia> array(t, "Events/HLT_Mu3_PFJet40")
0
...

# reading a jagged branch
# a jagged branch
julia> array(t, "Events/Electron_dxy")
1000-element Vector{Vector{Float32}}:
[0.00037050247]
Expand All @@ -56,7 +75,7 @@ julia> array(t, "Events/Electron_dxy")
[-0.0015697479]
...

# reading branch is also thread-safe, although may not be faster depending to disk I/O and cache
# reading branch is also thread-safe, although may not be much faster depending to disk I/O and cache
julia> using ThreadsX

julia> branch_names = keys(t["Events"])
Expand Down Expand Up @@ -95,7 +114,9 @@ julia> UnROOT.splitup(data, offsets, UnROOT.KM3NETDAQHit)
[UnROOT.KM3NETDAQHit(1073742790, 0x00, 9, 0x60)......
```
This is what happens behind the scenes with some additional debug output:
<details><summary>This is what happens behind the scenes with some additional debug output: </summary>
<p>
``` julia
julia> using UnROOT
Expand Down Expand Up @@ -173,6 +194,8 @@ Compressed datastream of 1317 bytes at 6180 (TKey 't1' (TTree))
10
10
```
</p>
</details>
## Main challenges
Expand Down
6 changes: 4 additions & 2 deletions src/UnROOT.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
module UnROOT

export ROOTFile, array
export ROOTFile, array, LazyBranch

import Base: keys, get, getindex, show, length, iterate, position, ntoh, lock, unlock
using Base.Threads: SpinLock
using Memoization, LRUCache
ntoh(b::Bool) = b

using CodecZlib, CodecLz4, CodecXz
using CodecZlib, CodecLz4, CodecXz, CodecZstd
using Mixers
using Parameters
using StaticArrays
Expand All @@ -19,6 +19,8 @@ include("utils.jl")
include("streamers.jl")
include("bootstrap.jl")
include("root.jl")
include("arrayapi.jl")
# include("itr.jl")
include("custom.jl")

@static if VERSION < v"1.1"
Expand Down
122 changes: 122 additions & 0 deletions src/arrayapi.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""
arrays(f::ROOTFile, treename)
Reads all branches from a tree.
"""
function arrays(f::ROOTFile, treename)
names = keys(f[treename])
res = Vector{Any}(undef, length(names))
Threads.@threads for i in eachindex(names)
res[i] = array(f, "$treename/$(names[i])")
end
res
end


"""
array(f::ROOTFile, path; raw=false)
Reads an array from a branch. Set `raw=true` to return raw data and correct offsets.
"""
array(f::ROOTFile, path::AbstractString; raw=false) = array(f::ROOTFile, f[path]; raw=raw)

function array(f::ROOTFile, branch; raw=false)
ismissing(branch) && error("No branch found at $path")
(!raw && length(branch.fLeaves.elements) > 1) && error(
"Branches with multiple leaves are not supported yet. Try reading with `array(...; raw=true)`.")

rawdata, rawoffsets = readbranchraw(f, branch)
if raw
return rawdata, rawoffsets
end
leaf = first(branch.fLeaves.elements)
jagt = JaggType(leaf)
T = eltype(branch)
interped_data(rawdata, rawoffsets, branch, jagt, T)
end

"""
basketarray(f::ROOTFile, path, ith; raw=false)
Reads an array from ith basket of a branch. Set `raw=true` to return raw data and correct offsets.
"""
basketarray(f::ROOTFile, path::AbstractString, ithbasket) = basketarray(f, f[path], ithbasket)

function basketarray(f::ROOTFile, branch, ithbasket)
ismissing(branch) && error("No branch found at $path")
length(branch.fLeaves.elements) > 1 && error(
"Branches with multiple leaves are not supported yet. Try reading with `array(...; raw=true)`.")

rawdata, rawoffsets = readbasket(f, branch, ithbasket)
leaf = first(branch.fLeaves.elements)
jagt = JaggType(leaf)
T = eltype(branch)
interped_data(rawdata, rawoffsets, branch, jagt, T)
end

# function barrior to make getting individual index faster
# TODO upstream some types into parametric types for Branch/BranchElement
#
"""
LazyBranch(f::ROOTFile, branch)
Construct an accessor for a given branch such that `BA[idx]` and or `BA[1:20]` is almost
type-stable. And memory footprint is a single basket (<20MB usually).
# Example
```julia
julia> rf = ROOTFile("./test/samples/tree_with_large_array.root");
julia> b = rf["t1/int32_array"];
julia> ab = UnROOT.LazyBranch(rf, b);
julia> ab[1]
0
julia> ab[begin:end]
0
1
...
```
"""
mutable struct LazyBranch{T, J}
f::ROOTFile
b::Union{TBranch, TBranchElement}
L::Int64
fEntry::Vector{Int64}
buffer_seek::Int64
buffer::Vector{T}

function LazyBranch(f::ROOTFile, b::Union{TBranch, TBranchElement})
T = eltype(b)
J = JaggType(first(b.fLeaves.elements))
max_len = maximum(diff(b.fBasketEntry))
# we don't know how to deal with multiple leaves yet
new{T, J}(f, b, length(b), b.fBasketEntry, -1, T[])
end
end
Base.firstindex(ba::LazyBranch) = 1
Base.lastindex(ba::LazyBranch) = ba.L
Base.length(ba::LazyBranch) = ba.L
Base.eltype(ba::LazyBranch{T,J}) where {T,J} = T

function Base.getindex(ba::LazyBranch{T, J}, idx::Integer) where {T, J}
# I hate 1-based indexing
seek_idx = findfirst(x -> x>(idx-1), ba.fEntry) - 1 #support 1.0 syntax
localidx = idx - ba.fEntry[seek_idx]
if seek_idx != ba.buffer_seek # update buffer
ba.buffer = basketarray(ba.f, ba.b, seek_idx)
ba.buffer_seek = seek_idx
end
return ba.buffer[localidx]
end

function Base.iterate(ba::LazyBranch{T, J}, idx=1) where {T, J}
idx>ba.L && return nothing
return (ba[idx], idx+1)
end

# TODO this is not terribly slow, but we can get faster implementation still ;)
function Base.getindex(ba::LazyBranch{T, J}, rang::UnitRange) where {T, J}
[ba[i] for i in rang]
end
9 changes: 8 additions & 1 deletion src/bootstrap.jl
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,14 @@ function unpack(io, tkey::TKey, refs::Dict{Int32, Any}, ::Type{T}) where {T<:TLe
end

abstract type TBranch <: ROOTStreamedObject end
abstract type TBranchElement <: ROOTStreamedObject end
Base.length(b::Union{TBranch, TBranchElement}) = b.fEntries
Base.eachindex(b::Union{TBranch, TBranchElement}) = Base.OneTo(b.fEntries)
function Base.eltype(b::Union{TBranch, TBranchElement})
leaf = first(b.fLeaves.elements)
JaggType(leaf)===Nojagg ? primitivetype(leaf) : Vector{interp_jaggT(b, leaf)}
end

@with_kw struct TBranch_12 <: TBranch
cursor::Cursor
# from TNamed
Expand Down Expand Up @@ -522,7 +530,6 @@ function readfields!(cursor::Cursor, fields, ::Type{T}) where {T<:TBranch_13}
fields[:fFileName] = readtype(io, String)
end

abstract type TBranchElement <: ROOTStreamedObject end
@with_kw struct TBranchElement_9 <: TBranchElement
cursor::Cursor
# from TNamed
Expand Down
37 changes: 37 additions & 0 deletions src/itr.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# TODO this doesn't look too useful to deserve it's own thing in light of the LazyBranch
# struct BranchItr{T, J}
# f::ROOTFile
# b::Union{TBranch, TBranchElement}
# current::Int64
# total_entry::Int64

# function BranchItr(f::ROOTFile, b::Union{TBranch, TBranchElement})
# T = eltype(b)
# J = JaggType(only(b.fLeaves.elements))
# # we don't know how to deal with multiple leaves yet
# new{T, J}(f, b, 0, b.fEntries)
# end
# end
# Base.length(itr::BranchItr) = itr.total_entry
# Base.eltype(itr::BranchItr{T,J}) where {T,J} = T

# function Base.iterate(itr::BranchItr{T, J}, state=(itr.current, 1, T[], 0)) where {T, J}
# current, ithbasket, entries, remaining = state

# current >= itr.total_entry && return nothing

# if iszero(remaining)
# rawdata, rawoffsets = readbasket(itr.f, itr.b, ithbasket)
# entries = interped_data(rawdata, rawoffsets, itr.b, J, T)
# remaining = length(entries)
# ithbasket += 1
# end
# return (popfirst!(entries), (current+1, ithbasket, entries, remaining-1))
# end

# function Base.show(io::IO, itr::BranchItr)
# summary(io, itr)
# println()
# println(io, "Branch: $(itr.b.fName)")
# print(io, "Total entry: $(itr.total_entry)")
# end
Loading

2 comments on commit 0338002

@Moelf
Copy link
Member Author

@Moelf Moelf commented on 0338002 Jul 8, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/40513

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.2.0 -m "<description of version>" 033800267ca5364c5f2016f76f66ae808361c61b
git push origin v0.2.0

Please sign in to comment.