Skip to content

Commit

Permalink
Merge pull request #59 from CoffeaTeam/processor
Browse files Browse the repository at this point in the history
Introduce some processor framework ideas
  • Loading branch information
lgray authored Apr 12, 2019
2 parents 51e71c2 + 87b5a84 commit 7874f05
Show file tree
Hide file tree
Showing 13 changed files with 466 additions and 26 deletions.
7 changes: 0 additions & 7 deletions binder/requirements.txt

This file was deleted.

9 changes: 4 additions & 5 deletions fnal_column_analysis_tools/hist/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@
from uproot_methods.classes.TH1 import Methods as TH1Methods


def export1d(hist, axis):
def export1d(hist):
if hist.dense_dim() != 1:
raise ValueError("export1d() can only support one dense dimension")
if hist.sparse_dim() != 0:
raise ValueError("export1d() expects zero sparse dimensions")

axis = hist.axis(axis)
if not isinstance(axis, DenseAxis):
raise ValueError("export1d() only exports dense axes")

axis = hist.axes()[0]
sumw, sumw2 = hist.values(sumw2=True, overflow='all')[()]
edges = axis.edges(overflow='none')

Expand Down
26 changes: 15 additions & 11 deletions fnal_column_analysis_tools/hist/hist_tools.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import division
from collections import namedtuple
from fnal_column_analysis_tools.util import numpy as np
from ..util import numpy as np
from ..processor.accumulator import AccumulatorABC
import copy
import functools
import math
Expand Down Expand Up @@ -105,6 +106,8 @@ def __lt__(self, other):
return False

def __eq__(self, other):
if not isinstance(other, Interval):
return False
if other.nan() and self.nan():
return True
if self._lo == other._lo and self._hi == other._hi:
Expand Down Expand Up @@ -232,7 +235,7 @@ def _ireduce(self, the_slice):
if isinstance(the_slice, _regex_pattern):
out = [v for v in self._categories if the_slice.match(v)]
elif isinstance(the_slice, basestring):
pattern = "^" + re.escape(the_slice).replace(r'\*', '.*')
pattern = "^" + re.escape(the_slice).replace(r'\*', '.*') + "$"
m = re.compile(pattern)
out = [v for v in self._categories if m.match(v)]
elif isinstance(the_slice, list):
Expand Down Expand Up @@ -459,18 +462,20 @@ def identifiers(self, overflow='none'):
return self._intervals[overflow_behavior(overflow)]


class Hist(object):
class Hist(AccumulatorABC):
"""
Specify a multidimensional histogram
label: description of meaning of frequencies (axis descriptions specified in axis constructor)
dtype: underlying numpy dtype of frequencies
*axes: positional list of Cat or Bin objects
"""
DEFAULT_DTYPE = 'd'

def __init__(self, label, *axes, **kwargs):
if not isinstance(label, basestring):
raise TypeError("label must be a string")
self._label = label
self._dtype = kwargs.pop('dtype', 'd') # Much nicer in python3 :(
self._dtype = kwargs.pop('dtype', Hist.DEFAULT_DTYPE) # Much nicer in python3 :(
if not all(isinstance(ax, Axis) for ax in axes):
raise TypeError("All axes must be derived from Axis class")
# if we stably partition axes to sparse, then dense, some things simplify
Expand Down Expand Up @@ -503,6 +508,9 @@ def copy(self, content=True):
out._sumw2 = copy.deepcopy(self._sumw2)
return out

def identity(self):
return self.copy(content=False)

def clear(self):
self._sumw = {}
self._sumw2 = None
Expand Down Expand Up @@ -563,7 +571,7 @@ def compatible(self, other):
return False
return True

def __iadd__(self, other):
def add(self, other):
if not self.compatible(other):
raise ValueError("Cannot add this histogram with histogram %r of dissimilar dimensions" % other)

Expand All @@ -588,11 +596,6 @@ def add_dict(l, r):
add_dict(self._sumw, other._sumw)
return self

def __add__(self, other):
out = self.copy()
out += other
return out

def __getitem__(self, keys):
if not isinstance(keys, tuple):
keys = (keys,)
Expand Down Expand Up @@ -640,7 +643,8 @@ def dense_op(array):

def fill(self, **values):
if not all(d.name in values for d in self._axes):
raise ValueError("Not all axes specified for this histogram!")
missing = ", ".join(d.name for d in self._axes if d.name not in values)
raise ValueError("Not all axes specified for %r. Missing: %s" % (self, missing))

if "weight" in values and self._sumw2 is None:
self._init_sumw2()
Expand Down
10 changes: 8 additions & 2 deletions fnal_column_analysis_tools/hist/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,13 @@ def clopper_pearson_interval(num, denom, coverage=_coverage1sd):
c.f. http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval
"""
lo = scipy.stats.beta.ppf((1-coverage)/2, k, n-k+1)
hi = scipy.stats.beta.ppf((1+coverage)/2, k+1, n-k)
if np.any(num > denom):
raise ValueError("Found numerator larger than denominator while calculating binomial uncertainty")
lo = scipy.stats.beta.ppf((1-coverage)/2, num, denom-num+1)
hi = scipy.stats.beta.ppf((1+coverage)/2, num+1, denom-num)
interval = np.array([lo, hi])
interval[:, num==0.] = 0.
interval[:, num==denom] = 1.
return interval


Expand Down Expand Up @@ -154,6 +158,8 @@ def plot1d(hist, ax=None, clear=True, overlay=None, stack=False, overflow='none'
sumw = np.r_[sumw, sumw[-1]]
sumw2 = np.r_[sumw2, sumw2[-1]]
label = str(identifier)
if label == '':
label = '<blank>'
primitives[label] = []
first_color = None
if stack:
Expand Down
2 changes: 2 additions & 0 deletions fnal_column_analysis_tools/lumi_tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ def __iadd__(self, other):
# TODO: re-apply unique? Or wait until end
if isinstance(other, LumiList):
self.array = np.r_[self.array, other.array]
else:
raise ValueError("Expected LumiList object, got %r" % other)
return self

def clear(self):
Expand Down
17 changes: 17 additions & 0 deletions fnal_column_analysis_tools/processor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from .processor import ProcessorABC
from .dataframe import (
LazyDataFrame,
PreloadedDataFrame,
)
from .helpers import Weights, PackedSelection
from .executor import (
iterative_executor,
futures_executor,
condor_executor,
)
from .accumulator import (
accumulator,
set_accumulator,
dict_accumulator,
defaultdict_accumulator,
)
102 changes: 102 additions & 0 deletions fnal_column_analysis_tools/processor/accumulator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from six import with_metaclass
from abc import ABCMeta, abstractmethod
import collections

try:
from collections.abc import Set
except ImportError:
from collections import Set


class AccumulatorABC(with_metaclass(ABCMeta)):
'''
ABC for an accumulator. Derived must implement:
identity: returns a new object of same type as self,
such that self + self.identity() == self
add(other): adds an object of same type as self to self
Concrete implementations are provided for __add__, __iadd__
'''
@abstractmethod
def identity(self):
pass

@abstractmethod
def add(self, other):
pass

def __add__(self, other):
ret = self.identity()
ret.add(self)
ret.add(other)
return ret

def __iadd__(self, other):
self.add(other)
return self


class accumulator(AccumulatorABC):
'''
Holds a value, of type and identity as provided to initializer
'''
def __init__(self, identity):
self.value = identity
self._identity = identity

def identity(self):
return accumulator(self._identity)

def add(self, other):
if isinstance(other, AccumulatorABC):
self.value += other.value
else:
self.value += other


class set_accumulator(set, AccumulatorABC):
'''
A set with accumulator semantics
'''
def identity(self):
return set_accumulator()

def add(self, other):
if isinstance(other, Set):
set.update(self, other)
else:
set.add(self, other)


class dict_accumulator(dict, AccumulatorABC):
'''
Like a dict but also has accumulator semantics
It is assumed that the contents of the dict have accumulator semantics
'''
def identity(self):
ret = dict_accumulator()
for key, value in self.items():
ret[key] = value.identity()
return ret

def add(self, other):
if isinstance(other, dict_accumulator):
for key, value in other.items():
if key not in self:
self[key] = value.identity()
self[key] += value
else:
raise ValueError


class defaultdict_accumulator(collections.defaultdict, AccumulatorABC):
'''
Like a defaultdict but also has accumulator semantics
It is assumed that the contents of the dict have accumulator semantics
'''
def identity(self):
return defaultdict_accumulator(self.default_factory)

def add(self, other):
for key, value in other.items():
self[key] += value
112 changes: 112 additions & 0 deletions fnal_column_analysis_tools/processor/dataframe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import warnings
from ..util import awkward

try:
from collections.abc import MutableMapping
except ImportError:
from collections import MutableMapping


class LazyDataFrame(MutableMapping):
"""
Simple delayed uproot reader (a la lazyarrays)
Keeps track of values accessed, for later parsing.
"""
def __init__(self, tree, stride=None, index=None, preload_items=None):
self._tree = tree
self._branchargs = {'awkwardlib': awkward}
self._stride = None
if (stride is not None) and (index is not None):
self._stride = stride
self._branchargs['entrystart'] = index*stride
self._branchargs['entrystop'] = min(self._tree.numentries, (index+1)*stride)
self._dict = {}
self._materialized = set()
if preload_items:
self.preload(preload_items)

def __delitem__(self, key):
del self._dict[key]

def __getitem__(self, key):
if key in self._dict:
return self._dict[key]
elif key in self._tree:
self._materialized.add(key)
self._dict[key] = self._tree[key].array(**self._branchargs)
return self._dict[key]
else:
raise KeyError(key)

def __iter__(self):
warnings.warning("An iterator has requested to read all branches from the tree", RuntimeWarning)
for item in self._dict:
self._materialized.add(item[0])
yield item

def __len__(self):
return len(self._dict)

def __setitem__(self, key, value):
self._dict[key] = value

@property
def available(self):
return self._tree.keys()

@property
def materialized(self):
return self._materialized

@property
def size(self):
if self._stride is None:
return self._tree.numentries
return (self._branchargs['entrystop'] - self._branchargs['entrystart'])

def preload(self, columns):
for name in columns:
if name in self._tree:
_ = self[name]


class PreloadedDataFrame(MutableMapping):
"""
For instances like spark where the columns are preloaded
Require input number of rows (don't want to implicitly rely on picking a random item)
Still keep track of what was accessed in case it is of use
"""
def __init__(self, size, items):
self._size = size
self._dict = items
self._accessed = set()

def __delitem__(self, key):
del self._dict[key]

def __getitem__(self, key):
self._accessed.add(key)
return self._dict[key]

def __iter__(self):
for key in self._dict:
self._accessed.add(key)
yield key

def __len__(self):
return len(self._dict)

def __setitem__(self, key, value):
self._dict[key] = value

@property
def available(self):
return self._tree.keys()

@property
def materialized(self):
return self._accessed

@property
def size(self):
return self._size
Loading

0 comments on commit 7874f05

Please sign in to comment.