Merge pull request #59 from CoffeaTeam/processor

Introduce some processor framework ideas
scikit-hep · Apr 12, 2019 · 7874f05 · 7874f05
2 parents 51e71c2 + 87b5a84
commit 7874f05
Show file tree

Hide file tree

Showing 13 changed files with 466 additions and 26 deletions.
diff --git a/binder/requirements.txt b/binder/requirements.txt
diff --git a/fnal_column_analysis_tools/hist/export.py b/fnal_column_analysis_tools/hist/export.py
@@ -2,14 +2,13 @@
 from uproot_methods.classes.TH1 import Methods as TH1Methods
 
 
-def export1d(hist, axis):
+def export1d(hist):
     if hist.dense_dim() != 1:
         raise ValueError("export1d() can only support one dense dimension")
+    if hist.sparse_dim() != 0:
+        raise ValueError("export1d() expects zero sparse dimensions")
 
-    axis = hist.axis(axis)
-    if not isinstance(axis, DenseAxis):
-        raise ValueError("export1d() only exports dense axes")
-
+    axis = hist.axes()[0]
     sumw, sumw2 = hist.values(sumw2=True, overflow='all')[()]
     edges = axis.edges(overflow='none')
 

diff --git a/fnal_column_analysis_tools/hist/hist_tools.py b/fnal_column_analysis_tools/hist/hist_tools.py
@@ -1,6 +1,7 @@
 from __future__ import division
 from collections import namedtuple
-from fnal_column_analysis_tools.util import numpy as np
+from ..util import numpy as np
+from ..processor.accumulator import AccumulatorABC
 import copy
 import functools
 import math
@@ -105,6 +106,8 @@ def __lt__(self, other):
         return False
 
     def __eq__(self, other):
+        if not isinstance(other, Interval):
+            return False
         if other.nan() and self.nan():
             return True
         if self._lo == other._lo and self._hi == other._hi:
@@ -232,7 +235,7 @@ def _ireduce(self, the_slice):
         if isinstance(the_slice, _regex_pattern):
             out = [v for v in self._categories if the_slice.match(v)]
         elif isinstance(the_slice, basestring):
-            pattern = "^" + re.escape(the_slice).replace(r'\*', '.*')
+            pattern = "^" + re.escape(the_slice).replace(r'\*', '.*') + "$"
             m = re.compile(pattern)
             out = [v for v in self._categories if m.match(v)]
         elif isinstance(the_slice, list):
@@ -459,18 +462,20 @@ def identifiers(self, overflow='none'):
         return self._intervals[overflow_behavior(overflow)]
 
 
-class Hist(object):
+class Hist(AccumulatorABC):
     """
         Specify a multidimensional histogram
             label: description of meaning of frequencies (axis descriptions specified in axis constructor)
             dtype: underlying numpy dtype of frequencies
             *axes: positional list of Cat or Bin objects
     """
+    DEFAULT_DTYPE = 'd'
+
     def __init__(self, label, *axes, **kwargs):
         if not isinstance(label, basestring):
             raise TypeError("label must be a string")
         self._label = label
-        self._dtype = kwargs.pop('dtype', 'd')  # Much nicer in python3 :(
+        self._dtype = kwargs.pop('dtype', Hist.DEFAULT_DTYPE)  # Much nicer in python3 :(
         if not all(isinstance(ax, Axis) for ax in axes):
             raise TypeError("All axes must be derived from Axis class")
         # if we stably partition axes to sparse, then dense, some things simplify
@@ -503,6 +508,9 @@ def copy(self, content=True):
             out._sumw2 = copy.deepcopy(self._sumw2)
         return out
 
+    def identity(self):
+        return self.copy(content=False)
+
     def clear(self):
         self._sumw = {}
         self._sumw2 = None
@@ -563,7 +571,7 @@ def compatible(self, other):
             return False
         return True
 
-    def __iadd__(self, other):
+    def add(self, other):
         if not self.compatible(other):
             raise ValueError("Cannot add this histogram with histogram %r of dissimilar dimensions" % other)
 
@@ -588,11 +596,6 @@ def add_dict(l, r):
         add_dict(self._sumw, other._sumw)
         return self
 
-    def __add__(self, other):
-        out = self.copy()
-        out += other
-        return out
-
     def __getitem__(self, keys):
         if not isinstance(keys, tuple):
             keys = (keys,)
@@ -640,7 +643,8 @@ def dense_op(array):
 
     def fill(self, **values):
         if not all(d.name in values for d in self._axes):
-            raise ValueError("Not all axes specified for this histogram!")
+            missing = ", ".join(d.name for d in self._axes if d.name not in values)
+            raise ValueError("Not all axes specified for %r.  Missing: %s" % (self, missing))
 
         if "weight" in values and self._sumw2 is None:
             self._init_sumw2()

diff --git a/fnal_column_analysis_tools/hist/plot.py b/fnal_column_analysis_tools/hist/plot.py
@@ -58,9 +58,13 @@ def clopper_pearson_interval(num, denom, coverage=_coverage1sd):
 
         c.f. http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval
     """
-    lo = scipy.stats.beta.ppf((1-coverage)/2, k, n-k+1)
-    hi = scipy.stats.beta.ppf((1+coverage)/2, k+1, n-k)
+    if np.any(num > denom):
+        raise ValueError("Found numerator larger than denominator while calculating binomial uncertainty")
+    lo = scipy.stats.beta.ppf((1-coverage)/2, num, denom-num+1)
+    hi = scipy.stats.beta.ppf((1+coverage)/2, num+1, denom-num)
     interval = np.array([lo, hi])
+    interval[:, num==0.] = 0.
+    interval[:, num==denom] = 1.
     return interval
 
 
@@ -154,6 +158,8 @@ def plot1d(hist, ax=None, clear=True, overlay=None, stack=False, overflow='none'
             sumw = np.r_[sumw, sumw[-1]]
             sumw2 = np.r_[sumw2, sumw2[-1]]
             label = str(identifier)
+            if label == '':
+                label = '<blank>'
             primitives[label] = []
             first_color = None
             if stack:

diff --git a/fnal_column_analysis_tools/lumi_tools/__init__.py b/fnal_column_analysis_tools/lumi_tools/__init__.py
@@ -119,6 +119,8 @@ def __iadd__(self, other):
         # TODO: re-apply unique? Or wait until end
         if isinstance(other, LumiList):
             self.array = np.r_[self.array, other.array]
+        else:
+            raise ValueError("Expected LumiList object, got %r" % other)
         return self
 
     def clear(self):

diff --git a/fnal_column_analysis_tools/processor/__init__.py b/fnal_column_analysis_tools/processor/__init__.py
@@ -0,0 +1,17 @@
+from .processor import ProcessorABC
+from .dataframe import (
+    LazyDataFrame,
+    PreloadedDataFrame,
+)
+from .helpers import Weights, PackedSelection
+from .executor import (
+    iterative_executor,
+    futures_executor,
+    condor_executor,
+)
+from .accumulator import (
+    accumulator,
+    set_accumulator,
+    dict_accumulator,
+    defaultdict_accumulator,
+)
diff --git a/fnal_column_analysis_tools/processor/accumulator.py b/fnal_column_analysis_tools/processor/accumulator.py
@@ -0,0 +1,102 @@
+from six import with_metaclass
+from abc import ABCMeta, abstractmethod
+import collections
+
+try:
+    from collections.abc import Set
+except ImportError:
+    from collections import Set
+
+
+class AccumulatorABC(with_metaclass(ABCMeta)):
+    '''
+    ABC for an accumulator.  Derived must implement:
+        identity: returns a new object of same type as self,
+            such that self + self.identity() == self
+        add(other): adds an object of same type as self to self
+
+    Concrete implementations are provided for __add__, __iadd__
+    '''
+    @abstractmethod
+    def identity(self):
+        pass
+
+    @abstractmethod
+    def add(self, other):
+        pass
+
+    def __add__(self, other):
+        ret = self.identity()
+        ret.add(self)
+        ret.add(other)
+        return ret
+
+    def __iadd__(self, other):
+        self.add(other)
+        return self
+
+
+class accumulator(AccumulatorABC):
+    '''
+    Holds a value, of type and identity as provided to initializer
+    '''
+    def __init__(self, identity):
+        self.value = identity
+        self._identity = identity
+
+    def identity(self):
+        return accumulator(self._identity)
+
+    def add(self, other):
+        if isinstance(other, AccumulatorABC):
+            self.value += other.value
+        else:
+            self.value += other
+
+
+class set_accumulator(set, AccumulatorABC):
+    '''
+    A set with accumulator semantics
+    '''
+    def identity(self):
+        return set_accumulator()
+
+    def add(self, other):
+        if isinstance(other, Set):
+            set.update(self, other)
+        else:
+            set.add(self, other)
+
+
+class dict_accumulator(dict, AccumulatorABC):
+    '''
+    Like a dict but also has accumulator semantics
+    It is assumed that the contents of the dict have accumulator semantics
+    '''
+    def identity(self):
+        ret = dict_accumulator()
+        for key, value in self.items():
+            ret[key] = value.identity()
+        return ret
+
+    def add(self, other):
+        if isinstance(other, dict_accumulator):
+            for key, value in other.items():
+                if key not in self:
+                    self[key] = value.identity()
+                self[key] += value
+        else:
+            raise ValueError
+
+
+class defaultdict_accumulator(collections.defaultdict, AccumulatorABC):
+    '''
+    Like a defaultdict but also has accumulator semantics
+    It is assumed that the contents of the dict have accumulator semantics
+    '''
+    def identity(self):
+        return defaultdict_accumulator(self.default_factory)
+
+    def add(self, other):
+        for key, value in other.items():
+            self[key] += value
diff --git a/fnal_column_analysis_tools/processor/dataframe.py b/fnal_column_analysis_tools/processor/dataframe.py
@@ -0,0 +1,112 @@
+import warnings
+from ..util import awkward
+
+try:
+    from collections.abc import MutableMapping
+except ImportError:
+    from collections import MutableMapping
+
+
+class LazyDataFrame(MutableMapping):
+    """
+    Simple delayed uproot reader (a la lazyarrays)
+    Keeps track of values accessed, for later parsing.
+    """
+    def __init__(self, tree, stride=None, index=None, preload_items=None):
+        self._tree = tree
+        self._branchargs = {'awkwardlib': awkward}
+        self._stride = None
+        if (stride is not None) and (index is not None):
+            self._stride = stride
+            self._branchargs['entrystart'] = index*stride
+            self._branchargs['entrystop'] = min(self._tree.numentries, (index+1)*stride)
+        self._dict = {}
+        self._materialized = set()
+        if preload_items:
+            self.preload(preload_items)
+
+    def __delitem__(self, key):
+        del self._dict[key]
+
+    def __getitem__(self, key):
+        if key in self._dict:
+            return self._dict[key]
+        elif key in self._tree:
+            self._materialized.add(key)
+            self._dict[key] = self._tree[key].array(**self._branchargs)
+            return self._dict[key]
+        else:
+            raise KeyError(key)
+
+    def __iter__(self):
+        warnings.warning("An iterator has requested to read all branches from the tree", RuntimeWarning)
+        for item in self._dict:
+            self._materialized.add(item[0])
+            yield item
+
+    def __len__(self):
+        return len(self._dict)
+
+    def __setitem__(self, key, value):
+        self._dict[key] = value
+
+    @property
+    def available(self):
+        return self._tree.keys()
+
+    @property
+    def materialized(self):
+        return self._materialized
+
+    @property
+    def size(self):
+        if self._stride is None:
+            return self._tree.numentries
+        return (self._branchargs['entrystop'] - self._branchargs['entrystart'])
+
+    def preload(self, columns):
+        for name in columns:
+            if name in self._tree:
+                _ = self[name]
+
+
+class PreloadedDataFrame(MutableMapping):
+    """
+    For instances like spark where the columns are preloaded
+    Require input number of rows (don't want to implicitly rely on picking a random item)
+    Still keep track of what was accessed in case it is of use
+    """
+    def __init__(self, size, items):
+        self._size = size
+        self._dict = items
+        self._accessed = set()
+
+    def __delitem__(self, key):
+        del self._dict[key]
+
+    def __getitem__(self, key):
+        self._accessed.add(key)
+        return self._dict[key]
+
+    def __iter__(self):
+        for key in self._dict:
+            self._accessed.add(key)
+            yield key
+
+    def __len__(self):
+        return len(self._dict)
+
+    def __setitem__(self, key, value):
+        self._dict[key] = value
+
+    @property
+    def available(self):
+        return self._tree.keys()
+
+    @property
+    def materialized(self):
+        return self._accessed
+
+    @property
+    def size(self):
+        return self._size