Skip to content

Commit

Permalink
Merge pull request h2oai#115 from h2oai/pasha_create_frame
Browse files Browse the repository at this point in the history
Pasha create frame
  • Loading branch information
navdeep-G authored Aug 21, 2016
2 parents 9a9ca73 + 4ac288c commit 619d974
Show file tree
Hide file tree
Showing 5 changed files with 210 additions and 62 deletions.
18 changes: 10 additions & 8 deletions h2o-core/src/main/java/hex/CreateFrame.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public class CreateFrame extends Iced {
public final Job<Frame> _job;
public long rows = 10000;
public int cols = 10;
public long seed = new Random().nextLong();
public long seed = -1;
public long seed_for_column_types = -1;
public boolean randomize = true;
public long value = 0;
Expand All @@ -41,8 +41,10 @@ public class CreateFrame extends Iced {
public CreateFrame() { this(Key.<Frame>make()); }

public Job<Frame> execImpl() {
if (seed_for_column_types==-1) seed_for_column_types = seed;
if (integer_fraction + binary_fraction + categorical_fraction + time_fraction + string_fraction > 1) throw new IllegalArgumentException("Integer, binary, categorical, time and string fractions must add up to <= 1.");
if (seed == -1) seed = new Random().nextLong();
if (seed_for_column_types == -1) seed_for_column_types = seed;
if (integer_fraction + binary_fraction + categorical_fraction + time_fraction + string_fraction > 1.00000001)
throw new IllegalArgumentException("Integer, binary, categorical, time and string fractions must add up to <= 1.");
if (missing_fraction < 0 || missing_fraction > 1) throw new IllegalArgumentException("Missing fraction must be between 0 and 1.");
if (integer_fraction < 0 || integer_fraction > 1) throw new IllegalArgumentException("Integer fraction must be between 0 and 1.");
if (binary_fraction < 0 || binary_fraction > 1) throw new IllegalArgumentException("Binary fraction must be between 0 and 1.");
Expand All @@ -54,15 +56,15 @@ public Job<Frame> execImpl() {
if (response_factors < 1) throw new IllegalArgumentException("Response factors must be either 1 (real-valued response), or >=2 (factor levels).");
if (response_factors > 1024) throw new IllegalArgumentException("Response factors must be <= 1024.");
if (factors > 1000000) throw new IllegalArgumentException("Number of factors must be <= 1,000,000).");
if (cols <= 0 || rows <= 0) throw new IllegalArgumentException("Must have number of rows > 0 and columns > 1.");
if (cols <= 0 || rows <= 0) throw new IllegalArgumentException("Must have number of rows > 0 and columns > 0.");

// estimate byte size of the frame
double byte_estimate = randomize ? rows * cols * (
binary_fraction * 1./8 //bits
+ categorical_fraction * (factors < 128 ? 1 : factors < 32768 ? 2 : 4)
+ integer_fraction * (integer_range < 128 ? 1 : integer_range < 32768 ? 2 : integer_range < (1<<31) ? 4 : 8)
+ time_fraction * 8
+ (1-integer_fraction - binary_fraction - categorical_fraction - time_fraction - string_fraction) * 8 ) //reals
+ categorical_fraction * (factors < 128 ? 1 : factors < 32768 ? 2 : 4)
+ integer_fraction * (integer_range < 128 ? 1 : integer_range < 32768 ? 2 : integer_range < (1<<31) ? 4 : 8)
+ time_fraction * 8
+ (1-integer_fraction - binary_fraction - categorical_fraction - time_fraction - string_fraction) * 8 ) //reals
+ rows //response is
: 0; // all constants - should be small

Expand Down
21 changes: 16 additions & 5 deletions h2o-core/src/main/java/water/fvec/FrameCreator.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,24 @@ public FrameCreator(CreateFrame createFrame) {
int[] shuffled_idx = new int[idx.length];
ArrayUtils.shuffleArray(idx, idx.length, shuffled_idx, _createFrame.seed_for_column_types, 0);

int catcols = (int)(_createFrame.categorical_fraction * _createFrame.cols);
int intcols = (int)(_createFrame.integer_fraction * _createFrame.cols);
int bincols = (int)(_createFrame.binary_fraction * _createFrame.cols);
int timecols = (int)(_createFrame.time_fraction * _createFrame.cols);
int stringcols = (int)(_createFrame.string_fraction * _createFrame.cols);
// Sometimes the client requests, say, 0.3 categorical columns. By the time this number arrives here, it becomes
// something like 0.299999999997. If we just multiply by the number of columns (say 10000) and take integer part,
// we'd have 2999 columns only -- not what the client expects. This is why we add 0.1 to each count before taking
// the floor part.
int catcols = (int)(_createFrame.categorical_fraction * _createFrame.cols + 0.1);
int intcols = (int)(_createFrame.integer_fraction * _createFrame.cols + 0.1);
int bincols = (int)(_createFrame.binary_fraction * _createFrame.cols + 0.1);
int timecols = (int)(_createFrame.time_fraction * _createFrame.cols + 0.1);
int stringcols = (int)(_createFrame.string_fraction * _createFrame.cols + 0.1);
int realcols = _createFrame.cols - catcols - intcols - bincols - timecols - stringcols;

// At this point we might accidentally allocated too many columns. In such a case, adjust their counts.
if (realcols < 0 && catcols > 0) { catcols--; realcols++; }
if (realcols < 0 && intcols > 0) { intcols--; realcols++; }
if (realcols < 0 && bincols > 0) { bincols--; realcols++; }
if (realcols < 0 && timecols > 0) { timecols--; realcols++; }
if (realcols < 0 && stringcols > 0) { stringcols--; realcols++; }

assert(catcols >= 0);
assert(intcols >= 0);
assert(bincols >= 0);
Expand Down
100 changes: 65 additions & 35 deletions h2o-py/h2o/h2o.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from h2o.backend import H2OLocalServer
from h2o.exceptions import H2OConnectionError, H2OValueError
from h2o.utils.shared_utils import deprecated, gen_header, is_list_of_lists, py_tmp_key, quoted, urlopen
from h2o.utils.typechecks import I, U, assert_is_type, assert_satisfies, is_type, numeric
from h2o.utils.typechecks import BoundInt, BoundNumeric, I, U, assert_is_type, assert_satisfies, is_type, numeric
from .estimators.deeplearning import H2OAutoEncoderEstimator
from .estimators.deeplearning import H2ODeepLearningEstimator
from .estimators.estimator_base import H2OEstimator
Expand Down Expand Up @@ -871,23 +871,24 @@ def cluster():



def create_frame(id=None, rows=10000, cols=10, randomize=True, value=0, real_range=100,
categorical_fraction=0.2, factors=100, integer_fraction=0.2, integer_range=100,
binary_fraction=0.1, binary_ones_fraction=0.02, time_fraction=0, string_fraction=0,
def create_frame(frame_id=None, rows=10000, cols=10, randomize=True,
real_fraction=None, categorical_fraction=None, integer_fraction=None,
binary_fraction=None, time_fraction=None, string_fraction=None,
value=0, real_range=100, factors=100, integer_range=100, binary_ones_fraction=0.02,
missing_fraction=0.01, response_factors=2, has_response=False, seed=None, seed_for_column_types=None):
"""
Create a new frame with random data in H2O.
Create a new frame with random data.
Creates a data frame in H2O with real-valued, categorical, integer,
and binary columns specified by the user.
Creates a data frame in H2O with real-valued, categorical, integer, and binary columns specified by the user.
:param id: the destination key. If empty, this will be auto-generated by H2O.
:param frame_id: the destination key. If empty, this will be auto-generated.
:param rows: the number of rows of data to generate.
:param cols: the number of columns of data to generate. Excludes the response column if has_response is True.
:param randomize: If True, data values will be randomly generated. This must be True if either
categorical_fraction or integer_fraction is non-zero.
:param value: if randomize is False, then all real-valued entries will be set to this value.
:param real_range: the range of randomly generated real values.
:param real_fraction: the fraction of columns that are real-valued.
:param categorical_fraction: the fraction of total columns that are categorical.
:param factors: the number of (unique) factor levels in each categorical column.
:param integer_fraction: the fraction of total columns that are integer-valued.
Expand All @@ -905,50 +906,79 @@ def create_frame(id=None, rows=10000, cols=10, randomize=True, value=0, real_ran
:returns: an :class:`H2OFrame` object
"""
assert_is_type(id, str, None)
assert_is_type(rows, int)
assert_is_type(cols, int)
t_fraction = U(None, BoundNumeric(0, 1))
assert_is_type(frame_id, str, None)
assert_is_type(rows, BoundInt(1))
assert_is_type(cols, BoundInt(1))
assert_is_type(randomize, bool)
assert_is_type(value, numeric)
assert_is_type(real_range, numeric)
assert_is_type(factors, int)
assert_is_type(integer_range, int)
assert_is_type(categorical_fraction, numeric)
assert_is_type(integer_fraction, numeric)
assert_is_type(binary_fraction, numeric)
assert_is_type(time_fraction, numeric)
assert_is_type(string_fraction, numeric)
assert_is_type(missing_fraction, numeric)
assert_is_type(response_factors, int, None)
assert_is_type(real_range, BoundNumeric(0))
assert_is_type(real_fraction, t_fraction)
assert_is_type(categorical_fraction, t_fraction)
assert_is_type(integer_fraction, t_fraction)
assert_is_type(binary_fraction, t_fraction)
assert_is_type(time_fraction, t_fraction)
assert_is_type(string_fraction, t_fraction)
assert_is_type(missing_fraction, t_fraction)
assert_is_type(binary_ones_fraction, t_fraction)
assert_is_type(factors, BoundInt(1))
assert_is_type(integer_range, BoundInt(1))
assert_is_type(response_factors, None, BoundInt(1))
assert_is_type(has_response, bool)
assert_is_type(seed, int, None)
assert_is_type(seed_for_column_types, int, None)
assert_satisfies(categorical_fraction, 0 <= categorical_fraction <= 1)
assert_satisfies(integer_fraction, 0 <= integer_fraction <= 1)
assert_satisfies(binary_fraction, 0 <= binary_fraction <= 1)
assert_satisfies(time_fraction, 0 <= time_fraction <= 1)
assert_satisfies(string_fraction, 0 <= string_fraction <= 1)
assert_satisfies(missing_fraction, 0 <= missing_fraction <= 1)
assert_satisfies(binary_ones_fraction, 0 <= binary_ones_fraction <= 1)
if (categorical_fraction or integer_fraction) and not randomize:
raise H2OValueError("`randomize` should be True when either categorical or integer columns are used.")
if categorical_fraction + integer_fraction + binary_fraction + time_fraction + string_fraction > 1:

# The total column fraction that the user has specified explicitly. This sum should not exceed 1. We will respect
# all explicitly set fractions, and will auto-select the remaining fractions.
frcs = [real_fraction, categorical_fraction, integer_fraction, binary_fraction, time_fraction, string_fraction]
wgts = [0.5, 0.2, 0.2, 0.1, 0.0, 0.0]
sum_explicit_fractions = sum(0 if f is None else f for f in frcs)
count_explicit_fractions = sum(0 if f is None else 1 for f in frcs)
remainder = 1 - sum_explicit_fractions
if sum_explicit_fractions >= 1 + 1e-10:
raise H2OValueError("Fractions of binary, integer, categorical, time and string columns should add up "
"to a number less than 1.")
parms = {"dest": py_tmp_key(append=h2oconn.session_id) if id is None else id,
elif sum_explicit_fractions >= 1 - 1e-10:
# The fractions already add up to almost 1. No need to do anything (the server will absorb the tiny
# remainder into the real_fraction column).
pass
else:
# sum_explicit_fractions < 1 => distribute the remainder among the columns that were not set explicitly
if count_explicit_fractions == 6:
raise H2OValueError("Fraction of binary, integer, categorical, time and string columns add up to a "
"number less than 1.")
# Each column type receives a certain part (proportional to column's "weight") of the remaining fraction.
sum_implicit_weights = sum(wgts[i] if frcs[i] is None else 0 for i in range(6))
for i, f in enumerate(frcs):
if frcs[i] is not None: continue
if sum_implicit_weights == 0:
frcs[i] = remainder
else:
frcs[i] = remainder * wgts[i] / sum_implicit_weights
remainder -= frcs[i]
sum_implicit_weights -= wgts[i]
for i, f in enumerate(frcs):
if f is None:
frcs[i] = 0
real_fraction, categorical_fraction, integer_fraction, binary_fraction, time_fraction, string_fraction = frcs

parms = {"dest": frame_id if frame_id else py_tmp_key(append=h2oconn.session_id),
"rows": rows,
"cols": cols,
"randomize": randomize,
"value": value,
"real_range": real_range,
"categorical_fraction": categorical_fraction,
"factors": factors,
"integer_fraction": integer_fraction,
"integer_range": integer_range,
"binary_fraction": binary_fraction,
"binary_ones_fraction": binary_ones_fraction,
"time_fraction": time_fraction,
"string_fraction": string_fraction,
# "real_fraction" is not provided, the backend computes it as 1 - sum(5 other fractions)
"value": value,
"real_range": real_range,
"factors": factors,
"integer_range": integer_range,
"binary_ones_fraction": binary_ones_fraction,
"missing_fraction": missing_fraction,
"response_factors": response_factors,
"has_response": has_response,
Expand Down
72 changes: 69 additions & 3 deletions h2o-py/h2o/utils/typechecks.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@
# ``Dict`` is a dictionary type which should match exactly (i.e. each key must be present in tested variable)
Dict(error=str) # dictionary with only one key "error" with string value
# ``BoundInt``, ``BoundNumeric`` are numbers that are bound from below below and/or above
BoundInt(1, 100)
BoundNumeric(0, 1)
# Lazy class references: these types can be used anywhere without having to load the corresponding modules. Their
# resolution is deferred until the run time, and if the module cannot be loaded no exception will be raised (but
# of course the type check will fail).
Expand All @@ -98,12 +102,14 @@
import re
import sys
import tokenize
from types import FunctionType, BuiltinFunctionType
from types import BuiltinFunctionType, FunctionType

from h2o.utils.compatibility import * # NOQA
from h2o.exceptions import H2OTypeError, H2OValueError
from h2o.utils.compatibility import * # NOQA
from h2o.utils.compatibility import PY2, viewitems

__all__ = ("U", "I", "NOT", "Tuple", "Dict", "MagicType", "numeric", "h2oframe", "pandas_dataframe", "numpy_ndarray",
__all__ = ("U", "I", "NOT", "Tuple", "Dict", "MagicType", "BoundInt", "BoundNumeric",
"numeric", "h2oframe", "pandas_dataframe", "numpy_ndarray",
"assert_is_type", "assert_matches", "assert_satisfies", "is_type")


Expand Down Expand Up @@ -276,6 +282,66 @@ def name(self, src=None):
for key, ktype in viewitems(self._types))


class BoundInt(MagicType):
"""Integer type bounded from below/above."""

def __init__(self, lb=None, ub=None):
"""
Create a BoundInt object.
The type will match any integer that is within the specified bounds (inclusively). Thus, ``BoundInt(0, 100)``
matches any integer in the range from 0 to 100 (including 100). Also ``BoundInt(1)`` is a positive integer,
and ``BoundInt(None, -1)`` is a negative integer.
:param lb: lower bound (can be None or int)
:param ub: upper bound (can be None or int)
"""
self._lower_bound = lb
self._upper_bound = ub

def check(self, var):
"""Return True if the variable matches the specified type."""
return (isinstance(var, _int_type) and
(self._lower_bound is None or var >= self._lower_bound) and
(self._upper_bound is None or var <= self._upper_bound))

def name(self, src=None):
"""Return string representing the name of this type."""
if self._upper_bound is None and self._lower_bound is None: return "int"
if self._upper_bound is None:
if self._lower_bound == 1: return "int>0"
return "int≥%d" % self._lower_bound
if self._lower_bound is None:
return "int≤%d" % self._upper_bound
return "int[%d…%d]" % (self._lower_bound, self._upper_bound)


class BoundNumeric(MagicType):
"""Numeric type bounded from below/above."""

def __init__(self, lb=None, ub=None):
"""
Create a BoundNumeric object.
:param lb: lower bound (can be None or numeric)
:param ub: upper bound (can be None or numeric)
"""
self._lower_bound = lb
self._upper_bound = ub

def check(self, var):
"""Return True if the variable matches the specified type."""
return (isinstance(var, _num_type) and
(self._lower_bound is None or var >= self._lower_bound) and
(self._upper_bound is None or var <= self._upper_bound))

def name(self, src=None):
"""Return string representing the name of this type."""
if self._upper_bound is None and self._lower_bound is None: return "numeric"
if self._upper_bound is None: return "numeric≥%d" % self._lower_bound
if self._lower_bound is None: return "numeric≤%d" % self._upper_bound
return "numeric[%d…%d]" % (self._lower_bound, self._upper_bound)


class _LazyClass(MagicType):
"""
Expand Down
Loading

0 comments on commit 619d974

Please sign in to comment.