Skip to content

Commit

Permalink
Add shortcut for symmetric covariance matrices
Browse files Browse the repository at this point in the history
  • Loading branch information
ludi committed Feb 4, 2016
1 parent 7242d6c commit 618cd68
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 23 deletions.
96 changes: 84 additions & 12 deletions h2o-core/src/main/java/water/rapids/ASTVariance.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,17 @@
/** Variance between columns of a frame */
class ASTVariance extends ASTPrim {
@Override
public String[] args() { return new String[]{"ary", "x","y","use"}; }
public String[] args() { return new String[]{"ary", "x","y","use", "symmetric"}; }
private enum Mode { Everything, AllObs, CompleteObs, PairwiseCompleteObs }
@Override int nargs() { return 1+3; /* (var X Y use) */}
@Override int nargs() { return 1+4; /* (var X Y use symmetric) */}
@Override public String str() { return "var"; }
@Override Val apply( Env env, Env.StackHelp stk, AST asts[] ) {
Frame frx = stk.track(asts[1].exec(env)).getFrame();
Frame fry = stk.track(asts[2].exec(env)).getFrame();
if( frx.numRows() != fry.numRows() )
throw new IllegalArgumentException("Frames must have the same number of rows, found "+frx.numRows()+" and "+fry.numRows());
String use = stk.track(asts[3].exec(env)).getStr();
boolean symmetric = asts[4].exec(env).getNum()==1;
Mode mode;
//In R, if the use arg is set, the na.rm arg has no effect (same result whether it is T or F). The na.rm param only
// comes into play when no use arg is set. Without a use arg, setting na.rm = T is equivalent to use = "complete.obs",
Expand All @@ -33,7 +34,7 @@ private enum Mode { Everything, AllObs, CompleteObs, PairwiseCompleteObs }
default: throw new IllegalArgumentException("unknown use mode, found: "+use);
}

return frx.numRows() == 1 ? scalar(frx,fry,mode) : array(frx,fry,mode);
return frx.numRows() == 1 ? scalar(frx,fry,mode) : array(frx,fry,mode, symmetric);
}

// Scalar covariance for 1 row
Expand Down Expand Up @@ -65,13 +66,50 @@ private ValNum scalar( Frame frx, Frame fry, Mode mode) {
// Matrix covariance. Compute covariance between all columns from each Frame
// against each other. Return a matrix of covariances which is frx.numCols
// tall and fry.numCols wide.
private Val array( Frame frx, Frame fry, Mode mode) {
private Val array( Frame frx, Frame fry, Mode mode, boolean symmetric) {
Vec[] vecxs = frx.vecs(); int ncolx = vecxs.length;
Vec[] vecys = fry.vecs(); int ncoly = vecys.length;

if (mode.equals(Mode.Everything) || mode.equals(Mode.AllObs)) {
// Launch tasks; each does all Xs vs one Y
CoVarTaskEverything[] cvts = new CoVarTaskEverything[ncoly];
if (symmetric) {
int[] idx = new int[ncoly];
for (int y = 0; y < ncoly; y++) idx[y] = y;
int[] first_index = new int[]{0};
//compute covariances between column_i and and column_i, column_i+1, ...
Frame reduced_fr = new Frame(frx);
for (int y = 0; y <ncoly; y++) {
cvts[y] = new CoVarTaskEverything().dfork(new Frame(vecys[y]).add(reduced_fr));
idx = ArrayUtils.removeIds(idx, first_index);
reduced_fr = new Frame(frx.vecs(idx));
}
//arrange the results into the bottom left of res_array. each successive cvts is 1 smaller in length
double[][] res_array = new double[ncoly][ncoly];
for (int y =0; y<ncoly; y++) {
double[] res_array_y = res_array[y];
CoVarTaskEverything cvtx = cvts[y].getResult();
if (mode.equals(Mode.AllObs))
for (double ss : cvtx._ss)
if (Double.isNaN(ss)) throw new IllegalArgumentException("Mode is 'all.obs' but NAs are present");
double[] res = ArrayUtils.div(ArrayUtils.subtract(cvtx._ss, ArrayUtils.mult(cvtx._xsum,
ArrayUtils.div(cvtx._ysum, frx.numRows()))), frx.numRows() - 1);
System.arraycopy(res, 0, res_array_y, y, ncoly - y);
}
//copy over the bottom left of res_array to its top right
for (int y = 0; y < ncoly -1; y++) {
for (int x = y+1; x < ncoly ; x++) {
res_array[x][y] = res_array[y][x];
}
}
//set Frame
Vec[] res = new Vec[ncoly];
Key<Vec>[] keys = Vec.VectorGroup.VG_LEN1.addVecs(ncoly);
for (int y = 0; y < ncoly; y++) {
res[y] = Vec.makeVec(res_array[y], keys[y]);
}
return new ValFrame(new Frame(fry._names, res));
}
// Launch tasks; each does all Xs vs one Y
for (int y = 0; y < ncoly; y++)
cvts[y] = new CoVarTaskEverything().dfork(new Frame(vecys[y]).add(frx));
// Short cut for the 1-row-1-col result: return a scalar
Expand Down Expand Up @@ -109,7 +147,6 @@ private Val array( Frame frx, Frame fry, Mode mode) {
//}
return new ValFrame(new Frame(fry._names, res));
}

if (mode.equals(Mode.CompleteObs)) {
CoVarTaskComplete cvs = new CoVarTaskComplete(ncolx, ncoly).doAll(new Frame(fry).add(frx));

Expand All @@ -125,10 +162,44 @@ private Val array( Frame frx, Frame fry, Mode mode) {
}
return new ValFrame(new Frame(fry._names, res));
}

if (mode.equals(Mode.PairwiseCompleteObs)) {

else { //if (mode.equals(Mode.PairwiseCompleteObs)) {
CoVarTaskPairwise[] cvts = new CoVarTaskPairwise[ncoly];
if (symmetric) {
int[] idx = new int[ncoly];
for (int y = 0; y < ncoly; y++) idx[y] = y;
int[] first_index = new int[]{0};
//compute covariances between column_i and and column_i, column_i+1, ...
Frame reduced_fr = new Frame(frx);
for (int y = 0; y <ncoly; y++) {
cvts[y] = new CoVarTaskPairwise().dfork(new Frame(vecys[y]).add(reduced_fr));
idx = ArrayUtils.removeIds(idx, first_index);
reduced_fr = new Frame(frx.vecs(idx));
}
//arrange the results into the bottom left of res_array. each successive cvts is 1 smaller in length
double[][] res_array = new double[ncoly][ncoly];
for (int y =0; y<ncoly; y++) {
double[] res_array_y = res_array[y];
CoVarTaskPairwise cvtx = cvts[y].getResult();

double[] res = ArrayUtils.div(ArrayUtils.subtract(cvtx._ss, ArrayUtils.mult(cvtx._xsum,
ArrayUtils.div(cvtx._ysum, ArrayUtils.subtract(frx.numRows(), cvtx._NACount.clone())))),
ArrayUtils.subtract(frx.numRows() - 1, cvtx._NACount.clone()));
System.arraycopy(res, 0, res_array_y, y, ncoly - y);
}
//copy over the bottom left of res_array to its top right
for (int y = 0; y < ncoly -1; y++) {
for (int x = y+1; x < ncoly ; x++) {
res_array[x][y] = res_array[y][x];
}
}
//set Frame
Vec[] res = new Vec[ncoly];
Key<Vec>[] keys = Vec.VectorGroup.VG_LEN1.addVecs(ncoly);
for (int y = 0; y < ncoly; y++) {
res[y] = Vec.makeVec(res_array[y], keys[y]);
}
return new ValFrame(new Frame(fry._names, res));
}
for (int y = 0; y < ncoly; y++)
cvts[y] = new CoVarTaskPairwise().dfork(new Frame(vecys[y]).add(frx));
// Short cut for the 1-row-1-col result: return a scalar
Expand All @@ -142,12 +213,12 @@ private Val array( Frame frx, Frame fry, Mode mode) {
for (int y = 0; y < ncoly; y++) {
CoVarTaskPairwise cvtx = cvts[y].getResult();
res[y] = Vec.makeVec(ArrayUtils.div(ArrayUtils.subtract(cvtx._ss, ArrayUtils.mult(cvtx._xsum,
ArrayUtils.div(cvtx._ysum, ArrayUtils.subtract(frx.numRows(), cvtx._NACount.clone())))), ArrayUtils.subtract(frx.numRows() - 1, cvtx._NACount.clone())), keys[y]);
ArrayUtils.div(cvtx._ysum, ArrayUtils.subtract(frx.numRows(), cvtx._NACount.clone())))),
ArrayUtils.subtract(frx.numRows() - 1, cvtx._NACount.clone())), keys[y]);
}

return new ValFrame(new Frame(fry._names, res));
}
throw new IllegalArgumentException("unknown use mode, found: "+mode);
}


Expand Down Expand Up @@ -237,8 +308,9 @@ private static class CoVarTaskComplete extends MRTask<CoVarTaskComplete> {
if (add) {
for (int y=0; y < _ncoly; y++) {
_ss_y = _ss[y];
yval = yvals[y];
for (int x = 0; x < _ncolx; x++)
_ss_y[x] += xvals[x] * yvals[y];
_ss_y[x] += xvals[x] * yval;
}
ArrayUtils.add(_xsum, xvals);
ArrayUtils.add(_ysum, yvals);
Expand Down
4 changes: 2 additions & 2 deletions h2o-core/src/test/java/water/rapids/RapidsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -205,10 +205,10 @@ public class RapidsTest extends TestUtil {

@Test public void testVariance() {
// Checking variance: scalar
String tree = "({x . (var x x \"everything\")} (rows a.hex [0]))";
String tree = "({x . (var x x \"everything\" FALSE)} (rows a.hex [0]))";
checkTree(tree);

tree = "({x . (var x x \"everything\")} a.hex)";
tree = "({x . (var x x \"everything\" FALSE)} a.hex)";
checkTree(tree);

tree = "(table (trunc (cols a.hex 1)) FALSE)";
Expand Down
9 changes: 6 additions & 3 deletions h2o-py/h2o/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1502,10 +1502,13 @@ def var(self,y=None,na_rm=False, use=None):
An H2OFrame of the covariance matrix of the columns of this H2OFrame with itself (if y is not given), or with the columns of y
(if y is given). If self and y are single rows or single columns, the variance or covariance is given as a scalar.
"""
if y is None: y = self
symmetric = False
if y is None:
y = self
if self.ncol > 1 and self.nrow > 1: symmetric = True
if use is None: use = "complete.obs" if na_rm else "everything"
if self.nrow==1 or (self.ncol==1 and y.ncol==1): return ExprNode("var",self,y,use)._eager_scalar()
return H2OFrame._expr(expr=ExprNode("var",self,y,use))._frame()
if self.nrow==1 or (self.ncol==1 and y.ncol==1): return ExprNode("var",self,y,use,symmetric)._eager_scalar()
return H2OFrame._expr(expr=ExprNode("var",self,y,use,symmetric))._frame()

def sd(self, na_rm=False):
"""Compute the standard deviation.
Expand Down
8 changes: 6 additions & 2 deletions h2o-r/h2o-package/R/frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -1872,15 +1872,19 @@ mean.H2OFrame <- h2o.mean
#' }
#' @export
h2o.var <- function(x, y = NULL, na.rm = FALSE, use) {
if( is.null(y) ) y <- x
symmetric <- FALSE
if( is.null(y) ) {
y <- x
if( ncol(x) > 1 && nrow(x) > 1) symmetric <- TRUE
}
if(!missing(use)) {
if (use == "na.or.complete")
stop("Unimplemented : `use` may be either \"everything\", \"all.obs\", \"complete.obs\", or \"pairwise.complete.obs\"")
} else {
if (na.rm) use <- "complete.obs" else use <- "everything"
}
# Eager, mostly to match prior semantics but no real reason it need to be
expr <- .newExpr("var",x,y,.quote(use))
expr <- .newExpr("var",x,y,.quote(use),symmetric)
if( (nrow(x)==1L || (ncol(x)==1L && ncol(y)==1L)) ) .eval.scalar(expr)
else .fetch.data(expr,ncol(x))
}
Expand Down
18 changes: 14 additions & 4 deletions h2o-r/tests/testdir_munging/unop/runit_covar.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ source("../../../scripts/h2o-r-test-setup.R")


test.var <- function() {


#1 row case
n <- 20
g <- runif(n)
Expand All @@ -29,21 +27,33 @@ test.var <- function() {
g[1,4] <- NA
h[2,3] <- NA
run.var.tests(g,h,has_nas=TRUE)

}

run.var.tests <- function (g,h,one_row=FALSE,has_nas=FALSE) {
h2o_g <- as.h2o(g)
h2o_h <- as.h2o(h)
uses <- c("everything", "all.obs", "complete.obs", "pairwise.complete.obs")
if (has_nas) uses <- uses[-2]
for (na.rm in c(FALSE, TRUE)) {
for (use in uses) {
#2 inputs
if (one_row) {
h2o_var <- var(as.h2o(t(g)),as.h2o(t(h)), na.rm = na.rm, use = use)
} else {
h2o_var <- var(as.h2o(g),as.h2o(h), na.rm = na.rm, use = use)
h2o_var <- var(h2o_g, h2o_h, na.rm = na.rm, use = use)
}
R_var <- var(g,h, na.rm = na.rm, use = use)
h2o_and_R_equal(h2o_var, R_var)

#1 input
h2o_var <- var(h2o_g, na.rm=na.rm, use=use)
R_var <- var(g, na.rm=na.rm, use=use)
h2o_and_R_equal(h2o_var, R_var)

#other input
h2o_var <- var(h2o_h, na.rm=na.rm, use=use)
R_var <- var(h, na.rm=na.rm, use=use)
h2o_and_R_equal(h2o_var, R_var)
}
}
}
Expand Down

0 comments on commit 618cd68

Please sign in to comment.