From 3e42434f652e4df57356418bf95e5e4855f1488d Mon Sep 17 00:00:00 2001 From: tomasnykodym Date: Mon, 22 Aug 2016 11:44:16 -0700 Subject: [PATCH] PUBDEV-1612: orc parser. (#93) Built on top of Michal M's avro-parser. Using Tomas parser setup with modifications to support orc parsing. Additional tests added by Nidhi. She fixes R unit test on HDFS. Needed extra help from Jeff G on lot of setup issues. Includes: - Added corresponding HDFS tests from pyunit tests. - Removed bad tests. --- build.gradle | 4 +- gradle.properties | 16 +- h2o-app/build.gradle | 2 + h2o-assembly/build.gradle | 3 + .../java/water/api/ParseSetupHandler.java | 16 +- .../src/main/java/water/fvec/FileVec.java | 23 +- .../water/parser/DefaultParserProviders.java | 17 +- .../java/water/parser/FVecParseWriter.java | 5 +- .../src/main/java/water/parser/ORCParser.java | 7 - .../main/java/water/parser/ParseDataset.java | 47 +- .../main/java/water/parser/ParseSetup.java | 43 +- .../java/water/parser/ParserProvider.java | 22 +- .../rapids/ast/prims/mungers/AstFlatten.java | 6 +- h2o-core/src/main/java/water/util/Log.java | 13 +- h2o-core/src/test/java/water/TestUtil.java | 81 +++ .../test/java/water/parser/ParserTest2.java | 18 +- .../test/java/water/rapids/GroupingBench.java | 2 +- h2o-hadoop/assemblyjar.gradle | 12 + h2o-hadoop/h2o-cdh5.2-assembly/build.gradle | 2 + h2o-hadoop/h2o-cdh5.2/build.gradle | 1 + h2o-hadoop/h2o-cdh5.3-assembly/build.gradle | 2 + h2o-hadoop/h2o-cdh5.4.2-assembly/build.gradle | 6 +- h2o-hadoop/h2o-cdh5.5.3-assembly/build.gradle | 4 +- h2o-hadoop/h2o-cdh5.6.0-assembly/build.gradle | 2 + h2o-hadoop/h2o-cdh5.7.0-assembly/build.gradle | 2 + h2o-hadoop/h2o-hdp2.1-assembly/build.gradle | 3 + h2o-hadoop/h2o-hdp2.2-assembly/build.gradle | 2 + h2o-hadoop/h2o-hdp2.3-assembly/build.gradle | 3 + h2o-hadoop/h2o-hdp2.4-assembly/build.gradle | 2 + .../h2o-mapr3.1.1-assembly/build.gradle | 1 + .../h2o-mapr4.0.1-assembly/build.gradle | 1 + h2o-hadoop/h2o-mapr5.0-assembly/build.gradle | 1 + h2o-hadoop/h2o-mapr5.1-assembly/build.gradle | 1 + .../water/parser/avro/AvroParserProvider.java | 4 +- h2o-parsers/h2o-orc-parser/build.gradle | 38 ++ .../main/java/water/parser/orc/OrcParser.java | 614 ++++++++++++++++++ .../water/parser/orc/OrcParserProvider.java | 138 ++++ .../main/java/water/parser/orc/OrcUtil.java | 72 ++ .../services/water.parser.ParserProvider | 1 + .../water/parser/ParseTestMultiFileOrc.java | 55 ++ .../java/water/parser/ParseTestORCCSV.java | 78 +++ .../test/java/water/parser/ParseTestOrc.java | 434 +++++++++++++ h2o-parsers/h2o-orc-parser/testMultiNode.sh | 124 ++++ h2o-persist-hdfs/build.gradle | 16 +- .../main/java/water/persist/PersistHdfs.java | 22 +- h2o-py/h2o/expr.py | 3 +- h2o-py/tests/pyunit_utils/utilsPY.py | 247 +++++++ h2o-py/tests/testdir_hdfs/index.list | 7 + .../pyunit_INTERNAL_HDFS_airlines_orc.py | 81 +++ .../pyunit_INTERNAL_HDFS_baddata_orc.py | 44 ++ ...NTERNAL_HDFS_hexdev_29_import_types_orc.py | 45 ++ ...HDFS_import_folder_airline_05_orc_large.py | 73 +++ .../pyunit_INTERNAL_HDFS_import_folder_orc.py | 46 ++ ...nit_INTERNAL_HDFS_iris_import_types_orc.py | 44 ++ ...pyunit_INTERNAL_HDFS_milsongs_orc_large.py | 45 ++ .../pyunit_INTERNAL_HDFS_orc_parser.py | 60 ++ .../pyunit_INTERNAL_HDFS_prostate_orc.py | 48 ++ ...pyunit_INTERNAL_HDFS_timestamp_date_orc.py | 57 ++ .../pyunit_NOFEATURE_orc_parser.py | 53 ++ .../pyunit_NOFEATURE_orc_parser_baddata.py | 27 + ...ATURE_orc_parser_hexdev_29_import_types.py | 30 + ...unit_NOFEATURE_orc_parser_import_folder.py | 30 + ..._parser_import_folder_airline_05p_large.py | 61 ++ ...orc_parser_import_folder_milsongs_large.py | 30 + ..._NOFEATURE_orc_parser_iris_import_types.py | 30 + .../pyunit_NOFEATURE_orc_parser_prostate.py | 33 + ...nit_orc_NOFEATURE_parser_timestamp_date.py | 49 ++ .../runit_INTERNAL_HDFS_airlines_orc.R | 67 ++ .../runit_NOFEATURE_orc_parser.R | 40 ++ ..._NOFEATURE_orc_parser_airlines_05p_large.R | 46 ++ .../runit_NOFEATURE_orc_parser_baddata.R | 28 + ...unit_orc_NOFEATURE_parser_milsongs_large.R | 33 + scripts/run.py | 2 +- scripts/saveTableAsOrc.textile | 161 ++++- settings.gradle | 12 +- 75 files changed, 3391 insertions(+), 107 deletions(-) delete mode 100644 h2o-core/src/main/java/water/parser/ORCParser.java create mode 100644 h2o-parsers/h2o-orc-parser/build.gradle create mode 100644 h2o-parsers/h2o-orc-parser/src/main/java/water/parser/orc/OrcParser.java create mode 100644 h2o-parsers/h2o-orc-parser/src/main/java/water/parser/orc/OrcParserProvider.java create mode 100644 h2o-parsers/h2o-orc-parser/src/main/java/water/parser/orc/OrcUtil.java create mode 100644 h2o-parsers/h2o-orc-parser/src/main/resources/META-INF/services/water.parser.ParserProvider create mode 100644 h2o-parsers/h2o-orc-parser/src/test/java/water/parser/ParseTestMultiFileOrc.java create mode 100644 h2o-parsers/h2o-orc-parser/src/test/java/water/parser/ParseTestORCCSV.java create mode 100644 h2o-parsers/h2o-orc-parser/src/test/java/water/parser/ParseTestOrc.java create mode 100755 h2o-parsers/h2o-orc-parser/testMultiNode.sh create mode 100644 h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_airlines_orc.py create mode 100644 h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_baddata_orc.py create mode 100644 h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_hexdev_29_import_types_orc.py create mode 100644 h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_import_folder_airline_05_orc_large.py create mode 100644 h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_import_folder_orc.py create mode 100644 h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_iris_import_types_orc.py create mode 100644 h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_milsongs_orc_large.py create mode 100644 h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_orc_parser.py create mode 100644 h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_prostate_orc.py create mode 100644 h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_timestamp_date_orc.py create mode 100644 h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser.py create mode 100644 h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_baddata.py create mode 100644 h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_hexdev_29_import_types.py create mode 100644 h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_import_folder.py create mode 100644 h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_import_folder_airline_05p_large.py create mode 100644 h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_import_folder_milsongs_large.py create mode 100644 h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_iris_import_types.py create mode 100644 h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_prostate.py create mode 100644 h2o-py/tests/testdir_parser/pyunit_orc_NOFEATURE_parser_timestamp_date.py create mode 100644 h2o-r/tests/testdir_hdfs/runit_INTERNAL_HDFS_airlines_orc.R create mode 100644 h2o-r/tests/testdir_parser/runit_NOFEATURE_orc_parser.R create mode 100644 h2o-r/tests/testdir_parser/runit_NOFEATURE_orc_parser_airlines_05p_large.R create mode 100644 h2o-r/tests/testdir_parser/runit_NOFEATURE_orc_parser_baddata.R create mode 100644 h2o-r/tests/testdir_parser/runit_orc_NOFEATURE_parser_milsongs_large.R diff --git a/build.gradle b/build.gradle index 77694e827f69..f44ad8a7d4f0 100644 --- a/build.gradle +++ b/build.gradle @@ -54,7 +54,8 @@ ext { project(':h2o-persist-s3'), project(':h2o-genmodel'), project(':h2o-bindings'), - project(':h2o-avro-parser') + project(':h2o-avro-parser'), + project(':h2o-orc-parser'), ] javaProjects = [ @@ -69,6 +70,7 @@ ext { project(':h2o-genmodel'), project(':h2o-bindings'), project(':h2o-avro-parser'), + project(':h2o-orc-parser'), ] scalaProjects = [ diff --git a/gradle.properties b/gradle.properties index 3dd45ffef709..787033053579 100644 --- a/gradle.properties +++ b/gradle.properties @@ -10,5 +10,19 @@ doJava6Bytecode=auto # Run animal sniffer - by default false, but if java6 bytecode is requested # then animal sniffer is run doAnimalSniffer=false -# Increase PermGen size for build +# The flag to include ORC support inside default h2o.jar. +# WARNING: this will upgrade default Hadoop client version to one supporting ORC +doIncludeOrc=false +# +# Version of hadoop dependency which is used for jUnit test execution +# +orcDefaultHadoopClientVersion=2.6.0-cdh5.4.0 +orcDefaultHiveExecVersion=1.1.0-cdh5.4.0 +# +# Default hadoop client version +# +defaultHadoopClientVersion=2.0.0-cdh4.3.0 +# +# Gradle arguments +# org.gradle.jvmargs='-XX:MaxPermSize=384m' diff --git a/h2o-app/build.gradle b/h2o-app/build.gradle index a5c985d4ad66..263a15f3d069 100644 --- a/h2o-app/build.gradle +++ b/h2o-app/build.gradle @@ -9,5 +9,7 @@ dependencies { compile project(":h2o-core") compile project(":h2o-genmodel") compile project(":h2o-avro-parser") + // Note: orc parser is included at the assembly level for each + // Hadoop distribution } diff --git a/h2o-assembly/build.gradle b/h2o-assembly/build.gradle index 81318cbb6af3..20830c8b583a 100644 --- a/h2o-assembly/build.gradle +++ b/h2o-assembly/build.gradle @@ -7,6 +7,9 @@ dependencies { compile project(":h2o-app") compile project(":h2o-persist-s3") compile project(":h2o-persist-hdfs") + if (project.hasProperty("doIncludeOrc") && project.doIncludeOrc == "true") { + compile project(":h2o-orc-parser") + } compile "org.slf4j:slf4j-log4j12:1.7.5" } diff --git a/h2o-core/src/main/java/water/api/ParseSetupHandler.java b/h2o-core/src/main/java/water/api/ParseSetupHandler.java index 4abc9dffc2b5..b6124bcae226 100644 --- a/h2o-core/src/main/java/water/api/ParseSetupHandler.java +++ b/h2o-core/src/main/java/water/api/ParseSetupHandler.java @@ -9,7 +9,9 @@ import water.Key; import water.api.schemas3.ParseSetupV3; import water.exceptions.H2OIllegalArgumentException; +import water.parser.ParseDataset; import water.parser.ParseSetup; +import water.util.DistributedException; import water.util.PojoUtils; import static water.parser.DefaultParserProviders.GUESS_INFO; @@ -33,9 +35,17 @@ public ParseSetupV3 guessSetup(int version, ParseSetupV3 p) { if (p.na_strings != null) for(int i = 0; i < p.na_strings.length; i++) if (p.na_strings[i] != null && p.na_strings[i].length == 0) p.na_strings[i] = null; - - ParseSetup ps = ParseSetup.guessSetup(fkeys, new ParseSetup(p)); - + ParseSetup ps; + try{ + ps = ParseSetup.guessSetup(fkeys, new ParseSetup(p)); + } catch(Throwable ex) { + Throwable ex2 = ex; + if(ex instanceof DistributedException) + ex2 = ex.getCause(); + if(ex2 instanceof ParseDataset.H2OParseException) + throw new H2OIllegalArgumentException(ex2.getMessage()); + throw ex; + } if(ps._errs != null && ps._errs.length > 0) { p.warnings = new String[ps._errs.length]; for (int i = 0; i < ps._errs.length; ++i) diff --git a/h2o-core/src/main/java/water/fvec/FileVec.java b/h2o-core/src/main/java/water/fvec/FileVec.java index 1878ff475638..99d0a30f932b 100644 --- a/h2o-core/src/main/java/water/fvec/FileVec.java +++ b/h2o-core/src/main/java/water/fvec/FileVec.java @@ -9,6 +9,18 @@ public abstract class FileVec extends ByteVec { long _len; // File length final byte _be; + // Returns String with path for given key. + public static String getPathForKey(Key k) { + final int off = k._kb[0]==Key.CHK || k._kb[0]==Key.VEC ? Vec.KEY_PREFIX_LEN : 0; + String p = new String(k._kb,off,k._kb.length-off); + + if(p.startsWith("nfs:/")) + p = p.substring("nfs:/".length()); + else if (p.startsWith("nfs:\\")) + p = p.substring("nfs:\\".length()); + + return p; + } /** Log-2 of Chunk size. */ public static final int DFLT_LOG2_CHUNK_SIZE = 20/*1Meg*/+2/*4Meg*/; /** Default Chunk size in bytes, useful when breaking up large arrays into @@ -16,13 +28,17 @@ public abstract class FileVec extends ByteVec { * costs, lower increases fine-grained parallelism. */ public static final int DFLT_CHUNK_SIZE = 1 << DFLT_LOG2_CHUNK_SIZE; public int _chunkSize = DFLT_CHUNK_SIZE; + public int _nChunks = -1; protected FileVec(Key key, long len, byte be) { super(key,-1/*no rowLayout*/); _len = len; _be = be; } - + public void setNChunks(int n){ + _nChunks = n; + setChunkSize((int)length()/n); + } /** * Chunk size must be positive, 1G or less, and a power of two. * Any values that aren't a power of two will be reduced to the @@ -36,6 +52,7 @@ protected FileVec(Key key, long len, byte be) { * @return actual _chunkSize setting */ public int setChunkSize(int chunkSize) { return setChunkSize(null, chunkSize); } + public int setChunkSize(Frame fr, int chunkSize) { // Clear cached chunks first // Peeking into a file before the chunkSize has been set @@ -63,7 +80,11 @@ public int setChunkSize(Frame fr, int chunkSize) { } @Override public long length() { return _len; } + + @Override public int nChunks() { + if(_nChunks != -1) // number of chunks can be set explicitly + return _nChunks; return (int)Math.max(1,_len / _chunkSize + ((_len % _chunkSize != 0)?1:0)); } @Override public boolean writable() { return false; } diff --git a/h2o-core/src/main/java/water/parser/DefaultParserProviders.java b/h2o-core/src/main/java/water/parser/DefaultParserProviders.java index 808567c7883c..7d0803abb4fb 100644 --- a/h2o-core/src/main/java/water/parser/DefaultParserProviders.java +++ b/h2o-core/src/main/java/water/parser/DefaultParserProviders.java @@ -6,6 +6,7 @@ import water.Job; import water.Key; +import water.fvec.ByteVec; import water.util.Log; /** @@ -23,7 +24,7 @@ public final class DefaultParserProviders { public static final ParserInfo SVMLight_INFO = new ParserInfo("SVMLight", 1000, true); public static final ParserInfo CSV_INFO = new ParserInfo("CSV", Integer.MAX_VALUE, true); public static final ParserInfo GUESS_INFO = new ParserInfo("GUESS", -10000, false); - /** Priority of non-core parsers shoudl begin here.*/ + /** Priority of non-core parsers should begin here.*/ public static final int MAX_CORE_PRIO = 10000; public final static class ArffParserProvider extends AbstractParserProvide { @@ -39,7 +40,7 @@ public Parser createParser(ParseSetup setup, Key jobKey) { } @Override - public ParseSetup guessSetup(byte[] bits, byte sep, int ncols, boolean singleQuotes, + public ParseSetup guessSetup(ByteVec bv, byte[] bits, byte sep, int ncols, boolean singleQuotes, int checkHeader, String[] columnNames, byte[] columnTypes, String[][] domains, String[][] naStrings) { return ARFFParser.guessSetup(bits, sep, singleQuotes, columnNames, naStrings); @@ -59,7 +60,7 @@ public Parser createParser(ParseSetup setup, Key jobKey) { } @Override - public ParseSetup guessSetup(byte[] bits, byte sep, int ncols, boolean singleQuotes, + public ParseSetup guessSetup(ByteVec bv, byte[] bits, byte sep, int ncols, boolean singleQuotes, int checkHeader, String[] columnNames, byte[] columnTypes, String[][] domains, String[][] naStrings) { return XlsParser.guessSetup(bits); @@ -79,7 +80,7 @@ public Parser createParser(ParseSetup setup, Key jobKey) { } @Override - public ParseSetup guessSetup(byte[] bits, byte sep, int ncols, boolean singleQuotes, + public ParseSetup guessSetup(ByteVec bv, byte[] bits, byte sep, int ncols, boolean singleQuotes, int checkHeader, String[] columnNames, byte[] columnTypes, String[][] domains, String[][] naStrings) { return SVMLightParser.guessSetup(bits); @@ -99,7 +100,7 @@ public Parser createParser(ParseSetup setup, Key jobKey) { } @Override - public ParseSetup guessSetup(byte[] bits, byte sep, int ncols, boolean singleQuotes, + public ParseSetup guessSetup(ByteVec bv, byte[] bits, byte sep, int ncols, boolean singleQuotes, int checkHeader, String[] columnNames, byte[] columnTypes, String[][] domains, String[][] naStrings) { return CsvParser.guessSetup(bits, sep, ncols, singleQuotes, checkHeader, columnNames, columnTypes, naStrings); @@ -119,7 +120,7 @@ public Parser createParser(ParseSetup setup, Key jobKey) { } @Override - public ParseSetup guessSetup(byte[] bits, byte sep, int ncols, boolean singleQuotes, + public ParseSetup guessSetup(ByteVec bv, byte[] bits, byte sep, int ncols, boolean singleQuotes, int checkHeader, String[] columnNames, byte[] columnTypes, String[][] domains, String[][] naStrings) { List pps = ParserService.INSTANCE.getAllProviders(true); // Sort them based on priorities @@ -129,7 +130,7 @@ public ParseSetup guessSetup(byte[] bits, byte sep, int ncols, boolean singleQuo if (pp == this || pp.info().equals(GUESS_INFO)) continue; // Else try to guess with given provider try { - ParseSetup ps = pp.guessSetup(bits, sep, ncols, singleQuotes, checkHeader, columnNames, columnTypes, domains, naStrings); + ParseSetup ps = pp.guessSetup(bv, bits, sep, ncols, singleQuotes, checkHeader, columnNames, columnTypes, domains, naStrings); if( ps != null) { return ps; } @@ -142,7 +143,7 @@ public ParseSetup guessSetup(byte[] bits, byte sep, int ncols, boolean singleQuo } } - static abstract class AbstractParserProvide implements ParserProvider { + static abstract class AbstractParserProvide extends ParserProvider { @Override public ParseSetup createParserSetup(Key[] inputs, ParseSetup requiredSetup) { diff --git a/h2o-core/src/main/java/water/parser/FVecParseWriter.java b/h2o-core/src/main/java/water/parser/FVecParseWriter.java index 60bd57aa7e48..3713934da12d 100644 --- a/h2o-core/src/main/java/water/parser/FVecParseWriter.java +++ b/h2o-core/src/main/java/water/parser/FVecParseWriter.java @@ -169,7 +169,10 @@ else if(_errs.length < 20) _errCnt++; } - @Override public void setIsAllASCII(int colIdx, boolean b) {_nvs[colIdx]._isAllASCII = b;} + @Override public void setIsAllASCII(int colIdx, boolean b) { + if(colIdx < _nvs.length) + _nvs[colIdx]._isAllASCII = b; + } @Override public boolean hasErrors() { diff --git a/h2o-core/src/main/java/water/parser/ORCParser.java b/h2o-core/src/main/java/water/parser/ORCParser.java deleted file mode 100644 index 952ec530f987..000000000000 --- a/h2o-core/src/main/java/water/parser/ORCParser.java +++ /dev/null @@ -1,7 +0,0 @@ -package water.parser; - -/** - * Created by brandon on 9/22/15. - */ -public class ORCParser { -} diff --git a/h2o-core/src/main/java/water/parser/ParseDataset.java b/h2o-core/src/main/java/water/parser/ParseDataset.java index ea3f1dccb6a8..d16675399689 100644 --- a/h2o-core/src/main/java/water/parser/ParseDataset.java +++ b/h2o-core/src/main/java/water/parser/ParseDataset.java @@ -139,17 +139,22 @@ public static ParseDataset forkParseDataset(final Key dest, final Key[] k } Log.info("Total file size: "+ PrettyPrint.bytes(totalParseSize)); - // set the parse chunk size for files - for( int i = 0; i < keys.length; ++i ) { - Iced ice = DKV.getGet(keys[i]); - if(ice instanceof FileVec) { - ((FileVec) ice).setChunkSize(setup._chunk_size); - Log.info("Parse chunk size " + setup._chunk_size); - } else if(ice instanceof Frame && ((Frame)ice).vec(0) instanceof FileVec) { - ((FileVec) ((Frame) ice).vec(0)).setChunkSize((Frame) ice, setup._chunk_size); - Log.info("Parse chunk size " + setup._chunk_size); + + // no need to set this for ORC, it is already done: + if (!setup.getParseType().name().contains("ORC")) { + for( int i = 0; i < keys.length; ++i ) { + Iced ice = DKV.getGet(keys[i]); + + // set the parse chunk size for files + if (ice instanceof FileVec) { + ((FileVec) ice).setChunkSize(setup._chunk_size); + Log.info("Parse chunk size " + setup._chunk_size); + } else if (ice instanceof Frame && ((Frame) ice).vec(0) instanceof FileVec) { + ((FileVec) ((Frame) ice).vec(0)).setChunkSize((Frame) ice, setup._chunk_size); + Log.info("Parse chunk size " + setup._chunk_size); + } } - } + } else Log.info("Orc Parse chunk sizes may be different across files"); long memsz = H2O.CLOUD.free_mem(); if( totalParseSize > memsz*4 ) @@ -909,7 +914,7 @@ private FVecParseWriter streamParse(final InputStream is, final ParseSetup local // ------------------------------------------------------------------------ private static class DistributedParse extends MRTask { - private final ParseSetup _setup; + private ParseSetup _setup; private final int _vecIdStart; private final int _startChunkIdx; // for multifile parse, offset of the first chunk in the final dataset private final VectorGroup _vg; @@ -938,9 +943,10 @@ private static class DistributedParse extends MRTask { super.setupLocal(); _visited = new NonBlockingSetInt(); _espc = MemoryManager.malloc8(_nchunks); + _setup = ParserService.INSTANCE.getByInfo(_setup._parse_type).setupLocal(_fr.anyVec(),_setup); } @Override public void map( Chunk in ) { - if( _jobKey.get().stop_requested() ) return; + if( _jobKey.get().stop_requested() ) throw new Job.JobCancelledException(); AppendableVec [] avs = new AppendableVec[_setup._number_columns]; for(int i = 0; i < avs.length; ++i) if (_setup._column_types == null) // SVMLight @@ -956,22 +962,24 @@ private static class DistributedParse extends MRTask { case "ARFF": case "CSV": Categorical [] categoricals = categoricals(_cKey, _setup._number_columns); - dout = new FVecParseWriter(_vg,_startChunkIdx + in.cidx(), categoricals, _setup._column_types, _setup._chunk_size, avs); //TODO: use _setup._domains instead of categoricals + dout = new FVecParseWriter(_vg,_startChunkIdx + in.cidx(), categoricals, _setup._column_types, + _setup._chunk_size, avs); //TODO: use _setup._domains instead of categoricals break; case "SVMLight": dout = new SVMLightFVecParseWriter(_vg, _vecIdStart, in.cidx() + _startChunkIdx, _setup._chunk_size, avs); break; + case "ORC": // setup special case for ORC + Categorical [] orc_categoricals = categoricals(_cKey, _setup._number_columns); + dout = new FVecParseWriter(_vg, in.cidx() + _startChunkIdx, orc_categoricals, _setup._column_types, + _setup._chunk_size, avs); + break; default: // FIXME: should not be default and creation strategy should be forwarded to ParserProvider - dout = new FVecParseWriter(_vg, in.cidx() + _startChunkIdx, null, _setup._column_types, _setup._chunk_size, avs); + dout = new FVecParseWriter(_vg, in.cidx() + _startChunkIdx, null, _setup._column_types, + _setup._chunk_size, avs); break; } p.parseChunk(in.cidx(), din, dout); (_dout = dout).close(_fs); - if(_dout.hasErrors()) - for(ParseWriter.ParseErr err:_dout._errs) { - assert err != null : "Parse error cannot be null!"; - err._file = _srckey.toString(); - } Job.update(in._len, _jobKey); // Record bytes parsed // remove parsed data right away freeMem(in); @@ -1007,6 +1015,7 @@ private void freeMem(Chunk in) { _outerMFPT._dout[_outerMFPT._lo] = _dout; if(_dout.hasErrors()) { ParseWriter.ParseErr [] errs = _dout.removeErrors(); + for(ParseWriter.ParseErr err:errs)err._file = FileVec.getPathForKey(_srckey).toString(); Arrays.sort(errs, new Comparator() { @Override public int compare(ParseWriter.ParseErr o1, ParseWriter.ParseErr o2) { diff --git a/h2o-core/src/main/java/water/parser/ParseSetup.java b/h2o-core/src/main/java/water/parser/ParseSetup.java index 08ca8d4c604b..594c5b961c68 100644 --- a/h2o-core/src/main/java/water/parser/ParseSetup.java +++ b/h2o-core/src/main/java/water/parser/ParseSetup.java @@ -5,6 +5,7 @@ import water.exceptions.H2OIllegalArgumentException; import water.fvec.*; import water.util.ArrayUtils; +import water.util.FileUtils; import water.util.Log; import java.io.BufferedReader; @@ -25,7 +26,7 @@ public class ParseSetup extends Iced { public static final int HAS_HEADER = 1; public static final int GUESS_COL_CNT = -1; - ParserInfo _parse_type; // CSV, XLS, XSLX, SVMLight, Auto, ARFF + ParserInfo _parse_type; // CSV, XLS, XSLX, SVMLight, Auto, ARFF, ORC byte _separator; // Field separator, usually comma ',' or TAB or space ' ' // Whether or not single-quotes quote a field. E.g. how do we parse: // raw data: 123,'Mally,456,O'Mally @@ -40,6 +41,10 @@ public class ParseSetup extends Iced { String[][] _na_strings; // Strings for NA in a given column String[][] _data; // First few rows of parsed/tokenized data + String [] _fileNames = new String[]{"unknown"}; + + public void setFileName(String name) {_fileNames[0] = name;} + public ParseWriter.ParseErr[] _errs; public int _chunk_size = FileVec.DFLT_CHUNK_SIZE; // Optimal chunk size to be used store values PreviewParseWriter _column_previews = null; @@ -51,11 +56,14 @@ public ParseSetup(ParseSetup ps) { new ParseWriter.ParseErr[0], ps._chunk_size); } + public static ParseSetup makeSVMLightSetup(){ return new ParseSetup(SVMLight_INFO, ParseSetup.GUESS_SEP, false,ParseSetup.NO_HEADER,1,null,new byte[]{Vec.T_NUM},null,null,null, new ParseWriter.ParseErr[0]); } + // This method was called during guess setup, lot of things are null, like ctypes. + // when it is called again, it either contains the guess column types or it will have user defined column types public ParseSetup(ParserInfo parse_type, byte sep, boolean singleQuotes, int checkHeader, int ncols, String[] columnNames, byte[] ctypes, String[][] domains, String[][] naStrings, String[][] data, ParseWriter.ParseErr[] errs, int chunkSize) { _parse_type = parse_type; _separator = sep; @@ -339,7 +347,7 @@ public GuessSetupTsk(ParseSetup userSetup) { || bv.length() <= FileVec.DFLT_CHUNK_SIZE || decompRatio > 1.0) { */ try { - _gblSetup = guessSetup(bits, _userSetup); + _gblSetup = guessSetup(bv, bits, _userSetup); for(ParseWriter.ParseErr e:_gblSetup._errs) { e._byteOffset += e._cidx*Parser.StreamData.bufSz; e._cidx = 0; @@ -389,6 +397,7 @@ public GuessSetupTsk(ParseSetup userSetup) { "Remaining files have been ignored."; }*/ } + _gblSetup.setFileName(FileUtils.keyToFileName(key)); } /** @@ -430,17 +439,17 @@ private ParseSetup mergeSetups(ParseSetup setupA, ParseSetup setupB, String file mergedSetup._check_header = unifyCheckHeader(setupA._check_header, setupB._check_header); mergedSetup._separator = unifyColumnSeparators(setupA._separator, setupB._separator); - mergedSetup._column_names = unifyColumnNames(setupA._column_names, setupB._column_names); if (setupA._parse_type.equals(ARFF_INFO) && setupB._parse_type.equals(CSV_INFO)) ;// do nothing parse_type and col_types are already set correctly else if (setupA._parse_type.equals(CSV_INFO) && setupB._parse_type.equals(ARFF_INFO)) { mergedSetup._parse_type = ARFF_INFO; mergedSetup._column_types = setupB._column_types; - } else if (setupA._parse_type.equals(setupB._parse_type)) { + } else if (setupA.isCompatible(setupB)) { mergedSetup._column_previews = PreviewParseWriter.unifyColumnPreviews(setupA._column_previews, setupB._column_previews); } else - throw new ParseDataset.H2OParseException("File type mismatch. Cannot parse files of type " - + setupA._parse_type + " and " + setupB._parse_type + " as one dataset."); + throw new ParseDataset.H2OParseException("File type mismatch. Cannot parse files " + setupA.file() + " and " + setupB.file() + " of type " + + setupA._parse_type.name() + " and " + setupB._parse_type.name() + " as one dataset."); + mergedSetup._column_names = unifyColumnNames(setupA._column_names, setupB._column_names); mergedSetup._number_columns = mergedSetup._parse_type.equals(CSV_INFO) ? Math.max(setupA._number_columns,setupB._number_columns):unifyColumnCount(setupA._number_columns, setupB._number_columns,mergedSetup, fileA, fileB); if (mergedSetup._data.length < PreviewParseWriter.MAX_PREVIEW_LINES) { int n = mergedSetup._data.length; @@ -449,6 +458,7 @@ else if (setupA._parse_type.equals(CSV_INFO) && setupB._parse_type.equals(ARFF_I System.arraycopy(setupB._data, 1, mergedSetup._data, n, m - n); } mergedSetup._errs = ArrayUtils.append(setupA._errs,setupB._errs); + mergedSetup._fileNames = ArrayUtils.append(setupA._fileNames,setupB._fileNames); if(mergedSetup._errs.length > 20) mergedSetup._errs = Arrays.copyOf(mergedSetup._errs,20); return mergedSetup; @@ -499,6 +509,18 @@ private static String[] unifyColumnNames(String[] namesA, String[] namesB){ } } + + private String file() { + String [] names = _fileNames; + if(names.length > 5) + names = Arrays.copyOf(names,5); + return Arrays.toString(names); + } + + protected boolean isCompatible(ParseSetup setupB) { + return _parse_type.equals(setupB._parse_type) && _number_columns == setupB._number_columns; + } + /** * Guess everything from a single pile-o-bits. Used in tests, or in initial * parser inspections when the user has not told us anything about separators @@ -507,14 +529,14 @@ private static String[] unifyColumnNames(String[] namesA, String[] namesB){ * @param bits Initial bytes from a parse source * @return ParseSetup settings from looking at all files */ - public static ParseSetup guessSetup( byte[] bits, ParseSetup userSetup ) { - return guessSetup(bits, userSetup._parse_type, userSetup._separator, GUESS_COL_CNT, userSetup._single_quotes, userSetup._check_header, userSetup._column_names, userSetup._column_types, null, null); + public static ParseSetup guessSetup( ByteVec bv, byte [] bits, ParseSetup userSetup ) { + return guessSetup(bv, bits, userSetup._parse_type, userSetup._separator, GUESS_COL_CNT, userSetup._single_quotes, userSetup._check_header, userSetup._column_names, userSetup._column_types, null, null); } - public static ParseSetup guessSetup(byte[] bits, ParserInfo parserType, byte sep, int ncols, boolean singleQuotes, int checkHeader, String[] columnNames, byte[] columnTypes, String[][] domains, String[][] naStrings ) { + public static ParseSetup guessSetup(ByteVec bv, byte [] bits, ParserInfo parserType, byte sep, int ncols, boolean singleQuotes, int checkHeader, String[] columnNames, byte[] columnTypes, String[][] domains, String[][] naStrings ) { ParserProvider pp = ParserService.INSTANCE.getByInfo(parserType); if (pp != null) { - return pp.guessSetup(bits, sep, ncols, singleQuotes, checkHeader, columnNames, columnTypes, domains, naStrings); + return pp.guessSetup(bv, bits, sep, ncols, singleQuotes, checkHeader, columnNames, columnTypes, domains, naStrings); } throw new ParseDataset.H2OParseException("Cannot determine file type."); } @@ -541,6 +563,7 @@ public static String createHexName(String n) { || n.endsWith("xls") || n.endsWith("txt") || n.endsWith("svm") + || n.endsWith("orc") || n.endsWith("arff"))) { n = n.substring(0, dot); dot = n.lastIndexOf('.'); diff --git a/h2o-core/src/main/java/water/parser/ParserProvider.java b/h2o-core/src/main/java/water/parser/ParserProvider.java index 584d76a7334c..01523f198642 100644 --- a/h2o-core/src/main/java/water/parser/ParserProvider.java +++ b/h2o-core/src/main/java/water/parser/ParserProvider.java @@ -1,23 +1,24 @@ package water.parser; -import water.Freezable; import water.Job; import water.Key; +import water.fvec.ByteVec; +import water.fvec.Vec; /** * Generic Parser provider. */ -public interface ParserProvider { +public abstract class ParserProvider { /** Technical information for this parser */ - ParserInfo info(); + public abstract ParserInfo info(); /** Create a new parser */ - Parser createParser(ParseSetup setup, Key jobKey); + public abstract Parser createParser(ParseSetup setup, Key jobKey); /** Returns parser setup of throws exception if input is not recognized */ // FIXME: should be more flexible - ParseSetup guessSetup(byte[] bits, byte sep, int ncols, boolean singleQuotes, int checkHeader, String[] columnNames, byte[] columnTypes, String[][] domains, String[][] naStrings ); + public abstract ParseSetup guessSetup(ByteVec v, byte[] bits, byte sep, int ncols, boolean singleQuotes, int checkHeader, String[] columnNames, byte[] columnTypes, String[][] domains, String[][] naStrings ); /** Create a parser specific setup. * @@ -26,5 +27,14 @@ public interface ParserProvider { * @param requiredSetup user given parser setup * @return parser specific setup */ - ParseSetup createParserSetup(Key[] inputs, ParseSetup requiredSetup); + public abstract ParseSetup createParserSetup(Key[] inputs, ParseSetup requiredSetup); + + /** + * Executed exactly once per-file-per-node during parse. + * Do any file-related non-distributed setup here. E.g. ORC reader creates node-shared instance of a (non-serializable) Reader. + * @param v + * @param setup + */ + + public ParseSetup setupLocal(Vec v, ParseSetup setup){ return setup;} } diff --git a/h2o-core/src/main/java/water/rapids/ast/prims/mungers/AstFlatten.java b/h2o-core/src/main/java/water/rapids/ast/prims/mungers/AstFlatten.java index 4eb1033c0160..a62bb494cbf7 100644 --- a/h2o-core/src/main/java/water/rapids/ast/prims/mungers/AstFlatten.java +++ b/h2o-core/src/main/java/water/rapids/ast/prims/mungers/AstFlatten.java @@ -39,11 +39,11 @@ public Val apply(Env env, Env.StackHelp stk, AstRoot asts[]) { case Vec.T_NUM: return new ValNum(vec.at(0)); case Vec.T_TIME: - return new ValNum(vec.at8(0)); + return vec.isNA(0)?new ValNum(Double.NaN) : new ValNum(vec.at8(0)); // check for missing values case Vec.T_STR: return new ValStr(vec.atStr(new BufferedString(), 0).toString()); - case Vec.T_CAT: - return new ValStr(vec.factor(vec.at8(0))); + case Vec.T_CAT: // check for missing values + return vec.isNA(0)?new ValStr("NA") : new ValStr(vec.factor(vec.at8(0))); default: throw H2O.unimpl("The type of vector: " + vec.get_type_str() + " is not supported by " + str()); } diff --git a/h2o-core/src/main/java/water/util/Log.java b/h2o-core/src/main/java/water/util/Log.java index 51e8f07703f9..438930a04ffa 100644 --- a/h2o-core/src/main/java/water/util/Log.java +++ b/h2o-core/src/main/java/water/util/Log.java @@ -1,16 +1,16 @@ package water.util; -import java.io.File; -import java.io.PrintWriter; -import java.io.StringWriter; -import java.util.ArrayList; - import org.apache.log4j.H2OPropertyConfigurator; import org.apache.log4j.LogManager; import org.apache.log4j.PropertyConfigurator; import water.H2O; import water.persist.PersistManager; +import java.io.File; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.ArrayList; + /** Log for H2O. * * OOME: when the VM is low on memory, OutOfMemoryError can be thrown in the @@ -302,6 +302,9 @@ private static void setLog4jProperties(String logDir, java.util.Properties p) th p.setProperty("log4j.logger.org.reflections.Reflections", "ERROR"); p.setProperty("log4j.logger.com.brsanthu.googleanalytics", "ERROR"); + // Turn down the logging for external libraries that Orc parser depends on + p.setProperty("log4j.logger.org.apache.hadoop.util.NativeCodeLoader", "ERROR"); + // See the following document for information about the pattern layout. // http://logging.apache.org/log4j/1.2/apidocs/org/apache/log4j/PatternLayout.html // diff --git a/h2o-core/src/test/java/water/TestUtil.java b/h2o-core/src/test/java/water/TestUtil.java index 692974c0d17d..05f69aeaa1d2 100644 --- a/h2o-core/src/test/java/water/TestUtil.java +++ b/h2o-core/src/test/java/water/TestUtil.java @@ -9,6 +9,7 @@ import org.junit.runners.model.Statement; import water.fvec.*; import water.parser.BufferedString; +import water.parser.DefaultParserProviders; import water.parser.ParseDataset; import water.parser.ParseSetup; import water.util.Log; @@ -215,6 +216,7 @@ protected static Frame parse_test_file( Key outputKey, String fname) { NFSFileVec nfs = NFSFileVec.make(f); return ParseDataset.parse(outputKey, nfs._key); } + protected Frame parse_test_file( Key outputKey, String fname , boolean guessSetup) { File f = find_test_file(fname); assert f != null && f.exists():" file not found: " + fname; @@ -222,6 +224,38 @@ protected Frame parse_test_file( Key outputKey, String fname , boolean guessSetu return ParseDataset.parse(outputKey, new Key[]{nfs._key}, true, ParseSetup.guessSetup(new Key[]{nfs._key},false,1)); } + protected Frame parse_test_file( String fname, String na_string, int check_header, byte[] column_types ) { + File f = find_test_file_static(fname); + assert f != null && f.exists():" file not found: " + fname; + NFSFileVec nfs = NFSFileVec.make(f); + + Key[] res = {nfs._key}; + + // create new parseSetup in order to store our na_string + ParseSetup p = ParseSetup.guessSetup(res, new ParseSetup(DefaultParserProviders.GUESS_INFO,(byte) ',',true, + check_header,0,null,null,null,null,null)); + + // add the na_strings into p. + if (na_string != null) { + int column_number = p.getColumnTypes().length; + int na_length = na_string.length() - 1; + + String[][] na_strings = new String[column_number][na_length + 1]; + + for (int index = 0; index < column_number; index++) { + na_strings[index][na_length] = na_string; + } + + p.setNAStrings(na_strings); + } + + if (column_types != null) + p.setColumnTypes(column_types); + + return ParseDataset.parse(Key.make(), res, true, p); + + } + /** Find & parse a folder of CSV files. NPE if file not found. * @param fname Test filename * @return Frame or NPE */ @@ -239,6 +273,53 @@ protected Frame parse_test_folder( String fname ) { return ParseDataset.parse(Key.make(), res); } + + /** + * Parse a folder with csv files when a single na_string is specified. + * + * @param fname + * @param na_string + * @return + */ + protected Frame parse_test_folder( String fname, String na_string, int check_header, byte[] column_types ) { + File folder = find_test_file(fname); + assert folder.isDirectory(); + File[] files = folder.listFiles(); + Arrays.sort(files); + ArrayList keys = new ArrayList<>(); + for( File f : files ) + if( f.isFile() ) + keys.add(NFSFileVec.make(f)._key); + + Key[] res = new Key[keys.size()]; + keys.toArray(res); // generated the necessary key here + + // create new parseSetup in order to store our na_string + ParseSetup p = ParseSetup.guessSetup(res, new ParseSetup(DefaultParserProviders.GUESS_INFO,(byte) ',',true, + check_header,0,null,null,null,null,null)); + + // add the na_strings into p. + if (na_string != null) { + int column_number = p.getColumnTypes().length; + int na_length = na_string.length() - 1; + + String[][] na_strings = new String[column_number][na_length + 1]; + + for (int index = 0; index < column_number; index++) { + na_strings[index][na_length] = na_string; + } + + p.setNAStrings(na_strings); + } + + if (column_types != null) + p.setColumnTypes(column_types); + + return ParseDataset.parse(Key.make(), res, true, p); + + } + + /** A Numeric Vec from an array of ints * @param rows Data * @return The Vec */ diff --git a/h2o-core/src/test/java/water/parser/ParserTest2.java b/h2o-core/src/test/java/water/parser/ParserTest2.java index 397faa59b2e9..b841ce9921c0 100644 --- a/h2o-core/src/test/java/water/parser/ParserTest2.java +++ b/h2o-core/src/test/java/water/parser/ParserTest2.java @@ -1,14 +1,18 @@ package water.parser; -import org.junit.*; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; +import water.Key; +import water.TestUtil; +import water.fvec.Frame; +import water.fvec.Vec; +import water.util.PrettyPrint; import java.util.Random; import java.util.UUID; -import water.*; -import water.fvec.*; -import water.util.PrettyPrint; - import static water.parser.DefaultParserProviders.CSV_INFO; public class ParserTest2 extends TestUtil { @@ -76,7 +80,7 @@ private static void testParsed(Frame fr, String[][] expected) { ar("'Tomas''s","test2'","test2",null), ar("last","'line''s","trailing","piece'") }; Key k = ParserTest.makeByteVec(data); - ParseSetup gSetupF = ParseSetup.guessSetup(data[0].getBytes(), CSV_INFO, (byte)',', 4, false/*single quote*/, ParseSetup.NO_HEADER, null, null, null, null); + ParseSetup gSetupF = ParseSetup.guessSetup(null, data[0].getBytes(), CSV_INFO, (byte)',', 4, false/*single quote*/, ParseSetup.NO_HEADER, null, null, null, null); gSetupF._column_types = ParseSetup.strToColumnTypes(new String[]{"Enum", "Enum", "Enum", "Enum"}); Frame frF = ParseDataset.parse(Key.make(), new Key[]{k}, false, gSetupF); testParsed(frF,expectFalse); @@ -84,7 +88,7 @@ private static void testParsed(Frame fr, String[][] expected) { String[][] expectTrue = new String[][] { ar("Tomass,test,first,line", null), ar("Tomas''stest2","test2"), ar("last", "lines trailing piece") }; - ParseSetup gSetupT = ParseSetup.guessSetup(data[0].getBytes(), CSV_INFO, (byte)',', 2, true/*single quote*/, ParseSetup.NO_HEADER, null, null, null, null); + ParseSetup gSetupT = ParseSetup.guessSetup(null, data[0].getBytes(), CSV_INFO, (byte)',', 2, true/*single quote*/, ParseSetup.NO_HEADER, null, null, null, null); gSetupT._column_types = ParseSetup.strToColumnTypes(new String[]{"Enum", "Enum", "Enum", "Enum"}); Frame frT = ParseDataset.parse(Key.make(), new Key[]{k}, true, gSetupT); //testParsed(frT,expectTrue); // not currently passing diff --git a/h2o-core/src/test/java/water/rapids/GroupingBench.java b/h2o-core/src/test/java/water/rapids/GroupingBench.java index 09ca2dd8923a..f126ca156fba 100644 --- a/h2o-core/src/test/java/water/rapids/GroupingBench.java +++ b/h2o-core/src/test/java/water/rapids/GroupingBench.java @@ -254,7 +254,7 @@ public class GroupingBench extends TestUtil { @Test public void runBench2() { Frame f1=null, f2=null, fx=null; - try { + try { // build a hi count cardinality frame final long card = (long)1e4; f1 = buildFrame(card,-1); diff --git a/h2o-hadoop/assemblyjar.gradle b/h2o-hadoop/assemblyjar.gradle index bfb7058655b6..4b90f0ba5f61 100644 --- a/h2o-hadoop/assemblyjar.gradle +++ b/h2o-hadoop/assemblyjar.gradle @@ -25,6 +25,17 @@ dependencies { if (project.hasProperty("maprExtraDependency")) { compile(project.property("maprExtraDependency")) } + if (orcSupported) { + compile(project(":h2o-orc-parser")) { + // We do not get any dependencies but directly rely on provided environment + transitive = false + } + + // Here we depends on hive-exec, but it is Hadoop version specific + compile("org.apache.hive:hive-exec:$orcHiveExecVersion") { + transitive = false + } + } } @@ -46,6 +57,7 @@ shadowJar { manifest { attributes 'Main-Class': 'water.hadoop.h2odriver' } + zip64 true } artifacts { diff --git a/h2o-hadoop/h2o-cdh5.2-assembly/build.gradle b/h2o-hadoop/h2o-cdh5.2-assembly/build.gradle index f423ca0be309..1525220449ed 100644 --- a/h2o-hadoop/h2o-cdh5.2-assembly/build.gradle +++ b/h2o-hadoop/h2o-cdh5.2-assembly/build.gradle @@ -1,6 +1,8 @@ ext { hadoopVersion = 'cdh5.2' hadoopMavenArtifactVersion = '2.5.0-cdh5.2.0' + orcSupported = true + orcHiveExecVersion = '0.13.1-cdh5.2.0' } apply from: '../assemblyjar.gradle' diff --git a/h2o-hadoop/h2o-cdh5.2/build.gradle b/h2o-hadoop/h2o-cdh5.2/build.gradle index f11842011daa..7c7e5a497b8d 100644 --- a/h2o-hadoop/h2o-cdh5.2/build.gradle +++ b/h2o-hadoop/h2o-cdh5.2/build.gradle @@ -1,6 +1,7 @@ ext { hadoopVersion = 'cdh5.2' hadoopMavenArtifactVersion = '2.5.0-cdh5.2.0' + orcSupported = false } apply from: '../driverjar.gradle' diff --git a/h2o-hadoop/h2o-cdh5.3-assembly/build.gradle b/h2o-hadoop/h2o-cdh5.3-assembly/build.gradle index 2cb863125a04..d257a648aa77 100644 --- a/h2o-hadoop/h2o-cdh5.3-assembly/build.gradle +++ b/h2o-hadoop/h2o-cdh5.3-assembly/build.gradle @@ -1,6 +1,8 @@ ext { hadoopVersion = 'cdh5.3' hadoopMavenArtifactVersion = '2.5.0-cdh5.3.0' + orcSupported = true + orcHiveExecVersion = '0.13.1-cdh5.3.0' } apply from: '../assemblyjar.gradle' diff --git a/h2o-hadoop/h2o-cdh5.4.2-assembly/build.gradle b/h2o-hadoop/h2o-cdh5.4.2-assembly/build.gradle index 2cb863125a04..56e97f6cbcec 100644 --- a/h2o-hadoop/h2o-cdh5.4.2-assembly/build.gradle +++ b/h2o-hadoop/h2o-cdh5.4.2-assembly/build.gradle @@ -1,6 +1,8 @@ ext { - hadoopVersion = 'cdh5.3' - hadoopMavenArtifactVersion = '2.5.0-cdh5.3.0' + hadoopVersion = 'cdh5.4.2' + hadoopMavenArtifactVersion = '2.5.0-cdh5.4.2' + orcSupported = true + orcHiveExecVersion = "1.1.0-$hadoopVersion" } apply from: '../assemblyjar.gradle' diff --git a/h2o-hadoop/h2o-cdh5.5.3-assembly/build.gradle b/h2o-hadoop/h2o-cdh5.5.3-assembly/build.gradle index 175f6d6201ce..69c380972960 100644 --- a/h2o-hadoop/h2o-cdh5.5.3-assembly/build.gradle +++ b/h2o-hadoop/h2o-cdh5.5.3-assembly/build.gradle @@ -1,6 +1,8 @@ ext { hadoopVersion = 'cdh5.5.3' - hadoopMavenArtifactVersion = '2.6.0-cdh5.5.2' + hadoopMavenArtifactVersion = '2.6.0-cdh5.5.4' + orcSupported = true + orcHiveExecVersion = "1.1.0-cdh5.5.4" } apply from: '../assemblyjar.gradle' diff --git a/h2o-hadoop/h2o-cdh5.6.0-assembly/build.gradle b/h2o-hadoop/h2o-cdh5.6.0-assembly/build.gradle index 9ab45de6b73e..6ae418505119 100644 --- a/h2o-hadoop/h2o-cdh5.6.0-assembly/build.gradle +++ b/h2o-hadoop/h2o-cdh5.6.0-assembly/build.gradle @@ -1,6 +1,8 @@ ext { hadoopVersion = 'cdh5.6.0' hadoopMavenArtifactVersion = '2.6.0-cdh5.6.0' + orcSupported = true + orcHiveExecVersion = "1.1.0-$hadoopVersion" } apply from: '../assemblyjar.gradle' diff --git a/h2o-hadoop/h2o-cdh5.7.0-assembly/build.gradle b/h2o-hadoop/h2o-cdh5.7.0-assembly/build.gradle index e415be8d18f9..e10aca76f2f8 100644 --- a/h2o-hadoop/h2o-cdh5.7.0-assembly/build.gradle +++ b/h2o-hadoop/h2o-cdh5.7.0-assembly/build.gradle @@ -1,6 +1,8 @@ ext { hadoopVersion = 'cdh5.7.0' hadoopMavenArtifactVersion = '2.6.0-cdh5.7.0' + orcSupported = true + orcHiveExecVersion = "1.1.0-$hadoopVersion" } apply from: '../assemblyjar.gradle' diff --git a/h2o-hadoop/h2o-hdp2.1-assembly/build.gradle b/h2o-hadoop/h2o-hdp2.1-assembly/build.gradle index db145e059a2f..a50f2cc92f88 100644 --- a/h2o-hadoop/h2o-hdp2.1-assembly/build.gradle +++ b/h2o-hadoop/h2o-hdp2.1-assembly/build.gradle @@ -1,6 +1,9 @@ ext { hadoopVersion = 'hdp2.1' hadoopMavenArtifactVersion = '2.4.0.2.1.1.0-385' + orcSupported = true + orcHiveExecVersion = "0.13.0" + } apply from: '../assemblyjar.gradle' diff --git a/h2o-hadoop/h2o-hdp2.2-assembly/build.gradle b/h2o-hadoop/h2o-hdp2.2-assembly/build.gradle index 768a0d4890e4..9a35a615a70d 100644 --- a/h2o-hadoop/h2o-hdp2.2-assembly/build.gradle +++ b/h2o-hadoop/h2o-hdp2.2-assembly/build.gradle @@ -1,6 +1,8 @@ ext { hadoopVersion = 'hdp2.2' hadoopMavenArtifactVersion = '2.6.0.2.2.0.0-2041' + orcSupported = true + orcHiveExecVersion = "0.14.0" } apply from: '../assemblyjar.gradle' diff --git a/h2o-hadoop/h2o-hdp2.3-assembly/build.gradle b/h2o-hadoop/h2o-hdp2.3-assembly/build.gradle index ee950735a84c..4745a633a6e1 100644 --- a/h2o-hadoop/h2o-hdp2.3-assembly/build.gradle +++ b/h2o-hadoop/h2o-hdp2.3-assembly/build.gradle @@ -1,6 +1,9 @@ ext { hadoopVersion = 'hdp2.3' hadoopMavenArtifactVersion = '2.7.1.2.3.2.0-2950' + orcSupported = true + orcHiveExecVersion = "1.2.1" + } apply from: '../assemblyjar.gradle' diff --git a/h2o-hadoop/h2o-hdp2.4-assembly/build.gradle b/h2o-hadoop/h2o-hdp2.4-assembly/build.gradle index b6980e300c4f..df351650f75a 100644 --- a/h2o-hadoop/h2o-hdp2.4-assembly/build.gradle +++ b/h2o-hadoop/h2o-hdp2.4-assembly/build.gradle @@ -1,6 +1,8 @@ ext { hadoopVersion = 'hdp2.4' hadoopMavenArtifactVersion = '2.7.1.2.4.0.0-169' + orcSupported = true + orcHiveExecVersion = "1.2.1" } apply from: '../assemblyjar.gradle' diff --git a/h2o-hadoop/h2o-mapr3.1.1-assembly/build.gradle b/h2o-hadoop/h2o-mapr3.1.1-assembly/build.gradle index b6c992030ce7..6bebf304757f 100644 --- a/h2o-hadoop/h2o-mapr3.1.1-assembly/build.gradle +++ b/h2o-hadoop/h2o-mapr3.1.1-assembly/build.gradle @@ -3,6 +3,7 @@ ext { hadoopVersion = 'mapr3.1.1' hadoopMavenArtifactVersion = '1.0.3-mapr-3.1.1' maprExtraDependency = 'org.json:org.json:chargebee-1.0' + orcSupported = false } apply from: '../assemblyjar.gradle' diff --git a/h2o-hadoop/h2o-mapr4.0.1-assembly/build.gradle b/h2o-hadoop/h2o-mapr4.0.1-assembly/build.gradle index 932ab3273d84..312e67cf6301 100644 --- a/h2o-hadoop/h2o-mapr4.0.1-assembly/build.gradle +++ b/h2o-hadoop/h2o-mapr4.0.1-assembly/build.gradle @@ -2,6 +2,7 @@ ext { hadoopVersion = 'mapr4.0.1' hadoopMavenArtifactVersion = '2.4.1-mapr-1408' maprExtraDependency = 'org.json:org.json:chargebee-1.0' + orcSupported = false } apply from: '../assemblyjar.gradle' diff --git a/h2o-hadoop/h2o-mapr5.0-assembly/build.gradle b/h2o-hadoop/h2o-mapr5.0-assembly/build.gradle index a64577c5d49d..2400cf85eecc 100644 --- a/h2o-hadoop/h2o-mapr5.0-assembly/build.gradle +++ b/h2o-hadoop/h2o-mapr5.0-assembly/build.gradle @@ -2,6 +2,7 @@ ext { hadoopVersion = 'mapr5.0' hadoopMavenArtifactVersion = '2.7.0-mapr-1506' maprExtraDependency = 'org.json:org.json:chargebee-1.0' + orcSupported = false } apply from: '../assemblyjar.gradle' diff --git a/h2o-hadoop/h2o-mapr5.1-assembly/build.gradle b/h2o-hadoop/h2o-mapr5.1-assembly/build.gradle index f1be24728c07..2229c9d0c788 100644 --- a/h2o-hadoop/h2o-mapr5.1-assembly/build.gradle +++ b/h2o-hadoop/h2o-mapr5.1-assembly/build.gradle @@ -2,6 +2,7 @@ ext { hadoopVersion = 'mapr5.1' hadoopMavenArtifactVersion = '2.7.0-mapr-1506' maprExtraDependency = 'org.json:org.json:chargebee-1.0' + orcSupported = false } apply from: '../assemblyjar.gradle' diff --git a/h2o-parsers/h2o-avro-parser/src/main/java/water/parser/avro/AvroParserProvider.java b/h2o-parsers/h2o-avro-parser/src/main/java/water/parser/avro/AvroParserProvider.java index 4966d81db5d2..2cb1edc95a69 100644 --- a/h2o-parsers/h2o-avro-parser/src/main/java/water/parser/avro/AvroParserProvider.java +++ b/h2o-parsers/h2o-avro-parser/src/main/java/water/parser/avro/AvroParserProvider.java @@ -16,7 +16,7 @@ /** * Avro parser provider. */ -public class AvroParserProvider implements ParserProvider { +public class AvroParserProvider extends ParserProvider { /* Setup for this parser */ static ParserInfo AVRO_INFO = new ParserInfo("AVRO", DefaultParserProviders.MAX_CORE_PRIO + 10, true, true); @@ -32,7 +32,7 @@ public Parser createParser(ParseSetup setup, Key jobKey) { } @Override - public ParseSetup guessSetup(byte[] bits, byte sep, int ncols, boolean singleQuotes, + public ParseSetup guessSetup(ByteVec bv, byte[] bits, byte sep, int ncols, boolean singleQuotes, int checkHeader, String[] columnNames, byte[] columnTypes, String[][] domains, String[][] naStrings) { return AvroParser.guessSetup(bits); diff --git a/h2o-parsers/h2o-orc-parser/build.gradle b/h2o-parsers/h2o-orc-parser/build.gradle new file mode 100644 index 000000000000..550186147979 --- /dev/null +++ b/h2o-parsers/h2o-orc-parser/build.gradle @@ -0,0 +1,38 @@ +// +// H2O Orc Parser +// +description = "H2O Orc Parser" + +dependencies { + compile project(":h2o-core") + // Only PersistHDFS API + compile(project(":h2o-persist-hdfs")) { + transitive = false + } + + // Note: What is connection between hive-exec version and hadoop-version and orc version? + // Note: In this case we are using hive version which is compatible with $orcDefaultHadoopClientVersion + // Note: for newest version it should be replaces by hive-orc + compile("org.apache.hive:hive-exec:$orcDefaultHiveExecVersion") { + transitive = false + } + // For compilation we need common + compile("org.apache.hadoop:hadoop-common:$orcDefaultHadoopClientVersion") { + transitive = false + } + + testCompile "junit:junit:${junitVersion}" + testCompile project(path: ":h2o-core", configuration: "testArchives") + // We need correct version of MapRe Hadoop to run JUnits + testCompile("org.apache.hadoop:hadoop-client:$orcDefaultHadoopClientVersion") +} + +apply from: "${rootDir}/gradle/dataCheck.gradle" + +test { + dependsOn ":h2o-core:testJar" + dependsOn smalldataCheck, cpLibs, jar, testJar, testMultiNode + + // Defeat task 'test' by running no tests. + exclude '**' +} diff --git a/h2o-parsers/h2o-orc-parser/src/main/java/water/parser/orc/OrcParser.java b/h2o-parsers/h2o-orc-parser/src/main/java/water/parser/orc/OrcParser.java new file mode 100644 index 000000000000..6a579591cff6 --- /dev/null +++ b/h2o-parsers/h2o-orc-parser/src/main/java/water/parser/orc/OrcParser.java @@ -0,0 +1,614 @@ +package water.parser.orc; + +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.exec.vector.*; +import org.apache.hadoop.hive.ql.io.orc.Reader; +import org.apache.hadoop.hive.ql.io.orc.RecordReader; +import org.apache.hadoop.hive.ql.io.orc.StripeInformation; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.objectinspector.*; +import org.joda.time.DateTime; +import org.joda.time.MutableDateTime; +import water.H2O; +import water.Job; +import water.Key; +import water.fvec.Vec; +import water.parser.*; +import water.util.ArrayUtils; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; + +import static water.parser.orc.OrcUtil.isSupportedSchema; +import static water.parser.orc.OrcUtil.schemaToColumnType; + +// Orc support + +/** + * ORC parser for H2O distributed parsing subsystem. + * + * Basically, here is the plan: + * To parse an Orc file, we need to do the following in order to get the following useful + * information: + * 1. Get a Reader rdr. + * 2. From the reader rdr, we can get the following pieces of information: + * a. number of columns, column types and column names. We only support parsing of primitive types; + * b. Lists of StripeInformation that describes how many stripes of data that we will need to read; + * c. For each stripe, get information like rows per stripe, data size in bytes + * 3. The plan is to read the file in parallel in whole numbers of stripes. + * 4. Inside each stripe, we will read data out in batches of VectorizedRowBatch (1024 rows or less). + * + */ +public class OrcParser extends Parser { + + /** Orc Info */ + private final Reader orcFileReader; // can generate all the other fields from this reader + public static final int DAY_TO_MS = 24*3600*1000; + public static final int ADD_OFFSET = 8*3600*1000; + public static final int HOUR_OFFSET = 3600000; // in ms to offset for leap seconds, years + private MutableDateTime epoch = new MutableDateTime(); // used to help us out the leap seconds, years + private ArrayList storeWarnings = new ArrayList(); // store a list of warnings + + + OrcParser(ParseSetup setup, Key jobKey) { + super(setup, jobKey); + + epoch.setDate(0); // used to figure out leap seconds, years + + this.orcFileReader = ((OrcParser.OrcParseSetup) setup).orcFileReader; + } + + private transient int _cidx; + + private transient HashMap> _toStringMaps = new HashMap<>(); + /** + * This method calculates the number of stripes that will be read for each chunk. Since + * only single threading is supported in reading each stripe, we will never split one stripe + * over different chunks. + * + * @param chunkId: chunk index, calculated as file size/chunk size. The file size is calculated + * with data plus overhead in terms of headers and other info, number of chunks + * calculated will be higher than the actual chunks needed. If the chunk number + * is too high, the method will return without writing to + * dout. + * @param din: ParseReader, not used for parsing orc files + * @param dout: ParseWriter, used to add data to H2O frame. + * @return: Parsewriter dout. + */ + @Override + protected final ParseWriter parseChunk(int chunkId, ParseReader din, ParseWriter dout) { + _cidx = chunkId; + // only do something if within file size and the orc file is not empty + List stripesInfo = ((OrcParseSetup) this._setup).getStripes(); + if(stripesInfo.size() == 0) { + dout.addError(new ParseWriter.ParseErr("Orc Parser: Empty file.", chunkId, 0L, -2L)); + return dout; // empty file + } + OrcParseSetup setup = (OrcParseSetup) this._setup; + StripeInformation thisStripe = stripesInfo.get(chunkId); // get one stripe + // write one stripe of data to H2O frame + String [] orcTypes = setup.getColumnTypesString(); + boolean[] toInclude = setup.getToInclude(); + try { + RecordReader perStripe = orcFileReader.rows(thisStripe.getOffset(), thisStripe.getDataLength(), + setup.getToInclude(), null, setup.getColumnNames()); + VectorizedRowBatch batch = null; + long rows = 0; + long rowCount = thisStripe.getNumberOfRows(); + while (rows != rowCount) { + batch = perStripe.nextBatch(batch); // read orc file stripes in vectorizedRowBatch + long currentBatchRow = batch.count(); + int nrows = (int)currentBatchRow; + if(currentBatchRow != nrows) + throw new IllegalArgumentException("got batch with too many records, does not fit in int"); + ColumnVector[] dataVectors = batch.cols; + int colIndex = 0; + for (int col = 0; col < batch.numCols; ++col) { // read one column at a time; + if (toInclude[col + 1]) { // only write a column if we actually want it + write1column(dataVectors[col], orcTypes[colIndex], colIndex, nrows, dout); + colIndex++; + } + } + rows += currentBatchRow; // record number of rows of data actually read + } + perStripe.close(); + } catch(IOException ioe) { + throw new RuntimeException(ioe); + } + return dout; + } + + + /** + * This method writes one column of H2O data frame at a time. + * + * @param oneColumn + * @param columnType + * @param cIdx + * @param rowNumber + * @param dout + */ + private void write1column(ColumnVector oneColumn, String columnType, int cIdx, int rowNumber,ParseWriter dout) { + if(oneColumn.isRepeating && !oneColumn.noNulls) { // ALL NAs + for(int i = 0; i < rowNumber; ++i) + dout.addInvalidCol(cIdx); + } else switch (columnType.toLowerCase()) { + case "bigint": + case "boolean": + case "int": + case "smallint": + case "tinyint": + writeLongcolumn((LongColumnVector)oneColumn, cIdx, rowNumber, dout); + break; + case "float": + case "double": + writeDoublecolumn((DoubleColumnVector)oneColumn, cIdx, rowNumber, dout); + break; + case "numeric": + case "real": + if (oneColumn instanceof LongColumnVector) + writeLongcolumn((LongColumnVector)oneColumn, cIdx, rowNumber, dout); + else + writeDoublecolumn((DoubleColumnVector)oneColumn, cIdx, rowNumber, dout); + break; + case "string": + case "varchar": + case "char": +// case "binary": //FIXME: only reading it as string right now. + writeStringcolumn((BytesColumnVector)oneColumn, cIdx, rowNumber, dout); + break; + case "date": + case "timestamp": + writeTimecolumn((LongColumnVector)oneColumn, columnType, cIdx, rowNumber, dout); + break; + case "decimal": + writeDecimalcolumn((DecimalColumnVector)oneColumn, cIdx, rowNumber, dout); + break; + default: + throw new IllegalArgumentException("Unsupported Orc schema type: " + columnType); + } + } + + /** + * This method is written to take care of the leap seconds, leap year effects. Our original + * plan of converting number of days from epoch does not quite work out right due to all these + * leap seconds, years accumulated over the century. However, I do notice that when we are + * not correcting for the leap seconds/years, if we build a dateTime object, the hour does not + * work out to be 00. Instead it is off. In this case, we just calculate the offset and take + * if off our straight forward timestamp calculation. + * + * @param daysSinceEpoch: number of days since epoch (1970 1/1) + * @return long: correct timestamp corresponding to daysSinceEpoch + */ + private long correctTimeStamp(long daysSinceEpoch) { + long timestamp = (daysSinceEpoch*DAY_TO_MS+ADD_OFFSET); + DateTime date = new DateTime(timestamp); + int hour = date.hourOfDay().get(); + if (hour == 0) + return timestamp; + else + return (timestamp-hour*HOUR_OFFSET); + } + + /** + * This method writes one column of H2O frame for column type timestamp. This is just a long that + * records the number of seconds since Jan 1, 2015. + * + * @param col + * @param cIdx + * @param rowNumber + * @param dout + */ + private void writeTimecolumn(LongColumnVector col, String columnType,int cIdx, + int rowNumber, ParseWriter dout) { + boolean timestamp = columnType.equals("timestamp"); + long [] oneColumn = col.vector; + if(col.isRepeating) { + long val = timestamp ? oneColumn[0] / 1000000 : correctTimeStamp(oneColumn[0]); + for (int rowIndex = 0; rowIndex < rowNumber; rowIndex++) + dout.addNumCol(cIdx, val, 0); + } else if(col.noNulls) { + for (int rowIndex = 0; rowIndex < rowNumber; rowIndex++) + dout.addNumCol(cIdx, timestamp ? oneColumn[rowIndex] / 1000000 : correctTimeStamp(oneColumn[rowIndex]), 0); + } else { + boolean[] isNull = col.isNull; + for (int rowIndex = 0; rowIndex < rowNumber; rowIndex++) { + if (isNull[rowIndex]) + dout.addInvalidCol(cIdx); + else + dout.addNumCol(cIdx, timestamp ? oneColumn[rowIndex] / 1000000 : correctTimeStamp(oneColumn[rowIndex]), 0); + } + } + } + + /** + * This method writes a column to H2O frame for column type Decimal. It is just written as some + * integer without using the scale field. Need to make sure this is what the customer wants. + * + * @param col + * @param cIdx + * @param rowNumber + * @param dout + */ + private void writeDecimalcolumn(DecimalColumnVector col, int cIdx, + int rowNumber, ParseWriter dout) { + HiveDecimalWritable[] oneColumn = col.vector; + if(col.isRepeating) { + HiveDecimal hd = oneColumn[0].getHiveDecimal(); + for (int rowIndex = 0; rowIndex < rowNumber; rowIndex++) + dout.addNumCol(cIdx, hd.unscaledValue().longValue(),-hd.scale()); + } else if(col.noNulls) { + for (int rowIndex = 0; rowIndex < rowNumber; rowIndex++) { + HiveDecimal hd = oneColumn[rowIndex].getHiveDecimal(); + dout.addNumCol(cIdx, hd.unscaledValue().longValue(),-hd.scale()); + } + } else { + boolean [] isNull = col.isNull; + for (int rowIndex = 0; rowIndex < rowNumber; rowIndex++) { + if (isNull[rowIndex]) + dout.addInvalidCol(cIdx); + else { + HiveDecimal hd = oneColumn[rowIndex].getHiveDecimal(); + dout.addNumCol(cIdx, hd.unscaledValue().longValue(), -hd.scale()); + } + } + } + } + + /** + * This method writes a column of H2O frame for Orc File column types of string, varchar, char and + * binary at some point. + * + * @param col + * @param cIdx + * @param rowNumber + * @param dout + */ + private void writeStringcolumn(BytesColumnVector col, int cIdx, int rowNumber, ParseWriter dout) { + BufferedString bs = new BufferedString(); + if(col.isRepeating) { + dout.addStrCol(cIdx, bs.set(col.vector[0], col.start[0], col.length[0])); + for (int rowIndex = 1; rowIndex < rowNumber; ++rowIndex) + dout.addStrCol(cIdx, bs); + } else if(col.noNulls){ + for (int rowIndex = 0; rowIndex < rowNumber; rowIndex++) + dout.addStrCol(cIdx, bs.set(col.vector[rowIndex], col.start[rowIndex], col.length[rowIndex])); + } else { + boolean [] isNull = col.isNull; + for (int rowIndex = 0; rowIndex < rowNumber; rowIndex++) { + if (isNull[rowIndex]) + dout.addInvalidCol(cIdx); + else + dout.addStrCol(cIdx, bs.set(col.vector[rowIndex], col.start[rowIndex], col.length[rowIndex])); + } + } + } + + + /** + * This method writes a column of H2O frame for Orc File column type of float or double. + * + * @param vec + * @param colId + * @param rowNumber + * @param dout + */ + private void writeDoublecolumn(DoubleColumnVector vec, int colId, int rowNumber, ParseWriter dout) { + double[] oneColumn = vec.vector; + byte t = _setup.getColumnTypes()[colId]; + switch(t) { + case Vec.T_CAT: + if(_toStringMaps.get(colId) == null) + _toStringMaps.put(colId,new HashMap()); + HashMap map = _toStringMaps.get(colId); + BufferedString bs = new BufferedString(); + if(vec.isRepeating) { + bs.set(Double.toString(oneColumn[0]).getBytes()); + for (int i = 0; i < rowNumber; ++i) + dout.addStrCol(colId, bs); + } else if (vec.noNulls) { + for (int i = 0; i < rowNumber; i++) { + double d = oneColumn[i]; + if(map.get(d) == null) // TODO probably more effficient if moved to the data output + map.put(d, Double.toString(d).getBytes()); + dout.addStrCol(colId, bs.set(map.get(d))); + } + } else { + for (int i = 0; i < rowNumber; i++) { + boolean [] isNull = vec.isNull; + if (isNull[i]) + dout.addInvalidCol(colId); + else { + double d = oneColumn[i]; + if(map.get(d) == null) + map.put(d,Double.toString(d).getBytes()); + dout.addStrCol(colId, bs.set(map.get(d))); + } + } + } + break; + default: + if(vec.isRepeating) { + for (int i = 0; i < rowNumber; ++i) + dout.addNumCol(colId, oneColumn[0]); + } else if (vec.noNulls) { + for (int rowIndex = 0; rowIndex < rowNumber; rowIndex++) + dout.addNumCol(colId, oneColumn[rowIndex]); + } else { + boolean [] isNull = vec.isNull; + for (int rowIndex = 0; rowIndex < rowNumber; rowIndex++) { + if (isNull[rowIndex]) dout.addInvalidCol(colId); + else dout.addNumCol(colId, oneColumn[rowIndex]); + } + } + break; + } + } + + /** + * This method writes a column of H2O frame for Orc File column type of boolean, bigint, int, smallint, + * tinyint and date. + * + * @param vec + * @param colId + * @param rowNumber + * @param dout + */ + private void writeLongcolumn(LongColumnVector vec, int colId, int rowNumber, ParseWriter dout) { + long[] oneColumn = vec.vector; + byte t = _setup.getColumnTypes()[colId]; + switch(t) { + case Vec.T_CAT: + if(_toStringMaps.get(colId) == null) + _toStringMaps.put(colId,new HashMap()); + HashMap map = _toStringMaps.get(colId); + BufferedString bs = new BufferedString(); + if(vec.isRepeating) { + bs.set(Long.toString(oneColumn[0]).getBytes()); + for (int i = 0; i < rowNumber; ++i) + dout.addStrCol(colId, bs); + } else if (vec.noNulls) { + for (int i = 0; i < rowNumber; i++) { + long l = oneColumn[i]; + if(map.get(l) == null) + map.put(l,Long.toString(l).getBytes()); + dout.addStrCol(colId, bs.set(map.get(l))); + } + } else { + for (int i = 0; i < rowNumber; i++) { + boolean [] isNull = vec.isNull; + if (isNull[i]) + dout.addInvalidCol(colId); + else { + long l = oneColumn[i]; + if(map.get(l) == null) + map.put(l,Long.toString(l).getBytes()); + dout.addStrCol(colId, bs.set(map.get(l))); + } + } + } + break; + default: + if(vec.isRepeating) { + for (int i = 0; i < rowNumber; ++i) + dout.addNumCol(colId, oneColumn[0], 0); + } else if (vec.noNulls) { + for (int rowIndex = 0; rowIndex < rowNumber; rowIndex++) { + check_Min_Value(oneColumn[rowIndex], colId, rowNumber, dout); + dout.addNumCol(colId, oneColumn[rowIndex], 0); + } + } else { + for (int rowIndex = 0; rowIndex < rowNumber; rowIndex++) { + boolean [] isNull = vec.isNull; + if (isNull[rowIndex]) + dout.addInvalidCol(colId); + else { + check_Min_Value(oneColumn[rowIndex], colId, rowNumber, dout); + dout.addNumCol(colId, oneColumn[rowIndex], 0); + } + } + } + break; + } + } + + /** + * This method is written to check and make sure any value written to a column of type long + * is more than Long.MIN_VALUE. If this is not true, a warning will be passed to the user. + * + * @param l + * @param cIdx + * @param rowNumber + * @param dout + */ + private void check_Min_Value(long l, int cIdx, int rowNumber, ParseWriter dout) { + if (l <= Long.MIN_VALUE) { + String warning = "Orc Parser: Long.MIN_VALUE: " + l + " is found in column "+cIdx+" row "+rowNumber + + " of stripe "+_cidx +". This value is used for sentinel and will not be parsed correctly."; + dout.addError(new ParseWriter.ParseErr(warning, _cidx, rowNumber, -2L)); + } + } + + public static class OrcParseSetup extends ParseSetup { + // expand to include Orc specific fields + transient Reader orcFileReader; + String[] columnTypesString; + boolean[] toInclude; + String[] allColumnNames; + + public OrcParseSetup(int ncols, + String[] columnNames, + byte[] ctypes, + String[][] domains, + String[][] naStrings, + String[][] data, + Reader orcReader, + String[] columntypes, + boolean[] toInclude, + String[] allColNames, ParseWriter.ParseErr[] errs) { + super(OrcParserProvider.ORC_INFO, (byte) '|', true, HAS_HEADER , + ncols, columnNames, ctypes, domains, naStrings, data, errs); + this.orcFileReader = orcReader; + this.columnTypesString = columntypes; + this.toInclude = toInclude; + this.allColumnNames = allColNames; + } + + @Override + protected boolean isCompatible(ParseSetup setupB) { + return super.isCompatible(setupB) && Arrays.equals(getColumnTypes(),setupB.getColumnTypes()); + } + + @Override + protected Parser parser(Key jobKey) { + return new OrcParser(this, jobKey); + } + + public Reader getOrcFileReader() { + return this.orcFileReader; + } + + public String[] getColumnTypesString() { + return this.columnTypesString; + } + + public void setColumnTypeStrings(String[] columnTypeStrings) { + this.columnTypesString = columnTypeStrings; + } + + public boolean[] getToInclude() { return this.toInclude; } + public String[] getAllColNames() { return this.allColumnNames; } + public void setAllColNames(String[] columnNames) { + this.allColumnNames = allColumnNames; + } + + public void setOrcFileReader(Reader orcFileReader) { + this.orcFileReader = orcFileReader; + this.stripesInfo = orcFileReader.getStripes(); + } + private transient List stripesInfo; + public List getStripes() {return stripesInfo;} + } + + // types are flattened in pre-order tree walk, here we just count the number of fields for non-primitve types + // which are ignored for now + static private int countStructFields(ObjectInspector x, ArrayList allColumnNames) { + int res = 1; + switch(x.getCategory()) { + case STRUCT: + StructObjectInspector structObjectInspector = (StructObjectInspector) x; + List allColumns = (List) structObjectInspector.getAllStructFieldRefs(); // column info + for (StructField oneField : allColumns) { + allColumnNames.add(oneField.getFieldName()); + res += countStructFields(oneField.getFieldObjectInspector(),allColumnNames); + } + break; + case LIST: + ListObjectInspector listObjectInspector = (ListObjectInspector) x; + allColumnNames.add("list"); + res += countStructFields(listObjectInspector.getListElementObjectInspector(),allColumnNames); + break; + case MAP: + MapObjectInspector mapObjectInspector = (MapObjectInspector) x; + allColumnNames.add("mapKey"); + res += countStructFields(mapObjectInspector.getMapKeyObjectInspector(),allColumnNames); + allColumnNames.add("mapValue"); + res += countStructFields(mapObjectInspector.getMapValueObjectInspector(),allColumnNames); + break; + case UNION: + UnionObjectInspector unionObjectInspector = (UnionObjectInspector)x; + allColumnNames.add("union"); + for( ObjectInspector xx:unionObjectInspector.getObjectInspectors()) + res += countStructFields(xx,allColumnNames); + break; + case PRIMITIVE:break; + default: throw H2O.unimpl(); + } + return res; + } + /* + * This function will derive information like column names, types and number from + * the inspector. + */ + static OrcParseSetup deriveParseSetup(Reader orcFileReader, StructObjectInspector insp) { + List allColumns = (List) insp.getAllStructFieldRefs(); // grab column info + List allStripes = orcFileReader.getStripes(); // grab stripe information + ArrayList allColNames = new ArrayList<>(); + boolean[] toInclude = new boolean[allColumns.size()+1]; + int supportedFieldCnt = 0 ; + int colIdx = 0; + for (StructField oneField:allColumns) { + allColNames.add(oneField.getFieldName()); + String columnType = oneField.getFieldObjectInspector().getTypeName(); + if (columnType.toLowerCase().contains("decimal")) { + columnType = "decimal"; + } + if (isSupportedSchema(columnType)) { + toInclude[colIdx+1] = true; + supportedFieldCnt++; + } + int cnt = countStructFields(oneField.getFieldObjectInspector(),allColNames); + if(cnt > 1) + toInclude = Arrays.copyOf(toInclude,toInclude.length + cnt-1); + colIdx+=cnt; + } + String [] allNames = allColNames.toArray(new String[allColNames.size()]); + String[] names = new String[supportedFieldCnt]; + + byte[] types = new byte[supportedFieldCnt]; + String[][] domains = new String[supportedFieldCnt][]; + String[] dataPreview = new String[supportedFieldCnt]; + String[] dataTypes = new String[supportedFieldCnt]; + ParseWriter.ParseErr[] errs = new ParseWriter.ParseErr[0]; + + // go through all column information + int columnIndex = 0; + for (StructField oneField : allColumns) { + String columnType = oneField.getFieldObjectInspector().getTypeName(); + if (columnType.toLowerCase().contains("decimal")) + columnType = "decimal"; // get rid of strange attachment + if (isSupportedSchema(columnType)) { + names[columnIndex] = oneField.getFieldName(); + types[columnIndex] = schemaToColumnType(columnType); + dataTypes[columnIndex] = columnType; + columnIndex++; + } else { + errs = ArrayUtils.append(errs, new ParseWriter.ParseErr("Orc Parser: Skipping field: " + + oneField.getFieldName() + " because of unsupported type: " + columnType, -1, -1L, -2L)); + } + } + + // get size of each stripe + long[] stripeSizes = new long[allStripes.size()]; + long fileSize = 0L; + long maxStripeSize = 0L; + + for (int index = 0; index < allStripes.size(); index++) { + long stripeSize = allStripes.get(index).getDataLength(); + + if (stripeSize > maxStripeSize) + maxStripeSize = stripeSize; + + fileSize = fileSize + stripeSize; + stripeSizes[index] = fileSize; + } + OrcParseSetup ps = new OrcParseSetup( + supportedFieldCnt, + names, + types, + domains, + null, + new String[][] { dataPreview }, + orcFileReader, + dataTypes, + toInclude, + allNames, + errs + ); + + return ps; + } +} \ No newline at end of file diff --git a/h2o-parsers/h2o-orc-parser/src/main/java/water/parser/orc/OrcParserProvider.java b/h2o-parsers/h2o-orc-parser/src/main/java/water/parser/orc/OrcParserProvider.java new file mode 100644 index 000000000000..6fcf921f72bf --- /dev/null +++ b/h2o-parsers/h2o-orc-parser/src/main/java/water/parser/orc/OrcParserProvider.java @@ -0,0 +1,138 @@ +package water.parser.orc; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.orc.OrcFile; +import org.apache.hadoop.hive.ql.io.orc.Reader; +import org.apache.hadoop.hive.ql.io.orc.StripeInformation; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import water.DKV; +import water.H2O; +import water.Job; +import water.Key; +import water.fvec.*; +import water.parser.*; +import water.persist.PersistHdfs; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static water.fvec.FileVec.getPathForKey; + + +/** + * Orc parser provider. + */ +public class OrcParserProvider extends ParserProvider { + + /* Setup for this parser */ + static ParserInfo ORC_INFO = new ParserInfo("ORC", DefaultParserProviders.MAX_CORE_PRIO + 20, true); + + @Override + public ParserInfo info() { + return ORC_INFO; + } + + @Override + public Parser createParser(ParseSetup setup, Key jobKey) { + return new OrcParser(setup, jobKey); + } + + @Override + public ParseSetup guessSetup(ByteVec bv, byte [] bits, byte sep, int ncols, boolean singleQuotes, + int checkHeader, String[] columnNames, byte[] columnTypes, + String[][] domains, String[][] naStrings) { + if(bv instanceof FileVec) + return readSetup((FileVec)bv, columnNames, columnTypes); + throw new UnsupportedOperationException("ORC only works on Files"); + } + + /** + * Use only the first file to setup everything. + * + * @param inputs input keys + * @param requiredSetup user given parser setup + * @return + */ + @Override + public ParseSetup createParserSetup(Key[] inputs, ParseSetup requiredSetup) { + + FileVec f; + Object frameOrVec = DKV.getGet(inputs[0]); + + if (frameOrVec instanceof water.fvec.Frame) + f = (FileVec) ((Frame) frameOrVec).vec(0); + else + f = (FileVec) frameOrVec; + return readSetup(f, requiredSetup.getColumnNames(), requiredSetup.getColumnTypes()); + } + + private Reader getReader(FileVec f) throws IOException { + String strPath = getPathForKey(f._key); + Path path = new Path(strPath); + if(f instanceof HDFSFileVec) + return OrcFile.createReader(PersistHdfs.getFS(strPath), path); + else + return OrcFile.createReader(path, OrcFile.readerOptions(new Configuration())); + } + + /** + * This method will create the readers and others info needed to parse an orc file. + * In addition, it will not over-ride the columnNames, columnTypes that the user + * may want to force upon it. However, we only allow users to set column types to + * enum at this point and ignore all the other requests. + * + * @param f + * @param columnNames + * @param columnTypes + * @return + */ + public ParseSetup readSetup(FileVec f, String[] columnNames, byte[] columnTypes) { + try { + Reader orcFileReader = getReader(f); + StructObjectInspector insp = (StructObjectInspector) orcFileReader.getObjectInspector(); + OrcParser.OrcParseSetup stp = OrcParser.deriveParseSetup(orcFileReader, insp); + + // change back the columnNames and columnTypes if they are specified already + if (!(columnNames == null) && (stp.getAllColNames().length == columnNames.length)) { // copy column name + stp.setColumnNames(columnNames); + stp.setAllColNames(columnNames); + } + + if (!(columnTypes == null) && (columnTypes.length == stp.getColumnTypes().length)) { // copy enum type only + byte[] old_columnTypes = stp.getColumnTypes(); + String[] old_columnTypeNames = stp.getColumnTypesString(); + for (int index = 0; index < columnTypes.length; index++) { + if (columnTypes[index] == Vec.T_CAT) // only copy the enum types + old_columnTypes[index] = columnTypes[index]; + } + stp.setColumnTypes(old_columnTypes); + stp.setColumnTypeStrings(old_columnTypeNames); + } + + List stripesInfo = orcFileReader.getStripes(); + if(stripesInfo.size() == 0) { // empty file + f.setChunkSize(stp._chunk_size = (int)f.length()); + return stp; + } + f.setNChunks(stripesInfo.size()); + stp._chunk_size = f._chunkSize; + assert f.nChunks() == stripesInfo.size(); // ORC parser needs one-to one mapping between chunk and strip (just ids, offsets do not matter) + return stp; + } catch(IOException ioe) { + throw new RuntimeException(ioe); + } + } + + @Override + public ParseSetup setupLocal(Vec v, ParseSetup setup){ + if(!(v instanceof FileVec)) throw H2O.unimpl("ORC only implemented for HDFS / NFS files"); + try { + ((OrcParser.OrcParseSetup)setup).setOrcFileReader(getReader((FileVec)v)); + + return setup; + + } catch (IOException e) {throw new RuntimeException(e);} + } +} diff --git a/h2o-parsers/h2o-orc-parser/src/main/java/water/parser/orc/OrcUtil.java b/h2o-parsers/h2o-orc-parser/src/main/java/water/parser/orc/OrcUtil.java new file mode 100644 index 000000000000..4983eb775788 --- /dev/null +++ b/h2o-parsers/h2o-orc-parser/src/main/java/water/parser/orc/OrcUtil.java @@ -0,0 +1,72 @@ +package water.parser.orc; + +import water.fvec.Vec; + +/** + * Utilities to work with Orc schema. + */ +public final class OrcUtil { + + /** Return true if the given schema can be transformed + * into h2o type. + * + * @param s orc field name in string + * @return true if the schema can be transformed into H2O type + */ + public static boolean isSupportedSchema(String s) { + + switch (s.toLowerCase()) { + case "boolean": + case "bigint": // long +// case "binary": // removed binary column type support for now + case "char": + case "date": + case "decimal": + case "double": + case "float": + case "int": + case "smallint": + case "string": + case "timestamp": + case "tinyint": + case "varchar": + case "enum": + return true; + default: + return false; + } + } + + /** + * Transform Orc column types into H2O type. + * + * @param s Orc data type + * @return a byte representing H2O column type + * @throws IllegalArgumentException if schema is not supported + */ + public static byte schemaToColumnType(String s) { + switch (s.toLowerCase()) { + case "boolean": + case "smallint": + case "tinyint": + case "bigint": // FIXME: make sure this is fixed by Tomas. + case "int": + case "float": + case "double": + case "decimal": + return Vec.T_NUM; + case "timestamp": + case "date": + return Vec.T_TIME; + case "enum": + return Vec.T_CAT; + case "string": + case "varchar": +// case "binary": // Removed binary column type support for now + case "char": + return Vec.T_STR; + default: + throw new IllegalArgumentException("Unsupported Orc schema type: " + s); + } + } +} \ No newline at end of file diff --git a/h2o-parsers/h2o-orc-parser/src/main/resources/META-INF/services/water.parser.ParserProvider b/h2o-parsers/h2o-orc-parser/src/main/resources/META-INF/services/water.parser.ParserProvider new file mode 100644 index 000000000000..f48f8fd351a5 --- /dev/null +++ b/h2o-parsers/h2o-orc-parser/src/main/resources/META-INF/services/water.parser.ParserProvider @@ -0,0 +1 @@ +water.parser.orc.OrcParserProvider diff --git a/h2o-parsers/h2o-orc-parser/src/test/java/water/parser/ParseTestMultiFileOrc.java b/h2o-parsers/h2o-orc-parser/src/test/java/water/parser/ParseTestMultiFileOrc.java new file mode 100644 index 000000000000..22c89d841116 --- /dev/null +++ b/h2o-parsers/h2o-orc-parser/src/test/java/water/parser/ParseTestMultiFileOrc.java @@ -0,0 +1,55 @@ +package water.parser; + + +import org.junit.BeforeClass; +import org.junit.Test; +import water.TestUtil; +import water.fvec.Frame; + +import static org.junit.Assert.assertTrue; + +/** + * Test suite for orc parser. + * + * This test will attempt to perform multi-file parsing of a csv and orc file and compare + * the frame summary statistics to make sure they are equivalent. + * + * -- Requested by Tomas N. + * + */ +public class ParseTestMultiFileOrc extends TestUtil { + + private double EPSILON = 1e-9; + private long ERRORMARGIN = 1000L; // error margin when compare timestamp. + int totalFilesTested = 0; + int numberWrong = 0; + + private String[] csvDirectories = {"bigdata/laptop/parser/orc/pubdev_3200/air05_csv", + "bigdata/laptop/parser/orc/milsongs_orc_csv", "smalldata/synthetic_perfect_separation"}; + private String[] orcDirectories = {"bigdata/laptop/parser/orc/pubdev_3200/air05_orc", + "bigdata/laptop/parser/orc/milsongs_orc", "smalldata/parser/orc/synthetic_perfect_separation"}; + + @BeforeClass + static public void setup() { TestUtil.stall_till_cloudsize(5); } + + @Test + public void testParseMultiFileOrcs() { + + for (int f_index = 0; f_index < csvDirectories.length; f_index++) { + Frame csv_frame = parse_test_folder(csvDirectories[f_index], "\\N", 0, null); + + byte[] types = csv_frame.types(); + + for (int index = 0; index < types.length; index++) { + if (types[index] == 0) + types[index] = 4; + } + + Frame orc_frame = parse_test_folder(orcDirectories[f_index], null, 0, types); + assertTrue(TestUtil.isIdenticalUpToRelTolerance(csv_frame, orc_frame, 1e-5)); + + csv_frame.delete(); + orc_frame.delete(); + } + } +} \ No newline at end of file diff --git a/h2o-parsers/h2o-orc-parser/src/test/java/water/parser/ParseTestORCCSV.java b/h2o-parsers/h2o-orc-parser/src/test/java/water/parser/ParseTestORCCSV.java new file mode 100644 index 000000000000..89c4f9e0a62d --- /dev/null +++ b/h2o-parsers/h2o-orc-parser/src/test/java/water/parser/ParseTestORCCSV.java @@ -0,0 +1,78 @@ +package water.parser; + + +import org.junit.BeforeClass; +import org.junit.Test; +import water.TestUtil; +import water.fvec.Frame; + +import static org.junit.Assert.assertTrue; + +/** + * Test suite for orc parser. + * + * This test will attempt to parse a bunch of files (orc and csv). We compare the frames of these files and make + * sure that they are equivalent. + * + * -- Requested by Tomas N. + * + */ +public class ParseTestORCCSV extends TestUtil { + + private double EPSILON = 1e-9; + private long ERRORMARGIN = 1000L; // error margin when compare timestamp. + int totalFilesTested = 0; + int numberWrong = 0; + + private String[] csvFiles = {"smalldata/parser/orc/orc2csv/TestOrcFile.testDate1900.csv", + "smalldata/parser/orc/orc2csv/TestOrcFile.testDate2038.csv", + "smalldata/parser/orc/orc2csv/orc_split_elim.csv", "smalldata/parser/csv2orc/prostate_NA.csv", + "smalldata/iris/iris.csv", "smalldata/jira/hexdev_29.csv"}; + + private String[] orcFiles = {"smalldata/parser/orc/TestOrcFile.testDate1900.orc", + "smalldata/parser/orc/TestOrcFile.testDate2038.orc", "smalldata/parser/orc/orc_split_elim.orc", + "smalldata/parser/orc/prostate_NA.orc", "smalldata/parser/orc/iris.orc", + "smalldata/parser/orc/hexdev_29.orc"}; + + private Boolean[] forceColumnTypes = {false, false, false, true, true, true}; + + @BeforeClass + static public void setup() { TestUtil.stall_till_cloudsize(5); } + + @Test + public void testParseOrcCsvFiles() { + int f_index = 0; + Frame csv_frame = parse_test_file(csvFiles[f_index], "\\N", 0, null); + Frame orc_frame = null; + + if (forceColumnTypes[f_index]) { + byte[] types = csv_frame.types(); + + for (int index = 0; index < types.length; index++) { + if (types[index] == 0) + types[index] = 3; + } + + orc_frame = parse_test_file(orcFiles[f_index], null, 0, types); + } else { + orc_frame = parse_test_file(orcFiles[f_index], null, 0, null); + } + + + // make sure column types are the same especially the enums + byte[] csv_types = csv_frame.types(); + byte[] orc_types = orc_frame.types(); + + for (int index = 0; index < csv_frame.numCols(); index++) { + if ((csv_types[index] == 4) && (orc_types[index] == 2)) { + orc_frame.replace(index, orc_frame.vec(index).toCategoricalVec().toNumericVec()); + csv_frame.replace(index, csv_frame.vec(index).toNumericVec()); + } + } + + assertTrue(TestUtil.isIdenticalUpToRelTolerance(csv_frame, orc_frame, 1e-5)); + + csv_frame.delete(); + orc_frame.delete(); + } +} \ No newline at end of file diff --git a/h2o-parsers/h2o-orc-parser/src/test/java/water/parser/ParseTestOrc.java b/h2o-parsers/h2o-orc-parser/src/test/java/water/parser/ParseTestOrc.java new file mode 100644 index 000000000000..d0e66af2cdb5 --- /dev/null +++ b/h2o-parsers/h2o-orc-parser/src/test/java/water/parser/ParseTestOrc.java @@ -0,0 +1,434 @@ +package water.parser; + + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.*; +import org.apache.hadoop.hive.ql.io.orc.OrcFile; +import org.apache.hadoop.hive.ql.io.orc.Reader; +import org.apache.hadoop.hive.ql.io.orc.RecordReader; +import org.apache.hadoop.hive.ql.io.orc.StripeInformation; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.joda.time.DateTime; +import org.junit.BeforeClass; +import org.junit.Test; +import water.TestUtil; +import water.fvec.Frame; +import water.fvec.Vec; +import water.util.Log; + +import java.util.*; + +import java.io.File; +import java.io.IOException; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static water.parser.orc.OrcUtil.isSupportedSchema; + +/** + * Test suite for orc parser. + * + * This test will build a H2O frame for all orc files found in smalldata/parser/orc directory + * and compare the H2O frame content with the orc file content read with Core Java commands. + * Test is declared a success if the content of H2O frame is the same as the contents read + * by using core Java commands off the Orc file itself. No multi-threading is used in reading + * off the Orc file using core Java commands. + */ +public class ParseTestOrc extends TestUtil { + + private double EPSILON = 1e-9; + private long ERRORMARGIN = 1000L; // error margin when compare timestamp. + int totalFilesTested = 0; + int numberWrong = 0; + BufferedString h2o = new BufferedString(); + BufferedString tempOrc = new BufferedString(); + public static final int DAY_TO_MS = 24*3600*1000; + public static final int ADD_OFFSET = 8*3600*1000; + public static final int HOUR_OFFSET = 3600000; // in ms to offset for leap seconds, years + + // list all orc files in smalldata/parser/orc directory + private String[] allOrcFiles = { + "smalldata/parser/orc/TestOrcFile.columnProjection.orc", + "smalldata/parser/orc/bigint_single_col.orc", + "smalldata/parser/orc/TestOrcFile.emptyFile.orc", + "smalldata/parser/orc/bool_single_col.orc", +// "smalldata/parser/orc/TestOrcFile.metaData.orc", +// "smalldata/parser/orc/decimal.orc", +// "smalldata/parser/orc/TestOrcFile.test1.orc", + "smalldata/parser/orc/demo-11-zlib.orc", + "smalldata/parser/orc/TestOrcFile.testDate1900.orc", + "smalldata/parser/orc/demo-12-zlib.orc", + "smalldata/parser/orc/TestOrcFile.testDate2038.orc", + "smalldata/parser/orc/double_single_col.orc", + "smalldata/parser/orc/TestOrcFile.testMemoryManagementV11.orc", + "smalldata/parser/orc/float_single_col.orc", + "smalldata/parser/orc/TestOrcFile.testMemoryManagementV12.orc", + "smalldata/parser/orc/int_single_col.orc", + "smalldata/parser/orc/TestOrcFile.testPredicatePushdown.orc", + "smalldata/parser/orc/nulls-at-end-snappy.orc", +// "smalldata/parser/orc/TestOrcFile.testSeek.orc", +// "smalldata/parser/orc/orc-file-11-format.orc", + "smalldata/parser/orc/TestOrcFile.testSnappy.orc", + "smalldata/parser/orc/orc_split_elim.orc", + "smalldata/parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc", +// "smalldata/parser/orc/over1k_bloom.orc", + "smalldata/parser/orc/TestOrcFile.testStripeLevelStats.orc", + "smalldata/parser/orc/smallint_single_col.orc", +// "smalldata/parser/orc/TestOrcFile.testTimestamp.orc", + "smalldata/parser/orc/string_single_col.orc", +// "smalldata/parser/orc/TestOrcFile.testUnionAndTimestamp.orc", + "smalldata/parser/orc/tinyint_single_col.orc", + "smalldata/parser/orc/TestOrcFile.testWithoutIndex.orc", +// "smalldata/parser/orc/version1999.orc" + }; + + @BeforeClass + static public void setup() { TestUtil.stall_till_cloudsize(5); } + + @Test + public void testParseAllOrcs() { + Set failedFiles = new TreeSet<>(); + int numOfOrcFiles = allOrcFiles.length; // number of Orc Files to test + + for (int fIndex = 0; fIndex < numOfOrcFiles; fIndex++) + { + +// if ((fIndex == 4) || (fIndex == 6) || (fIndex == 18) || (fIndex == 23) || (fIndex == 28)) +// continue; // do not support metadata from user +// +// if (fIndex == 31) // contain only orc header, no column and no row, total file size is 0. +// continue; +// +// if (fIndex == 19) // different column names are used between stripes +// continue; +// +// if (fIndex == 26) // abnormal orc file, no inpsector structure available +// continue; + +// if (fIndex ==30) // problem getting the right column number and then comparison problem +// continue; + +// if (fIndex == 22) // problem with BufferedString retrieval for binary, wait for Tomas +// continue; +// +// if (fIndex == 17) // problem with bigint retrieval, wait for Tomas +// continue; + +// Random rn = new Random(); +// int randNum = rn.nextInt(10); +// +// if (randNum > 3) // skip test for 70% of the time +// continue; + + String fileName = allOrcFiles[fIndex]; + Log.info("Orc Parser parsing " + fileName); + File f = find_test_file_static(fileName); + + if (f != null && f.exists()) { + Configuration conf = new Configuration(); + Path p = new Path(f.toString()); + try { + Reader orcFileReader = OrcFile.createReader(p, OrcFile.readerOptions(conf)); // orc reader + Frame h2oFrame = parse_test_file(fileName); // read one orc file and build a H2O frame + + compareH2OFrame(fileName, failedFiles, h2oFrame, orcFileReader); + + if (h2oFrame != null) // delete frame after done. + h2oFrame.delete(); + + totalFilesTested++; + + } catch (IOException e) { + e.printStackTrace(); + failedFiles.add(fileName); + numberWrong++; + } + + } else { + Log.warn("The following file was not found: " + fileName); + failedFiles.add(fileName); + numberWrong++; + } + } + + if (numberWrong > 0) { + Log.warn("There are errors in your test."); + assertEquals("Number of orc files failed to parse is: " + numberWrong + ", failed files = " + + failedFiles.toString(), 0, numberWrong); + } else { + Log.info("Parser test passed! Number of files parsed is " + totalFilesTested); + } + } + + /** + * This method will take one H2O frame generated by the Orc parser and the fileName of the Orc file + * and attempt to compare the content of the Orc file to the H2O frame. In particular, the following + * are compared: + * - column names; + * - number of columns and rows; + * - content of each row. + * + * If all comparison pass, the test will pass. Otherwise, the test will fail. + * + * @param h2oFrame + * @param orcReader + */ + private void compareH2OFrame(String fileName, Set failedFiles, Frame h2oFrame, Reader orcReader) { + // grab column names, column and row numbers + StructObjectInspector insp = (StructObjectInspector) orcReader.getObjectInspector(); + List allColInfo = (List) insp.getAllStructFieldRefs(); // get info of all cols + + // compare number of columns and rows + int allColNumber = allColInfo.size(); // get and check column number + boolean[] toInclude = new boolean[allColNumber+1]; + + int colNumber = 0 ; + int index1 = 0; + for (StructField oneField:allColInfo) { + String colType = oneField.getFieldObjectInspector().getTypeName(); + + if (colType.toLowerCase().contains("decimal")) + colType = "decimal"; + + if (isSupportedSchema(colType)) { + toInclude[index1 + 1] = true; + colNumber++; + } + + index1++; + } + + assertEquals("Number of columns need to be the same: ", colNumber, h2oFrame.numCols()); + + // compare column names + String[] colNames = new String[colNumber]; + String[] colTypes = new String[colNumber]; + int colIndex = 0; + + for (int index = 0; index < allColNumber; index++) { // get and check column names + String typeName = allColInfo.get(index).getFieldObjectInspector().getTypeName(); + + if (typeName.toLowerCase().contains("decimal")) + typeName = "decimal"; + + if (isSupportedSchema(typeName)) { + colNames[colIndex] = allColInfo.get(index).getFieldName(); + colTypes[colIndex] = typeName; + colIndex++; + } + } + assertArrayEquals("Column names need to be the same: ", colNames, h2oFrame._names); + + // compare one column at a time of the whole row? + compareFrameContents(fileName, failedFiles, h2oFrame, orcReader, colTypes, colNames, toInclude); + + Long totalRowNumber = orcReader.getNumberOfRows(); // get and check row number + assertEquals("Number of rows need to be the same: ", totalRowNumber, (Long) h2oFrame.numRows()); + + } + + + private void compareFrameContents(String fileName, Set failedFiles, Frame h2oFrame, Reader orcReader, + String[] colTypes, String[] colNames, boolean[] toInclude) { + // prepare parameter to read a orc file. +// boolean[] toInclude = new boolean[colNumber+1]; // must equal to number of column+1 +// Arrays.fill(toInclude, true); + + List stripesInfo = orcReader.getStripes(); // get all stripe info + + if (stripesInfo.size() == 0) { // Orc file contains no data + assertEquals("Orc file is empty. H2O frame row number should be zero: ", 0, h2oFrame.numRows()); + } else { + Long startRowIndex = 0L; // row index into H2O frame + for (StripeInformation oneStripe : stripesInfo) { + try { + RecordReader perStripe = orcReader.rows(oneStripe.getOffset(), oneStripe.getDataLength(), toInclude, null, + colNames); + VectorizedRowBatch batch = perStripe.nextBatch(null); // read orc file stripes in vectorizedRowBatch + + boolean done = false; + Long rowCounts = 0L; + Long rowNumber = oneStripe.getNumberOfRows(); // row number of current stripe + + while (!done) { + long currentBatchRow = batch.count(); // row number of current batch + + ColumnVector[] dataVectors = batch.cols; + + int colIndex = 0; + for (int cIdx = 0; cIdx < batch.numCols; cIdx++) { // read one column at a time; + if (toInclude[cIdx+1]) { + compare1Cloumn(dataVectors[cIdx], colTypes[colIndex].toLowerCase(), colIndex, currentBatchRow, + h2oFrame.vec(colNames[colIndex]), startRowIndex); + colIndex++; + } + } + + rowCounts = rowCounts + currentBatchRow; // record number of rows of data actually read + startRowIndex = startRowIndex + currentBatchRow; + + if (rowCounts >= rowNumber) // read all rows of the stripe already. + done = true; + + if (!done) // not done yet, get next batch + batch = perStripe.nextBatch(batch); + } + + perStripe.close(); + } catch (Throwable e) { + numberWrong++; + failedFiles.add(fileName); + e.printStackTrace(); + // assertEquals("Test failed! ", true, false); + } + } + } + } + + private void compare1Cloumn(ColumnVector oneColumn, String columnType, int cIdx, long currentBatchRow, + Vec h2oColumn, Long startRowIndex) { + +// if (columnType.contains("bigint")) // cannot handle big integer right now +// return; + + if (columnType.contains("binary")) // binary retrieval problem. Tomas + return; + + switch (columnType) { + case "boolean": + case "bigint": // FIXME: not working right now + case "int": + case "smallint": + case "tinyint": + CompareLongcolumn(oneColumn, oneColumn.isNull, currentBatchRow, h2oColumn, startRowIndex); + break; + case "float": + case "double": + compareDoublecolumn(oneColumn, oneColumn.isNull, currentBatchRow, h2oColumn, startRowIndex); + break; + case "string": //FIXME: not working right now + case "varchar": + case "char": + case "binary": //FIXME: only reading it as string right now. + compareStringcolumn(oneColumn, oneColumn.isNull, currentBatchRow, h2oColumn, startRowIndex, columnType); + break; + case "timestamp": + case "date": + compareTimecolumn(oneColumn, columnType, oneColumn.isNull, currentBatchRow, h2oColumn, startRowIndex); + break; + case "decimal": + compareDecimalcolumn(oneColumn, oneColumn.isNull, currentBatchRow, h2oColumn, startRowIndex); + break; + default: + Log.warn("String, bigint are not tested. H2O frame is built for them but cannot be verified."); + } + } + + private void compareDecimalcolumn(ColumnVector oneDecimalColumn, boolean[] isNull, + long currentBatchRow, Vec h2oFrame, Long startRowIndex) { + HiveDecimalWritable[] oneColumn= ((DecimalColumnVector) oneDecimalColumn).vector; + long frameRowIndex = startRowIndex; + + for (int rowIndex = 0; rowIndex < currentBatchRow; rowIndex++) { + if (isNull[rowIndex]) + assertEquals("Na is found: ", true, h2oFrame.isNA(frameRowIndex)); + else + assertEquals("Decimal elements should equal: ", Double.parseDouble(oneColumn[rowIndex].toString()), + h2oFrame.at(frameRowIndex), EPSILON); + + frameRowIndex++; + } + } + + private void compareTimecolumn(ColumnVector oneTSColumn, String columnType, boolean[] isNull, long currentBatchRow, + Vec h2oFrame, Long startRowIndex) { + long[] oneColumn = ((LongColumnVector) oneTSColumn).vector; + long frameRowIndex = startRowIndex; + + for (int rowIndex = 0; rowIndex < currentBatchRow; rowIndex++) { + if (isNull[rowIndex]) + assertEquals("Na is found: ", true, h2oFrame.isNA(frameRowIndex)); + else { + if (columnType.contains("timestamp")) + assertEquals("Numerical elements should equal: ", oneColumn[rowIndex]/1000000, h2oFrame.at8(frameRowIndex), + ERRORMARGIN); + else + assertEquals("Numerical elements should equal: ", correctTimeStamp(oneColumn[rowIndex]), + h2oFrame.at8(frameRowIndex), ERRORMARGIN); + } + + frameRowIndex++; + } + } + + private void compareStringcolumn(ColumnVector oneStringColumn, boolean[] isNull, + long currentBatchRow, Vec h2oFrame, Long startRowIndex, String columnType) { + byte[][] oneColumn = ((BytesColumnVector) oneStringColumn).vector; + int[] stringLength = ((BytesColumnVector) oneStringColumn).length; + int[] stringStart = ((BytesColumnVector) oneStringColumn).start; + long frameRowIndex = startRowIndex; + + for (int rowIndex = 0; rowIndex < currentBatchRow; rowIndex++) { + if (isNull[rowIndex]) + assertEquals("Na is found: ", true, h2oFrame.isNA(frameRowIndex)); + else { + if (!oneStringColumn.isRepeating || rowIndex == 0) + tempOrc.set(oneColumn[rowIndex], stringStart[rowIndex], stringLength[rowIndex]); + h2oFrame.atStr(h2o, frameRowIndex); + assertEquals("isRepeating = " + oneStringColumn.isRepeating + " String/char elements should equal: ", true, tempOrc.equals(h2o)); + } + + frameRowIndex++; + } + } + + private void compareDoublecolumn(ColumnVector oneDoubleColumn, boolean[] isNull, + long currentBatchRow, Vec h2oFrame, Long startRowIndex) { + double[] oneColumn= ((DoubleColumnVector) oneDoubleColumn).vector; + long frameRowIndex = startRowIndex; + + for (int rowIndex = 0; rowIndex < currentBatchRow; rowIndex++) { + if (isNull[rowIndex]) + assertEquals("Na is found: ", true, h2oFrame.isNA(frameRowIndex)); + else + assertEquals("Numerical elements should equal: ", oneColumn[rowIndex], h2oFrame.at(frameRowIndex), EPSILON); + + frameRowIndex++; + } + } + + private void CompareLongcolumn(ColumnVector oneLongColumn, boolean[] isNull, + long currentBatchRow, Vec h2oFrame, Long startRowIndex) { + long[] oneColumn= ((LongColumnVector) oneLongColumn).vector; + long frameRowIndex = startRowIndex; + + for (int rowIndex = 0; rowIndex < currentBatchRow; rowIndex++) { + if (isNull[rowIndex]) + assertEquals("Na is found: ", true, h2oFrame.isNA(frameRowIndex)); + else { + if (h2oFrame.isNA(frameRowIndex)) + continue; + else + assertEquals("Numerical elements should equal: ", oneColumn[rowIndex], h2oFrame.at8(frameRowIndex)); + } + + frameRowIndex++; + } + } + + private long correctTimeStamp(long daysSinceEpoch) { + long timestamp = (daysSinceEpoch*DAY_TO_MS+ADD_OFFSET); + + DateTime date = new DateTime(timestamp); + + int hour = date.hourOfDay().get(); + + if (hour == 0) + return timestamp; + else + return (timestamp-hour*HOUR_OFFSET); + } +} \ No newline at end of file diff --git a/h2o-parsers/h2o-orc-parser/testMultiNode.sh b/h2o-parsers/h2o-orc-parser/testMultiNode.sh new file mode 100755 index 000000000000..093991e0d23d --- /dev/null +++ b/h2o-parsers/h2o-orc-parser/testMultiNode.sh @@ -0,0 +1,124 @@ +#!/bin/bash + +# Argument parsing +if [ "$1" = "jacoco" ] +then + JACOCO_ENABLED=true +else + JACOCO_ENABLED=false +fi + +# Clean out any old sandbox, make a new one +OUTDIR=sandbox +rm -fr $OUTDIR; mkdir -p $OUTDIR + +# Check for os +SEP=: +case "`uname`" in + CYGWIN* ) + SEP=";" + ;; +esac + +function cleanup () { + kill -9 ${PID_1} ${PID_2} ${PID_3} ${PID_4} 1> /dev/null 2>&1 + wait 1> /dev/null 2>&1 + RC=`cat $OUTDIR/status.0` + if [ $RC -ne 0 ]; then + cat $OUTDIR/out.0 + echo h2o-orc-parser junit tests FAILED + else + echo h2o-orc-parser junit tests PASSED + fi + exit $RC +} + +trap cleanup SIGTERM SIGINT + +# Find java command +if [ -z "$TEST_JAVA_HOME" ]; then + # Use default + JAVA_CMD="java" +else + # Use test java home + JAVA_CMD="$TEST_JAVA_HOME/bin/java" + # Increase XMX since JAVA_HOME can point to java6 + JAVA6_REGEXP=".*1\.6.*" + if [[ $TEST_JAVA_HOME =~ $JAVA6_REGEXP ]]; then + JAVA_CMD="${JAVA_CMD}" + fi +fi + +MAX_MEM="-Xmx3g" + +# Check if coverage should be run +if [ $JACOCO_ENABLED = true ] +then + AGENT="../../jacoco/jacocoagent.jar" + COVERAGE="-javaagent:$AGENT=destfile=build/jacoco/h2o-parser_orc.exec" + MAX_MEM="-Xmx3g" +else + COVERAGE="" +fi + +# Gradle puts files: +# build/classes/main - Main h2o core classes +# build/classes/test - Test h2o core classes +# build/resources/main - Main resources (e.g. page.html) +JVM="nice $JAVA_CMD -ea $COVERAGE $MAX_MEM -Xms3g -cp build/libs/h2o-orc-parser-test.jar${SEP}build/libs/h2o-orc-parser.jar${SEP}../../h2o-core/build/libs/h2o-core-test.jar${SEP}../../h2o-core/build/libs/h2o-core.jar${SEP}../../h2o-genmodel/build/libs/h2o-genmodel.jar${SEP}../../lib/*" + +echo "$JVM" > $OUTDIR/jvm_cmd.txt +# Ahhh... but the makefile runs the tests skipping the jar'ing step when possible. +# Also, sometimes see test files in the main-class directory, so put the test +# classpath before the main classpath. +#JVM="nice java -ea -cp build/classes/test${SEP}build/classes/main${SEP}../h2o-core/build/classes/test${SEP}../h2o-core/build/classes/main${SEP}../lib/*" + +# Tests +# Must run first, before the cloud locks (because it tests cloud locking) +JUNIT_TESTS_BOOT="" +JUNIT_TESTS_BIG="" + +# Runner +# Default JUnit runner is org.junit.runner.JUnitCore +JUNIT_RUNNER="water.junit.H2OTestRunner" + +# find all java in the src/test directory +# Cut the "./water/MRThrow.java" down to "water/MRThrow.java" +# Cut the "water/MRThrow.java" down to "water/MRThrow" +# Slash/dot "water/MRThrow" becomes "water.MRThrow" + +# On this h2o-algos testMultiNode.sh only, force the tests.txt to be in the same order for all machines. +# If sorted, the result of the cd/grep varies by machine. +# If randomness is desired, replace sort with the unix 'shuf' +# Use /usr/bin/sort because of cygwin on windows. +# Windows has sort.exe which you don't want. Fails? (is it a lineend issue) +(cd src/test/java; /usr/bin/find . -name '*.java' | cut -c3- | sed 's/.....$//' | sed -e 's/\//./g') | grep -v $JUNIT_TESTS_BOOT | grep -v $JUNIT_TESTS_BIG | /usr/bin/sort > $OUTDIR/tests.txt + +# Output the comma-separated list of ignored/dooonly tests +# Ignored tests trump do-only tests +echo $IGNORE > $OUTDIR/tests.ignore.txt +echo $DOONLY > $OUTDIR/tests.doonly.txt + +# Launch 4 helper JVMs. All output redir'd at the OS level to sandbox files. +CLUSTER_NAME=junit_cluster_$$ +CLUSTER_BASEPORT=44000 +$JVM water.H2O -name $CLUSTER_NAME -baseport $CLUSTER_BASEPORT -ga_opt_out 1> $OUTDIR/out.1 2>&1 & PID_1=$! +$JVM water.H2O -name $CLUSTER_NAME -baseport $CLUSTER_BASEPORT -ga_opt_out 1> $OUTDIR/out.2 2>&1 & PID_2=$! +$JVM water.H2O -name $CLUSTER_NAME -baseport $CLUSTER_BASEPORT -ga_opt_out 1> $OUTDIR/out.3 2>&1 & PID_3=$! +$JVM water.H2O -name $CLUSTER_NAME -baseport $CLUSTER_BASEPORT -ga_opt_out 1> $OUTDIR/out.4 2>&1 & PID_4=$! + +# If coverage is being run, then pass a system variable flag so that timeout limits are increased. +if [ $JACOCO_ENABLED = true ] +then + JACOCO_FLAG="-Dtest.jacocoEnabled=true" +else + JACOCO_FLAG="" +fi + +# Launch last driver JVM. All output redir'd at the OS level to sandbox files. +echo Running h2o-orc-parser junit tests... +($JVM -Ddoonly.tests=$DOONLY -Dbuild.id=$BUILD_ID -Dignore.tests=$IGNORE -Djob.name=$JOB_NAME -Dgit.commit=$GIT_COMMIT -Dgit.branch=$GIT_BRANCH -Dai.h2o.name=$CLUSTER_NAME -Dai.h2o.baseport=$CLUSTER_BASEPORT -Dai.h2o.ga_opt_out=yes $JACOCO_FLAG $JUNIT_RUNNER `cat $OUTDIR/tests.txt` 2>&1 ; echo $? > $OUTDIR/status.0) 1> $OUTDIR/out.0 2>&1 + +grep EXECUTION $OUTDIR/out.0 | sed -e "s/.*TEST \(.*\) EXECUTION TIME: \(.*\) (Wall.*/\2 \1/" | sort -gr | head -n 10 >> $OUTDIR/out.0 + +cleanup \ No newline at end of file diff --git a/h2o-persist-hdfs/build.gradle b/h2o-persist-hdfs/build.gradle index ef0cb0588aae..80787759147e 100644 --- a/h2o-persist-hdfs/build.gradle +++ b/h2o-persist-hdfs/build.gradle @@ -1,9 +1,13 @@ + description = "H2O Persist HDFS" dependencies { - compile project(":h2o-core") - compile('net.java.dev.jets3t:jets3t:0.6.1') - compile("org.apache.hadoop:hadoop-client:2.0.0-cdh4.3.0") { - transitive = true - } -} + compile project(":h2o-core") + compile('net.java.dev.jets3t:jets3t:0.6.1') + def hadoopVersion = project.hasProperty("doIncludeOrc") && project.doIncludeOrc == "true" ? + orcDefaultHadoopClientVersion : defaultHadoopClientVersion + compile("org.apache.hadoop:hadoop-client:$hadoopVersion") { + // Pull all dependencies to allow run directly from IDE or command line + transitive = true + } +} \ No newline at end of file diff --git a/h2o-persist-hdfs/src/main/java/water/persist/PersistHdfs.java b/h2o-persist-hdfs/src/main/java/water/persist/PersistHdfs.java index 9014ed46ee09..3f1166f99d32 100644 --- a/h2o-persist-hdfs/src/main/java/water/persist/PersistHdfs.java +++ b/h2o-persist-hdfs/src/main/java/water/persist/PersistHdfs.java @@ -12,6 +12,7 @@ import java.io.OutputStream; import java.net.SocketTimeoutException; import java.net.URI; +import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Arrays; import java.util.concurrent.Callable; @@ -29,6 +30,8 @@ import water.util.FileUtils; import water.util.Log; +import static water.fvec.FileVec.getPathForKey; + /** * HDFS persistence layer. */ @@ -38,12 +41,6 @@ public final class PersistHdfs extends Persist { /** Root path of HDFS */ private final Path _iceRoot; - // Returns String with path for given key. - private static String getPathForKey(Key k) { - final int off = k._kb[0]==Key.CHK ? Vec.KEY_PREFIX_LEN : 0; - return new String(k._kb,off,k._kb.length-off); - } - // Global HDFS initialization // FIXME: do not share it via classes, but initialize it by object static { @@ -145,13 +142,13 @@ public PersistHdfs(URI uri) { long end, start = System.currentTimeMillis(); final byte[] b = MemoryManager.malloc1(v._max); Key k = v._key; + long skip = k.isChunkKey() ? water.fvec.NFSFileVec.chunkOffset(k) : 0; final Path p = _iceRoot == null?new Path(getPathForKey(k)):new Path(_iceRoot, getIceName(v)); final long skip_ = skip; run(new Callable() { @Override public Object call() throws Exception { FileSystem fs = FileSystem.get(p.toUri(), CONF); - FSDataInputStream s = null; try { // fs.getDefaultBlockSize(p); @@ -324,6 +321,17 @@ public Key uriToKey(URI uri) throws IOException { return HDFSFileVec.make(fstatus[0].getPath().toString(), fstatus[0].getLen()); } + public static FileSystem getFS(String path) throws IOException { + try { + return getFS(new URI(path)); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + } + public static FileSystem getFS(URI uri) throws IOException { + return FileSystem.get(uri, PersistHdfs.CONF); + } + // Is there a bucket name without a trailing "/" ? private boolean isBareS3NBucketWithoutTrailingSlash(String s) { String s2 = s.toLowerCase(); diff --git a/h2o-py/h2o/expr.py b/h2o-py/h2o/expr.py index b512a3752edb..206fc1c22a54 100644 --- a/h2o-py/h2o/expr.py +++ b/h2o-py/h2o/expr.py @@ -321,7 +321,8 @@ def _fill_data(self, json): # token NaN, so the default python json decoder does not convert them # to math.nan. Do that now. else: - c['data'] = [float('nan') if x == "NaN" else x for x in c['data']] + if c['data'] and (len(c['data']) > 0): # orc file parse can return frame with zero rows + c['data'] = [float('nan') if x == "NaN" else x for x in c['data']] self._data[c.pop('label')] = c # Label used as the Key return self diff --git a/h2o-py/tests/pyunit_utils/utilsPY.py b/h2o-py/tests/pyunit_utils/utilsPY.py index 4e720ee21a7f..469512d827db 100644 --- a/h2o-py/tests/pyunit_utils/utilsPY.py +++ b/h2o-py/tests/pyunit_utils/utilsPY.py @@ -4,6 +4,12 @@ from builtins import range from past.builtins import basestring import sys, os + +try: # works with python 2.7 not 3 + from StringIO import StringIO +except: # works with python 3 + from io import StringIO + sys.path.insert(1, "../../") import h2o import imp @@ -2522,3 +2528,244 @@ def write_hyper_parameters_json(dir1, dir2, json_filename, hyper_parameters): # save hyper-parameter file in sandbox with open(os.path.join(dir2, json_filename), 'w') as test_file: json.dump(hyper_parameters, test_file) + + +def compare_frames(frame1, frame2, numElements, tol_time=0, tol_numeric=0, strict=False, compare_NA=True): + """ + This function will compare two H2O frames to make sure their dimension, and values in all cells are the same. + It will not compare the column names though. + + :param frame1: H2O frame to be compared + :param frame2: H2O frame to be compared + :param numElements: integer to denote number of rows to compare. Done to reduce compare time. + Set to 0 or negative number if you want to compare all elements. + :param tol_time: optional parameter to limit time value difference. + :param tol_numerica: optional parameter to limit numeric value difference. + :param strict: optional parameter to enforce strict comparison or not. If True, column type must + match in order to pass the test. + :param compare_NA: optional parameter to compare NA or not. For csv file generated from orc file, the + NAs are represented as some other symbol but our CSV will not be able to parse it correctly as NA. + In this case, do not compare the number of NAs. + :return: boolean: True, the two frames are equal and False otherwise. + """ + + # check frame dimensions + rows1, cols1 = frame1.dim + rows2, cols2 = frame2.dim + + assert rows1 == rows2 and cols1 == cols2, "failed dim check! frame 1 rows:{0} frame 2 rows:{1} frame 1 cols:{2} " \ + "frame2 cols:{3}".format(rows1, rows2, cols1, cols2) + + na_frame1 = frame1.isna().sum() + na_frame2 = frame2.isna().sum() + + if compare_NA: # check number of missing values + assert na_frame1 == na_frame2, "failed numbers of NA check! Frame 1 NA number: {0}, frame 2 " \ + "NA number: {1}".format(na_frame1, na_frame2) + + # check column types are the same before proceeding to check each row content. + for col_ind in range(cols1): + + c1_key = frame1.columns[col_ind] + c2_key = frame2.columns[col_ind] + c2_type = frame2.types[c2_key] + c1_type = frame1.types[c1_key] + + print("###### Comparing column: {0} and column type is {1}.".format(col_ind, c1_type)) + + if strict: # every column type must match + assert c1_type == c2_type, "failed column type check! frame1 col type: {0}, frame2 col type: " \ + "{1}".format(c1_type, c2_type) + else: + if str(c2_type) == 'enum': # orc files do not have enum column type. We convert it here + frame1[col_ind].asfactor() + else: + assert c1_type == c2_type, "failed column type check! frame1 col type: {0}, frame2 col type: " \ + "{1}".format(c1_type, c2_type) + # compare string + if (str(c1_type) == 'string') or (str(c1_type) == 'enum'): + compareOneStringColumn(frame1, frame2, col_ind, rows1, numElements) + else: + if str(c2_type) == 'time': # compare time columns + compareOneNumericColumn(frame1, frame2, col_ind, rows1, tol_time, numElements) + else: + compareOneNumericColumn(frame1, frame2, col_ind, rows1, tol_numeric, numElements) + return True + + +def compareOneStringColumn(frame1, frame2, col_ind, rows, numElements): + """ + This function will compare two String columns of two H2O frames to make sure that they are the same. + + :param frame1: H2O frame to be compared + :param frame2: H2O frame to be compared + :param col_ind: integer denoting column index to compare the two frames + :param rows: integer denoting number of rows in the column + :param numElements: integer to denote number of rows to compare. Done to reduce compare time + :return: None. Will throw exceptions if comparison failed. + """ + + row_indices = list(range(rows)) + if numElements > 0: + random.shuffle(row_indices) + else: + numElements = rows + + for ele_ind in range(numElements): + row_ind = row_indices[ele_ind] + + val1 = frame1[row_ind, col_ind] + val2 = frame2[row_ind, col_ind] + + assert val1 == val2, "failed frame values check! frame1 value: {0}, frame2 value: {1} at row {2}, column " \ + "{3}".format(val1, val2, row_ind, col_ind) + + +def compareOneNumericColumn(frame1, frame2, col_ind, rows, tolerance, numElements): + """ + This function compares two numeric columns of two H2O frames to make sure that they are close. + + :param frame1: H2O frame to be compared + :param frame2: H2O frame to be compared + :param col_ind: integer denoting column index to compare the two frames + :param rows: integer denoting number of rows in the column + :param tolerance: double parameter to limit numerical value difference. + :param numElements: integer to denote number of rows to compare. Done to reduce compare time. + :return: None. Will throw exceptions if comparison failed. + """ + + row_indices = [] + if numElements > 0: + row_indices = random.sample(xrange(rows),numElements) + else: + numElements = rows # Compare all elements + list(range(rows)) + + for ele_ind in range(numElements): + row_ind = row_indices[ele_ind] + + val1 = frame1[row_ind, col_ind] + val2 = frame2[row_ind, col_ind] + + if not(math.isnan(val1)) and not(math.isnan(val2)): # both frames contain valid elements + diff = abs(val1-val2) + assert diff <= tolerance, "failed frame values check! frame1 value = {0}, frame2 value = {1}, " \ + "at row {2}, column {3}. The difference is {4}.".format(val1, val2, row_ind, + col_ind, diff) + elif math.isnan(val1) and math.isnan(val2): # both frame contains missing values + continue + else: # something is wrong, one frame got a missing value while the other is fine. + assert 1 == 2, "failed frame values check! frame1 value {0}, frame2 value {1} at row {2}, " \ + "column {3}".format(val1, val2, row_ind, col_ind) + +import warnings + +def expect_warnings(filewithpath, warn_phrase="warn", warn_string_of_interest="warn", number_of_times=1): + """ + This function will execute a command to run and analyze the print outs of + running the command. The goal here is to capture any warnings that we may expect + out of running those commands. + + :param filewithpath: name of file to be parsed with path + :param warn_phrase: capture the warning header, sometimes it is warn or userwarn. + :param warn_string_of_interest: specific warning message string + :param number_of_times: number of warning lines we are expecting. + :return: True if warning was found and False otherwise + """ + + number_warngings = 0 + + buffer = StringIO() # redirect warning messages to string buffer for later analysis + sys.stderr = buffer + + frame = h2o.import_file(path=locate(filewithpath)) + + sys.stderr = sys.__stderr__ # redirect it back to stdout. + try: # for python 2.7 + if len(buffer.buflist) > 0: + for index in range(len(buffer.buflist)): + if (warn_phrase in buffer.buflist[index]) and (warn_string_of_interest in buffer.buflist[index]): + number_warngings = number_warngings+1 + except: # for python 3. + warns = buffer.getvalue() + + if (warn_phrase in warns) and (warn_string_of_interest in warns): + number_warngings = number_warngings+1 + + number_of_times = 1 + + if number_warngings >= number_of_times: + return True + else: + return False + + +def compare_frame_summary(frame1_summary, frame2_summary, compareNames=False, compareTypes=False): + """ + This method is written to compare the frame summary between two frames. + + :param frame1_summary: + :param frame2_summary: + :param compareNames: + :param compareTypes: + :return: + """ + + frame1_column_number = len(frame1_summary) + frame2_column_number = len(frame2_summary) + + assert frame1_column_number == frame2_column_number, "failed column number check! Frame 1 column number: {0}," \ + "frame 2 column number: {1}".format(frame1_column_number, + frame2_column_number) + + for col_index in range(frame1_column_number): # check summary for each column + for key_val in list(frame1_summary[col_index]): + + if not(compareNames) and (str(key_val) == 'label'): + continue + + if not(compareTypes) and (str(key_val) == 'type'): + continue + + if str(key_val) == 'precision': # skip comparing precision + continue + + val1 = frame1_summary[col_index][key_val] + val2 = frame2_summary[col_index][key_val] + + if isinstance(val1, list) or isinstance(val1, dict): + if isinstance(val1, dict): + assert cmp(val1, val2) == 0, "failed column summary comparison for column {0} and summary " \ + "type {1}, frame 1 value is {2}, frame 2 value is " \ + "{3}".format(col_index, str(key_val), val1, val2) + else: + if len(val1) > 0: + # find if elements are float + float_found = False + + for ind in range(len(val1)): + if isinstance(val1[ind], float): + float_found = True + break + + if float_found: + for ind in range(len(val1)): + if not(str(val1[ind] == 'NaN')): + assert abs(val1[ind]-val2[ind]) < 1e-5, "failed column summary comparison for " \ + "column {0} and summary type {1}, frame 1" \ + " value is {2}, frame 2 value is " \ + "{3}".format(col_index, str(key_val), + val1[ind], val2[ind]) + else: + assert cmp(val1, val2) == 0, "failed column summary comparison for column {0} and summary" \ + " type {1}, frame 1 value is {2}, frame 2 value is " \ + "{3}".format(col_index, str(key_val), val1, val2) + else: + if isinstance(val1, float): + assert abs(val1-val2) < 1e-5, "failed column summary comparison for column {0} and summary type " \ + "{1}, frame 1 value is {2}, frame 2 value is " \ + "{3}".format(col_index, str(key_val), val1, val2) + else: + assert val1 == val2, "failed column summary comparison for column {0} and summary type " \ + "{1}, frame 1 value is {2}, frame 2 value is " \ + "{3}".format(col_index, str(key_val), val1, val2) \ No newline at end of file diff --git a/h2o-py/tests/testdir_hdfs/index.list b/h2o-py/tests/testdir_hdfs/index.list index 7e34bea48c88..f3be85484759 100644 --- a/h2o-py/tests/testdir_hdfs/index.list +++ b/h2o-py/tests/testdir_hdfs/index.list @@ -1,3 +1,10 @@ pyunit_INTERNAL_HDFS_basic.py pyunit_INTERNAL_HDFS_import_export.py +pyunit_INTERNAL_HDFS_airlines_orc.py +pyunit_INTERNAL_HDFS_hexdev_29_import_types_orc.py +pyunit_INTERNAL_HDFS_iris_import_types_orc.py +pyunit_INTERNAL_HDFS_milsongs_orc_large.py +pyunit_INTERNAL_HDFS_orc_parser.py +pyunit_INTERNAL_HDFS_prostate_orc.py +pyunit_INTERNAL_HDFS_timestamp_date_orc.py diff --git a/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_airlines_orc.py b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_airlines_orc.py new file mode 100644 index 000000000000..0f004f074dfc --- /dev/null +++ b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_airlines_orc.py @@ -0,0 +1,81 @@ +from __future__ import print_function +import sys +sys.path.insert(1,"../../") +import h2o +import time +from tests import pyunit_utils +#---------------------------------------------------------------------- +# Purpose: This test will test orc-parser in HDFS with data files of +# significant size split across multi files. Basically, we are testing +# our multi-file parsing of Orc with big data sets. This test is +# copied over from Nidhi R unit test. +#---------------------------------------------------------------------- + + +def hdfs_orc_parser(): + + # Check if we are running inside the H2O network by seeing if we can touch + # the namenode. + hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() + + if hadoop_namenode_is_accessible: + numElements2Compare = 10 + tol_time = 200 + tol_numeric = 1e-5 + + hdfs_name_node = pyunit_utils.hadoop_namenode() + hdfs_orc_file = "/datasets/airlines_all_orc_parts" + hdfs_csv_file = "/datasets/air_csv_part" + + col_types = ['real', 'real', 'real', 'real', 'real', 'real', 'real', 'real', 'enum', 'real', 'enum', 'real', + 'real', 'enum', 'real', 'real', 'enum', 'enum', 'real', 'enum', 'enum', 'real', 'real', 'real', + 'enum', 'enum', 'enum', 'enum', 'enum', 'enum', 'enum'] + + # import CSV file + print("Import airlines 116M dataset in original csv format from HDFS") + url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) + + startcsv = time.time() + multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N'], col_types=col_types) + endcsv = time.time() + + startcsv1 = time.time() + multi_file_csv1 = h2o.import_file(url_csv) + endcsv1 = time.time() + h2o.remove(multi_file_csv1) + + multi_file_csv.summary() + csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] + + # import ORC file with same column types as CSV file + print("Import airlines 116M dataset in ORC format from HDFS") + url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) + + startorc1 = time.time() + multi_file_orc1 = h2o.import_file(url_orc) + endorc1 = time.time() + h2o.remove(multi_file_orc1) + + startorc = time.time() + multi_file_orc = h2o.import_file(url_orc, col_types=col_types) + endorc = time.time() + + multi_file_orc.summary() + orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] + + print("************** CSV (without column type forcing) parse time is {0}".format(endcsv1-startcsv1)) + print("************** CSV (with column type forcing) parse time is {0}".format(endcsv-startcsv)) + print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1)) + print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc)) + + # compare frame read by orc by forcing column type, + pyunit_utils.compare_frame_summary(csv_summary, orc_summary) + + else: + raise EnvironmentError + + +if __name__ == "__main__": + pyunit_utils.standalone_test(hdfs_orc_parser) +else: + hdfs_orc_parser() \ No newline at end of file diff --git a/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_baddata_orc.py b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_baddata_orc.py new file mode 100644 index 000000000000..178e402cbcfe --- /dev/null +++ b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_baddata_orc.py @@ -0,0 +1,44 @@ +from __future__ import print_function +import sys +sys.path.insert(1,"../../") +import h2o +import time +from tests import pyunit_utils +#---------------------------------------------------------------------- +# This test is used to verify if the orc parser warnings from backend is +# passed down to python client when parsing orc files with unsupported +# data types or bad data value. +#---------------------------------------------------------------------- + +def hdfs_orc_parser(): + + # Check if we are running inside the H2O network by seeing if we can touch + # the namenode. + hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() + + if hadoop_namenode_is_accessible: + hdfs_name_node = pyunit_utils.hadoop_namenode() + + hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc" + url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) + assert pyunit_utils.expect_warnings(url_orc, "UserWarning:", "Skipping field:", 1),\ + "Expect warnings from orc parser for file "+url_orc+"!" + + hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc" + url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) + assert pyunit_utils.expect_warnings(url_orc, "UserWarning:", "Skipping field:", 1), \ + "Expect warnings from orc parser for file "+url_orc+"!" + + hdfs_orc_file = "/datasets/orc_parser/orc/nulls-at-end-snappy.orc" + url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) + assert pyunit_utils.expect_warnings(url_orc, "UserWarning:", "Skipping field:", 1), \ + "Expect warnings from orc parser for file "+url_orc+"!" + + else: + raise EnvironmentError + + +if __name__ == "__main__": + pyunit_utils.standalone_test(hdfs_orc_parser) +else: + hdfs_orc_parser() \ No newline at end of file diff --git a/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_hexdev_29_import_types_orc.py b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_hexdev_29_import_types_orc.py new file mode 100644 index 000000000000..ed3d8c301e7f --- /dev/null +++ b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_hexdev_29_import_types_orc.py @@ -0,0 +1,45 @@ +from __future__ import print_function +import sys +sys.path.insert(1,"../../") +import h2o +import time +from tests import pyunit_utils +#---------------------------------------------------------------------- +# Verifying that Python can define features as categorical or continuous +# on import in HDFS. +#---------------------------------------------------------------------- + + +def hdfs_orc_parser(): + + # Check if we are running inside the H2O network by seeing if we can touch + # the namenode. + hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() + + if hadoop_namenode_is_accessible: + hdfs_name_node = pyunit_utils.hadoop_namenode() + hdfs_orc_file = "/datasets/orc_parser/orc/hexdev_29.orc" + url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) + hdfs_csv_file = "/datasets/orc_parser/csv/hexdev_29.csv" + url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) + + numElements2Compare = 0 + tol_time = 200 + tol_numeric = 1e-5 + + ctypes = ["enum"]*3 + h2oframe_csv = h2o.import_file(url_csv, col_types=ctypes) + h2oframe_orc = h2o.import_file(url_orc, col_types=ctypes) + + # compare the two frames + assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric, + True), "H2O frame parsed from orc and csv files are different!" + + else: + raise EnvironmentError + + +if __name__ == "__main__": + pyunit_utils.standalone_test(hdfs_orc_parser) +else: + hdfs_orc_parser() \ No newline at end of file diff --git a/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_import_folder_airline_05_orc_large.py b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_import_folder_airline_05_orc_large.py new file mode 100644 index 000000000000..a1b87c1fc4aa --- /dev/null +++ b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_import_folder_airline_05_orc_large.py @@ -0,0 +1,73 @@ +from __future__ import print_function +import sys +sys.path.insert(1,"../../") +import h2o +import time +from tests import pyunit_utils +#---------------------------------------------------------------------- +# This test will build a H2O frame from importing the bigdata/laptop/parser/orc/airlines_05p_orc_csv +# from and build another H2O frame from the multi-file orc parser using multiple orc files that are +# saved in the directory bigdata/laptop/parser/orc/airlines_05p_orc. It will compare the two frames +# to make sure they are equal. +#---------------------------------------------------------------------- + + +def hdfs_orc_parser(): + + # Check if we are running inside the H2O network by seeing if we can touch + # the namenode. + hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() + + if hadoop_namenode_is_accessible: + hdfs_name_node = pyunit_utils.hadoop_namenode() + + hdfs_orc_file = "/datasets/orc_parser/air05_orc" + url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) + hdfs_csv_file = "/datasets/orc_parser/air05_csv" + url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) + + startcsv = time.time() + multi_file_csv = h2o.import_file(hdfs_csv_file, na_strings=['\\N']) + endcsv = time.time() + + csv_type_dict = multi_file_csv.types + + multi_file_csv.summary() + csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] + + col_ind_name = dict() + # change column types from real to enum according to multi_file_csv column types + for key_name in list(csv_type_dict): + col_ind = key_name.split('C') + new_ind = int(str(col_ind[1]))-1 + col_ind_name[new_ind] = key_name + + col_types = [] + for ind in range(len(col_ind_name)): + col_types.append(csv_type_dict[col_ind_name[ind]]) + + startorc1 = time.time() + multi_file_orc1 = h2o.import_file(url_orc) + endorc1 = time.time() + h2o.remove(multi_file_orc1) + + startorc = time.time() + multi_file_orc = h2o.import_file(url_orc,col_types=col_types) + endorc = time.time() + + multi_file_orc.summary() + orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] + + print("************** CSV parse time is {0}".format(endcsv-startcsv)) + print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1)) + print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc)) + # compare frame read by orc by forcing column type, + pyunit_utils.compare_frame_summary(csv_summary, orc_summary) + else: + raise EnvironmentError + + +if __name__ == "__main__": + pyunit_utils.standalone_test(hdfs_orc_parser) +else: + hdfs_orc_parser() \ No newline at end of file diff --git a/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_import_folder_orc.py b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_import_folder_orc.py new file mode 100644 index 000000000000..c679aa2ccb82 --- /dev/null +++ b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_import_folder_orc.py @@ -0,0 +1,46 @@ +from __future__ import print_function +import sys +sys.path.insert(1,"../../") +import h2o +import time +from tests import pyunit_utils +#---------------------------------------------------------------------- +# test that h2o.import_file works on a directory of files! +#---------------------------------------------------------------------- + + +def hdfs_orc_parser(): + + # Check if we are running inside the H2O network by seeing if we can touch + # the namenode. + hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() + + if hadoop_namenode_is_accessible: + hdfs_name_node = pyunit_utils.hadoop_namenode() + + tol_time = 200 # comparing in ms or ns + tol_numeric = 1e-5 # tolerance for comparing other numeric fields + numElements2Compare = 0 # choose number of elements per column to compare. Save test time. + + hdfs_csv_file = "/datasets/orc_parser/synthetic_perfect_separation_csv" + hdfs_orc_file = "/datasets/orc_parser/synthetic_perfect_separation_orc" + + url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) + url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) + + + multi_file_csv = h2o.import_file(url_csv) + multi_file_orc = h2o.import_file(url_orc) + + # make sure orc multi-file and single big file create same H2O frame + assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv, numElements2Compare, tol_time, + tol_numeric,True), "H2O frame parsed from multiple orc and single orc " \ + "files are different!" + else: + raise EnvironmentError + + +if __name__ == "__main__": + pyunit_utils.standalone_test(hdfs_orc_parser) +else: + hdfs_orc_parser() \ No newline at end of file diff --git a/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_iris_import_types_orc.py b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_iris_import_types_orc.py new file mode 100644 index 000000000000..1196ebb34881 --- /dev/null +++ b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_iris_import_types_orc.py @@ -0,0 +1,44 @@ +from __future__ import print_function +import sys +sys.path.insert(1,"../../") +import h2o +import time +from tests import pyunit_utils +#---------------------------------------------------------------------- +## Verifying that a user can change a column type to Enum if they like. +#---------------------------------------------------------------------- + + +def hdfs_orc_parser(): + + # Check if we are running inside the H2O network by seeing if we can touch + # the namenode. + hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() + + if hadoop_namenode_is_accessible: + hdfs_name_node = pyunit_utils.hadoop_namenode() + + numElements2Compare = 100 + tol_time = 200 + tol_numeric = 1e-5 + + hdfs_orc_file = "/datasets/orc_parser/orc/iris.orc" + url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) + hdfs_csv_file = "/datasets/orc_parser/csv/iris.csv" + url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) + + h2oframe_csv = h2o.import_file(url_csv) + data_types = ['real', 'real', 'real', 'real', 'enum'] + h2oframe_orc = h2o.import_file(url_orc, col_types = data_types) + + # compare the two frames + assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric, + True), "H2O frame parsed from orc and csv files are different!" + else: + raise EnvironmentError + + +if __name__ == "__main__": + pyunit_utils.standalone_test(hdfs_orc_parser) +else: + hdfs_orc_parser() \ No newline at end of file diff --git a/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_milsongs_orc_large.py b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_milsongs_orc_large.py new file mode 100644 index 000000000000..012bf88a43fe --- /dev/null +++ b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_milsongs_orc_large.py @@ -0,0 +1,45 @@ +from __future__ import print_function +import sys +sys.path.insert(1,"../../") +import h2o +import time +from tests import pyunit_utils +#---------------------------------------------------------------------- +# This test will build a H2O frame from importing the bigdata/laptop/parser/orc/milsongs_orc_csv +# from and build another H2O frame from the multi-file orc parser using multiple orc files that are +# saved in the directory bigdata/laptop/parser/orc/milsongs_orc. It will compare the two frames +# to make sure they are equal. +#---------------------------------------------------------------------- + + +def hdfs_orc_parser(): + + # Check if we are running inside the H2O network by seeing if we can touch + # the namenode. + hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() + + if hadoop_namenode_is_accessible: + hdfs_name_node = pyunit_utils.hadoop_namenode() + hdfs_orc_file = "/datasets/orc_parser/milsongs_orc" + url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) + hdfs_csv_file = "/datasets/orc_parser/milsongs_csv" + url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) + + multi_file_csv = h2o.import_file(url_csv) + multi_file_orc = h2o.import_file(url_orc) + + multi_file_csv.summary() + csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] + + multi_file_orc.summary() + orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] + + pyunit_utils.compare_frame_summary(csv_summary, orc_summary) + else: + raise EnvironmentError + + +if __name__ == "__main__": + pyunit_utils.standalone_test(hdfs_orc_parser) +else: + hdfs_orc_parser() \ No newline at end of file diff --git a/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_orc_parser.py b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_orc_parser.py new file mode 100644 index 000000000000..41637dbaeb77 --- /dev/null +++ b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_orc_parser.py @@ -0,0 +1,60 @@ +from __future__ import print_function +import sys +sys.path.insert(1,"../../") +import h2o +from tests import pyunit_utils +#---------------------------------------------------------------------- +# Purpose: This test will test orc-parser in HDFS parsing multiple +# orc files collected by Tom K. +#---------------------------------------------------------------------- + + +def hdfs_orc_parser(): + + # Check if we are running inside the H2O network by seeing if we can touch + # the namenode. + hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() + + if hadoop_namenode_is_accessible: + numElements2Compare = 10 + tol_time = 200 + tol_numeric = 1e-5 + + hdfs_name_node = pyunit_utils.hadoop_namenode() + + allOrcFiles = ["/datasets/orc_parser/orc/TestOrcFile.columnProjection.orc", + "/datasets/orc_parser/orc/bigint_single_col.orc", + "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc", + "/datasets/orc_parser/orc/bool_single_col.orc", + "/datasets/orc_parser/orc/demo-11-zlib.orc", + "/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc", + "/datasets/orc_parser/orc/demo-12-zlib.orc", + "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc", + "/datasets/orc_parser/orc/double_single_col.orc", + "/datasets/orc_parser/orc/TestOrcFile.testMemoryManagementV11.orc", + "/datasets/orc_parser/orc/float_single_col.orc", + "/datasets/orc_parser/orc/TestOrcFile.testMemoryManagementV12.orc", + "/datasets/orc_parser/orc/int_single_col.orc", + "/datasets/orc_parser/orc/TestOrcFile.testPredicatePushdown.orc", + "/datasets/orc_parser/orc/nulls-at-end-snappy.orc", + "/datasets/orc_parser/orc/TestOrcFile.testSnappy.orc", + "/datasets/orc_parser/orc/orc_split_elim.orc", + "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc", + "/datasets/orc_parser/orc/TestOrcFile.testStripeLevelStats.orc", + "/datasets/orc_parser/orc/smallint_single_col.orc", + "/datasets/orc_parser/orc/string_single_col.orc", + "/datasets/orc_parser/orc/tinyint_single_col.orc", + "/datasets/orc_parser/orc/TestOrcFile.testWithoutIndex.orc"] + + + for fIndex in range(len(allOrcFiles)): + url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex]) + tab_test = h2o.import_file(url_orc) + else: + raise EnvironmentError + + +if __name__ == "__main__": + pyunit_utils.standalone_test(hdfs_orc_parser) +else: + hdfs_orc_parser() \ No newline at end of file diff --git a/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_prostate_orc.py b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_prostate_orc.py new file mode 100644 index 000000000000..76a1b1aac6dd --- /dev/null +++ b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_prostate_orc.py @@ -0,0 +1,48 @@ +from __future__ import print_function +import sys +sys.path.insert(1,"../../") +import h2o +import time +from tests import pyunit_utils +#---------------------------------------------------------------------- +# To verify that the orc parser is parsing correctly, we want to take a file we know (prostate_NA.csv), convert +# it to an Orc file (prostate_NA.orc) and build two H2O frames out of them. We compare them and verified that +# they are the same. +# +# Nidhi did this manually in Hive and verified that the parsing is correct. I am automating the test here. +# +#---------------------------------------------------------------------- + + +def hdfs_orc_parser(): + + # Check if we are running inside the H2O network by seeing if we can touch + # the namenode. + hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() + + if hadoop_namenode_is_accessible: + hdfs_name_node = pyunit_utils.hadoop_namenode() + + tol_time = 200 # comparing in ms or ns + tol_numeric = 1e-5 # tolerance for comparing other numeric fields + numElements2Compare = 10 # choose number of elements per column to compare. Save test time. + + hdfs_orc_file = "/datasets/orc_parser/orc/prostate_NA.orc" + hdfs_csv_file = "/datasets/orc_parser/csv/prostate_NA.csv" + url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) + url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) + + h2oOrc = h2o.import_file(url_orc) + h2oCsv = h2o.import_file(url_csv) + + # compare the two frames + assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ + "H2O frame parsed from orc and csv files are different!" + else: + raise EnvironmentError + + +if __name__ == "__main__": + pyunit_utils.standalone_test(hdfs_orc_parser) +else: + hdfs_orc_parser() \ No newline at end of file diff --git a/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_timestamp_date_orc.py b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_timestamp_date_orc.py new file mode 100644 index 000000000000..b56b3531a890 --- /dev/null +++ b/h2o-py/tests/testdir_hdfs/pyunit_INTERNAL_HDFS_timestamp_date_orc.py @@ -0,0 +1,57 @@ +from __future__ import print_function +import sys +sys.path.insert(1,"../../") +import h2o +import time +from tests import pyunit_utils +#---------------------------------------------------------------------- +# This test will parse orc files containing timestamp and date information into +# H2O frame. Next, it will take the .csv file generated from the orc file from +# Hive and parse into H2O frame. Finally, we compare the two frames and make sure +# that they are equal. +# +# We want to make sure that we are parsing the date and timestamp +# date correctly from an orc file. Thanks to Nidhi who has imported an orc file +# containing timestamp/date into spark and later into Hive and write it out as +# csv. +# +#---------------------------------------------------------------------- + +def hdfs_orc_parser(): + + # Check if we are running inside the H2O network by seeing if we can touch + # the namenode. + hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() + + if hadoop_namenode_is_accessible: + hdfs_name_node = pyunit_utils.hadoop_namenode() + + tol_time = 200 # comparing in ms or ns + tol_numeric = 1e-5 # tolerance for comparing other numeric fields + numElements2Compare = 100 # choose number of elements per column to compare. Save test time. + + allOrcFiles = ["/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc", + "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc", + "/datasets/orc_parser/orc/orc_split_elim.orc"] + + allCsvFiles = ["/datasets/orc_parser/csv/TestOrcFile.testDate1900.csv", + "/datasets/orc_parser/csv/TestOrcFile.testDate2038.csv", + "/datasets/orc_parser/csv/orc_split_elim.csv"] + + for fIndex in range(len(allOrcFiles)): + url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex]) + url_csv = "hdfs://{0}{1}".format(hdfs_name_node, allCsvFiles[fIndex]) + h2oOrc = h2o.import_file(url_orc) + h2oCsv = h2o.import_file(url_csv) + + # compare the two frames + assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ + "H2O frame parsed from orc and csv files are different!" + else: + raise EnvironmentError + + +if __name__ == "__main__": + pyunit_utils.standalone_test(hdfs_orc_parser) +else: + hdfs_orc_parser() \ No newline at end of file diff --git a/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser.py b/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser.py new file mode 100644 index 000000000000..f43d9b15c462 --- /dev/null +++ b/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser.py @@ -0,0 +1,53 @@ +from builtins import str +import sys +sys.path.insert(1,"../../") +import h2o +from tests import pyunit_utils +from random import randint + +""" +This test takes all orc files collected by Tom K and try to parse them into H2O frames. +Due to test duration limitation, we do not parse all the files. Instead, we randomly +choose about 30% of the files and parse them into H2O frames. If all pareses are successful, +the test pass and else it fails. +""" +def orc_parser_test(): + allOrcFiles = ["smalldata/parser/orc/TestOrcFile.columnProjection.orc", + "smalldata/parser/orc/bigint_single_col.orc", + "smalldata/parser/orc/TestOrcFile.emptyFile.orc", + "smalldata/parser/orc/bool_single_col.orc", + "smalldata/parser/orc/demo-11-zlib.orc", + "smalldata/parser/orc/TestOrcFile.testDate1900.orc", + "smalldata/parser/orc/demo-12-zlib.orc", + "smalldata/parser/orc/TestOrcFile.testDate2038.orc", + "smalldata/parser/orc/double_single_col.orc", + "smalldata/parser/orc/TestOrcFile.testMemoryManagementV11.orc", + "smalldata/parser/orc/float_single_col.orc", + "smalldata/parser/orc/TestOrcFile.testMemoryManagementV12.orc", + "smalldata/parser/orc/int_single_col.orc", + "smalldata/parser/orc/TestOrcFile.testPredicatePushdown.orc", + "smalldata/parser/orc/nulls-at-end-snappy.orc", + "smalldata/parser/orc/TestOrcFile.testSnappy.orc", + "smalldata/parser/orc/orc_split_elim.orc", + "smalldata/parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc", + "smalldata/parser/orc/TestOrcFile.testStripeLevelStats.orc", + "smalldata/parser/orc/smallint_single_col.orc", + "smalldata/parser/orc/string_single_col.orc", + "smalldata/parser/orc/tinyint_single_col.orc", + "smalldata/parser/orc/TestOrcFile.testWithoutIndex.orc"] + + for fIndex in range(len(allOrcFiles)): + #Test tab seperated files by giving separator argument + tab_test = h2o.import_file(path=pyunit_utils.locate(allOrcFiles[fIndex])) + + +if __name__ == "__main__": + pyunit_utils.standalone_test(orc_parser_test) +else: + orc_parser_test() + + + + + + diff --git a/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_baddata.py b/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_baddata.py new file mode 100644 index 000000000000..7791bd055a7b --- /dev/null +++ b/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_baddata.py @@ -0,0 +1,27 @@ +import sys +sys.path.insert(1,"../../") +from tests import pyunit_utils + +def orc_parser_baddata(): + """ + This test is used to verify if the orc parser warnings from backend is passed down to python client + when parsing orc files with unsupported data types or bad data value. + + :return: None or a fit if no warning is captured + """ + fileWithPath = "smalldata/parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc" + assert pyunit_utils.expect_warnings(fileWithPath, "UserWarning:", "Skipping field:", 1), \ + "Expect warnings from orc parser for file "+fileWithPath+"!" + + fileWithPath = "smalldata/parser/orc/TestOrcFile.emptyFile.orc" + assert pyunit_utils.expect_warnings(fileWithPath, "UserWarning:", "Skipping field:", 4), \ + "Expect warnings from orc parser for file "+fileWithPath+"!" + + fileWithPath = "smalldata/parser/orc/nulls-at-end-snappy.orc" + assert pyunit_utils.expect_warnings(fileWithPath, "UserWarning:", "Long.MIN_VALUE:", 1), \ + "Expect warnings from orc parser for file "+fileWithPath+"!" + +if __name__ == "__main__": + pyunit_utils.standalone_test(orc_parser_baddata) +else: + orc_parser_baddata() diff --git a/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_hexdev_29_import_types.py b/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_hexdev_29_import_types.py new file mode 100644 index 000000000000..397779643022 --- /dev/null +++ b/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_hexdev_29_import_types.py @@ -0,0 +1,30 @@ +import sys +sys.path.insert(1,"../../") +import h2o +from tests import pyunit_utils +################################################################################ +## +## Verifying that Python can define features as categorical or continuous on import +## +################################################################################ + + +def continuous_or_categorical(): + numElements2Compare = 0 + tol_time = 200 + tol_numeric = 1e-5 + + ctypes = ["enum"]*3 + h2oframe_csv = h2o.import_file(pyunit_utils.locate("smalldata/jira/hexdev_29.csv"), col_types=ctypes) + h2oframe_orc = h2o.import_file(pyunit_utils.locate("smalldata/parser/orc/hexdev_29.orc"), col_types=ctypes) + + # compare the two frames + assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric, True), \ + "H2O frame parsed from orc and csv files are different!" + + + +if __name__ == "__main__": + pyunit_utils.standalone_test(continuous_or_categorical) +else: + continuous_or_categorical() diff --git a/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_import_folder.py b/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_import_folder.py new file mode 100644 index 000000000000..912ed6a4f2a1 --- /dev/null +++ b/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_import_folder.py @@ -0,0 +1,30 @@ +from __future__ import print_function +import sys +sys.path.insert(1,"../../") +import h2o +from tests import pyunit_utils + +# test that h2o.import_file works on a directory of files! +def import_folder(): + + tol_time = 200 # comparing in ms or ns + tol_numeric = 1e-5 # tolerance for comparing other numeric fields + numElements2Compare = 0 # choose number of elements per column to compare. Save test time. + + multi_file_csv1 = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/orc/synthetic_perfect_seperation_csv/balunbal.csv")) + multi_file_csv2 = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/orc/synthetic_perfect_seperation_csv/unbalbal.csv")) + multi_file_orc = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/orc/synthetic_perfect_separation")) + + # make sure orc multi-file and single big file create same H2O frame + try: + assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv1, numElements2Compare, tol_time, tol_numeric, + True), "H2O frame parsed from multiple orc and single orc files are different!" + except: + assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv2, numElements2Compare, tol_time, tol_numeric, + True), "H2O frame parsed from multiple orc and single orc files are different!" + + +if __name__ == "__main__": + pyunit_utils.standalone_test(import_folder) +else: + import_folder() diff --git a/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_import_folder_airline_05p_large.py b/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_import_folder_airline_05p_large.py new file mode 100644 index 000000000000..578c4f27db97 --- /dev/null +++ b/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_import_folder_airline_05p_large.py @@ -0,0 +1,61 @@ +from __future__ import print_function +import sys +sys.path.insert(1,"../../") +import h2o +import time +from tests import pyunit_utils + + +def import_folder(): + """ + This test will build a H2O frame from importing the bigdata/laptop/parser/orc/airlines_05p_orc_csv + from and build another H2O frame from the multi-file orc parser using multiple orc files that are + saved in the directory bigdata/laptop/parser/orc/airlines_05p_orc. It will compare the two frames + to make sure they are equal. + :return: None if passed. Otherwise, an exception will be thrown. + """ + startcsv = time.time() + multi_file_csv = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_csv"), + na_strings=['\\N']) + endcsv = time.time() + + csv_type_dict = multi_file_csv.types + + multi_file_csv.summary() + csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] + + col_ind_name = dict() + # change column types from real to enum according to multi_file_csv column types + for key_name in list(csv_type_dict): + col_ind = key_name.split('C') + new_ind = int(str(col_ind[1]))-1 + col_ind_name[new_ind] = key_name + + col_types = [] + for ind in range(len(col_ind_name)): + col_types.append(csv_type_dict[col_ind_name[ind]]) + + startorc1 = time.time() + multi_file_orc1 = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_orc")) + endorc1 = time.time() + h2o.remove(multi_file_orc1) + + startorc = time.time() + multi_file_orc = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_orc"), + col_types=col_types) + endorc = time.time() + + multi_file_orc.summary() + orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] + + print("************** CSV parse time is {0}".format(endcsv-startcsv)) + print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1)) + print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc)) + # compare frame read by orc by forcing column type, + pyunit_utils.compare_frame_summary(csv_summary, orc_summary) + + +if __name__ == "__main__": + pyunit_utils.standalone_test(import_folder) +else: + import_folder() diff --git a/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_import_folder_milsongs_large.py b/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_import_folder_milsongs_large.py new file mode 100644 index 000000000000..adc9209e8ba2 --- /dev/null +++ b/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_import_folder_milsongs_large.py @@ -0,0 +1,30 @@ +from __future__ import print_function +import sys +sys.path.insert(1,"../../") +import h2o +from tests import pyunit_utils + + +def import_folder(): + """ + This test will build a H2O frame from importing the bigdata/laptop/parser/orc/milsongs_orc_csv + from and build another H2O frame from the multi-file orc parser using multiple orc files that are + saved in the directory bigdata/laptop/parser/orc/milsongs_orc. It will compare the two frames + to make sure they are equal. + :return: None if passed. Otherwise, an exception will be thrown. + """ + multi_file_csv = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/milsongs_orc_csv")) + multi_file_orc = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/milsongs_orc")) + + multi_file_csv.summary() + csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] + + multi_file_orc.summary() + orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] + + pyunit_utils.compare_frame_summary(csv_summary, orc_summary) + +if __name__ == "__main__": + pyunit_utils.standalone_test(import_folder) +else: + import_folder() diff --git a/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_iris_import_types.py b/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_iris_import_types.py new file mode 100644 index 000000000000..62b25ff2cd59 --- /dev/null +++ b/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_iris_import_types.py @@ -0,0 +1,30 @@ +import sys +sys.path.insert(1,"../../") +import h2o +from tests import pyunit_utils +################################################################################ +## +## Verifying that a user can change a column type to Enum if they like. +## +################################################################################ + + +def continuous_or_categorical_orc(): + numElements2Compare = 100 + tol_time = 200 + tol_numeric = 1e-5 + + h2oframe_csv = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv")) + data_types = ['real', 'real', 'real', 'real', 'enum'] + h2oframe_orc = h2o.import_file(pyunit_utils.locate("smalldata/parser/orc/iris.orc"), col_types = data_types) + + # compare the two frames + assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric, True), \ + "H2O frame parsed from orc and csv files are different!" + + + +if __name__ == "__main__": + pyunit_utils.standalone_test(continuous_or_categorical_orc) +else: + continuous_or_categorical_orc() diff --git a/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_prostate.py b/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_prostate.py new file mode 100644 index 000000000000..c1322033539a --- /dev/null +++ b/h2o-py/tests/testdir_parser/pyunit_NOFEATURE_orc_parser_prostate.py @@ -0,0 +1,33 @@ +from builtins import str +import sys +sys.path.insert(1,"../../") +import h2o +from tests import pyunit_utils + +def orc_parser_timestamp_date(): + """ + To verify that the orc parser is parsing correctly, we want to take a file we know (prostate_NA.csv), convert + it to an Orc file (prostate_NA.orc) and build two H2O frames out of them. We compare them and verified that + they are the same. + + Nidhi did this manually in Hive and verified that the parsing is correct. I am automating the test here. + + :return: None + """ + + tol_time = 200 # comparing in ms or ns + tol_numeric = 1e-5 # tolerance for comparing other numeric fields + numElements2Compare = 10 # choose number of elements per column to compare. Save test time. + + h2oOrc = h2o.import_file(path=pyunit_utils.locate('smalldata/parser/orc/prostate_NA.orc')) + h2oCsv = h2o.import_file(path=pyunit_utils.locate('smalldata/parser/csv2orc/prostate_NA.csv')) + + # compare the two frames + assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ + "H2O frame parsed from orc and csv files are different!" + + +if __name__ == "__main__": + pyunit_utils.standalone_test(orc_parser_timestamp_date) +else: + orc_parser_timestamp_date() diff --git a/h2o-py/tests/testdir_parser/pyunit_orc_NOFEATURE_parser_timestamp_date.py b/h2o-py/tests/testdir_parser/pyunit_orc_NOFEATURE_parser_timestamp_date.py new file mode 100644 index 000000000000..49ddc3e6a3d5 --- /dev/null +++ b/h2o-py/tests/testdir_parser/pyunit_orc_NOFEATURE_parser_timestamp_date.py @@ -0,0 +1,49 @@ +from builtins import str +import sys +sys.path.insert(1,"../../") +import h2o +from tests import pyunit_utils + + +def orc_parser_timestamp_date(): + """ + This test will parse orc files containing timestamp and date information into + H2O frame. Next, it will take the .csv file generated from the orc file from + Hive and parse into H2O frame. Finally, we compare the two frames and make sure + that they are equal. + + We want to make sure that we are parsing the date and timestamp + date correctly from an orc file. Thanks to Nidhi who has imported an orc file + containing timestamp/date into spark and later into Hive and write it out as + csv. + + :return: None + """ + + tol_time = 200 # comparing in ms or ns + tol_numeric = 1e-5 # tolerance for comparing other numeric fields + numElements2Compare = 100 # choose number of elements per column to compare. Save test time. + + allOrcFiles = ["smalldata/parser/orc/TestOrcFile.testDate1900.orc", + "smalldata/parser/orc/TestOrcFile.testDate2038.orc", + "smalldata/parser/orc/orc_split_elim.orc"] + + allCsvFiles = ["smalldata/parser/orc/orc2csv/TestOrcFile.testDate1900.csv", + "smalldata/parser/orc/orc2csv/TestOrcFile.testDate2038.csv", + "smalldata/parser/orc/orc2csv/orc_split_elim.csv"] + + for fIndex in range(len(allOrcFiles)): + + h2oOrc = h2o.import_file(path=pyunit_utils.locate(allOrcFiles[fIndex])) + h2oCsv = h2o.import_file(path=pyunit_utils.locate(allCsvFiles[fIndex])) + + # compare the two frames + assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ + "H2O frame parsed from orc and csv files are different!" + + +if __name__ == "__main__": + pyunit_utils.standalone_test(orc_parser_timestamp_date) +else: + orc_parser_timestamp_date() + diff --git a/h2o-r/tests/testdir_hdfs/runit_INTERNAL_HDFS_airlines_orc.R b/h2o-r/tests/testdir_hdfs/runit_INTERNAL_HDFS_airlines_orc.R new file mode 100644 index 000000000000..d32cb1198079 --- /dev/null +++ b/h2o-r/tests/testdir_hdfs/runit_INTERNAL_HDFS_airlines_orc.R @@ -0,0 +1,67 @@ +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source("../../scripts/h2o-r-test-setup.R") +#---------------------------------------------------------------------- +# Purpose: This tests orc parser on multi-file parsing in HDFS. +#---------------------------------------------------------------------- + +# Check if we are running inside the H2O network by seeing if we can touch +# the namenode. +hdfs_name_node <- Sys.getenv(c("NAME_NODE")) +print(hdfs_name_node) + +#myIP <- H2O.IP +#myPort <- H2O.PORT + +hdfs_air_orc = "/datasets/airlines_all_orc_parts" +hdfs_air_original = "/datasets/airlines/airlines_all.csv" + +#h2o.init(ip=myIP, port=myPort, startH2O = FALSE) + +#---------------------------------------------------------------------- + +heading("BEGIN TEST") +check.hdfs_airorc <- function() { + + heading("Import airlines 116M dataset in original csv format ") + url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_air_original) + + print("************** csv parsing time: ") + ptm <- proc.time() + csv.hex <- h2o.importFile(url,destination_frame = "csv.hex") + timepassed = proc.time() - ptm + print(timepassed) + + n <- nrow(csv.hex) + print(paste("Imported n =", n, "rows from csv")) + + heading("Import airlines 116M dataset in ORC format ") + + #print("************** orc parsing time without forcing column types: ") + #ptm <- proc.time() + #orc2.hex <- h2o.importFolder(url,destination_frame = "dd2") + #timepassed = proc.time() - ptm + #print(timepassed) + #h2o.rm(orc2.hex) + + url <- sprintf("hdfs://%s%s", hdfs_name_node, hdfs_air_orc) + print("************** orc parsing time: ") + ptm <- proc.time() + orc.hex <- h2o.importFile(url,destination_frame = "orc.hex",col.names = names(csv.hex), + col.types = c("Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Enum","Numeric", + "Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Enum","Enum","Numeric","Numeric","Numeric","Numeric" + ,"Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Enum","Enum")) + timepassed = proc.time() - ptm + print(timepassed) + + n <- nrow(orc.hex) + print(paste("Imported n =", n, "rows from orc")) + + + expect_equal(dim(orc.hex),dim(csv.hex)) + expect_equal(summary(orc.hex),summary(csv.hex)) + + h2o.rm(orc.hex) # remove file + h2o.rm(csv.hex) +} + +doTest("ORC multifile parse test", check.hdfs_airorc) \ No newline at end of file diff --git a/h2o-r/tests/testdir_parser/runit_NOFEATURE_orc_parser.R b/h2o-r/tests/testdir_parser/runit_NOFEATURE_orc_parser.R new file mode 100644 index 000000000000..f09e0d752f80 --- /dev/null +++ b/h2o-r/tests/testdir_parser/runit_NOFEATURE_orc_parser.R @@ -0,0 +1,40 @@ +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source("../../scripts/h2o-r-test-setup.R") + +# This simple test is used to make sure that orc file parsing works across the REST +# API for R clients. + +test.orc_parser <- function(){ + Options(warn=1) + # all orc files that Tom K has found + allOrcFiles = c("smalldata/parser/orc/TestOrcFile.columnProjection.orc", + "smalldata/parser/orc/bigint_single_col.orc", + "smalldata/parser/orc/TestOrcFile.emptyFile.orc", + "smalldata/parser/orc/bool_single_col.orc", + "smalldata/parser/orc/demo-11-zlib.orc", + "smalldata/parser/orc/TestOrcFile.testDate1900.orc", + "smalldata/parser/orc/demo-12-zlib.orc", + "smalldata/parser/orc/TestOrcFile.testDate2038.orc", + "smalldata/parser/orc/double_single_col.orc", + "smalldata/parser/orc/TestOrcFile.testMemoryManagementV11.orc", + "smalldata/parser/orc/float_single_col.orc", + "smalldata/parser/orc/TestOrcFile.testMemoryManagementV12.orc", + "smalldata/parser/orc/int_single_col.orc", + "smalldata/parser/orc/TestOrcFile.testPredicatePushdown.orc", + "smalldata/parser/orc/nulls-at-end-snappy.orc", + "smalldata/parser/orc/TestOrcFile.testSnappy.orc", + "smalldata/parser/orc/orc_split_elim.orc", + "smalldata/parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc", + "smalldata/parser/orc/TestOrcFile.testStripeLevelStats.orc", + "smalldata/parser/orc/smallint_single_col.orc", + "smalldata/parser/orc/string_single_col.orc", + "smalldata/parser/orc/tinyint_single_col.orc", + "smalldata/parser/orc/TestOrcFile.testWithoutIndex.orc") + + for (temp in 1:length(allOrcFiles)) { + h2oFrame = h2o.importFile(locate(allOrcFiles[temp])) + } + +} + +doTest("Orc parser Test", test.orc_parser ) diff --git a/h2o-r/tests/testdir_parser/runit_NOFEATURE_orc_parser_airlines_05p_large.R b/h2o-r/tests/testdir_parser/runit_NOFEATURE_orc_parser_airlines_05p_large.R new file mode 100644 index 000000000000..be6a9b0ebec4 --- /dev/null +++ b/h2o-r/tests/testdir_parser/runit_NOFEATURE_orc_parser_airlines_05p_large.R @@ -0,0 +1,46 @@ +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source("../../scripts/h2o-r-test-setup.R") +################################################################################ +## +## This tests Orc multifile parser by comparing the summary of the original csv frame with the h2o parsed orc frame +## +################################################################################ + + +test.continuous.or.categorical <- function() { + + + original = h2o.importFile(locate("bigdata/laptop/airlines_all.05p.csv"),destination_frame = "original", + col.types=c("Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Enum","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Enum","Enum","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Enum","Enum")) + print("************** csv parsing time: ") + ptm <- proc.time() + csv = h2o.importFile(locate("bigdata/laptop/parser/orc/pubdev_3200/air05_csv"),destination_frame = "csv",col.names = names(original), + col.types=c("Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Enum","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Enum","Enum","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Enum","Enum")) + timepassed = proc.time() - ptm + print(timepassed) + + print("************** orc parsing time without forcing column types: ") + ptm <- proc.time() + orc2 = h2o.importFile(locate("bigdata/laptop/parser/orc/pubdev_3200/air05_orc"),destination_frame = "orc2",col.names = names(original)) + timepassed = proc.time()-ptm + print(timepassed) + h2o.rm(orc2) + + print("************** orc parsing time forcing same column types as csv: ") + ptm <- proc.time() + orc = h2o.importFile(locate("bigdata/laptop/parser/orc/pubdev_3200/air05_orc"),destination_frame = "orc",col.names = names(original), + col.types=c("Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Enum","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Enum","Enum","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Enum","Enum")) + timepassed = proc.time()-ptm + + + print(timepassed) + + expect_equal(summary(csv),summary(original)) + + for(i in 1:ncol(csv)){ + print(i) + expect_equal(summary(csv[,i]),summary(orc[,i])) + } +} + +doTest("Test orc multifile parser", test.continuous.or.categorical) diff --git a/h2o-r/tests/testdir_parser/runit_NOFEATURE_orc_parser_baddata.R b/h2o-r/tests/testdir_parser/runit_NOFEATURE_orc_parser_baddata.R new file mode 100644 index 000000000000..5c7fcd5c91fd --- /dev/null +++ b/h2o-r/tests/testdir_parser/runit_NOFEATURE_orc_parser_baddata.R @@ -0,0 +1,28 @@ +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source("../../scripts/h2o-r-test-setup.R") + +# This test is written to make sure that warnings from Orc Parser are passed to the R client. +# In particular, the first two Orc files contain unsupported column types. +# The third Orc file contains big integer values that are used by sentinel for H2O frame. + +test.orc_parser.bad_data <- function() { + options(warn=1) # make warnings to cause an error + + # These files contain unsupported data types + frame = h2o.importFile(locate("smalldata/parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc")) + expect_warning(h2o.importFile(locate("smalldata/parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc"))) + frame = h2o.importFile(locate("smalldata/parser/orc/TestOrcFile.emptyFile.orc")) + expect_warning(h2o.importFile(locate("smalldata/parser/orc/TestOrcFile.emptyFile.orc"))) + # This file contains big integer value Long.MIN_VALUE that is used for sentinel + frame = h2o.importFile(locate("smalldata/parser/orc/nulls-at-end-snappy.orc")) + expect_warning(h2o.importFile(locate("smalldata/parser/orc/nulls-at-end-snappy.orc"))) + +# b = warnings() # collect all warnings into a list +# print(length(b)) +# if (length(b) < 1) { +# browser() +# # throw("Not all warning messages are passed from Java to R client.") +# } +} + +doTest("Orc Parser: make sure warnings are passed to user.", test.orc_parser.bad_data) diff --git a/h2o-r/tests/testdir_parser/runit_orc_NOFEATURE_parser_milsongs_large.R b/h2o-r/tests/testdir_parser/runit_orc_NOFEATURE_parser_milsongs_large.R new file mode 100644 index 000000000000..11e79c2313fb --- /dev/null +++ b/h2o-r/tests/testdir_parser/runit_orc_NOFEATURE_parser_milsongs_large.R @@ -0,0 +1,33 @@ +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source("../../scripts/h2o-r-test-setup.R") +################################################################################ +## +## This tests Orc multifile parser by comparing the summary of the original csv frame with the h2o parsed orc frame +## on milsongs dataset +################################################################################ + + +test.continuous <- function() { + + + original = h2o.importFile(locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"),destination_frame = "original") + + csv = h2o.importFile(locate("bigdata/laptop/parser/orc/milsongs_orc_csv"),destination_frame = "csv",col.names = names(original), + col.types = c("Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric", + "Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric", + "Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric")) + + orc = h2o.importFolder(locate("bigdata/laptop/parser/orc/milsongs_orc"),pattern = "*_0",destination_frame = "orc",col.names = names(original), + col.types = c("Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric", + "Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric", + "Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric")) + + expect_equal(dim(csv),dim(orc)) + + expect_equal(summary(csv),summary(original)) + + expect_equal(summary(csv),summary(orc)) + +} + +doTest("Test orc multifile parser", test.continuous) diff --git a/scripts/run.py b/scripts/run.py index 785353245479..3610ea8b1e0b 100755 --- a/scripts/run.py +++ b/scripts/run.py @@ -2051,7 +2051,7 @@ def usage(): print(" --excludelist A file containing a list of tests to NOT run.") print("") print(" --testgroup Test a group of tests by function:") - print(" pca, glm, kmeans, gbm, rf, deeplearning, algos, golden, munging") + print(" pca, glm, kmeans, gbm, rf, deeplearning, algos, golden, munging, parser") print("") print(" --testsize Sizes (and by extension length) of tests to run:") print(" s=small (seconds), m=medium (a minute or two), l=large (longer), x=xlarge (very big)") diff --git a/scripts/saveTableAsOrc.textile b/scripts/saveTableAsOrc.textile index 36b0403daa18..214420929b2f 100644 --- a/scripts/saveTableAsOrc.textile +++ b/scripts/saveTableAsOrc.textile @@ -1,8 +1,8 @@ set hive.execution.engine=mr; set mapreduce.map.memory.mb=5240; set mapreduce.reduce.memory.mb=5240; -set mapreduce.map.java.opts=-DAMYWANG_MAP=1 -Xmx4G -XX:PermSize=256m -XX:MaxPermSize=256m -XX:+PrintGCDetails -XX:+PrintGCTimeStamps; -set mapreduce.reduce.java.opts=-DAMYWANG_REDUCE=1 -Xmx4G -XX:PermSize=256m -XX:MaxPermSize=256m -XX:+PrintGCDetails -XX:+PrintGCTimeStamps; +set mapreduce.map.java.opts=-DAMYWANG_MAP=1 -Xmx4G -XX:PermSize=256m -XX:MaxPermSize=256m -XX:PrintGCDetails -XX:PrintGCTimeStamps; +set mapreduce.reduce.java.opts=-DAMYWANG_REDUCE=1 -Xmx4G -XX:PermSize=256m -XX:MaxPermSize=256m -XX:PrintGCDetails -XX:PrintGCTimeStamps; create table airlines_all_05p( Year INT, Month INT, @@ -37,10 +37,11 @@ IsArrDelayed STRING, IsDepDelayed STRING ) ROW FORMAT DELIMITED -FIELDS TERMINATED BY ',' -location '/apps/hive/warehouse/data/airlines_all_05p'; -load data inpath 'hdfs://mr-0xd6.0xdata.loc:8020/user/amy/airlines_all.05p.csv' into table airlines_all_05p; -create table orc_airlines_all_05p( +FIELDS TERMINATED BY ‘,’ +location ‘/apps/hive/warehouse/data/airlines_all_05p’; +load data inpath ‘hdfs://mr-0xd6.0xdata.loc:8020/user/amy/airlines_all.05p.csv’ into table airlines_all_05p; + +create table csv_airlines_all( Year INT, Month INT, DayofMonth INT, @@ -51,7 +52,7 @@ ArrTime INT, CRSArrTime INT, UniqueCarrier STRING, FlightNum INT, -TailNum INT, +TailNum STRING, ActualElapsedTime INT, CRSElapsedTime INT, AirTime INT, @@ -74,7 +75,151 @@ IsArrDelayed STRING, IsDepDelayed STRING ) ROW FORMAT DELIMITED -FIELDS TERMINATED BY ',' +FIELDS TERMINATED BY ‘,’ STORED AS ORC; INSERT OVERWRITE TABLE orc_airlines_all_05p select * from airlines_all_05p; +The following is from Nidhi: +CREATE EXTERNAL TABLE ta (a1 INT, a2 STRING, a3 STRING, a4 STRING, a5 INT, a6 STRING, a7 INT, a8 INT, a9 INT) +STORED AS ORC; + +LOAD DATA local INPATH ‘/home/wendy/demo-11-zlib.orc’ OVERWRITE INTO TABLE ta; +​ +select * from ta limit 3; +#OK +#1 M M Primary 500 Good 0 0 0 +#2 F M Primary 500 Good 0 0 0 +#3 M S Primary 500 Good 0 0 0 +​ +CREATE EXTERNAL TABLE aa (a1 INT, a2 STRING, a3 STRING, a4 STRING, a5 INT, a6 STRING, a7 INT, a8 INT, a9 INT) +ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘,’ +STORED AS TEXTFILE +LOCATION ‘/user/wendy/from_hive’; +​ +select * from aa limit 3; +#OK +#NULL NULL NULL NULL NULL NULL NULL NULL NULL +#NULL NULL NULL NULL NULL NULL NULL NULL NULL +#NULL NULL NULL NULL NULL NULL NULL NULL NULL +​ +INSERT OVERWRITE TABLE aa SELECT * FROM ta; +​ +select * from aa limit 3; +#OK +#1 M M Primary 500 Good 0 0 0 +#2 F M Primary 500 Good 0 0 0 +#3 M S Primary 500 Good 0 0 0 +#Time taken: 0.079 seconds, Fetched: 3 row(s) + +Convert csv to orc: +CREATE EXTERNAL TABLE da (a1 INT, a2 INT, a3 INT, a4 INT, a5 INT, a6 INT, a7 DOUBLE, a8 DOUBLE, a9 INT) +ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘,’ +STORED AS TEXTFILE; +LOAD DATA local INPATH ‘/home/nidhi/prostate.csv’ OVERWRITE INTO TABLE bb; +​ +select * from da limit 3; +​ +​ +CREATE EXTERNAL TABLE bb (a1 INT, a2 INT, a3 INT, a4 INT, a5 INT, a6 INT, a7 DOUBLE, a8 DOUBLE, a9 INT) +STORED AS ORC +LOCATION ‘/user/nidhi/from_hive_prna’; + +INSERT OVERWRITE TABLE bb +SELECT * +FROM da; +​ +select * from bb limit 3; + +create table for milsongs +create external table milsongs ( +c1 INT, +c2 DOUBLE, +c3 DOUBLE, +c4 DOUBLE, +c5 DOUBLE, +c6 DOUBLE, +c7 DOUBLE, +c8 DOUBLE, +c9 DOUBLE, +c10 DOUBLE, +c11 DOUBLE, +c12 DOUBLE, +c13 DOUBLE, +c14 DOUBLE, +c15 DOUBLE, +c16 DOUBLE, +c17 DOUBLE, +c18 DOUBLE, +c19 DOUBLE, +c20 DOUBLE, +c21 DOUBLE, +c22 DOUBLE, +c23 DOUBLE, +c24 DOUBLE, +c25 DOUBLE, +c26 DOUBLE, +c27 DOUBLE, +c28 DOUBLE, +c29 DOUBLE, +c30 DOUBLE, +c31 DOUBLE, +c32 DOUBLE, +c33 DOUBLE, +c34 DOUBLE, +c35 DOUBLE, +c36 DOUBLE, +c37 DOUBLE, +c38 DOUBLE, +c39 DOUBLE, +c40 DOUBLE, +c41 DOUBLE, +c42 DOUBLE, +c43 DOUBLE, +c44 DOUBLE, +c45 DOUBLE, +c46 DOUBLE, +c47 DOUBLE, +c48 DOUBLE, +c49 DOUBLE, +c50 DOUBLE, +c51 DOUBLE, +c52 DOUBLE, +c53 DOUBLE, +c54 DOUBLE, +c55 DOUBLE, +c56 DOUBLE, +c57 DOUBLE, +c58 DOUBLE, +c59 DOUBLE, +c60 DOUBLE, +c61 DOUBLE, +c62 DOUBLE, +c63 DOUBLE, +c64 DOUBLE, +c65 DOUBLE, +c66 DOUBLE, +c67 DOUBLE, +c68 DOUBLE, +c69 DOUBLE, +c70 DOUBLE, +c71 DOUBLE, +c72 DOUBLE, +c73 DOUBLE, +c74 DOUBLE, +c75 DOUBLE, +c76 DOUBLE, +c77 DOUBLE, +c78 DOUBLE, +c79 DOUBLE, +c80 DOUBLE, +c81 DOUBLE, +c82 DOUBLE, +c83 DOUBLE, +c84 DOUBLE, +c85 DOUBLE, +c86 DOUBLE, +c87 DOUBLE, +c88 DOUBLE, +c89 DOUBLE, +c90 DOUBLE, +c91 DOUBLE) \ No newline at end of file diff --git a/settings.gradle b/settings.gradle index 8ad4cf477edb..8d2aedcebf2b 100644 --- a/settings.gradle +++ b/settings.gradle @@ -17,6 +17,7 @@ include 'h2o-bindings' include 'h2o-test-integ' include 'h2o-test-accuracy' include 'h2o-avro-parser' +include 'h2o-orc-parser' // Reconfigure scala projects to support cross compilation // The following code will create two projects for each included item: @@ -30,7 +31,7 @@ scalaCrossCompile { // Make structure flat and avoid annoying dummy modules rootProject.children.each { project -> - if (project.name.equals("h2o-avro-parser")) { + if (project.name.equals("h2o-avro-parser") || project.name.equals("h2o-orc-parser")) { String projectDirName = "h2o-parsers/${project.name}" project.projectDir = new File(settingsDir, projectDirName) } @@ -39,16 +40,16 @@ rootProject.children.each { project -> // // Include Hadoop builds only if requested // -if (System.getProperty("user.name").equals("jenkins") - || System.getenv("BUILD_HADOOP") != null - || System.getenv("H2O_TARGET") != null) { +if (System.getProperty("user.name").equals("jenkins") + || System.getenv("BUILD_HADOOP") != null + || System.getenv("H2O_TARGET") != null) { // Default hadoop build targets def allTargets = [ "cdh5.2", "cdh5.3", "cdh5.4.2", "cdh5.5.3", "cdh5.6.0", "cdh5.7.0", "hdp2.1", "hdp2.2", "hdp2.3", "hdp2.4", "mapr3.1.1", "mapr4.0.1", "mapr5.0", "mapr5.1" - ] + ] // Compute targets def targets = System.getenv("H2O_TARGET") != null ? System.getenv("H2O_TARGET").split(",").collect { it.trim() } : allTargets // Include selected/all Hadoop targets @@ -60,4 +61,3 @@ if (System.getProperty("user.name").equals("jenkins") } } } -