Merge pull request apache#1 from Parquet/master

update
gilv · Dec 20, 2013 · 82b889c · 82b889c
2 parents fd3b05c + ffcc0b8
commit 82b889c
Show file tree

Hide file tree

Showing 157 changed files with 7,473 additions and 2,054 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,63 @@
 # Parquet #
 
+### Version 1.2.10 ###
+* ISSUE [247](https://github.com/Parquet/parquet-mr/pull/247): fix bug: when field index is greater than zero
+* ISSUE [244](https://github.com/Parquet/parquet-mr/pull/244): Feature/error handler
+* ISSUE [187](https://github.com/Parquet/parquet-mr/pull/187): Plumb OriginalType
+* ISSUE [245](https://github.com/Parquet/parquet-mr/pull/245): integrate parquet format 2.0
+
+### Version 1.2.9 ###
+* ISSUE [242](https://github.com/Parquet/parquet-mr/pull/242): upgrade elephant-bird version to 4.3
+* ISSUE [240](https://github.com/Parquet/parquet-mr/pull/240): fix loader cache
+* ISSUE [233](https://github.com/Parquet/parquet-mr/pull/233): use latest stable release of cascading: 2.5.1
+* ISSUE [241](https://github.com/Parquet/parquet-mr/pull/241): Update reference to 0.10 in Hive012Binding javadoc
+* ISSUE [239](https://github.com/Parquet/parquet-mr/pull/239): Fix hive map and array inspectors with null containers
+* ISSUE [234](https://github.com/Parquet/parquet-mr/pull/234): optimize chunk scan; fix compressed size
+* ISSUE [237](https://github.com/Parquet/parquet-mr/pull/237): Handle codec not found
+* ISSUE [238](https://github.com/Parquet/parquet-mr/pull/238): fix pom version caused by bad merge
+* ISSUE [235](https://github.com/Parquet/parquet-mr/pull/235): Not write pig meta data only when pig is not avaliable
+* ISSUE [227](https://github.com/Parquet/parquet-mr/pull/227): Breaks parquet-hive up into several submodules, creating infrastructure ...
+* ISSUE [229](https://github.com/Parquet/parquet-mr/pull/229): add changelog tool
+* ISSUE [236](https://github.com/Parquet/parquet-mr/pull/236): Make cascading a provided dependency
+
+### Version 1.2.8 ###
+* ISSUE 228: enable globing files for parquetTupleScheme, refactor unit tests and rem...
+* ISSUE 224: Changing read and write methods in ParquetInputSplit so that they can de...
+
+### Version 1.2.8 ###
+* ISSUE 228: enable globing files for parquetTupleScheme, refactor unit tests and rem...
+* ISSUE 224: Changing read and write methods in ParquetInputSplit so that they can de...
+
+### Version 1.2.7 ###
+* ISSUE 223: refactor encoded values changes and test that resetDictionary works
+* ISSUE 222: fix bug: set raw data size to 0 after reset
+
+### Version 1.2.6 ###
+* ISSUE 221: make pig, hadoop and log4j jars provided
+* ISSUE 220: parquet-hive should ship and uber jar
+* ISSUE 213: group parquet-format version in one property
+* ISSUE 215: Fix Binary.equals().
+* ISSUE 210: ParquetWriter ignores enable dictionary and validating flags.
+* ISSUE 202: Fix requested schema when recreating splits in hive
+* ISSUE 208: Improve dic fall back
+* ISSUE 207: Fix offset
+* ISSUE 206: Create a "Powered by" page
+
+### Version 1.2.5 ###
+* ISSUE 204: ParquetLoader.inputFormatCache as WeakHashMap
+* ISSUE 203: add null check for EnumWriteProtocol
+* ISSUE 205: use cascading 2.2.0
+* ISSUE 199: simplify TupleWriteSupport constructor
+* ISSUE 164: Dictionary changes
+* ISSUE 196: Fixes to the Hive SerDe
+* ISSUE 197: RLE decoder reading past the end of the stream
+* ISSUE 188: Added ability to define arbitrary predicate functions
+* ISSUE 194: refactor serde to remove some unecessary boxing and include dictionary awareness
+* ISSUE 190: NPE in DictionaryValuesWriter.
+
+### Version 1.2.4 ###
+* ISSUE 191: Add compatibility checker for ThriftStruct to check for backward compatibility of two thrift structs
+
 ### Version 1.2.3 ###
 * ISSUE 186: add parquet-pig-bundle
 * ISSUE 184: Update ParquetReader to take Configuration as a constructor argument.

diff --git a/PoweredBy.md b/PoweredBy.md
@@ -0,0 +1,25 @@
+Who's using Parquet?
+======
+(in alphabetical order)
+
+## Cloudera Impala
+
+<blockquote class="twitter-tweet"><p>We shipped Impala 0.7 (<a href="http://t.co/wxuV0wYShk">http://t.co/wxuV0wYShk</a>) - a whole ton of great new features including DDL, Parquet support and partitioned joins!</p>&mdash; Henry Robinson (@HenryR) <a href="https://twitter.com/HenryR/statuses/324222874011451392">April 16, 2013</a></blockquote>
+
+## Criteo
+
+<blockquote class="twitter-tweet"><p>Parquet: Efficient Columnar Storage for Apache <a href="https://twitter.com/search?q=%23Hadoop&amp;src=hash">#Hadoop</a> <a href="http://t.co/He1xyv6NC3">http://t.co/He1xyv6NC3</a> via <a href="https://twitter.com/cloudera">@cloudera</a> - <a href="https://twitter.com/search?q=%23Criteo&amp;src=hash">#Criteo</a> R&amp;D very happy to contribute!</p>&mdash; Julien SIMON (@julsimon) <a href="https://twitter.com/julsimon/statuses/312114074911666177">March 14, 2013</a></blockquote>
+
+## Salesforce.com
+
+<blockquote class="twitter-tweet"><p>&quot;<a href="https://twitter.com/ParquetFormat">@ParquetFormat</a> at <a href="http://t.co/lro7m7quuc">Salesforce.com</a>&quot; <a href="http://t.co/IFskqF0FP3">http://t.co/IFskqF0FP3</a> via <a href="https://twitter.com/cloudera">@cloudera</a></p>&mdash; Twitter Open Source (@TwitterOSS) <a href="https://twitter.com/TwitterOSS/statuses/392734610116726784">October 22, 2013</a></blockquote>
+
+## Stripe
+
+<blockquote class="twitter-tweet"><p>We&#39;re moving basically all of our archival data at <a href="https://twitter.com/stripe">@stripe</a> into <a href="https://twitter.com/ParquetFormat">@ParquetFormat</a>, and I&#39;m super pleased with how well it&#39;s working out.</p>&mdash; Avi Bryant (@avibryant) <a href="https://twitter.com/avibryant/statuses/391339949250715648">October 18, 2013</a></blockquote>
+
+## Twitter
+
+<blockquote class="twitter-tweet"><p>Converting some data to Parquet on the Twitter clusters. I&#39;m seeing a 28% space saving thanks to the compressibility of the column layout.</p>&mdash; Julien Le Dem (@J_) <a href="https://twitter.com/J_/statuses/315844725611581441">March 24, 2013</a></blockquote>
+
+<script async src="//platform.twitter.com/widgets.js" charset="utf-8"></script>
diff --git a/README.md b/README.md
@@ -18,6 +18,7 @@ Parquet is a very active project, and new features are being added quickly; belo
   <tr><td>Hive integration</td><td>YES (<a href ="https://github.com/Parquet/parquet-mr/pull/28">28</a>)</td><td></td></td><td></td><td>1.0</td></tr>
   <tr><td>Pig integration</td><td>YES</td><td></td></td><td></td><td>1.0</td></tr>
   <tr><td>Cascading integration</td><td>YES</td><td></td></td><td></td><td>1.0</td></tr>
+  <tr><td>Crunch integration</td><td>YES (<a href ="https://issues.apache.org/jira/browse/CRUNCH-277">CRUNCH-277</a>)</td><td></td></td><td></td><td>1.0</td></tr>
   <tr><td>Impala integration</td><td>YES (non-nested)</td><td></td></td><td></td><td>1.0</td></tr>
   <tr><td>Java Map/Reduce API</td><td>YES</td><td></td></td><td></td><td>1.0</td></tr>
   <tr><td>Native Avro support</td><td>YES</td><td></td></td><td></td><td>1.0</td></tr>

diff --git a/changelog.sh b/changelog.sh
@@ -0,0 +1,36 @@
+echo "github username:" >&2 
+read username >&2
+echo "github password:" >&2 
+read -s password >&2
+
+curl -f -u $username:$password -s "https://api.github.com" > /dev/null
+if [ $? == 0 ]
+then
+  echo "login successful" >&2
+else
+  echo "login failed" >&2
+  curl -u $username:$password -s "https://api.github.com"
+  exit 1
+fi
+
+echo "# Parquet #"
+
+git log | grep -E "Merge pull request|prepare release" | while read l
+do 
+  release=`echo $l | grep "\[maven-release-plugin\] prepare release" | cut -d "-" -f 4`
+  PR=`echo $l| grep -E -o "Merge pull request #[^ ]*" | cut -d "#" -f 2`
+#  echo $l
+  if [ -n "$release" ] 
+  then 
+    echo
+    echo "### Version $release ###"
+  fi
+  if [ -n "$PR" ]
+  then
+    JSON=`curl -u $username:$password -s https://api.github.com/repos/Parquet/parquet-mr/pulls/$PR | tr "\n" " "`
+    DESC_RAW=$(echo $JSON |  grep -Po '"title":.*?[^\\]",' | cut -d "\"" -f 4- | head -n 1 | sed -e "s/\\\\//g")
+    DESC=$(echo ${DESC_RAW%\",})
+    echo "* ISSUE [$PR](https://github.com/Parquet/parquet-mr/pull/$PR): ${DESC}"
+  fi
+done
+
diff --git a/parquet-avro/pom.xml b/parquet-avro/pom.xml
@@ -3,7 +3,7 @@
     <groupId>com.twitter</groupId>
     <artifactId>parquet</artifactId>
     <relativePath>../pom.xml</relativePath>
-    <version>1.2.4-SNAPSHOT</version>
+    <version>1.2.11-SNAPSHOT</version>
   </parent>
 
   <modelVersion>4.0.0</modelVersion>
@@ -32,7 +32,7 @@
     <dependency>
       <groupId>com.twitter</groupId>
       <artifactId>parquet-format</artifactId>
-      <version>1.0.0</version>
+      <version>${parquet.format.version}</version>
     </dependency>
     <dependency>
       <groupId>org.apache.avro</groupId>
@@ -43,6 +43,7 @@
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-client</artifactId>
       <version>${hadoop.version}</version>
+      <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>com.google.guava</groupId>

diff --git a/parquet-cascading/pom.xml b/parquet-cascading/pom.xml
@@ -3,7 +3,7 @@
     <groupId>com.twitter</groupId>
     <artifactId>parquet</artifactId>
     <relativePath>../pom.xml</relativePath>
-    <version>1.2.4-SNAPSHOT</version>
+    <version>1.2.11-SNAPSHOT</version>
   </parent>
 
   <modelVersion>4.0.0</modelVersion>
@@ -41,11 +41,13 @@
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-client</artifactId>
       <version>${hadoop.version}</version>
+      <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>log4j</groupId>
       <artifactId>log4j</artifactId>
-      <version>1.2.17</version>
+      <version>${log4j.version}</version>
+      <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>com.twitter</groupId>
@@ -63,7 +65,8 @@
     <dependency>
        <groupId>cascading</groupId>
        <artifactId>cascading-hadoop</artifactId>
-       <version>2.1.5</version>
+       <version>${cascading.version}</version>
+       <scope>provided</scope>
     </dependency>
   </dependencies>
 

diff --git a/parquet-cascading/src/main/java/parquet/cascading/ParquetTupleScheme.java b/parquet-cascading/src/main/java/parquet/cascading/ParquetTupleScheme.java
@@ -93,7 +93,8 @@ private MessageType readSchema(FlowProcess<JobConf> flowProcess, Tap tap) {
       else
         hfs = (Hfs) tap;
 
-      List<Footer> footers = ParquetFileReader.readFooters(flowProcess.getConfigCopy(), hfs.getPath());
+      List<Footer> footers = getFooters(flowProcess, hfs);
+
       if(footers.isEmpty()) {
         throw new TapException("Could not read Parquet metadata at " + hfs.getPath());
       } else {
@@ -104,7 +105,14 @@ private MessageType readSchema(FlowProcess<JobConf> flowProcess, Tap tap) {
     }
   }
 
-  @SuppressWarnings("unchecked")
+   private List<Footer> getFooters(FlowProcess<JobConf> flowProcess, Hfs hfs) throws IOException {
+     JobConf jobConf = flowProcess.getConfigCopy();
+     DeprecatedParquetInputFormat format = new DeprecatedParquetInputFormat();
+     format.addInputPath(jobConf, hfs.getPath());
+     return format.getFooters(jobConf);
+   }
+
+   @SuppressWarnings("unchecked")
   @Override
   public boolean source(FlowProcess<JobConf> fp, SourceCall<Object[], RecordReader> sc)
       throws IOException {

diff --git a/parquet-cascading/src/test/java/parquet/cascading/TestParquetTBaseScheme.java b/parquet-cascading/src/test/java/parquet/cascading/TestParquetTBaseScheme.java
@@ -46,6 +46,7 @@
 import static org.junit.Assert.*;
 
 import parquet.hadoop.thrift.ThriftToParquetFileWriter;
+import parquet.hadoop.util.ContextUtil;
 import parquet.thrift.test.Name;
 
 import java.io.File;
@@ -117,7 +118,7 @@ private void createFileForRead() throws Exception {
 
     TProtocolFactory protocolFactory = new TCompactProtocol.Factory();
     TaskAttemptID taskId = new TaskAttemptID("local", 0, true, 0, 0);
-    ThriftToParquetFileWriter w = new ThriftToParquetFileWriter(fileToCreate, new TaskAttemptContext(conf, taskId), protocolFactory, Name.class);
+    ThriftToParquetFileWriter w = new ThriftToParquetFileWriter(fileToCreate, ContextUtil.newTaskAttemptContext(conf, taskId), protocolFactory, Name.class);
 
     final ByteArrayOutputStream baos = new ByteArrayOutputStream();
     final TProtocol protocol = protocolFactory.getProtocol(new TIOStreamTransport(baos));