merge

gilv · Dec 26, 2013 · a717bbf · a717bbf
2 parents 82b889c + 94b2ec0
commit a717bbf
Show file tree

Hide file tree

Showing 30 changed files with 2,232 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -13,4 +13,5 @@ target
 *.orig
 *.rej
 dependency-reduced-pom.xml
-.idea/*
+.idea/*
+target/
diff --git a/parquet-protobuf/README.md b/parquet-protobuf/README.md
@@ -0,0 +1,4 @@
+parquet-protobuf
+================
+
+protobuffer support for Parquet columnar format
diff --git a/parquet-protobuf/pom.xml b/parquet-protobuf/pom.xml
@@ -0,0 +1,157 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <parent>
+    <groupId>com.twitter</groupId>
+    <artifactId>parquet</artifactId>
+    <relativePath>../pom.xml</relativePath>
+    <version>1.2.10-SNAPSHOT</version>
+  </parent>
+
+  <modelVersion>4.0.0</modelVersion>
+
+  <artifactId>parquet-protobuf</artifactId>
+  <packaging>jar</packaging>
+
+  <properties>
+    <elephant-bird.version>3.0.8</elephant-bird.version>
+  </properties>
+
+
+  <name>Parquet Protobuf</name>
+  <url>https://github.com/lukasnalezenec/parquet-protobuf.git</url>
+
+  <dependencies>
+    <dependency>
+      <groupId>com.google.protobuf</groupId>
+      <artifactId>protobuf-java</artifactId>
+      <version>2.4.1</version>
+    </dependency>
+    <dependency>
+      <groupId>com.twitter</groupId>
+      <artifactId>parquet-hadoop</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>com.twitter.elephantbird</groupId>
+      <artifactId>elephant-bird-core</artifactId>
+      <version>${elephant-bird.version}</version>
+        <exclusions>
+          <exclusion>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-core</artifactId>
+            </exclusion>
+        </exclusions>
+    </dependency>
+
+      <dependency>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-client</artifactId>
+          <version>${hadoop.version}</version>
+      </dependency>
+
+  </dependencies>
+
+  <developers>
+    <developer>
+      <id>lukasnalezenec</id>
+      <name>Lukas Nalezenec</name>      
+    </developer>
+  </developers>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <version>${maven-jar-plugin.version}</version>
+        <configuration>
+          <archive>
+            <manifestEntries>
+              <git-SHA-1>${buildNumber}</git-SHA-1>
+            </manifestEntries>
+          </archive>
+        </configuration>
+        <executions>
+          <execution>
+            <goals>
+              <goal>test-jar</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-shade-plugin</artifactId>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>shade</goal>
+            </goals>
+            <configuration>
+              <artifactSet>
+                <includes>
+                  <include>org.codehaus.jackson:jackson-mapper-asl</include>
+                  <include>org.codehaus.jackson:jackson-core-asl</include>
+                </includes>
+              </artifactSet>
+              <relocations>
+                <relocation>
+                  <pattern>org.codehaus.jackson</pattern>
+                  <shadedPattern>parquet.org.codehaus.jackson</shadedPattern>
+                </relocation>
+              </relocations>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <!-- Ensure that the specific classes are available during test compile but not included in jar -->
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>build-helper-maven-plugin</artifactId>
+        <version>1.8</version>
+        <executions>
+          <execution>
+            <id>add-test-sources</id>
+            <phase>generate-test-sources</phase>
+            <goals>
+              <goal>add-test-source</goal>
+            </goals>
+            <configuration>
+              <sources>
+                <source>${project.build.directory}/generated-test-sources</source>
+              </sources>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+
+      <plugin>
+        <artifactId>maven-antrun-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>generate-sources</id>
+            <phase>generate-test-sources</phase>
+            <configuration>
+              <tasks>
+                <mkdir dir="${project.build.directory}/generated-test-sources"/>
+                <mkdir dir="${project.build.directory}/generated-test-sources/java"/>
+                <exec failonerror="true" executable="protoc">
+                  <arg value="--java_out=${project.build.directory}/generated-test-sources/java"/>
+                  <arg value="src/test/resources/TestProtobuf.proto"/>
+                  <arg value="-I."/>
+                </exec>
+              </tasks>
+              <sourceRoot>src/main/java</sourceRoot>
+              <sourceRoot>target/generated-sources/java</sourceRoot>
+            </configuration>
+            <goals>
+              <goal>run</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+
+    </plugins>
+  </build>
+
+</project>
diff --git a/parquet-protobuf/src/main/java/parquet/proto/ProtoParquetInputFormat.java b/parquet-protobuf/src/main/java/parquet/proto/ProtoParquetInputFormat.java
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2013 Lukas Nalezenec
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package parquet.proto;
+
+import com.google.protobuf.MessageOrBuilder;
+import org.apache.hadoop.conf.Configuration;
+import parquet.hadoop.ParquetInputFormat;
+
+/**
+ * A Hadoop {@link org.apache.hadoop.mapreduce.InputFormat} for Parquet files.
+ */
+public class ProtoParquetInputFormat<T extends MessageOrBuilder> extends ParquetInputFormat<T> {
+  public ProtoParquetInputFormat() {
+    super(ProtoReadSupport.class);
+  }
+
+  public static void setRequestedProjection(Configuration configuration, String requestedProjection) {
+    ProtoReadSupport.setRequestedProjection(configuration, requestedProjection);
+  }
+
+}
diff --git a/parquet-protobuf/src/main/java/parquet/proto/ProtoParquetOutputFormat.java b/parquet-protobuf/src/main/java/parquet/proto/ProtoParquetOutputFormat.java
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2013 Lukas Nalezenec
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package parquet.proto;
+
+import com.google.protobuf.Message;
+import com.google.protobuf.MessageOrBuilder;
+import org.apache.hadoop.mapreduce.Job;
+import parquet.hadoop.ParquetOutputFormat;
+import parquet.hadoop.util.ContextUtil;
+
+/**
+ * A Hadoop {@link org.apache.hadoop.mapreduce.OutputFormat} for Protobuffer Parquet files.
+ * <p/>
+ * Usage:
+ * <p/>
+ * <pre>
+ * {@code
+ * final Job job = new Job(conf, "Parquet writing job");
+ * job.setOutputFormatClass(ProtoParquetOutputFormat.class);
+ * ProtoParquetOutputFormat.setOutputPath(job, parquetPath);
+ * ProtoParquetOutputFormat.setProtobufferClass(job, YourProtobuffer.class);
+ * }
+ * </pre>
+ *
+ * @author Lukas Nalezenec
+ */
+public class ProtoParquetOutputFormat<T extends MessageOrBuilder> extends ParquetOutputFormat<T> {
+
+  public static void setProtobufferClass(Job job, Class<? extends Message> protoClass) {
+    ProtoWriteSupport.setSchema(ContextUtil.getConfiguration(job), protoClass);
+  }
+
+  public ProtoParquetOutputFormat(Class<? extends Message> msg) {
+    super(new ProtoWriteSupport(msg));
+  }
+
+  public ProtoParquetOutputFormat() {
+    super(new ProtoWriteSupport());
+  }
+
+}
diff --git a/parquet-protobuf/src/main/java/parquet/proto/ProtoParquetReader.java b/parquet-protobuf/src/main/java/parquet/proto/ProtoParquetReader.java
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2012 Twitter, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package parquet.proto;
+
+import com.google.protobuf.MessageOrBuilder;
+import org.apache.hadoop.fs.Path;
+import parquet.filter.UnboundRecordFilter;
+import parquet.hadoop.ParquetReader;
+import parquet.hadoop.api.ReadSupport;
+
+import java.io.IOException;
+
+/**
+ * Read Avro records from a Parquet file.
+ */
+public class ProtoParquetReader<T extends MessageOrBuilder> extends ParquetReader<T> {
+
+  public ProtoParquetReader(Path file) throws IOException {
+    super(file, (ReadSupport<T>) new ProtoReadSupport());
+  }
+
+  public ProtoParquetReader(Path file, UnboundRecordFilter recordFilter) throws IOException {
+    super(file, (ReadSupport<T>) new ProtoReadSupport(), recordFilter);
+  }
+
+  //TODO here should be option to override pb from file
+}
diff --git a/parquet-protobuf/src/main/java/parquet/proto/ProtoParquetWriter.java b/parquet-protobuf/src/main/java/parquet/proto/ProtoParquetWriter.java
@@ -0,0 +1,78 @@
+/**
+ * Copyright 2012 Twitter, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package parquet.proto;
+
+import com.google.protobuf.Message;
+import com.google.protobuf.MessageOrBuilder;
+import org.apache.hadoop.fs.Path;
+import parquet.hadoop.ParquetWriter;
+import parquet.hadoop.api.WriteSupport;
+import parquet.hadoop.metadata.CompressionCodecName;
+
+import java.io.IOException;
+
+/**
+ * Write Protobuffer records to a Parquet file.
+ */
+public class ProtoParquetWriter<T extends MessageOrBuilder> extends ParquetWriter<T> {
+
+  /**
+   * Create a new {@link ProtoParquetWriter}.
+   *
+   * @param file
+   * @param compressionCodecName
+   * @param blockSize
+   * @param pageSize
+   * @throws IOException
+   */
+  public ProtoParquetWriter(Path file, Class<? extends Message> protoMessage,
+                            CompressionCodecName compressionCodecName, int blockSize,
+                            int pageSize) throws IOException {
+    super(file, (WriteSupport<T>) new ProtoWriteSupport(protoMessage),
+            compressionCodecName, blockSize, pageSize);
+  }
+
+  /**
+   * Create a new {@link ProtoParquetWriter}.
+   *
+   * @param file                 The file name to write to.
+   * @param compressionCodecName Compression code to use, or CompressionCodecName.UNCOMPRESSED
+   * @param blockSize            HDFS block size
+   * @param pageSize             See parquet write up. Blocks are subdivided into pages for alignment and other purposes.
+   * @param enableDictionary     Whether to use a dictionary to compress columns.
+   * @throws IOException
+   */
+  public ProtoParquetWriter(Path file, Class<? extends Message> protoMessage,
+                            CompressionCodecName compressionCodecName, int blockSize,
+                            int pageSize, boolean enableDictionary) throws IOException {
+    super(file, (WriteSupport<T>)
+            new ProtoWriteSupport(protoMessage),
+            compressionCodecName, blockSize, pageSize, enableDictionary, false);
+  }
+
+  /**
+   * Create a new {@link ProtoParquetWriter}. The default block size is 50 MB.The default
+   * page size is 1 MB.  Default compression is no compression. (Inherited from {@link ParquetWriter})
+   *
+   * @param file The file name to write to.
+   * @throws IOException
+   */
+  public ProtoParquetWriter(Path file, Class<? extends Message> protoMessage) throws IOException {
+    this(file, protoMessage, CompressionCodecName.UNCOMPRESSED,
+            DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE);
+  }
+
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,4 +13,5 @@ target @@
     *.orig
     *.rej
     dependency-reduced-pom.xml
-    .idea/*
+    .idea/*
+    target/