forked from apache/parquet-java
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
30 changed files
with
2,232 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,4 +13,5 @@ target | |
*.orig | ||
*.rej | ||
dependency-reduced-pom.xml | ||
.idea/* | ||
.idea/* | ||
target/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
parquet-protobuf | ||
================ | ||
|
||
protobuffer support for Parquet columnar format |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<parent> | ||
<groupId>com.twitter</groupId> | ||
<artifactId>parquet</artifactId> | ||
<relativePath>../pom.xml</relativePath> | ||
<version>1.2.10-SNAPSHOT</version> | ||
</parent> | ||
|
||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<artifactId>parquet-protobuf</artifactId> | ||
<packaging>jar</packaging> | ||
|
||
<properties> | ||
<elephant-bird.version>3.0.8</elephant-bird.version> | ||
</properties> | ||
|
||
|
||
<name>Parquet Protobuf</name> | ||
<url>https://github.com/lukasnalezenec/parquet-protobuf.git</url> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>com.google.protobuf</groupId> | ||
<artifactId>protobuf-java</artifactId> | ||
<version>2.4.1</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>com.twitter</groupId> | ||
<artifactId>parquet-hadoop</artifactId> | ||
<version>${project.version}</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>com.twitter.elephantbird</groupId> | ||
<artifactId>elephant-bird-core</artifactId> | ||
<version>${elephant-bird.version}</version> | ||
<exclusions> | ||
<exclusion> | ||
<groupId>org.apache.hadoop</groupId> | ||
<artifactId>hadoop-core</artifactId> | ||
</exclusion> | ||
</exclusions> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.apache.hadoop</groupId> | ||
<artifactId>hadoop-client</artifactId> | ||
<version>${hadoop.version}</version> | ||
</dependency> | ||
|
||
</dependencies> | ||
|
||
<developers> | ||
<developer> | ||
<id>lukasnalezenec</id> | ||
<name>Lukas Nalezenec</name> | ||
</developer> | ||
</developers> | ||
|
||
<build> | ||
<plugins> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-jar-plugin</artifactId> | ||
<version>${maven-jar-plugin.version}</version> | ||
<configuration> | ||
<archive> | ||
<manifestEntries> | ||
<git-SHA-1>${buildNumber}</git-SHA-1> | ||
</manifestEntries> | ||
</archive> | ||
</configuration> | ||
<executions> | ||
<execution> | ||
<goals> | ||
<goal>test-jar</goal> | ||
</goals> | ||
</execution> | ||
</executions> | ||
</plugin> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-shade-plugin</artifactId> | ||
<executions> | ||
<execution> | ||
<phase>package</phase> | ||
<goals> | ||
<goal>shade</goal> | ||
</goals> | ||
<configuration> | ||
<artifactSet> | ||
<includes> | ||
<include>org.codehaus.jackson:jackson-mapper-asl</include> | ||
<include>org.codehaus.jackson:jackson-core-asl</include> | ||
</includes> | ||
</artifactSet> | ||
<relocations> | ||
<relocation> | ||
<pattern>org.codehaus.jackson</pattern> | ||
<shadedPattern>parquet.org.codehaus.jackson</shadedPattern> | ||
</relocation> | ||
</relocations> | ||
</configuration> | ||
</execution> | ||
</executions> | ||
</plugin> | ||
<plugin> | ||
<!-- Ensure that the specific classes are available during test compile but not included in jar --> | ||
<groupId>org.codehaus.mojo</groupId> | ||
<artifactId>build-helper-maven-plugin</artifactId> | ||
<version>1.8</version> | ||
<executions> | ||
<execution> | ||
<id>add-test-sources</id> | ||
<phase>generate-test-sources</phase> | ||
<goals> | ||
<goal>add-test-source</goal> | ||
</goals> | ||
<configuration> | ||
<sources> | ||
<source>${project.build.directory}/generated-test-sources</source> | ||
</sources> | ||
</configuration> | ||
</execution> | ||
</executions> | ||
</plugin> | ||
|
||
<plugin> | ||
<artifactId>maven-antrun-plugin</artifactId> | ||
<executions> | ||
<execution> | ||
<id>generate-sources</id> | ||
<phase>generate-test-sources</phase> | ||
<configuration> | ||
<tasks> | ||
<mkdir dir="${project.build.directory}/generated-test-sources"/> | ||
<mkdir dir="${project.build.directory}/generated-test-sources/java"/> | ||
<exec failonerror="true" executable="protoc"> | ||
<arg value="--java_out=${project.build.directory}/generated-test-sources/java"/> | ||
<arg value="src/test/resources/TestProtobuf.proto"/> | ||
<arg value="-I."/> | ||
</exec> | ||
</tasks> | ||
<sourceRoot>src/main/java</sourceRoot> | ||
<sourceRoot>target/generated-sources/java</sourceRoot> | ||
</configuration> | ||
<goals> | ||
<goal>run</goal> | ||
</goals> | ||
</execution> | ||
</executions> | ||
</plugin> | ||
|
||
</plugins> | ||
</build> | ||
|
||
</project> |
34 changes: 34 additions & 0 deletions
34
parquet-protobuf/src/main/java/parquet/proto/ProtoParquetInputFormat.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
/** | ||
* Copyright 2013 Lukas Nalezenec | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package parquet.proto; | ||
|
||
import com.google.protobuf.MessageOrBuilder; | ||
import org.apache.hadoop.conf.Configuration; | ||
import parquet.hadoop.ParquetInputFormat; | ||
|
||
/** | ||
* A Hadoop {@link org.apache.hadoop.mapreduce.InputFormat} for Parquet files. | ||
*/ | ||
public class ProtoParquetInputFormat<T extends MessageOrBuilder> extends ParquetInputFormat<T> { | ||
public ProtoParquetInputFormat() { | ||
super(ProtoReadSupport.class); | ||
} | ||
|
||
public static void setRequestedProjection(Configuration configuration, String requestedProjection) { | ||
ProtoReadSupport.setRequestedProjection(configuration, requestedProjection); | ||
} | ||
|
||
} |
54 changes: 54 additions & 0 deletions
54
parquet-protobuf/src/main/java/parquet/proto/ProtoParquetOutputFormat.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
/** | ||
* Copyright 2013 Lukas Nalezenec | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package parquet.proto; | ||
|
||
import com.google.protobuf.Message; | ||
import com.google.protobuf.MessageOrBuilder; | ||
import org.apache.hadoop.mapreduce.Job; | ||
import parquet.hadoop.ParquetOutputFormat; | ||
import parquet.hadoop.util.ContextUtil; | ||
|
||
/** | ||
* A Hadoop {@link org.apache.hadoop.mapreduce.OutputFormat} for Protobuffer Parquet files. | ||
* <p/> | ||
* Usage: | ||
* <p/> | ||
* <pre> | ||
* {@code | ||
* final Job job = new Job(conf, "Parquet writing job"); | ||
* job.setOutputFormatClass(ProtoParquetOutputFormat.class); | ||
* ProtoParquetOutputFormat.setOutputPath(job, parquetPath); | ||
* ProtoParquetOutputFormat.setProtobufferClass(job, YourProtobuffer.class); | ||
* } | ||
* </pre> | ||
* | ||
* @author Lukas Nalezenec | ||
*/ | ||
public class ProtoParquetOutputFormat<T extends MessageOrBuilder> extends ParquetOutputFormat<T> { | ||
|
||
public static void setProtobufferClass(Job job, Class<? extends Message> protoClass) { | ||
ProtoWriteSupport.setSchema(ContextUtil.getConfiguration(job), protoClass); | ||
} | ||
|
||
public ProtoParquetOutputFormat(Class<? extends Message> msg) { | ||
super(new ProtoWriteSupport(msg)); | ||
} | ||
|
||
public ProtoParquetOutputFormat() { | ||
super(new ProtoWriteSupport()); | ||
} | ||
|
||
} |
40 changes: 40 additions & 0 deletions
40
parquet-protobuf/src/main/java/parquet/proto/ProtoParquetReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
/** | ||
* Copyright 2012 Twitter, Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package parquet.proto; | ||
|
||
import com.google.protobuf.MessageOrBuilder; | ||
import org.apache.hadoop.fs.Path; | ||
import parquet.filter.UnboundRecordFilter; | ||
import parquet.hadoop.ParquetReader; | ||
import parquet.hadoop.api.ReadSupport; | ||
|
||
import java.io.IOException; | ||
|
||
/** | ||
* Read Avro records from a Parquet file. | ||
*/ | ||
public class ProtoParquetReader<T extends MessageOrBuilder> extends ParquetReader<T> { | ||
|
||
public ProtoParquetReader(Path file) throws IOException { | ||
super(file, (ReadSupport<T>) new ProtoReadSupport()); | ||
} | ||
|
||
public ProtoParquetReader(Path file, UnboundRecordFilter recordFilter) throws IOException { | ||
super(file, (ReadSupport<T>) new ProtoReadSupport(), recordFilter); | ||
} | ||
|
||
//TODO here should be option to override pb from file | ||
} |
78 changes: 78 additions & 0 deletions
78
parquet-protobuf/src/main/java/parquet/proto/ProtoParquetWriter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
/** | ||
* Copyright 2012 Twitter, Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package parquet.proto; | ||
|
||
import com.google.protobuf.Message; | ||
import com.google.protobuf.MessageOrBuilder; | ||
import org.apache.hadoop.fs.Path; | ||
import parquet.hadoop.ParquetWriter; | ||
import parquet.hadoop.api.WriteSupport; | ||
import parquet.hadoop.metadata.CompressionCodecName; | ||
|
||
import java.io.IOException; | ||
|
||
/** | ||
* Write Protobuffer records to a Parquet file. | ||
*/ | ||
public class ProtoParquetWriter<T extends MessageOrBuilder> extends ParquetWriter<T> { | ||
|
||
/** | ||
* Create a new {@link ProtoParquetWriter}. | ||
* | ||
* @param file | ||
* @param compressionCodecName | ||
* @param blockSize | ||
* @param pageSize | ||
* @throws IOException | ||
*/ | ||
public ProtoParquetWriter(Path file, Class<? extends Message> protoMessage, | ||
CompressionCodecName compressionCodecName, int blockSize, | ||
int pageSize) throws IOException { | ||
super(file, (WriteSupport<T>) new ProtoWriteSupport(protoMessage), | ||
compressionCodecName, blockSize, pageSize); | ||
} | ||
|
||
/** | ||
* Create a new {@link ProtoParquetWriter}. | ||
* | ||
* @param file The file name to write to. | ||
* @param compressionCodecName Compression code to use, or CompressionCodecName.UNCOMPRESSED | ||
* @param blockSize HDFS block size | ||
* @param pageSize See parquet write up. Blocks are subdivided into pages for alignment and other purposes. | ||
* @param enableDictionary Whether to use a dictionary to compress columns. | ||
* @throws IOException | ||
*/ | ||
public ProtoParquetWriter(Path file, Class<? extends Message> protoMessage, | ||
CompressionCodecName compressionCodecName, int blockSize, | ||
int pageSize, boolean enableDictionary) throws IOException { | ||
super(file, (WriteSupport<T>) | ||
new ProtoWriteSupport(protoMessage), | ||
compressionCodecName, blockSize, pageSize, enableDictionary, false); | ||
} | ||
|
||
/** | ||
* Create a new {@link ProtoParquetWriter}. The default block size is 50 MB.The default | ||
* page size is 1 MB. Default compression is no compression. (Inherited from {@link ParquetWriter}) | ||
* | ||
* @param file The file name to write to. | ||
* @throws IOException | ||
*/ | ||
public ProtoParquetWriter(Path file, Class<? extends Message> protoMessage) throws IOException { | ||
this(file, protoMessage, CompressionCodecName.UNCOMPRESSED, | ||
DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE); | ||
} | ||
|
||
} |
Oops, something went wrong.