Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
lukasnalezenec committed Dec 26, 2013
2 parents 82b889c + 94b2ec0 commit a717bbf
Show file tree
Hide file tree
Showing 30 changed files with 2,232 additions and 1 deletion.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ target
*.orig
*.rej
dependency-reduced-pom.xml
.idea/*
.idea/*
target/
4 changes: 4 additions & 0 deletions parquet-protobuf/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
parquet-protobuf
================

protobuffer support for Parquet columnar format
157 changes: 157 additions & 0 deletions parquet-protobuf/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<groupId>com.twitter</groupId>
<artifactId>parquet</artifactId>
<relativePath>../pom.xml</relativePath>
<version>1.2.10-SNAPSHOT</version>
</parent>

<modelVersion>4.0.0</modelVersion>

<artifactId>parquet-protobuf</artifactId>
<packaging>jar</packaging>

<properties>
<elephant-bird.version>3.0.8</elephant-bird.version>
</properties>


<name>Parquet Protobuf</name>
<url>https://github.com/lukasnalezenec/parquet-protobuf.git</url>

<dependencies>
<dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
<version>2.4.1</version>
</dependency>
<dependency>
<groupId>com.twitter</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.twitter.elephantbird</groupId>
<artifactId>elephant-bird-core</artifactId>
<version>${elephant-bird.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>

</dependencies>

<developers>
<developer>
<id>lukasnalezenec</id>
<name>Lukas Nalezenec</name>
</developer>
</developers>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>${maven-jar-plugin.version}</version>
<configuration>
<archive>
<manifestEntries>
<git-SHA-1>${buildNumber}</git-SHA-1>
</manifestEntries>
</archive>
</configuration>
<executions>
<execution>
<goals>
<goal>test-jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<artifactSet>
<includes>
<include>org.codehaus.jackson:jackson-mapper-asl</include>
<include>org.codehaus.jackson:jackson-core-asl</include>
</includes>
</artifactSet>
<relocations>
<relocation>
<pattern>org.codehaus.jackson</pattern>
<shadedPattern>parquet.org.codehaus.jackson</shadedPattern>
</relocation>
</relocations>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<!-- Ensure that the specific classes are available during test compile but not included in jar -->
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>1.8</version>
<executions>
<execution>
<id>add-test-sources</id>
<phase>generate-test-sources</phase>
<goals>
<goal>add-test-source</goal>
</goals>
<configuration>
<sources>
<source>${project.build.directory}/generated-test-sources</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>

<plugin>
<artifactId>maven-antrun-plugin</artifactId>
<executions>
<execution>
<id>generate-sources</id>
<phase>generate-test-sources</phase>
<configuration>
<tasks>
<mkdir dir="${project.build.directory}/generated-test-sources"/>
<mkdir dir="${project.build.directory}/generated-test-sources/java"/>
<exec failonerror="true" executable="protoc">
<arg value="--java_out=${project.build.directory}/generated-test-sources/java"/>
<arg value="src/test/resources/TestProtobuf.proto"/>
<arg value="-I."/>
</exec>
</tasks>
<sourceRoot>src/main/java</sourceRoot>
<sourceRoot>target/generated-sources/java</sourceRoot>
</configuration>
<goals>
<goal>run</goal>
</goals>
</execution>
</executions>
</plugin>

</plugins>
</build>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/**
* Copyright 2013 Lukas Nalezenec
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package parquet.proto;

import com.google.protobuf.MessageOrBuilder;
import org.apache.hadoop.conf.Configuration;
import parquet.hadoop.ParquetInputFormat;

/**
* A Hadoop {@link org.apache.hadoop.mapreduce.InputFormat} for Parquet files.
*/
public class ProtoParquetInputFormat<T extends MessageOrBuilder> extends ParquetInputFormat<T> {
public ProtoParquetInputFormat() {
super(ProtoReadSupport.class);
}

public static void setRequestedProjection(Configuration configuration, String requestedProjection) {
ProtoReadSupport.setRequestedProjection(configuration, requestedProjection);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/**
* Copyright 2013 Lukas Nalezenec
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package parquet.proto;

import com.google.protobuf.Message;
import com.google.protobuf.MessageOrBuilder;
import org.apache.hadoop.mapreduce.Job;
import parquet.hadoop.ParquetOutputFormat;
import parquet.hadoop.util.ContextUtil;

/**
* A Hadoop {@link org.apache.hadoop.mapreduce.OutputFormat} for Protobuffer Parquet files.
* <p/>
* Usage:
* <p/>
* <pre>
* {@code
* final Job job = new Job(conf, "Parquet writing job");
* job.setOutputFormatClass(ProtoParquetOutputFormat.class);
* ProtoParquetOutputFormat.setOutputPath(job, parquetPath);
* ProtoParquetOutputFormat.setProtobufferClass(job, YourProtobuffer.class);
* }
* </pre>
*
* @author Lukas Nalezenec
*/
public class ProtoParquetOutputFormat<T extends MessageOrBuilder> extends ParquetOutputFormat<T> {

public static void setProtobufferClass(Job job, Class<? extends Message> protoClass) {
ProtoWriteSupport.setSchema(ContextUtil.getConfiguration(job), protoClass);
}

public ProtoParquetOutputFormat(Class<? extends Message> msg) {
super(new ProtoWriteSupport(msg));
}

public ProtoParquetOutputFormat() {
super(new ProtoWriteSupport());
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/**
* Copyright 2012 Twitter, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package parquet.proto;

import com.google.protobuf.MessageOrBuilder;
import org.apache.hadoop.fs.Path;
import parquet.filter.UnboundRecordFilter;
import parquet.hadoop.ParquetReader;
import parquet.hadoop.api.ReadSupport;

import java.io.IOException;

/**
* Read Avro records from a Parquet file.
*/
public class ProtoParquetReader<T extends MessageOrBuilder> extends ParquetReader<T> {

public ProtoParquetReader(Path file) throws IOException {
super(file, (ReadSupport<T>) new ProtoReadSupport());
}

public ProtoParquetReader(Path file, UnboundRecordFilter recordFilter) throws IOException {
super(file, (ReadSupport<T>) new ProtoReadSupport(), recordFilter);
}

//TODO here should be option to override pb from file
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/**
* Copyright 2012 Twitter, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package parquet.proto;

import com.google.protobuf.Message;
import com.google.protobuf.MessageOrBuilder;
import org.apache.hadoop.fs.Path;
import parquet.hadoop.ParquetWriter;
import parquet.hadoop.api.WriteSupport;
import parquet.hadoop.metadata.CompressionCodecName;

import java.io.IOException;

/**
* Write Protobuffer records to a Parquet file.
*/
public class ProtoParquetWriter<T extends MessageOrBuilder> extends ParquetWriter<T> {

/**
* Create a new {@link ProtoParquetWriter}.
*
* @param file
* @param compressionCodecName
* @param blockSize
* @param pageSize
* @throws IOException
*/
public ProtoParquetWriter(Path file, Class<? extends Message> protoMessage,
CompressionCodecName compressionCodecName, int blockSize,
int pageSize) throws IOException {
super(file, (WriteSupport<T>) new ProtoWriteSupport(protoMessage),
compressionCodecName, blockSize, pageSize);
}

/**
* Create a new {@link ProtoParquetWriter}.
*
* @param file The file name to write to.
* @param compressionCodecName Compression code to use, or CompressionCodecName.UNCOMPRESSED
* @param blockSize HDFS block size
* @param pageSize See parquet write up. Blocks are subdivided into pages for alignment and other purposes.
* @param enableDictionary Whether to use a dictionary to compress columns.
* @throws IOException
*/
public ProtoParquetWriter(Path file, Class<? extends Message> protoMessage,
CompressionCodecName compressionCodecName, int blockSize,
int pageSize, boolean enableDictionary) throws IOException {
super(file, (WriteSupport<T>)
new ProtoWriteSupport(protoMessage),
compressionCodecName, blockSize, pageSize, enableDictionary, false);
}

/**
* Create a new {@link ProtoParquetWriter}. The default block size is 50 MB.The default
* page size is 1 MB. Default compression is no compression. (Inherited from {@link ParquetWriter})
*
* @param file The file name to write to.
* @throws IOException
*/
public ProtoParquetWriter(Path file, Class<? extends Message> protoMessage) throws IOException {
this(file, protoMessage, CompressionCodecName.UNCOMPRESSED,
DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE);
}

}
Loading

0 comments on commit a717bbf

Please sign in to comment.