diff --git a/CHANGES.md b/CHANGES.md index cd34e0a0b0..6f2894dce9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,63 @@ # Parquet # +### Version 1.2.10 ### +* ISSUE [247](https://github.com/Parquet/parquet-mr/pull/247): fix bug: when field index is greater than zero +* ISSUE [244](https://github.com/Parquet/parquet-mr/pull/244): Feature/error handler +* ISSUE [187](https://github.com/Parquet/parquet-mr/pull/187): Plumb OriginalType +* ISSUE [245](https://github.com/Parquet/parquet-mr/pull/245): integrate parquet format 2.0 + +### Version 1.2.9 ### +* ISSUE [242](https://github.com/Parquet/parquet-mr/pull/242): upgrade elephant-bird version to 4.3 +* ISSUE [240](https://github.com/Parquet/parquet-mr/pull/240): fix loader cache +* ISSUE [233](https://github.com/Parquet/parquet-mr/pull/233): use latest stable release of cascading: 2.5.1 +* ISSUE [241](https://github.com/Parquet/parquet-mr/pull/241): Update reference to 0.10 in Hive012Binding javadoc +* ISSUE [239](https://github.com/Parquet/parquet-mr/pull/239): Fix hive map and array inspectors with null containers +* ISSUE [234](https://github.com/Parquet/parquet-mr/pull/234): optimize chunk scan; fix compressed size +* ISSUE [237](https://github.com/Parquet/parquet-mr/pull/237): Handle codec not found +* ISSUE [238](https://github.com/Parquet/parquet-mr/pull/238): fix pom version caused by bad merge +* ISSUE [235](https://github.com/Parquet/parquet-mr/pull/235): Not write pig meta data only when pig is not avaliable +* ISSUE [227](https://github.com/Parquet/parquet-mr/pull/227): Breaks parquet-hive up into several submodules, creating infrastructure ... +* ISSUE [229](https://github.com/Parquet/parquet-mr/pull/229): add changelog tool +* ISSUE [236](https://github.com/Parquet/parquet-mr/pull/236): Make cascading a provided dependency + +### Version 1.2.8 ### +* ISSUE 228: enable globing files for parquetTupleScheme, refactor unit tests and rem... +* ISSUE 224: Changing read and write methods in ParquetInputSplit so that they can de... + +### Version 1.2.8 ### +* ISSUE 228: enable globing files for parquetTupleScheme, refactor unit tests and rem... +* ISSUE 224: Changing read and write methods in ParquetInputSplit so that they can de... + +### Version 1.2.7 ### +* ISSUE 223: refactor encoded values changes and test that resetDictionary works +* ISSUE 222: fix bug: set raw data size to 0 after reset + +### Version 1.2.6 ### +* ISSUE 221: make pig, hadoop and log4j jars provided +* ISSUE 220: parquet-hive should ship and uber jar +* ISSUE 213: group parquet-format version in one property +* ISSUE 215: Fix Binary.equals(). +* ISSUE 210: ParquetWriter ignores enable dictionary and validating flags. +* ISSUE 202: Fix requested schema when recreating splits in hive +* ISSUE 208: Improve dic fall back +* ISSUE 207: Fix offset +* ISSUE 206: Create a "Powered by" page + +### Version 1.2.5 ### +* ISSUE 204: ParquetLoader.inputFormatCache as WeakHashMap +* ISSUE 203: add null check for EnumWriteProtocol +* ISSUE 205: use cascading 2.2.0 +* ISSUE 199: simplify TupleWriteSupport constructor +* ISSUE 164: Dictionary changes +* ISSUE 196: Fixes to the Hive SerDe +* ISSUE 197: RLE decoder reading past the end of the stream +* ISSUE 188: Added ability to define arbitrary predicate functions +* ISSUE 194: refactor serde to remove some unecessary boxing and include dictionary awareness +* ISSUE 190: NPE in DictionaryValuesWriter. + +### Version 1.2.4 ### +* ISSUE 191: Add compatibility checker for ThriftStruct to check for backward compatibility of two thrift structs + ### Version 1.2.3 ### * ISSUE 186: add parquet-pig-bundle * ISSUE 184: Update ParquetReader to take Configuration as a constructor argument. diff --git a/PoweredBy.md b/PoweredBy.md new file mode 100644 index 0000000000..f9b5ac5293 --- /dev/null +++ b/PoweredBy.md @@ -0,0 +1,25 @@ +Who's using Parquet? +====== +(in alphabetical order) + +## Cloudera Impala + +

We shipped Impala 0.7 (http://t.co/wxuV0wYShk) - a whole ton of great new features including DDL, Parquet support and partitioned joins!

— Henry Robinson (@HenryR) April 16, 2013
+ +## Criteo + +

Parquet: Efficient Columnar Storage for Apache #Hadoop http://t.co/He1xyv6NC3 via @cloudera - #Criteo R&D very happy to contribute!

— Julien SIMON (@julsimon) March 14, 2013
+ +## Salesforce.com + +

"@ParquetFormat at Salesforce.com" http://t.co/IFskqF0FP3 via @cloudera

— Twitter Open Source (@TwitterOSS) October 22, 2013
+ +## Stripe + +

We're moving basically all of our archival data at @stripe into @ParquetFormat, and I'm super pleased with how well it's working out.

— Avi Bryant (@avibryant) October 18, 2013
+ +## Twitter + +

Converting some data to Parquet on the Twitter clusters. I'm seeing a 28% space saving thanks to the compressibility of the column layout.

— Julien Le Dem (@J_) March 24, 2013
+ + diff --git a/README.md b/README.md index 22b1c1b425..8a281d5499 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Parquet is a very active project, and new features are being added quickly; belo Hive integrationYES (28)1.0 Pig integrationYES1.0 Cascading integrationYES1.0 + Crunch integrationYES (CRUNCH-277)1.0 Impala integrationYES (non-nested)1.0 Java Map/Reduce APIYES1.0 Native Avro supportYES1.0 diff --git a/changelog.sh b/changelog.sh new file mode 100755 index 0000000000..e7e5b1f960 --- /dev/null +++ b/changelog.sh @@ -0,0 +1,36 @@ +echo "github username:" >&2 +read username >&2 +echo "github password:" >&2 +read -s password >&2 + +curl -f -u $username:$password -s "https://api.github.com" > /dev/null +if [ $? == 0 ] +then + echo "login successful" >&2 +else + echo "login failed" >&2 + curl -u $username:$password -s "https://api.github.com" + exit 1 +fi + +echo "# Parquet #" + +git log | grep -E "Merge pull request|prepare release" | while read l +do + release=`echo $l | grep "\[maven-release-plugin\] prepare release" | cut -d "-" -f 4` + PR=`echo $l| grep -E -o "Merge pull request #[^ ]*" | cut -d "#" -f 2` +# echo $l + if [ -n "$release" ] + then + echo + echo "### Version $release ###" + fi + if [ -n "$PR" ] + then + JSON=`curl -u $username:$password -s https://api.github.com/repos/Parquet/parquet-mr/pulls/$PR | tr "\n" " "` + DESC_RAW=$(echo $JSON | grep -Po '"title":.*?[^\\]",' | cut -d "\"" -f 4- | head -n 1 | sed -e "s/\\\\//g") + DESC=$(echo ${DESC_RAW%\",}) + echo "* ISSUE [$PR](https://github.com/Parquet/parquet-mr/pull/$PR): ${DESC}" + fi +done + diff --git a/parquet-avro/pom.xml b/parquet-avro/pom.xml index 33258e5b59..ea3d5effa9 100644 --- a/parquet-avro/pom.xml +++ b/parquet-avro/pom.xml @@ -3,7 +3,7 @@ com.twitter parquet ../pom.xml - 1.2.4-SNAPSHOT + 1.2.11-SNAPSHOT 4.0.0 @@ -32,7 +32,7 @@ com.twitter parquet-format - 1.0.0 + ${parquet.format.version} org.apache.avro @@ -43,6 +43,7 @@ org.apache.hadoop hadoop-client ${hadoop.version} + provided com.google.guava diff --git a/parquet-cascading/pom.xml b/parquet-cascading/pom.xml index 38e03fc28d..de03a11387 100644 --- a/parquet-cascading/pom.xml +++ b/parquet-cascading/pom.xml @@ -3,7 +3,7 @@ com.twitter parquet ../pom.xml - 1.2.4-SNAPSHOT + 1.2.11-SNAPSHOT 4.0.0 @@ -41,11 +41,13 @@ org.apache.hadoop hadoop-client ${hadoop.version} + provided log4j log4j - 1.2.17 + ${log4j.version} + provided com.twitter @@ -63,7 +65,8 @@ cascading cascading-hadoop - 2.1.5 + ${cascading.version} + provided diff --git a/parquet-cascading/src/main/java/parquet/cascading/ParquetTupleScheme.java b/parquet-cascading/src/main/java/parquet/cascading/ParquetTupleScheme.java index d75b6e76d7..5d229c72d7 100644 --- a/parquet-cascading/src/main/java/parquet/cascading/ParquetTupleScheme.java +++ b/parquet-cascading/src/main/java/parquet/cascading/ParquetTupleScheme.java @@ -93,7 +93,8 @@ private MessageType readSchema(FlowProcess flowProcess, Tap tap) { else hfs = (Hfs) tap; - List