Skip to content

Commit

Permalink
PARQUET-1026: allow unsigned binary stats when min == max
Browse files Browse the repository at this point in the history
When min equals max this is a special case where unsigned stats would actually be the same as signed stats since there is only one value.
This is useful when the data is partitioned by that column and there's only one value in the file.
Drill for example takes advantage of this.

Author: Julien Le Dem <[email protected]>

Closes apache#416 from julienledem/min_eq_max and squashes the following commits:

1d71624 [Julien Le Dem] revert package import ordering change
47d89fc [Julien Le Dem] allow unsigned binary stats when min == max
  • Loading branch information
julienledem committed Jun 9, 2017
1 parent 2d3203b commit 352b906
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -340,13 +340,16 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist
org.apache.parquet.column.statistics.Statistics stats = org.apache.parquet.column.statistics.Statistics.getStatsBasedOnType(type);
// If there was no statistics written to the footer, create an empty Statistics object and return

boolean isSet = statistics != null && statistics.isSetMax() && statistics.isSetMin();
boolean maxEqualsMin = isSet ? Arrays.equals(statistics.getMin(), statistics.getMax()) : false;
boolean sortOrdersMatch = SortOrder.SIGNED == typeSortOrder;
// NOTE: See docs in CorruptStatistics for explanation of why this check is needed
// The sort order is checked to avoid returning min/max stats that are not
// valid with the type's sort order. Currently, all stats are aggregated
// using a signed ordering, which isn't valid for strings or unsigned ints.
if (statistics != null && !CorruptStatistics.shouldIgnoreStatistics(createdBy, type) &&
SortOrder.SIGNED == typeSortOrder) {
if (statistics.isSetMax() && statistics.isSetMin()) {
( sortOrdersMatch || maxEqualsMin)) {
if (isSet) {
stats.setMinMaxFromBytes(statistics.min.array(), statistics.max.array());
}
stats.setNumNulls(statistics.null_count);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -534,7 +534,27 @@ public void testIgnoreStatsWithSignedSortOrder() {
Types.required(PrimitiveTypeName.BINARY)
.as(OriginalType.UTF8).named("b"));

Assert.assertTrue("Stats should be empty", convertedStats.isEmpty());
Assert.assertTrue("Stats should be empty: " + convertedStats, convertedStats.isEmpty());
}

@Test
public void testStillUseStatsWithSignedSortOrderIfSingleValue() {
ParquetMetadataConverter converter = new ParquetMetadataConverter();
BinaryStatistics stats = new BinaryStatistics();
stats.incrementNumNulls();
stats.updateStats(Binary.fromString("A"));
stats.incrementNumNulls();
stats.updateStats(Binary.fromString("A"));
stats.incrementNumNulls();

Statistics convertedStats = converter.fromParquetStatistics(
Version.FULL_VERSION,
ParquetMetadataConverter.toParquetStatistics(stats),
Types.required(PrimitiveTypeName.BINARY)
.as(OriginalType.UTF8).named("b"));

Assert.assertFalse("Stats should not be empty: " + convertedStats, convertedStats.isEmpty());
Assert.assertArrayEquals("min == max: " + convertedStats, convertedStats.getMaxBytes(), convertedStats.getMinBytes());
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -595,17 +595,18 @@ public void testWriteReadStatisticsAllNulls() throws Exception {
GroupWriteSupport.setSchema(schema, configuration);

ParquetWriter<Group> writer = new ParquetWriter<Group>(path, configuration, new GroupWriteSupport());

Group r1 = new SimpleGroup(schema);
writer.write(r1);
writer.close();

ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);

// assert the statistics object is not empty
assertTrue((readFooter.getBlocks().get(0).getColumns().get(0).getStatistics().isEmpty()) == false);
org.apache.parquet.column.statistics.Statistics stats = readFooter.getBlocks().get(0).getColumns().get(0).getStatistics();
assertFalse("is empty: " + stats, stats.isEmpty());
// assert the number of nulls are correct for the first block
assertEquals(1, (readFooter.getBlocks().get(0).getColumns().get(0).getStatistics().getNumNulls()));
assertEquals("nulls: " + stats, 1, stats.getNumNulls());
}

private void validateFooters(final List<Footer> metadata) {
Expand Down

0 comments on commit 352b906

Please sign in to comment.