Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parquet f32/f64 handle signed zeros in statistics #5048

Merged
merged 1 commit into from
Nov 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 27 additions & 2 deletions parquet/src/column/writer/encoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.

use crate::basic::Encoding;
use crate::basic::{Encoding, Type};
use crate::bloom_filter::Sbbf;
use crate::column::writer::{
compare_greater, fallback_encoding, has_dictionary_support, is_nan, update_max, update_min,
Expand Down Expand Up @@ -308,5 +308,30 @@ where
max = val;
}
}
Some((min.clone(), max.clone()))

// Float/Double statistics have special case for zero.
//
// If computed min is zero, whether negative or positive,
// the spec states that the min should be written as -0.0
// (negative zero)
//
// For max, it has similar logic but will be written as 0.0
// (positive zero)
let min = replace_zero(min, -0.0);
let max = replace_zero(max, 0.0);

Some((min, max))
}

#[inline]
fn replace_zero<T: ParquetValueType>(val: &T, replace: f32) -> T {
match T::PHYSICAL_TYPE {
Type::FLOAT if f32::from_le_bytes(val.as_bytes().try_into().unwrap()) == 0.0 => {
T::try_from_le_slice(&f32::to_le_bytes(replace)).unwrap()
}
Type::DOUBLE if f64::from_le_bytes(val.as_bytes().try_into().unwrap()) == 0.0 => {
T::try_from_le_slice(&f64::to_le_bytes(replace as f64)).unwrap()
}
_ => val.clone(),
}
}
120 changes: 118 additions & 2 deletions parquet/src/column/writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2111,6 +2111,64 @@ mod tests {
assert!(matches!(stats, Statistics::Float(_)));
}

#[test]
fn test_float_statistics_zero_only() {
let stats = statistics_roundtrip::<FloatType>(&[0.0]);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
if let Statistics::Float(stats) = stats {
assert_eq!(stats.min(), &-0.0);
assert!(stats.min().is_sign_negative());
assert_eq!(stats.max(), &0.0);
assert!(stats.max().is_sign_positive());
} else {
panic!("expecting Statistics::Float");
}
}

#[test]
fn test_float_statistics_neg_zero_only() {
let stats = statistics_roundtrip::<FloatType>(&[-0.0]);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
if let Statistics::Float(stats) = stats {
assert_eq!(stats.min(), &-0.0);
assert!(stats.min().is_sign_negative());
assert_eq!(stats.max(), &0.0);
assert!(stats.max().is_sign_positive());
} else {
panic!("expecting Statistics::Float");
}
}

#[test]
fn test_float_statistics_zero_min() {
let stats = statistics_roundtrip::<FloatType>(&[0.0, 1.0, f32::NAN, 2.0]);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
if let Statistics::Float(stats) = stats {
assert_eq!(stats.min(), &-0.0);
assert!(stats.min().is_sign_negative());
assert_eq!(stats.max(), &2.0);
} else {
panic!("expecting Statistics::Float");
}
}

#[test]
fn test_float_statistics_neg_zero_max() {
let stats = statistics_roundtrip::<FloatType>(&[-0.0, -1.0, f32::NAN, -2.0]);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
if let Statistics::Float(stats) = stats {
assert_eq!(stats.min(), &-2.0);
assert_eq!(stats.max(), &0.0);
assert!(stats.max().is_sign_positive());
} else {
panic!("expecting Statistics::Float");
}
}

#[test]
fn test_double_statistics_nan_middle() {
let stats = statistics_roundtrip::<DoubleType>(&[1.0, f64::NAN, 2.0]);
Expand All @@ -2120,7 +2178,7 @@ mod tests {
assert_eq!(stats.min(), &1.0);
assert_eq!(stats.max(), &2.0);
} else {
panic!("expecting Statistics::Float");
panic!("expecting Statistics::Double");
}
}

Expand All @@ -2133,7 +2191,7 @@ mod tests {
assert_eq!(stats.min(), &1.0);
assert_eq!(stats.max(), &2.0);
} else {
panic!("expecting Statistics::Float");
panic!("expecting Statistics::Double");
}
}

Expand All @@ -2145,6 +2203,64 @@ mod tests {
assert!(stats.is_min_max_backwards_compatible());
}

#[test]
fn test_double_statistics_zero_only() {
let stats = statistics_roundtrip::<DoubleType>(&[0.0]);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
if let Statistics::Double(stats) = stats {
assert_eq!(stats.min(), &-0.0);
assert!(stats.min().is_sign_negative());
assert_eq!(stats.max(), &0.0);
assert!(stats.max().is_sign_positive());
} else {
panic!("expecting Statistics::Double");
}
}

#[test]
fn test_double_statistics_neg_zero_only() {
let stats = statistics_roundtrip::<DoubleType>(&[-0.0]);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
if let Statistics::Double(stats) = stats {
assert_eq!(stats.min(), &-0.0);
assert!(stats.min().is_sign_negative());
assert_eq!(stats.max(), &0.0);
assert!(stats.max().is_sign_positive());
} else {
panic!("expecting Statistics::Double");
}
}

#[test]
fn test_double_statistics_zero_min() {
let stats = statistics_roundtrip::<DoubleType>(&[0.0, 1.0, f64::NAN, 2.0]);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
if let Statistics::Double(stats) = stats {
assert_eq!(stats.min(), &-0.0);
assert!(stats.min().is_sign_negative());
assert_eq!(stats.max(), &2.0);
} else {
panic!("expecting Statistics::Double");
}
}

#[test]
fn test_double_statistics_neg_zero_max() {
let stats = statistics_roundtrip::<DoubleType>(&[-0.0, -1.0, f64::NAN, -2.0]);
assert!(stats.has_min_max_set());
assert!(stats.is_min_max_backwards_compatible());
if let Statistics::Double(stats) = stats {
assert_eq!(stats.min(), &-2.0);
assert_eq!(stats.max(), &0.0);
assert!(stats.max().is_sign_positive());
} else {
panic!("expecting Statistics::Double");
}
}

#[test]
fn test_compare_greater_byte_array_decimals() {
assert!(!compare_greater_byte_array_decimals(&[], &[],),);
Expand Down
4 changes: 2 additions & 2 deletions parquet/src/data_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -632,15 +632,15 @@ pub(crate) mod private {

/// Return the value as i64 if possible
///
/// This is essentially the same as `std::convert::TryInto<i64>` but can
/// This is essentially the same as `std::convert::TryInto<i64>` but can't be
/// implemented for `f32` and `f64`, types that would fail orphan rules
fn as_i64(&self) -> Result<i64> {
Err(general_err!("Type cannot be converted to i64"))
}

/// Return the value as u64 if possible
///
/// This is essentially the same as `std::convert::TryInto<u64>` but can
/// This is essentially the same as `std::convert::TryInto<u64>` but can't be
/// implemented for `f32` and `f64`, types that would fail orphan rules
fn as_u64(&self) -> Result<u64> {
self.as_i64()
Expand Down