Parquet f32/f64 handle signed zeros in statistics (#5048)

apache · Nov 7, 2023 · ffeda62 · ffeda62
1 parent 20f10dc
commit ffeda62
Show file tree

Hide file tree

Showing 3 changed files with 147 additions and 6 deletions.
diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::basic::Encoding;
+use crate::basic::{Encoding, Type};
 use crate::bloom_filter::Sbbf;
 use crate::column::writer::{
     compare_greater, fallback_encoding, has_dictionary_support, is_nan, update_max, update_min,
@@ -308,5 +308,30 @@ where
             max = val;
         }
     }
-    Some((min.clone(), max.clone()))
+
+    // Float/Double statistics have special case for zero.
+    //
+    // If computed min is zero, whether negative or positive,
+    // the spec states that the min should be written as -0.0
+    // (negative zero)
+    //
+    // For max, it has similar logic but will be written as 0.0
+    // (positive zero)
+    let min = replace_zero(min, -0.0);
+    let max = replace_zero(max, 0.0);
+
+    Some((min, max))
+}
+
+#[inline]
+fn replace_zero<T: ParquetValueType>(val: &T, replace: f32) -> T {
+    match T::PHYSICAL_TYPE {
+        Type::FLOAT if f32::from_le_bytes(val.as_bytes().try_into().unwrap()) == 0.0 => {
+            T::try_from_le_slice(&f32::to_le_bytes(replace)).unwrap()
+        }
+        Type::DOUBLE if f64::from_le_bytes(val.as_bytes().try_into().unwrap()) == 0.0 => {
+            T::try_from_le_slice(&f64::to_le_bytes(replace as f64)).unwrap()
+        }
+        _ => val.clone(),
+    }
 }
diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
@@ -2111,6 +2111,64 @@ mod tests {
         assert!(matches!(stats, Statistics::Float(_)));
     }
 
+    #[test]
+    fn test_float_statistics_zero_only() {
+        let stats = statistics_roundtrip::<FloatType>(&[0.0]);
+        assert!(stats.has_min_max_set());
+        assert!(stats.is_min_max_backwards_compatible());
+        if let Statistics::Float(stats) = stats {
+            assert_eq!(stats.min(), &-0.0);
+            assert!(stats.min().is_sign_negative());
+            assert_eq!(stats.max(), &0.0);
+            assert!(stats.max().is_sign_positive());
+        } else {
+            panic!("expecting Statistics::Float");
+        }
+    }
+
+    #[test]
+    fn test_float_statistics_neg_zero_only() {
+        let stats = statistics_roundtrip::<FloatType>(&[-0.0]);
+        assert!(stats.has_min_max_set());
+        assert!(stats.is_min_max_backwards_compatible());
+        if let Statistics::Float(stats) = stats {
+            assert_eq!(stats.min(), &-0.0);
+            assert!(stats.min().is_sign_negative());
+            assert_eq!(stats.max(), &0.0);
+            assert!(stats.max().is_sign_positive());
+        } else {
+            panic!("expecting Statistics::Float");
+        }
+    }
+
+    #[test]
+    fn test_float_statistics_zero_min() {
+        let stats = statistics_roundtrip::<FloatType>(&[0.0, 1.0, f32::NAN, 2.0]);
+        assert!(stats.has_min_max_set());
+        assert!(stats.is_min_max_backwards_compatible());
+        if let Statistics::Float(stats) = stats {
+            assert_eq!(stats.min(), &-0.0);
+            assert!(stats.min().is_sign_negative());
+            assert_eq!(stats.max(), &2.0);
+        } else {
+            panic!("expecting Statistics::Float");
+        }
+    }
+
+    #[test]
+    fn test_float_statistics_neg_zero_max() {
+        let stats = statistics_roundtrip::<FloatType>(&[-0.0, -1.0, f32::NAN, -2.0]);
+        assert!(stats.has_min_max_set());
+        assert!(stats.is_min_max_backwards_compatible());
+        if let Statistics::Float(stats) = stats {
+            assert_eq!(stats.min(), &-2.0);
+            assert_eq!(stats.max(), &0.0);
+            assert!(stats.max().is_sign_positive());
+        } else {
+            panic!("expecting Statistics::Float");
+        }
+    }
+
     #[test]
     fn test_double_statistics_nan_middle() {
         let stats = statistics_roundtrip::<DoubleType>(&[1.0, f64::NAN, 2.0]);
@@ -2120,7 +2178,7 @@ mod tests {
             assert_eq!(stats.min(), &1.0);
             assert_eq!(stats.max(), &2.0);
         } else {
-            panic!("expecting Statistics::Float");
+            panic!("expecting Statistics::Double");
         }
     }
 
@@ -2133,7 +2191,7 @@ mod tests {
             assert_eq!(stats.min(), &1.0);
             assert_eq!(stats.max(), &2.0);
         } else {
-            panic!("expecting Statistics::Float");
+            panic!("expecting Statistics::Double");
         }
     }
 
@@ -2145,6 +2203,64 @@ mod tests {
         assert!(stats.is_min_max_backwards_compatible());
     }
 
+    #[test]
+    fn test_double_statistics_zero_only() {
+        let stats = statistics_roundtrip::<DoubleType>(&[0.0]);
+        assert!(stats.has_min_max_set());
+        assert!(stats.is_min_max_backwards_compatible());
+        if let Statistics::Double(stats) = stats {
+            assert_eq!(stats.min(), &-0.0);
+            assert!(stats.min().is_sign_negative());
+            assert_eq!(stats.max(), &0.0);
+            assert!(stats.max().is_sign_positive());
+        } else {
+            panic!("expecting Statistics::Double");
+        }
+    }
+
+    #[test]
+    fn test_double_statistics_neg_zero_only() {
+        let stats = statistics_roundtrip::<DoubleType>(&[-0.0]);
+        assert!(stats.has_min_max_set());
+        assert!(stats.is_min_max_backwards_compatible());
+        if let Statistics::Double(stats) = stats {
+            assert_eq!(stats.min(), &-0.0);
+            assert!(stats.min().is_sign_negative());
+            assert_eq!(stats.max(), &0.0);
+            assert!(stats.max().is_sign_positive());
+        } else {
+            panic!("expecting Statistics::Double");
+        }
+    }
+
+    #[test]
+    fn test_double_statistics_zero_min() {
+        let stats = statistics_roundtrip::<DoubleType>(&[0.0, 1.0, f64::NAN, 2.0]);
+        assert!(stats.has_min_max_set());
+        assert!(stats.is_min_max_backwards_compatible());
+        if let Statistics::Double(stats) = stats {
+            assert_eq!(stats.min(), &-0.0);
+            assert!(stats.min().is_sign_negative());
+            assert_eq!(stats.max(), &2.0);
+        } else {
+            panic!("expecting Statistics::Double");
+        }
+    }
+
+    #[test]
+    fn test_double_statistics_neg_zero_max() {
+        let stats = statistics_roundtrip::<DoubleType>(&[-0.0, -1.0, f64::NAN, -2.0]);
+        assert!(stats.has_min_max_set());
+        assert!(stats.is_min_max_backwards_compatible());
+        if let Statistics::Double(stats) = stats {
+            assert_eq!(stats.min(), &-2.0);
+            assert_eq!(stats.max(), &0.0);
+            assert!(stats.max().is_sign_positive());
+        } else {
+            panic!("expecting Statistics::Double");
+        }
+    }
+
     #[test]
     fn test_compare_greater_byte_array_decimals() {
         assert!(!compare_greater_byte_array_decimals(&[], &[],),);

diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs
@@ -632,15 +632,15 @@ pub(crate) mod private {
 
         /// Return the value as i64 if possible
         ///
-        /// This is essentially the same as `std::convert::TryInto<i64>` but can
+        /// This is essentially the same as `std::convert::TryInto<i64>` but can't be
         /// implemented for `f32` and `f64`, types that would fail orphan rules
         fn as_i64(&self) -> Result<i64> {
             Err(general_err!("Type cannot be converted to i64"))
         }
 
         /// Return the value as u64 if possible
         ///
-        /// This is essentially the same as `std::convert::TryInto<u64>` but can
+        /// This is essentially the same as `std::convert::TryInto<u64>` but can't be
         /// implemented for `f32` and `f64`, types that would fail orphan rules
         fn as_u64(&self) -> Result<u64> {
             self.as_i64()