From 1491c6e8f4fd100f53c358e4f3ef1536d9e75090 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Mon, 17 Apr 2023 13:42:07 +0400 Subject: [PATCH 01/80] Fixed timestamp to datetime conversion error (#1470) Fixed timestamp to datetime conversion error on pre-epoch (negative) values and improved test coverage --- src/temporal_conversions.rs | 51 +++++++++++++++++++++----------- tests/it/temporal_conversions.rs | 6 ++-- 2 files changed, 36 insertions(+), 21 deletions(-) diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs index 65448aa699..6560a7067e 100644 --- a/src/temporal_conversions.rs +++ b/src/temporal_conversions.rs @@ -120,13 +120,18 @@ pub fn timestamp_ms_to_datetime(v: i64) -> NaiveDateTime { (v % MILLISECONDS * MICROSECONDS) as u32, ) } else { - // note: negative values require 'div_floor' rounding behaviour, which isn't - // yet stabilised (see - https://github.com/rust-lang/rust/issues/88581). let secs_rem = (v / MILLISECONDS, v % MILLISECONDS); - NaiveDateTime::from_timestamp_opt( - secs_rem.0 - (secs_rem.1 != 0) as i64, - (v % MILLISECONDS * MICROSECONDS).unsigned_abs() as u32, - ) + if secs_rem.1 == 0 { + // whole/integer seconds; no adjustment required + NaiveDateTime::from_timestamp_opt(secs_rem.0, (v % MILLISECONDS * MICROSECONDS) as u32) + } else { + // negative values with fractional seconds require 'div_floor' rounding behaviour. + // (which isn't yet stabilised: https://github.com/rust-lang/rust/issues/88581) + NaiveDateTime::from_timestamp_opt( + secs_rem.0 - 1, + (NANOSECONDS + (v % MILLISECONDS * MICROSECONDS)) as u32, + ) + } } .expect("invalid or out-of-range datetime") } @@ -142,13 +147,18 @@ pub fn timestamp_us_to_datetime(v: i64) -> NaiveDateTime { (v % MICROSECONDS * MILLISECONDS) as u32, ) } else { - // note: negative values require 'div_floor' rounding behaviour, which isn't - // yet stabilised (see - https://github.com/rust-lang/rust/issues/88581). let secs_rem = (v / MICROSECONDS, v % MICROSECONDS); - NaiveDateTime::from_timestamp_opt( - secs_rem.0 - (secs_rem.1 != 0) as i64, - (v % MICROSECONDS * MILLISECONDS).unsigned_abs() as u32, - ) + if secs_rem.1 == 0 { + // whole/integer seconds; no adjustment required + NaiveDateTime::from_timestamp_opt(secs_rem.0, (v % MICROSECONDS * MILLISECONDS) as u32) + } else { + // negative values with fractional seconds require 'div_floor' rounding behaviour. + // (which isn't yet stabilised: https://github.com/rust-lang/rust/issues/88581) + NaiveDateTime::from_timestamp_opt( + secs_rem.0 - 1, + (NANOSECONDS + (v % MICROSECONDS * MILLISECONDS)) as u32, + ) + } } .expect("invalid or out-of-range datetime") } @@ -164,13 +174,18 @@ pub fn timestamp_ns_to_datetime(v: i64) -> NaiveDateTime { (v % NANOSECONDS) as u32, ) } else { - // note: negative values require 'div_floor' rounding behaviour, which isn't - // yet stabilised (see - https://github.com/rust-lang/rust/issues/88581). let secs_rem = (v / NANOSECONDS, v % NANOSECONDS); - NaiveDateTime::from_timestamp_opt( - secs_rem.0 - (secs_rem.1 != 0) as i64, - (v % NANOSECONDS).unsigned_abs() as u32, - ) + if secs_rem.1 == 0 { + // whole/integer seconds; no adjustment required + NaiveDateTime::from_timestamp_opt(secs_rem.0, (v % NANOSECONDS) as u32) + } else { + // negative values with fractional seconds require 'div_floor' rounding behaviour. + // (which isn't yet stabilised: https://github.com/rust-lang/rust/issues/88581) + NaiveDateTime::from_timestamp_opt( + secs_rem.0 - 1, + (NANOSECONDS + (v % NANOSECONDS)) as u32, + ) + } } .expect("invalid or out-of-range datetime") } diff --git a/tests/it/temporal_conversions.rs b/tests/it/temporal_conversions.rs index 883d524f95..1bb206de5a 100644 --- a/tests/it/temporal_conversions.rs +++ b/tests/it/temporal_conversions.rs @@ -150,17 +150,17 @@ fn timestamp_to_datetime() { // negative milliseconds assert_eq!( temporal_conversions::timestamp_ms_to_datetime(ts / 1_000_000), - NaiveDateTime::parse_from_str("1969-07-05T01:02:03.987000000", fmt).unwrap() + NaiveDateTime::parse_from_str("1969-07-05T01:02:03.013000000", fmt).unwrap() ); // negative microseconds assert_eq!( temporal_conversions::timestamp_us_to_datetime(ts / 1_000), - NaiveDateTime::parse_from_str("1969-07-05T01:02:03.987654000", fmt).unwrap() + NaiveDateTime::parse_from_str("1969-07-05T01:02:03.012346000", fmt).unwrap() ); // negative nanoseconds assert_eq!( temporal_conversions::timestamp_ns_to_datetime(ts), - NaiveDateTime::parse_from_str("1969-07-05T01:02:03.987654321", fmt).unwrap() + NaiveDateTime::parse_from_str("1969-07-05T01:02:03.012345679", fmt).unwrap() ); let fmt = "%Y-%m-%dT%H:%M:%S"; From 144a8b865f0652b9fcd8a17e88f8e47cd53dca57 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Wed, 19 Apr 2023 19:33:20 +0400 Subject: [PATCH 02/80] Don't calculate nanoseconds in timestamp conversion when remainder implies integer seconds (#1471) --- src/temporal_conversions.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs index 6560a7067e..a76700f444 100644 --- a/src/temporal_conversions.rs +++ b/src/temporal_conversions.rs @@ -123,7 +123,7 @@ pub fn timestamp_ms_to_datetime(v: i64) -> NaiveDateTime { let secs_rem = (v / MILLISECONDS, v % MILLISECONDS); if secs_rem.1 == 0 { // whole/integer seconds; no adjustment required - NaiveDateTime::from_timestamp_opt(secs_rem.0, (v % MILLISECONDS * MICROSECONDS) as u32) + NaiveDateTime::from_timestamp_opt(secs_rem.0, 0) } else { // negative values with fractional seconds require 'div_floor' rounding behaviour. // (which isn't yet stabilised: https://github.com/rust-lang/rust/issues/88581) @@ -150,7 +150,7 @@ pub fn timestamp_us_to_datetime(v: i64) -> NaiveDateTime { let secs_rem = (v / MICROSECONDS, v % MICROSECONDS); if secs_rem.1 == 0 { // whole/integer seconds; no adjustment required - NaiveDateTime::from_timestamp_opt(secs_rem.0, (v % MICROSECONDS * MILLISECONDS) as u32) + NaiveDateTime::from_timestamp_opt(secs_rem.0, 0) } else { // negative values with fractional seconds require 'div_floor' rounding behaviour. // (which isn't yet stabilised: https://github.com/rust-lang/rust/issues/88581) @@ -177,7 +177,7 @@ pub fn timestamp_ns_to_datetime(v: i64) -> NaiveDateTime { let secs_rem = (v / NANOSECONDS, v % NANOSECONDS); if secs_rem.1 == 0 { // whole/integer seconds; no adjustment required - NaiveDateTime::from_timestamp_opt(secs_rem.0, (v % NANOSECONDS) as u32) + NaiveDateTime::from_timestamp_opt(secs_rem.0, 0) } else { // negative values with fractional seconds require 'div_floor' rounding behaviour. // (which isn't yet stabilised: https://github.com/rust-lang/rust/issues/88581) From 64d8ec203f991468032025a13a4f971f1f2cfc14 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 27 Apr 2023 10:27:46 +0200 Subject: [PATCH 03/80] Added parquet deserialization of nested null types (#1472) --- src/array/mod.rs | 2 +- src/array/null.rs | 65 +++++++++ src/io/parquet/read/deserialize/nested.rs | 12 ++ .../read/deserialize/{null.rs => null/mod.rs} | 3 + .../parquet/read/deserialize/null/nested.rs | 124 ++++++++++++++++++ src/io/parquet/read/statistics/mod.rs | 5 + src/io/parquet/read/statistics/null.rs | 11 ++ 7 files changed, 221 insertions(+), 1 deletion(-) rename src/io/parquet/read/deserialize/{null.rs => null/mod.rs} (98%) create mode 100644 src/io/parquet/read/deserialize/null/nested.rs create mode 100644 src/io/parquet/read/statistics/null.rs diff --git a/src/array/mod.rs b/src/array/mod.rs index 42b528fc27..f9c320a650 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -728,7 +728,7 @@ pub use fixed_size_binary::{FixedSizeBinaryArray, MutableFixedSizeBinaryArray}; pub use fixed_size_list::{FixedSizeListArray, MutableFixedSizeListArray}; pub use list::{ListArray, ListValuesIter, MutableListArray}; pub use map::MapArray; -pub use null::NullArray; +pub use null::{MutableNullArray, NullArray}; pub use primitive::*; pub use struct_::{MutableStructArray, StructArray}; pub use union::UnionArray; diff --git a/src/array/null.rs b/src/array/null.rs index 5a1471efcf..bcd5c0aff7 100644 --- a/src/array/null.rs +++ b/src/array/null.rs @@ -1,5 +1,8 @@ use crate::{bitmap::Bitmap, datatypes::DataType}; +use std::any::Any; +use crate::array::MutableArray; +use crate::bitmap::MutableBitmap; use crate::{ array::{Array, FromFfi, ToFfi}, datatypes::PhysicalType, @@ -88,6 +91,68 @@ impl Array for NullArray { } } +#[derive(Debug)] +/// A distinct type to disambiguate +/// clashing methods +pub struct MutableNullArray { + inner: NullArray, +} + +impl MutableNullArray { + /// Returns a new [`MutableNullArray`]. + /// # Panics + /// This function errors iff: + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to [`crate::datatypes::PhysicalType::Null`]. + pub fn new(data_type: DataType, length: usize) -> Self { + let inner = NullArray::try_new(data_type, length).unwrap(); + Self { inner } + } +} + +impl From for NullArray { + fn from(value: MutableNullArray) -> Self { + value.inner + } +} + +impl MutableArray for MutableNullArray { + fn data_type(&self) -> &DataType { + &DataType::Null + } + + fn len(&self) -> usize { + self.inner.length + } + + fn validity(&self) -> Option<&MutableBitmap> { + None + } + + fn as_box(&mut self) -> Box { + self.inner.clone().boxed() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn push_null(&mut self) { + self.inner.length += 1; + } + + fn reserve(&mut self, _additional: usize) { + // no-op + } + + fn shrink_to_fit(&mut self) { + // no-op + } +} + impl std::fmt::Debug for NullArray { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "NullArray({})", self.len()) diff --git a/src/io/parquet/read/deserialize/nested.rs b/src/io/parquet/read/deserialize/nested.rs index 9a2c0232f2..0887751438 100644 --- a/src/io/parquet/read/deserialize/nested.rs +++ b/src/io/parquet/read/deserialize/nested.rs @@ -52,6 +52,18 @@ where use crate::datatypes::PrimitiveType::*; Ok(match field.data_type().to_physical_type() { + Null => { + // physical type is i32 + init.push(InitNested::Primitive(field.is_nullable)); + types.pop(); + primitive(null::NestedIter::new( + columns.pop().unwrap(), + init, + field.data_type().clone(), + num_rows, + chunk_size, + )) + } Boolean => { init.push(InitNested::Primitive(field.is_nullable)); types.pop(); diff --git a/src/io/parquet/read/deserialize/null.rs b/src/io/parquet/read/deserialize/null/mod.rs similarity index 98% rename from src/io/parquet/read/deserialize/null.rs rename to src/io/parquet/read/deserialize/null/mod.rs index e897a045b6..18c745191d 100644 --- a/src/io/parquet/read/deserialize/null.rs +++ b/src/io/parquet/read/deserialize/null/mod.rs @@ -1,8 +1,11 @@ +mod nested; + use parquet2::page::Page; use crate::{array::NullArray, datatypes::DataType}; use super::super::{ArrayIter, Pages}; +pub(super) use nested::NestedIter; /// Converts [`Pages`] to an [`ArrayIter`] pub fn iter_to_arrays<'a, I>( diff --git a/src/io/parquet/read/deserialize/null/nested.rs b/src/io/parquet/read/deserialize/null/nested.rs new file mode 100644 index 0000000000..7f0d33d825 --- /dev/null +++ b/src/io/parquet/read/deserialize/null/nested.rs @@ -0,0 +1,124 @@ +use std::collections::VecDeque; + +use parquet2::page::{DataPage, DictPage}; + +use crate::array::NullArray; +use crate::io::parquet::read::deserialize::utils::DecodedState; +use crate::{datatypes::DataType, error::Result}; + +use super::super::nested_utils::*; +use super::super::utils; +use super::super::Pages; + +impl<'a> utils::PageState<'a> for () { + fn len(&self) -> usize { + 0 + } +} + +#[derive(Debug)] +struct NullDecoder {} + +impl DecodedState for usize { + fn len(&self) -> usize { + *self + } +} + +impl<'a> NestedDecoder<'a> for NullDecoder { + type State = (); + type Dictionary = (); + type DecodedState = usize; + + fn build_state( + &self, + _page: &'a DataPage, + _dict: Option<&'a Self::Dictionary>, + ) -> Result { + Ok(()) + } + + /// Initializes a new state + fn with_capacity(&self, _capacity: usize) -> Self::DecodedState { + 0 + } + + fn push_valid(&self, _state: &mut Self::State, decoded: &mut Self::DecodedState) -> Result<()> { + *decoded += 1; + Ok(()) + } + + fn push_null(&self, decoded: &mut Self::DecodedState) { + let length = decoded; + *length += 1; + } + + fn deserialize_dict(&self, _page: &DictPage) -> Self::Dictionary { + unreachable!() + } +} + +/// An iterator adapter over [`Pages`] assumed to be encoded as null arrays +#[derive(Debug)] +pub struct NestedIter +where + I: Pages, +{ + iter: I, + init: Vec, + data_type: DataType, + items: VecDeque<(NestedState, usize)>, + remaining: usize, + chunk_size: Option, + decoder: NullDecoder, +} + +impl NestedIter +where + I: Pages, +{ + pub fn new( + iter: I, + init: Vec, + data_type: DataType, + num_rows: usize, + chunk_size: Option, + ) -> Self { + Self { + iter, + init, + data_type, + items: VecDeque::new(), + chunk_size, + remaining: num_rows, + decoder: NullDecoder {}, + } + } +} + +impl Iterator for NestedIter +where + I: Pages, +{ + type Item = Result<(NestedState, NullArray)>; + + fn next(&mut self) -> Option { + let maybe_state = next( + &mut self.iter, + &mut self.items, + &mut None, + &mut self.remaining, + &self.init, + self.chunk_size, + &self.decoder, + ); + match maybe_state { + utils::MaybeNext::Some(Ok((nested, state))) => { + Some(Ok((nested, NullArray::new(self.data_type.clone(), state)))) + } + utils::MaybeNext::Some(Err(e)) => Some(Err(e)), + utils::MaybeNext::None => None, + utils::MaybeNext::More => self.next(), + } + } +} diff --git a/src/io/parquet/read/statistics/mod.rs b/src/io/parquet/read/statistics/mod.rs index f3c1ed9e8d..8514913d56 100644 --- a/src/io/parquet/read/statistics/mod.rs +++ b/src/io/parquet/read/statistics/mod.rs @@ -26,6 +26,7 @@ mod dictionary; mod fixlen; mod list; mod map; +mod null; mod primitive; mod struct_; mod utf8; @@ -194,6 +195,9 @@ fn make_mutable(data_type: &DataType, capacity: usize) -> Result { + Box::new(MutableNullArray::new(DataType::Null, 0)) as Box + } other => { return Err(Error::NotYetImplemented(format!( "Deserializing parquet stats from {other:?} is still not implemented" @@ -538,6 +542,7 @@ fn push( Utf8 => utf8::push::(from, min, max), LargeUtf8 => utf8::push::(from, min, max), FixedSizeBinary(_) => fixlen::push(from, min, max), + Null => null::push(min, max), other => todo!("{:?}", other), } } diff --git a/src/io/parquet/read/statistics/null.rs b/src/io/parquet/read/statistics/null.rs new file mode 100644 index 0000000000..9102720ebc --- /dev/null +++ b/src/io/parquet/read/statistics/null.rs @@ -0,0 +1,11 @@ +use crate::array::*; +use crate::error::Result; + +pub(super) fn push(min: &mut dyn MutableArray, max: &mut dyn MutableArray) -> Result<()> { + let min = min.as_mut_any().downcast_mut::().unwrap(); + let max = max.as_mut_any().downcast_mut::().unwrap(); + min.push_null(); + max.push_null(); + + Ok(()) +} From 07fd3f639cb983fb7496e6adeac2c2dbe0a3cad0 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 29 Apr 2023 11:39:16 +0200 Subject: [PATCH 04/80] Fixed struct FFI for sliced pyarrow (#1474) --- src/array/struct_/ffi.rs | 29 ++++++++++++++++++++++++++++- src/bitmap/immutable.rs | 28 ++++++++++++++++------------ 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/src/array/struct_/ffi.rs b/src/array/struct_/ffi.rs index b29a342d88..847948630f 100644 --- a/src/array/struct_/ffi.rs +++ b/src/array/struct_/ffi.rs @@ -30,11 +30,38 @@ impl FromFfi for StructArray { let data_type = array.data_type().clone(); let fields = Self::get_fields(&data_type); + let arrow_array = array.array(); let validity = unsafe { array.validity() }?; + let len = arrow_array.len(); + let offset = arrow_array.offset(); let values = (0..fields.len()) .map(|index| { let child = array.child(index)?; - ffi::try_from(child) + ffi::try_from(child).map(|arr| { + // there is a discrepancy with how arrow2 exports sliced + // struct array and how pyarrow does it. + // # Pyarrow + // ## struct array len 3 + // * slice 1 by with len 2 + // offset on struct array: 1 + // length on struct array: 2 + // offset on value array: 0 + // length on value array: 3 + // # Arrow2 + // ## struct array len 3 + // * slice 1 by with len 2 + // offset on struct array: 0 + // length on struct array: 3 + // offset on value array: 1 + // length on value array: 2 + // + // this branch will ensure both can round trip + if arr.len() >= (len + offset) { + arr.sliced(offset, len) + } else { + arr + } + }) }) .collect::>>>()?; diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs index 5d85208782..be1e3a662d 100644 --- a/src/bitmap/immutable.rs +++ b/src/bitmap/immutable.rs @@ -182,19 +182,23 @@ impl Bitmap { /// The caller must ensure that `self.offset + offset + length <= self.len()` #[inline] pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) { - // count the smallest chunk - if length < self.length / 2 { - // count the null values in the slice - self.unset_bits = count_zeros(&self.bytes, self.offset + offset, length); - } else { - // subtract the null count of the chunks we slice off - let start_end = self.offset + offset + length; - let head_count = count_zeros(&self.bytes, self.offset, offset); - let tail_count = count_zeros(&self.bytes, start_end, self.length - length - offset); - self.unset_bits -= head_count + tail_count; + // first guard a no-op slice so that we don't do a bitcount + // if there isn't any data sliced + if !(offset == 0 && length == self.length) { + // count the smallest chunk + if length < self.length / 2 { + // count the null values in the slice + self.unset_bits = count_zeros(&self.bytes, self.offset + offset, length); + } else { + // subtract the null count of the chunks we slice off + let start_end = self.offset + offset + length; + let head_count = count_zeros(&self.bytes, self.offset, offset); + let tail_count = count_zeros(&self.bytes, start_end, self.length - length - offset); + self.unset_bits -= head_count + tail_count; + } + self.offset += offset; + self.length = length; } - self.offset += offset; - self.length = length; } /// Slices `self`, offsetting by `offset` and truncating up to `length` bits. From 8de953d76036aa80eb50e5d7cf81e38bf2bce3d6 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 7 May 2023 10:50:44 +0200 Subject: [PATCH 05/80] Correct for offsets in list json serialization (#1475) --- src/io/json/write/mod.rs | 2 +- src/io/json/write/serialize.rs | 324 ++++++++++++++++++++------------- src/io/ndjson/write/mod.rs | 2 +- 3 files changed, 201 insertions(+), 127 deletions(-) diff --git a/src/io/json/write/mod.rs b/src/io/json/write/mod.rs index 53cc3f20e6..c04b4b51a6 100644 --- a/src/io/json/write/mod.rs +++ b/src/io/json/write/mod.rs @@ -85,7 +85,7 @@ impl<'a> RecordSerializer<'a> { let iterators = chunk .arrays() .iter() - .map(|arr| new_serializer(arr.as_ref())) + .map(|arr| new_serializer(arr.as_ref(), 0, usize::MAX)) .collect(); Self { diff --git a/src/io/json/write/serialize.rs b/src/io/json/write/serialize.rs index 8247609406..dfa104d78e 100644 --- a/src/io/json/write/serialize.rs +++ b/src/io/json/write/serialize.rs @@ -16,95 +16,114 @@ use crate::{array::*, datatypes::DataType, types::NativeType}; use super::utf8; +fn materialize_serializer<'a, I, F, T>( + f: F, + iterator: I, + offset: usize, + take: usize, +) -> Box + 'a + Send + Sync> +where + T: 'a, + I: Iterator + Send + Sync + 'a, + F: FnMut(T, &mut Vec) + Send + Sync + 'a, +{ + if offset > 0 || take < usize::MAX { + Box::new(BufStreamingIterator::new( + iterator.skip(offset).take(take), + f, + vec![], + )) + } else { + Box::new(BufStreamingIterator::new(iterator, f, vec![])) + } +} + fn boolean_serializer<'a>( array: &'a BooleanArray, + offset: usize, + take: usize, ) -> Box + 'a + Send + Sync> { - Box::new(BufStreamingIterator::new( - array.iter(), - |x, buf| match x { - Some(true) => buf.extend_from_slice(b"true"), - Some(false) => buf.extend_from_slice(b"false"), - None => buf.extend_from_slice(b"null"), - }, - vec![], - )) + let f = |x: Option, buf: &mut Vec| match x { + Some(true) => buf.extend_from_slice(b"true"), + Some(false) => buf.extend_from_slice(b"false"), + None => buf.extend_from_slice(b"null"), + }; + materialize_serializer(f, array.iter(), offset, take) } fn primitive_serializer<'a, T: NativeType + ToLexical>( array: &'a PrimitiveArray, + offset: usize, + take: usize, ) -> Box + 'a + Send + Sync> { - Box::new(BufStreamingIterator::new( - array.iter(), - |x, buf| { - if let Some(x) = x { - lexical_to_bytes_mut(*x, buf) - } else { - buf.extend(b"null") - } - }, - vec![], - )) + let f = |x: Option<&T>, buf: &mut Vec| { + if let Some(x) = x { + lexical_to_bytes_mut(*x, buf) + } else { + buf.extend(b"null") + } + }; + materialize_serializer(f, array.iter(), offset, take) } fn float_serializer<'a, T>( array: &'a PrimitiveArray, + offset: usize, + take: usize, ) -> Box + 'a + Send + Sync> where T: num_traits::Float + NativeType + ToLexical, { - Box::new(BufStreamingIterator::new( - array.iter(), - |x, buf| { - if let Some(x) = x { - if T::is_nan(*x) || T::is_infinite(*x) { - buf.extend(b"null") - } else { - lexical_to_bytes_mut(*x, buf) - } - } else { + let f = |x: Option<&T>, buf: &mut Vec| { + if let Some(x) = x { + if T::is_nan(*x) || T::is_infinite(*x) { buf.extend(b"null") + } else { + lexical_to_bytes_mut(*x, buf) } - }, - vec![], - )) + } else { + buf.extend(b"null") + } + }; + + materialize_serializer(f, array.iter(), offset, take) } fn dictionary_utf8_serializer<'a, K: DictionaryKey, O: Offset>( array: &'a DictionaryArray, + offset: usize, + take: usize, ) -> Box + 'a + Send + Sync> { - let iter = array.iter_typed::>().unwrap(); - - Box::new(BufStreamingIterator::new( - iter, - |x, buf| { - if let Some(x) = x { - utf8::write_str(buf, x).unwrap(); - } else { - buf.extend_from_slice(b"null") - } - }, - vec![], - )) + let iter = array.iter_typed::>().unwrap().skip(offset); + let f = |x: Option<&str>, buf: &mut Vec| { + if let Some(x) = x { + utf8::write_str(buf, x).unwrap(); + } else { + buf.extend_from_slice(b"null") + } + }; + materialize_serializer(f, iter, offset, take) } fn utf8_serializer<'a, O: Offset>( array: &'a Utf8Array, + offset: usize, + take: usize, ) -> Box + 'a + Send + Sync> { - Box::new(BufStreamingIterator::new( - array.iter(), - |x, buf| { - if let Some(x) = x { - utf8::write_str(buf, x).unwrap(); - } else { - buf.extend_from_slice(b"null") - } - }, - vec![], - )) + let f = |x: Option<&str>, buf: &mut Vec| { + if let Some(x) = x { + utf8::write_str(buf, x).unwrap(); + } else { + buf.extend_from_slice(b"null") + } + }; + materialize_serializer(f, array.iter(), offset, take) } fn struct_serializer<'a>( array: &'a StructArray, + offset: usize, + take: usize, ) -> Box + 'a + Send + Sync> { // {"a": [1, 2, 3], "b": [a, b, c], "c": {"a": [1, 2, 3]}} // [ @@ -117,7 +136,7 @@ fn struct_serializer<'a>( .values() .iter() .map(|x| x.as_ref()) - .map(new_serializer) + .map(|arr| new_serializer(arr, offset, take)) .collect::>(); let names = array.fields().iter().map(|f| f.name.as_str()); @@ -149,6 +168,8 @@ fn struct_serializer<'a>( fn list_serializer<'a, O: Offset>( array: &'a ListArray, + offset: usize, + take: usize, ) -> Box + 'a + Send + Sync> { // [[1, 2], [3]] // [ @@ -156,35 +177,40 @@ fn list_serializer<'a, O: Offset>( // [3] // ] // - let mut serializer = new_serializer(array.values().as_ref()); + let offsets = array.offsets().as_slice(); + let start = offsets[0].to_usize(); + let end = offsets.last().unwrap().to_usize(); + let mut serializer = new_serializer(array.values().as_ref(), start, end - start); - Box::new(BufStreamingIterator::new( - ZipValidity::new_with_validity(array.offsets().buffer().windows(2), array.validity()), - move |offset, buf| { - if let Some(offset) = offset { - let length = (offset[1] - offset[0]).to_usize(); - buf.push(b'['); - let mut is_first_row = true; - for _ in 0..length { - if !is_first_row { - buf.push(b','); - } - is_first_row = false; - buf.extend(serializer.next().unwrap()); + let f = move |offset: Option<&[O]>, buf: &mut Vec| { + if let Some(offset) = offset { + let length = (offset[1] - offset[0]).to_usize(); + buf.push(b'['); + let mut is_first_row = true; + for _ in 0..length { + if !is_first_row { + buf.push(b','); } - buf.push(b']'); - } else { - buf.extend(b"null"); + is_first_row = false; + buf.extend(serializer.next().unwrap()); } - }, - vec![], - )) + buf.push(b']'); + } else { + buf.extend(b"null"); + } + }; + + let iter = + ZipValidity::new_with_validity(array.offsets().buffer().windows(2), array.validity()); + materialize_serializer(f, iter, offset, take) } fn fixed_size_list_serializer<'a>( array: &'a FixedSizeListArray, + offset: usize, + take: usize, ) -> Box + 'a + Send + Sync> { - let mut serializer = new_serializer(array.values().as_ref()); + let mut serializer = new_serializer(array.values().as_ref(), offset, take); Box::new(BufStreamingIterator::new( ZipValidity::new(0..array.len(), array.validity().map(|x| x.iter())), @@ -212,83 +238,126 @@ fn fixed_size_list_serializer<'a>( fn date_serializer<'a, T, F>( array: &'a PrimitiveArray, convert: F, + offset: usize, + take: usize, ) -> Box + 'a + Send + Sync> where T: NativeType, F: Fn(T) -> NaiveDate + 'static + Send + Sync, { - Box::new(BufStreamingIterator::new( - array.iter(), - move |x, buf| { - if let Some(x) = x { - let nd = convert(*x); - write!(buf, "\"{nd}\"").unwrap(); - } else { - buf.extend_from_slice(b"null") - } - }, - vec![], - )) + let f = move |x: Option<&T>, buf: &mut Vec| { + if let Some(x) = x { + let nd = convert(*x); + write!(buf, "\"{nd}\"").unwrap(); + } else { + buf.extend_from_slice(b"null") + } + }; + + materialize_serializer(f, array.iter(), offset, take) } fn timestamp_serializer<'a, F>( array: &'a PrimitiveArray, convert: F, + offset: usize, + take: usize, ) -> Box + 'a + Send + Sync> where F: Fn(i64) -> NaiveDateTime + 'static + Send + Sync, { - Box::new(BufStreamingIterator::new( - array.iter(), - move |x, buf| { - if let Some(x) = x { - let ndt = convert(*x); - write!(buf, "\"{ndt}\"").unwrap(); - } else { - buf.extend_from_slice(b"null") - } - }, - vec![], - )) + let f = move |x: Option<&i64>, buf: &mut Vec| { + if let Some(x) = x { + let ndt = convert(*x); + write!(buf, "\"{ndt}\"").unwrap(); + } else { + buf.extend_from_slice(b"null") + } + }; + materialize_serializer(f, array.iter(), offset, take) } pub(crate) fn new_serializer<'a>( array: &'a dyn Array, + offset: usize, + take: usize, ) -> Box + 'a + Send + Sync> { match array.data_type().to_logical_type() { - DataType::Boolean => boolean_serializer(array.as_any().downcast_ref().unwrap()), - DataType::Int8 => primitive_serializer::(array.as_any().downcast_ref().unwrap()), - DataType::Int16 => primitive_serializer::(array.as_any().downcast_ref().unwrap()), - DataType::Int32 => primitive_serializer::(array.as_any().downcast_ref().unwrap()), - DataType::Int64 => primitive_serializer::(array.as_any().downcast_ref().unwrap()), - DataType::UInt8 => primitive_serializer::(array.as_any().downcast_ref().unwrap()), - DataType::UInt16 => primitive_serializer::(array.as_any().downcast_ref().unwrap()), - DataType::UInt32 => primitive_serializer::(array.as_any().downcast_ref().unwrap()), - DataType::UInt64 => primitive_serializer::(array.as_any().downcast_ref().unwrap()), - DataType::Float32 => float_serializer::(array.as_any().downcast_ref().unwrap()), - DataType::Float64 => float_serializer::(array.as_any().downcast_ref().unwrap()), - DataType::Utf8 => utf8_serializer::(array.as_any().downcast_ref().unwrap()), - DataType::LargeUtf8 => utf8_serializer::(array.as_any().downcast_ref().unwrap()), - DataType::Struct(_) => struct_serializer(array.as_any().downcast_ref().unwrap()), + DataType::Boolean => { + boolean_serializer(array.as_any().downcast_ref().unwrap(), offset, take) + } + DataType::Int8 => { + primitive_serializer::(array.as_any().downcast_ref().unwrap(), offset, take) + } + DataType::Int16 => { + primitive_serializer::(array.as_any().downcast_ref().unwrap(), offset, take) + } + DataType::Int32 => { + primitive_serializer::(array.as_any().downcast_ref().unwrap(), offset, take) + } + DataType::Int64 => { + primitive_serializer::(array.as_any().downcast_ref().unwrap(), offset, take) + } + DataType::UInt8 => { + primitive_serializer::(array.as_any().downcast_ref().unwrap(), offset, take) + } + DataType::UInt16 => { + primitive_serializer::(array.as_any().downcast_ref().unwrap(), offset, take) + } + DataType::UInt32 => { + primitive_serializer::(array.as_any().downcast_ref().unwrap(), offset, take) + } + DataType::UInt64 => { + primitive_serializer::(array.as_any().downcast_ref().unwrap(), offset, take) + } + DataType::Float32 => { + float_serializer::(array.as_any().downcast_ref().unwrap(), offset, take) + } + DataType::Float64 => { + float_serializer::(array.as_any().downcast_ref().unwrap(), offset, take) + } + DataType::Utf8 => { + utf8_serializer::(array.as_any().downcast_ref().unwrap(), offset, take) + } + DataType::LargeUtf8 => { + utf8_serializer::(array.as_any().downcast_ref().unwrap(), offset, take) + } + DataType::Struct(_) => { + struct_serializer(array.as_any().downcast_ref().unwrap(), offset, take) + } DataType::FixedSizeList(_, _) => { - fixed_size_list_serializer(array.as_any().downcast_ref().unwrap()) + fixed_size_list_serializer(array.as_any().downcast_ref().unwrap(), offset, take) + } + DataType::List(_) => { + list_serializer::(array.as_any().downcast_ref().unwrap(), offset, take) + } + DataType::LargeList(_) => { + list_serializer::(array.as_any().downcast_ref().unwrap(), offset, take) } - DataType::List(_) => list_serializer::(array.as_any().downcast_ref().unwrap()), - DataType::LargeList(_) => list_serializer::(array.as_any().downcast_ref().unwrap()), other @ DataType::Dictionary(k, v, _) => match (k, &**v) { (IntegerType::UInt32, DataType::LargeUtf8) => { let array = array .as_any() .downcast_ref::>() .unwrap(); - dictionary_utf8_serializer::(array) + dictionary_utf8_serializer::(array, offset, take) } _ => { todo!("Writing {:?} to JSON", other) } }, - DataType::Date32 => date_serializer(array.as_any().downcast_ref().unwrap(), date32_to_date), - DataType::Date64 => date_serializer(array.as_any().downcast_ref().unwrap(), date64_to_date), + DataType::Date32 => date_serializer( + array.as_any().downcast_ref().unwrap(), + date32_to_date, + offset, + take, + ), + DataType::Date64 => date_serializer( + array.as_any().downcast_ref().unwrap(), + date64_to_date, + offset, + take, + ), DataType::Timestamp(tu, tz) => { if tz.is_some() { todo!("still have to implement timezone") @@ -299,7 +368,12 @@ pub(crate) fn new_serializer<'a>( TimeUnit::Millisecond => timestamp_ms_to_datetime, TimeUnit::Second => timestamp_s_to_datetime, }; - timestamp_serializer(array.as_any().downcast_ref().unwrap(), convert) + timestamp_serializer( + array.as_any().downcast_ref().unwrap(), + convert, + offset, + take, + ) } } other => todo!("Writing {:?} to JSON", other), @@ -328,7 +402,7 @@ fn serialize_item(buffer: &mut Vec, record: &[(&str, &[u8])], is_first_row: /// # Implementation /// This operation is CPU-bounded pub(crate) fn serialize(array: &dyn Array, buffer: &mut Vec) { - let mut serializer = new_serializer(array); + let mut serializer = new_serializer(array, 0, usize::MAX); (0..array.len()).for_each(|i| { if i != 0 { diff --git a/src/io/ndjson/write/mod.rs b/src/io/ndjson/write/mod.rs index 0932f7b8ec..45ad52253c 100644 --- a/src/io/ndjson/write/mod.rs +++ b/src/io/ndjson/write/mod.rs @@ -9,7 +9,7 @@ use crate::error::Error; use super::super::json::write::new_serializer; fn serialize(array: &dyn Array, buffer: &mut Vec) { - let mut serializer = new_serializer(array); + let mut serializer = new_serializer(array, 0, usize::MAX); (0..array.len()).for_each(|_| { buffer.extend_from_slice(serializer.next().unwrap()); buffer.push(b'\n'); From c676340939a1301d3530c83e648dd463ab8d8291 Mon Sep 17 00:00:00 2001 From: theadd336 <31495729+theadd336@users.noreply.github.com> Date: Sun, 7 May 2023 04:51:51 -0400 Subject: [PATCH 06/80] Close Underlying Writer when IPC `StreamSink` is Closed (#1463) --- src/io/ipc/write/stream_async.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/io/ipc/write/stream_async.rs b/src/io/ipc/write/stream_async.rs index 8d156ff16a..df651461fe 100644 --- a/src/io/ipc/write/stream_async.rs +++ b/src/io/ipc/write/stream_async.rs @@ -2,7 +2,7 @@ use std::{pin::Pin, task::Poll}; -use futures::{future::BoxFuture, AsyncWrite, FutureExt, Sink}; +use futures::{future::BoxFuture, AsyncWrite, AsyncWriteExt, FutureExt, Sink}; use super::super::IpcField; pub use super::common::WriteOptions; @@ -170,6 +170,8 @@ where this.task = Some( async move { write_continuation(&mut writer, 0).await?; + writer.flush().await?; + writer.close().await?; Ok(None) } .boxed(), From bf1a3ceffc4599f5ea31144f121c735d20007b7f Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Mon, 8 May 2023 10:31:19 +0200 Subject: [PATCH 07/80] Improved pargquet nested null deserialization (#1477) --- .../parquet/read/deserialize/null/nested.rs | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/io/parquet/read/deserialize/null/nested.rs b/src/io/parquet/read/deserialize/null/nested.rs index 7f0d33d825..705b607398 100644 --- a/src/io/parquet/read/deserialize/null/nested.rs +++ b/src/io/parquet/read/deserialize/null/nested.rs @@ -10,9 +10,9 @@ use super::super::nested_utils::*; use super::super::utils; use super::super::Pages; -impl<'a> utils::PageState<'a> for () { +impl<'a> utils::PageState<'a> for usize { fn len(&self) -> usize { - 0 + *self } } @@ -26,16 +26,19 @@ impl DecodedState for usize { } impl<'a> NestedDecoder<'a> for NullDecoder { - type State = (); - type Dictionary = (); + type State = usize; + type Dictionary = usize; type DecodedState = usize; fn build_state( &self, _page: &'a DataPage, - _dict: Option<&'a Self::Dictionary>, + dict: Option<&'a Self::Dictionary>, ) -> Result { - Ok(()) + if let Some(n) = dict { + return Ok(*n); + } + Ok(1) } /// Initializes a new state @@ -43,8 +46,8 @@ impl<'a> NestedDecoder<'a> for NullDecoder { 0 } - fn push_valid(&self, _state: &mut Self::State, decoded: &mut Self::DecodedState) -> Result<()> { - *decoded += 1; + fn push_valid(&self, state: &mut Self::State, decoded: &mut Self::DecodedState) -> Result<()> { + *decoded += *state; Ok(()) } @@ -53,8 +56,8 @@ impl<'a> NestedDecoder<'a> for NullDecoder { *length += 1; } - fn deserialize_dict(&self, _page: &DictPage) -> Self::Dictionary { - unreachable!() + fn deserialize_dict(&self, page: &DictPage) -> Self::Dictionary { + page.num_values } } From b09e580f075293e9af2879dcc7f6b2d5c8fe520e Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Tue, 9 May 2023 11:19:20 +0200 Subject: [PATCH 08/80] Bumped version to 0.17.1 (#1479) --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 7294dc47df..2025b71777 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "arrow2" -version = "0.17.0" +version = "0.17.1" license = "Apache-2.0" description = "Unofficial implementation of Apache Arrow spec in safe Rust" homepage = "https://github.com/jorgecarleitao/arrow2" From ef2ef5e1135e35e262d105a7ab93813dbbf732b2 Mon Sep 17 00:00:00 2001 From: Hieu Minh Nguyen <38937534+therealhieu@users.noreply.github.com> Date: Tue, 16 May 2023 00:52:41 +0700 Subject: [PATCH 09/80] feat(deps): Bump arrow-rs 39 (#1482) --- Cargo.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2025b71777..91cba8cd3d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -101,10 +101,10 @@ odbc-api = { version = "0.36", optional = true } ahash = "0.8" # Support conversion to/from arrow-rs -arrow-buffer = { version = "37.0.0", optional = true } -arrow-schema = { version = "37.0.0", optional = true } -arrow-data = { version = "37.0.0", optional = true } -arrow-array = { version = "37.0.0", optional = true } +arrow-buffer = { version = "39.0.0", optional = true } +arrow-schema = { version = "39.0.0", optional = true } +arrow-data = { version = "39.0.0", optional = true } +arrow-array = { version = "39.0.0", optional = true } [target.wasm32-unknown-unknown.dependencies] getrandom = { version = "0.2", features = ["js"] } From ce92eed4cd5a303669dfd8c971c6f9af7f62e969 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Mon, 15 May 2023 20:06:59 +0200 Subject: [PATCH 10/80] Added `Buffer::is_sliced` (#1480) --- src/buffer/immutable.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs index 98a31cb153..10c7968b0d 100644 --- a/src/buffer/immutable.rs +++ b/src/buffer/immutable.rs @@ -96,6 +96,13 @@ impl Buffer { self.len() == 0 } + /// Returns whether underlying data is sliced. + /// If sliced the [`Buffer`] is backed by + /// more data than the length of `Self`. + pub fn is_sliced(&self) -> bool { + self.data.len() != self.length + } + /// Returns the byte slice stored in this buffer #[inline] pub fn as_slice(&self) -> &[T] { From eed5ebb2b0d18dfbcce363f5d212410f52a49333 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Tue, 16 May 2023 10:17:06 +0200 Subject: [PATCH 11/80] Made `MutableList::try_extend_from_lengths` public (#1486) --- src/array/list/mutable.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/array/list/mutable.rs b/src/array/list/mutable.rs index 881cb620a0..8841ceaf53 100644 --- a/src/array/list/mutable.rs +++ b/src/array/list/mutable.rs @@ -174,8 +174,7 @@ impl MutableListArray { /// - the new offsets are not in monotonic increasing order. /// - any new offset is not in bounds of the backing array. /// - the passed iterator has no upper bound. - #[allow(dead_code)] - pub(crate) fn try_extend_from_lengths(&mut self, iterator: II) -> Result<()> + pub fn try_extend_from_lengths(&mut self, iterator: II) -> Result<()> where II: TrustedLen> + Clone, { From 148ea37467dc8d3375a23deab7c2087c630d4c87 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 20 May 2023 13:57:51 +0200 Subject: [PATCH 12/80] Splitted 'io_json' feature flags in read/write (#1487) --- Cargo.toml | 4 +++- src/io/json/mod.rs | 10 ++-------- src/io/json/read/mod.rs | 8 ++++++++ src/io/mod.rs | 11 ++++++++--- src/io/ndjson/mod.rs | 3 +++ 5 files changed, 24 insertions(+), 12 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 91cba8cd3d..7c03aad27c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -168,7 +168,9 @@ io_csv_async = ["io_csv_read_async"] io_csv_read = ["csv", "lexical-core"] io_csv_read_async = ["csv-async", "lexical-core", "futures"] io_csv_write = ["csv-core", "streaming-iterator", "lexical-core"] -io_json = ["json-deserializer", "streaming-iterator", "fallible-streaming-iterator", "indexmap", "lexical-core"] +io_json = ["io_json_read", "io_json_write"] +io_json_read = ["json-deserializer", "indexmap", "lexical-core"] +io_json_write = ["streaming-iterator", "fallible-streaming-iterator", "lexical-core"] io_ipc = ["arrow-format"] io_ipc_write_async = ["io_ipc", "futures"] io_ipc_read_async = ["io_ipc", "futures", "async-stream"] diff --git a/src/io/json/mod.rs b/src/io/json/mod.rs index ebbdc92b69..42e40a89c7 100644 --- a/src/io/json/mod.rs +++ b/src/io/json/mod.rs @@ -1,12 +1,6 @@ //! Convert data between the Arrow memory format and JSON line-delimited records. +#[cfg(feature = "io_json_read")] pub mod read; +#[cfg(feature = "io_json_write")] pub mod write; - -use crate::error::Error; - -impl From for Error { - fn from(error: json_deserializer::Error) -> Self { - Error::ExternalFormat(error.to_string()) - } -} diff --git a/src/io/json/read/mod.rs b/src/io/json/read/mod.rs index 686390df2b..087da38d50 100644 --- a/src/io/json/read/mod.rs +++ b/src/io/json/read/mod.rs @@ -8,3 +8,11 @@ pub(crate) use infer_schema::coerce_data_type; pub use infer_schema::{infer, infer_records_schema}; pub use json_deserializer; + +use crate::error::Error; + +impl From for Error { + fn from(error: json_deserializer::Error) -> Self { + Error::ExternalFormat(error.to_string()) + } +} diff --git a/src/io/mod.rs b/src/io/mod.rs index 69e4657fd7..0dd8e8651d 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -24,10 +24,10 @@ pub mod orc; )] pub mod csv; -#[cfg(feature = "io_json")] +#[cfg(any(feature = "io_json_read", feature = "io_json_write"))] #[cfg_attr(docsrs, doc(cfg(feature = "io_json")))] pub mod json; -#[cfg(feature = "io_json")] +#[cfg(any(feature = "io_json_read", feature = "io_json_write"))] #[cfg_attr(docsrs, doc(cfg(feature = "io_json")))] pub mod ndjson; @@ -55,5 +55,10 @@ pub mod avro; #[cfg_attr(docsrs, doc(cfg(feature = "io_print")))] pub mod print; -#[cfg(any(feature = "io_csv_write", feature = "io_avro", feature = "io_json"))] +#[cfg(any( + feature = "io_csv_write", + feature = "io_avro", + feature = "io_json_write", + feature = "io_json_read" +))] mod iterator; diff --git a/src/io/ndjson/mod.rs b/src/io/ndjson/mod.rs index 1448a5af6e..a77eda1e3d 100644 --- a/src/io/ndjson/mod.rs +++ b/src/io/ndjson/mod.rs @@ -1,3 +1,6 @@ //! APIs to read from and write to NDJSON + +#[cfg(feature = "io_json_read")] pub mod read; +#[cfg(feature = "io_json_write")] pub mod write; From 6f8b5dea4f8d5da637726fb853cc441a2c009b93 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 20 May 2023 14:57:44 +0200 Subject: [PATCH 13/80] Splitted extend_from_slice into extend_from_slice_unchecked (#1488) --- src/array/boolean/mutable.rs | 6 +++++- src/array/growable/boolean.rs | 6 +++++- src/array/growable/utils.rs | 7 +++++-- src/array/physical_binary.rs | 3 ++- src/bitmap/mutable.rs | 27 ++++++++++++++++++++++++--- src/compute/filter.rs | 7 ++++++- 6 files changed, 47 insertions(+), 9 deletions(-) diff --git a/src/array/boolean/mutable.rs b/src/array/boolean/mutable.rs index 4a4ed3f9ce..729ef81d6b 100644 --- a/src/array/boolean/mutable.rs +++ b/src/array/boolean/mutable.rs @@ -559,7 +559,11 @@ impl TryExtendFromSelf for MutableBooleanArray { extend_validity(self.len(), &mut self.validity, &other.validity); let slice = other.values.as_slice(); - self.values.extend_from_slice(slice, 0, other.values.len()); + // safety: invariant offset + length <= slice.len() + unsafe { + self.values + .extend_from_slice_unchecked(slice, 0, other.values.len()); + } Ok(()) } } diff --git a/src/array/growable/boolean.rs b/src/array/growable/boolean.rs index a9fb52ef3e..0cb1213403 100644 --- a/src/array/growable/boolean.rs +++ b/src/array/growable/boolean.rs @@ -63,7 +63,11 @@ impl<'a> Growable<'a> for GrowableBoolean<'a> { let values = array.values(); let (slice, offset, _) = values.as_slice(); - self.values.extend_from_slice(slice, start + offset, len); + // safety: invariant offset + length <= slice.len() + unsafe { + self.values + .extend_from_slice_unchecked(slice, start + offset, len); + } } fn extend_validity(&mut self, additional: usize) { diff --git a/src/array/growable/utils.rs b/src/array/growable/utils.rs index 06a85cd9ad..3e0c25a4ee 100644 --- a/src/array/growable/utils.rs +++ b/src/array/growable/utils.rs @@ -7,9 +7,12 @@ pub(super) type ExtendNullBits<'a> = Box ExtendNullBits { if let Some(bitmap) = array.validity() { Box::new(move |validity, start, len| { - assert!(start + len <= bitmap.len()); + debug_assert!(start + len <= bitmap.len()); let (slice, offset, _) = bitmap.as_slice(); - validity.extend_from_slice(slice, start + offset, len); + // safety: invariant offset + length <= slice.len() + unsafe { + validity.extend_from_slice_unchecked(slice, start + offset, len); + } }) } else if use_validity { Box::new(|validity, _, len| { diff --git a/src/array/physical_binary.rs b/src/array/physical_binary.rs index 161371603e..694e61a7ea 100644 --- a/src/array/physical_binary.rs +++ b/src/array/physical_binary.rs @@ -219,7 +219,8 @@ pub(crate) fn extend_validity( if let Some(other) = other { if let Some(validity) = validity { let slice = other.as_slice(); - validity.extend_from_slice(slice, 0, other.len()) + // safety: invariant offset + length <= slice.len() + unsafe { validity.extend_from_slice_unchecked(slice, 0, other.len()) } } else { let mut new_validity = MutableBitmap::from_len_set(length); new_validity.extend_from_slice(other.as_slice(), 0, other.len()); diff --git a/src/bitmap/mutable.rs b/src/bitmap/mutable.rs index 2cfc253b2a..1cc2193917 100644 --- a/src/bitmap/mutable.rs +++ b/src/bitmap/mutable.rs @@ -673,9 +673,15 @@ impl MutableBitmap { /// # Implementation /// When both [`MutableBitmap`]'s length and `offset` are both multiples of 8, /// this function performs a memcopy. Else, it first aligns bit by bit and then performs a memcopy. + /// # Safety + /// Caller must ensure `offset + length <= slice.len() * 8` #[inline] - pub fn extend_from_slice(&mut self, slice: &[u8], offset: usize, length: usize) { - assert!(offset + length <= slice.len() * 8); + pub unsafe fn extend_from_slice_unchecked( + &mut self, + slice: &[u8], + offset: usize, + length: usize, + ) { if length == 0 { return; }; @@ -691,11 +697,26 @@ impl MutableBitmap { debug_assert_eq!(self.length.saturating_add(7) / 8, self.buffer.len()); } + /// Extends the [`MutableBitmap`] from a slice of bytes with optional offset. + /// This is the fastest way to extend a [`MutableBitmap`]. + /// # Implementation + /// When both [`MutableBitmap`]'s length and `offset` are both multiples of 8, + /// this function performs a memcopy. Else, it first aligns bit by bit and then performs a memcopy. + #[inline] + pub fn extend_from_slice(&mut self, slice: &[u8], offset: usize, length: usize) { + assert!(offset + length <= slice.len() * 8); + // safety: invariant is asserted + unsafe { self.extend_from_slice_unchecked(slice, offset, length) } + } + /// Extends the [`MutableBitmap`] from a [`Bitmap`]. #[inline] pub fn extend_from_bitmap(&mut self, bitmap: &Bitmap) { let (slice, offset, length) = bitmap.as_slice(); - self.extend_from_slice(slice, offset, length); + // safety: bitmap.as_slice adheres to the invariant + unsafe { + self.extend_from_slice_unchecked(slice, offset, length); + } } /// Returns the slice of bytes of this [`MutableBitmap`]. diff --git a/src/compute/filter.rs b/src/compute/filter.rs index 0299e5045f..7ba260e702 100644 --- a/src/compute/filter.rs +++ b/src/compute/filter.rs @@ -108,7 +108,12 @@ where std::ptr::copy(chunk.as_ptr(), dst, size); dst = dst.add(size); - new_validity.extend_from_slice(validity_chunk.to_ne_bytes().as_ref(), 0, size); + // safety: invariant offset + length <= slice.len() + new_validity.extend_from_slice_unchecked( + validity_chunk.to_ne_bytes().as_ref(), + 0, + size, + ); } return; } From 6d9290e30d6b4017bef0b2aa25bb0383077e32ad Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 21 May 2023 10:01:02 +0200 Subject: [PATCH 14/80] Added PushUnchecked trait (#1489) --- src/array/fixed_size_list/mutable.rs | 29 ++++++++++++++++++++++++++++ src/array/mod.rs | 9 +++++++++ 2 files changed, 38 insertions(+) diff --git a/src/array/fixed_size_list/mutable.rs b/src/array/fixed_size_list/mutable.rs index d929f75e6e..1e387a2f70 100644 --- a/src/array/fixed_size_list/mutable.rs +++ b/src/array/fixed_size_list/mutable.rs @@ -1,5 +1,6 @@ use std::sync::Arc; +use crate::array::PushUnchecked; use crate::{ array::{ physical_binary::extend_validity, Array, MutableArray, TryExtend, TryExtendFromSelf, @@ -104,6 +105,15 @@ impl MutableFixedSizeListArray { Ok(()) } + #[inline] + /// Needs to be called when a valid value was extended to this array. + /// This is a relatively low level function, prefer `try_push` when you can. + pub fn push_valid(&mut self) { + if let Some(validity) = &mut self.validity { + validity.push(true) + } + } + #[inline] fn push_null(&mut self) { (0..self.size).for_each(|_| self.values.push_null()); @@ -221,6 +231,25 @@ where } } +impl PushUnchecked> for MutableFixedSizeListArray +where + M: MutableArray + Extend>, + I: IntoIterator>, +{ + /// # Safety + /// The caller must ensure that the `I` iterates exactly over `size` + /// items, where `size` is the fixed size width. + #[inline] + unsafe fn push_unchecked(&mut self, item: Option) { + if let Some(items) = item { + self.values.extend(items); + self.push_valid(); + } else { + self.push_null(); + } + } +} + impl TryExtendFromSelf for MutableFixedSizeListArray where M: MutableArray + TryExtendFromSelf, diff --git a/src/array/mod.rs b/src/array/mod.rs index f9c320a650..50eb962b2b 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -751,6 +751,15 @@ pub trait TryPush { fn try_push(&mut self, item: A) -> Result<()>; } +/// A trait describing the ability of a struct to receive new items. +pub trait PushUnchecked { + /// Push a new element that holds the invariants of the struct. + /// # Safety + /// The items must uphold the invariants of the struct + /// Read the specific implementation of the trait to understand what these are. + unsafe fn push_unchecked(&mut self, item: A); +} + /// A trait describing the ability of a struct to extend from a reference of itself. /// Specialization of [`TryExtend`]. pub trait TryExtendFromSelf { From 38a2e9090554d25706536205fde2fa92fc850667 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Wed, 24 May 2023 07:05:46 +0200 Subject: [PATCH 15/80] Added cast FixedSizeList <-> LargeList (#1490) --- src/compute/cast/mod.rs | 49 +++++++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index e42f769e7e..afbf0fe44e 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -86,9 +86,15 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (FixedSizeList(list_from, _), List(list_to)) => { can_cast_types(&list_from.data_type, &list_to.data_type) } + (FixedSizeList(list_from, _), LargeList(list_to)) => { + can_cast_types(&list_from.data_type, &list_to.data_type) + } (List(list_from), FixedSizeList(list_to, _)) => { can_cast_types(&list_from.data_type, &list_to.data_type) } + (LargeList(list_from), FixedSizeList(list_to, _)) => { + can_cast_types(&list_from.data_type, &list_to.data_type) + } (List(list_from), List(list_to)) => { can_cast_types(&list_from.data_type, &list_to.data_type) } @@ -345,24 +351,24 @@ fn cast_large_to_list(array: &ListArray, to_type: &DataType) -> ListArray( fixed: &FixedSizeListArray, to_type: &DataType, options: CastOptions, -) -> Result> { +) -> Result> { let new_values = cast( fixed.values().as_ref(), - ListArray::::get_child_type(to_type), + ListArray::::get_child_type(to_type), options, )?; let offsets = (0..=fixed.len()) - .map(|ix| (ix * fixed.size()) as i32) + .map(|ix| O::from_as_usize(ix * fixed.size())) .collect::>(); // Safety: offsets _are_ monotonically increasing let offsets = unsafe { Offsets::new_unchecked(offsets) }; - Ok(ListArray::::new( + Ok(ListArray::::new( to_type.clone(), offsets.into(), new_values, @@ -370,20 +376,20 @@ fn cast_fixed_size_list_to_list( )) } -fn cast_list_to_fixed_size_list( - list: &ListArray, +fn cast_list_to_fixed_size_list( + list: &ListArray, inner: &Field, size: usize, options: CastOptions, ) -> Result { let offsets = list.offsets().buffer().iter(); - let expected = (0..list.len()).map(|ix| (ix * size) as i32); + let expected = (0..list.len()).map(|ix| O::from_as_usize(ix * size)); match offsets .zip(expected) .find(|(actual, expected)| *actual != expected) { - Some(_) => Err(Error::NotYetImplemented( + Some(_) => Err(Error::InvalidArgumentError( "incompatible offsets in source list".to_string(), )), None => { @@ -438,17 +444,32 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu (_, Struct(_)) => Err(Error::NotYetImplemented( "Cannot cast to struct from other types".to_string(), )), - (List(_), FixedSizeList(inner, size)) => cast_list_to_fixed_size_list( + (List(_), FixedSizeList(inner, size)) => cast_list_to_fixed_size_list::( array.as_any().downcast_ref().unwrap(), inner.as_ref(), *size, options, ) .map(|x| x.boxed()), - (FixedSizeList(_, _), List(_)) => { - cast_fixed_size_list_to_list(array.as_any().downcast_ref().unwrap(), to_type, options) - .map(|x| x.boxed()) - } + (LargeList(_), FixedSizeList(inner, size)) => cast_list_to_fixed_size_list::( + array.as_any().downcast_ref().unwrap(), + inner.as_ref(), + *size, + options, + ) + .map(|x| x.boxed()), + (FixedSizeList(_, _), List(_)) => cast_fixed_size_list_to_list::( + array.as_any().downcast_ref().unwrap(), + to_type, + options, + ) + .map(|x| x.boxed()), + (FixedSizeList(_, _), LargeList(_)) => cast_fixed_size_list_to_list::( + array.as_any().downcast_ref().unwrap(), + to_type, + options, + ) + .map(|x| x.boxed()), (List(_), List(_)) => { cast_list::(array.as_any().downcast_ref().unwrap(), to_type, options) .map(|x| x.boxed()) From a2a9bd7d32f9e107c1fc5ea79508b5e20a4e45b4 Mon Sep 17 00:00:00 2001 From: sundyli <543950155@qq.com> Date: Wed, 24 May 2023 23:02:58 -0700 Subject: [PATCH 16/80] ci: Disable integration-ipc tests (#1492) Disable integration-ipc.yml --- .github/workflows/integration-ipc.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration-ipc.yml b/.github/workflows/integration-ipc.yml index 11aa79bed5..7dcb8b0c7d 100644 --- a/.github/workflows/integration-ipc.yml +++ b/.github/workflows/integration-ipc.yml @@ -1,6 +1,7 @@ name: Integration IPC / Flight -on: [push, pull_request] +# on: [push, pull_request] +on: [] jobs: docker: From f348fd60351092216c611d58e5ae6680bcef2038 Mon Sep 17 00:00:00 2001 From: Howard Zuo Date: Thu, 25 May 2023 07:45:10 -0400 Subject: [PATCH 17/80] Implement `into_inner` for Utf8Array and BinaryArray for reusing Buffer allocations (#1491) --- src/array/binary/mod.rs | 12 ++++++++++++ src/array/utf8/mod.rs | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index b08e78dc6d..7247decb30 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -221,6 +221,18 @@ impl BinaryArray { impl_mut_validity!(); impl_into_array!(); + /// Returns its internal representation + #[must_use] + pub fn into_inner(self) -> (DataType, OffsetsBuffer, Buffer, Option) { + let Self { + data_type, + offsets, + values, + validity, + } = self; + (data_type, offsets, values, validity) + } + /// Try to convert this `BinaryArray` to a `MutableBinaryArray` #[must_use] pub fn into_mut(self) -> Either> { diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 7392ccdd04..9440ae4330 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -240,6 +240,18 @@ impl Utf8Array { impl_mut_validity!(); impl_into_array!(); + /// Returns its internal representation + #[must_use] + pub fn into_inner(self) -> (DataType, OffsetsBuffer, Buffer, Option) { + let Self { + data_type, + offsets, + values, + validity, + } = self; + (data_type, offsets, values, validity) + } + /// Try to convert this `Utf8Array` to a `MutableUtf8Array` #[must_use] pub fn into_mut(self) -> Either> { From 320a90bf300f388747dba97bf8ddd838af4a41ea Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 25 May 2023 14:18:55 +0200 Subject: [PATCH 18/80] feat: expose `non_null_sum` (#1493) --- src/compute/aggregate/sum.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/compute/aggregate/sum.rs b/src/compute/aggregate/sum.rs index 9156a304c2..614a4cf538 100644 --- a/src/compute/aggregate/sum.rs +++ b/src/compute/aggregate/sum.rs @@ -21,7 +21,8 @@ pub trait Sum { } #[multiversion(targets = "simd")] -fn nonnull_sum(values: &[T]) -> T +/// Compute the sum of a slice +pub fn sum_slice(values: &[T]) -> T where T: NativeType + Simd + Add + std::iter::Sum, T::Simd: Sum + Add, @@ -97,7 +98,7 @@ where } match array.validity() { - None => Some(nonnull_sum(array.values())), + None => Some(sum_slice(array.values())), Some(bitmap) => Some(null_sum(array.values(), bitmap)), } } From 23e6ab937cdbba426bf1b532ab0e84947dafd38a Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 1 Jun 2023 13:34:35 +0200 Subject: [PATCH 19/80] chore: feature gate decimal arithmetic (#1494) --- Cargo.toml | 3 ++- src/compute/arithmetics/mod.rs | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 7c03aad27c..7e43f68c97 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -213,7 +213,8 @@ io_json_integration = ["hex", "serde", "serde_derive", "serde_json", "io_ipc"] io_print = ["comfy-table"] # the compute kernels. Disabling this significantly reduces compile time. compute_aggregate = ["multiversion"] -compute_arithmetics = ["strength_reduce"] +compute_arithmetics_decimal = ["strength_reduce"] +compute_arithmetics = ["strength_reduce", "compute_arithmetics_decimal"] compute_bitwise = [] compute_boolean = [] compute_boolean_kleene = [] diff --git a/src/compute/arithmetics/mod.rs b/src/compute/arithmetics/mod.rs index 3f09fe4d0b..b1ec2a12bc 100644 --- a/src/compute/arithmetics/mod.rs +++ b/src/compute/arithmetics/mod.rs @@ -13,6 +13,7 @@ //! adjusts the precision and scale to make the resulting value fit. #[forbid(unsafe_code)] pub mod basic; +#[cfg(feature = "compute_arithmetics_decimal")] pub mod decimal; pub mod time; From fbaf35e81d5357ebece1c49ac4e8a93aef26617a Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 1 Jun 2023 14:19:26 +0200 Subject: [PATCH 20/80] chore: update rustc and fix clippy (#1496) --- rust-toolchain.toml | 2 +- src/compute/sort/mod.rs | 56 +++++++++---------- src/ffi/stream.rs | 2 +- .../parquet/read/deserialize/nested_utils.rs | 6 +- tests/it/compute/sort/mod.rs | 14 ++--- 5 files changed, 34 insertions(+), 46 deletions(-) diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 5f7fac67f2..904c6cc5fc 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,2 +1,2 @@ [toolchain] -channel = "nightly-2022-12-05" +channel = "nightly-2023-06-01" diff --git a/src/compute/sort/mod.rs b/src/compute/sort/mod.rs index 92272d057c..be85c9f6cb 100644 --- a/src/compute/sort/mod.rs +++ b/src/compute/sort/mod.rs @@ -6,10 +6,7 @@ use crate::compute::take; use crate::datatypes::*; use crate::error::{Error, Result}; use crate::offset::Offset; -use crate::{ - array::*, - types::{Index, NativeType}, -}; +use crate::{array::*, types::Index}; mod binary; mod boolean; @@ -156,14 +153,14 @@ pub fn sort_to_indices( DataType::List(field) => { let (v, n) = partition_validity(values); match &field.data_type { - DataType::Int8 => Ok(sort_list::(values, v, n, options, limit)), - DataType::Int16 => Ok(sort_list::(values, v, n, options, limit)), - DataType::Int32 => Ok(sort_list::(values, v, n, options, limit)), - DataType::Int64 => Ok(sort_list::(values, v, n, options, limit)), - DataType::UInt8 => Ok(sort_list::(values, v, n, options, limit)), - DataType::UInt16 => Ok(sort_list::(values, v, n, options, limit)), - DataType::UInt32 => Ok(sort_list::(values, v, n, options, limit)), - DataType::UInt64 => Ok(sort_list::(values, v, n, options, limit)), + DataType::Int8 => Ok(sort_list::(values, v, n, options, limit)), + DataType::Int16 => Ok(sort_list::(values, v, n, options, limit)), + DataType::Int32 => Ok(sort_list::(values, v, n, options, limit)), + DataType::Int64 => Ok(sort_list::(values, v, n, options, limit)), + DataType::UInt8 => Ok(sort_list::(values, v, n, options, limit)), + DataType::UInt16 => Ok(sort_list::(values, v, n, options, limit)), + DataType::UInt32 => Ok(sort_list::(values, v, n, options, limit)), + DataType::UInt64 => Ok(sort_list::(values, v, n, options, limit)), t => Err(Error::NotYetImplemented(format!( "Sort not supported for list type {t:?}" ))), @@ -172,14 +169,14 @@ pub fn sort_to_indices( DataType::LargeList(field) => { let (v, n) = partition_validity(values); match field.data_type() { - DataType::Int8 => Ok(sort_list::(values, v, n, options, limit)), - DataType::Int16 => Ok(sort_list::(values, v, n, options, limit)), - DataType::Int32 => Ok(sort_list::(values, v, n, options, limit)), - DataType::Int64 => Ok(sort_list::(values, v, n, options, limit)), - DataType::UInt8 => Ok(sort_list::(values, v, n, options, limit)), - DataType::UInt16 => Ok(sort_list::(values, v, n, options, limit)), - DataType::UInt32 => Ok(sort_list::(values, v, n, options, limit)), - DataType::UInt64 => Ok(sort_list::(values, v, n, options, limit)), + DataType::Int8 => Ok(sort_list::(values, v, n, options, limit)), + DataType::Int16 => Ok(sort_list::(values, v, n, options, limit)), + DataType::Int32 => Ok(sort_list::(values, v, n, options, limit)), + DataType::Int64 => Ok(sort_list::(values, v, n, options, limit)), + DataType::UInt8 => Ok(sort_list::(values, v, n, options, limit)), + DataType::UInt16 => Ok(sort_list::(values, v, n, options, limit)), + DataType::UInt32 => Ok(sort_list::(values, v, n, options, limit)), + DataType::UInt64 => Ok(sort_list::(values, v, n, options, limit)), t => Err(Error::NotYetImplemented(format!( "Sort not supported for list type {t:?}" ))), @@ -188,14 +185,14 @@ pub fn sort_to_indices( DataType::FixedSizeList(field, _) => { let (v, n) = partition_validity(values); match field.data_type() { - DataType::Int8 => Ok(sort_list::(values, v, n, options, limit)), - DataType::Int16 => Ok(sort_list::(values, v, n, options, limit)), - DataType::Int32 => Ok(sort_list::(values, v, n, options, limit)), - DataType::Int64 => Ok(sort_list::(values, v, n, options, limit)), - DataType::UInt8 => Ok(sort_list::(values, v, n, options, limit)), - DataType::UInt16 => Ok(sort_list::(values, v, n, options, limit)), - DataType::UInt32 => Ok(sort_list::(values, v, n, options, limit)), - DataType::UInt64 => Ok(sort_list::(values, v, n, options, limit)), + DataType::Int8 => Ok(sort_list::(values, v, n, options, limit)), + DataType::Int16 => Ok(sort_list::(values, v, n, options, limit)), + DataType::Int32 => Ok(sort_list::(values, v, n, options, limit)), + DataType::Int64 => Ok(sort_list::(values, v, n, options, limit)), + DataType::UInt8 => Ok(sort_list::(values, v, n, options, limit)), + DataType::UInt16 => Ok(sort_list::(values, v, n, options, limit)), + DataType::UInt32 => Ok(sort_list::(values, v, n, options, limit)), + DataType::UInt64 => Ok(sort_list::(values, v, n, options, limit)), t => Err(Error::NotYetImplemented(format!( "Sort not supported for list type {t:?}" ))), @@ -305,7 +302,7 @@ impl Default for SortOptions { } } -fn sort_list( +fn sort_list( values: &dyn Array, value_indices: Vec, null_indices: Vec, @@ -315,7 +312,6 @@ fn sort_list( where I: Index, O: Offset, - T: NativeType + std::cmp::PartialOrd, { let mut valids: Vec<(I, Box)> = values .as_any() diff --git a/src/ffi/stream.rs b/src/ffi/stream.rs index 5e0dabe012..2aeb1b47f5 100644 --- a/src/ffi/stream.rs +++ b/src/ffi/stream.rs @@ -132,7 +132,7 @@ unsafe extern "C" fn get_next(iter: *mut ArrowArrayStream, array: *mut ArrowArra if iter.is_null() { return 2001; } - let mut private = &mut *((*iter).private_data as *mut PrivateData); + let private = &mut *((*iter).private_data as *mut PrivateData); match private.iter.next() { Some(Ok(item)) => { diff --git a/src/io/parquet/read/deserialize/nested_utils.rs b/src/io/parquet/read/deserialize/nested_utils.rs index 211531f6c0..86c7f5bdab 100644 --- a/src/io/parquet/read/deserialize/nested_utils.rs +++ b/src/io/parquet/read/deserialize/nested_utils.rs @@ -452,11 +452,7 @@ fn extend_offsets2<'a, D: NestedDecoder<'a>>( let is_valid = nest.is_nullable() && def > cum_sum[depth]; nest.push(length, is_valid); - if nest.is_required() && !is_valid { - is_required = true; - } else { - is_required = false - }; + is_required = nest.is_required() && !is_valid; if depth == max_depth - 1 { // the leaf / primitive diff --git a/tests/it/compute/sort/mod.rs b/tests/it/compute/sort/mod.rs index 2ede887364..736cfbadba 100644 --- a/tests/it/compute/sort/mod.rs +++ b/tests/it/compute/sort/mod.rs @@ -41,11 +41,7 @@ fn string_arrays(data: &[Option<&str>], options: SortOptions, expected_data: &[O assert_eq!(expected, output.as_ref()) } -fn string_dict_arrays( - data: &[Option<&str>], - options: SortOptions, - expected_data: &[Option<&str>], -) { +fn string_dict_arrays(data: &[Option<&str>], options: SortOptions, expected_data: &[Option<&str>]) { let mut input = MutableDictionaryArray::>::new(); input.try_extend(data.iter().copied()).unwrap(); let input = input.into_arc(); @@ -351,7 +347,7 @@ fn strings() { #[test] fn string_dicts() { - string_dict_arrays::( + string_dict_arrays( &[ None, Some("bad"), @@ -374,7 +370,7 @@ fn string_dicts() { ], ); - string_dict_arrays::( + string_dict_arrays( &[ None, Some("bad"), @@ -397,7 +393,7 @@ fn string_dicts() { ], ); - string_dict_arrays::( + string_dict_arrays( &[ None, Some("bad"), @@ -420,7 +416,7 @@ fn string_dicts() { ], ); - string_dict_arrays::( + string_dict_arrays( &[ None, Some("bad"), From 1db6afad063555bb891202aaa5d2da6a5aa45f0b Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 1 Jun 2023 14:45:14 +0200 Subject: [PATCH 21/80] Added cast LargeBinary -> LargeList (#1497) --- src/compute/cast/binary_to.rs | 12 +++++++ src/compute/cast/mod.rs | 59 +++++++++++++++++++++-------------- 2 files changed, 48 insertions(+), 23 deletions(-) diff --git a/src/compute/cast/binary_to.rs b/src/compute/cast/binary_to.rs index d84c7dd1bd..82f827e3f6 100644 --- a/src/compute/cast/binary_to.rs +++ b/src/compute/cast/binary_to.rs @@ -144,3 +144,15 @@ pub fn fixed_size_binary_binary( from.validity().cloned(), ) } + +/// Conversion of binary +pub fn binary_to_list(from: &BinaryArray, to_data_type: DataType) -> ListArray { + let values = from.values().clone(); + let values = PrimitiveArray::new(DataType::UInt8, values, None); + ListArray::::new( + to_data_type, + from.offsets().clone(), + values.boxed(), + from.validity().cloned(), + ) +} diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index afbf0fe44e..d97878d497 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -137,7 +137,14 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Binary, to_type) => { is_numeric(to_type) || matches!(to_type, LargeBinary | Utf8 | LargeUtf8) } - (LargeBinary, to_type) => is_numeric(to_type) || matches!(to_type, Binary | LargeUtf8), + (LargeBinary, to_type) => { + is_numeric(to_type) + || match to_type { + Binary | LargeUtf8 => true, + LargeList(field) => matches!(field.data_type, UInt8), + _ => false, + } + } (FixedSizeBinary(_), to_type) => matches!(to_type, Binary | LargeBinary), (Timestamp(_, _), Utf8) => true, (Timestamp(_, _), LargeUtf8) => true, @@ -684,29 +691,35 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu ))), }, - (LargeBinary, _) => match to_type { - UInt8 => binary_to_primitive_dyn::(array, to_type, options), - UInt16 => binary_to_primitive_dyn::(array, to_type, options), - UInt32 => binary_to_primitive_dyn::(array, to_type, options), - UInt64 => binary_to_primitive_dyn::(array, to_type, options), - Int8 => binary_to_primitive_dyn::(array, to_type, options), - Int16 => binary_to_primitive_dyn::(array, to_type, options), - Int32 => binary_to_primitive_dyn::(array, to_type, options), - Int64 => binary_to_primitive_dyn::(array, to_type, options), - Float32 => binary_to_primitive_dyn::(array, to_type, options), - Float64 => binary_to_primitive_dyn::(array, to_type, options), - Binary => { - binary_large_to_binary(array.as_any().downcast_ref().unwrap(), to_type.clone()) - .map(|x| x.boxed()) - } - LargeUtf8 => { - binary_to_utf8::(array.as_any().downcast_ref().unwrap(), to_type.clone()) - .map(|x| x.boxed()) + (LargeBinary, _) => { + match to_type { + UInt8 => binary_to_primitive_dyn::(array, to_type, options), + UInt16 => binary_to_primitive_dyn::(array, to_type, options), + UInt32 => binary_to_primitive_dyn::(array, to_type, options), + UInt64 => binary_to_primitive_dyn::(array, to_type, options), + Int8 => binary_to_primitive_dyn::(array, to_type, options), + Int16 => binary_to_primitive_dyn::(array, to_type, options), + Int32 => binary_to_primitive_dyn::(array, to_type, options), + Int64 => binary_to_primitive_dyn::(array, to_type, options), + Float32 => binary_to_primitive_dyn::(array, to_type, options), + Float64 => binary_to_primitive_dyn::(array, to_type, options), + Binary => { + binary_large_to_binary(array.as_any().downcast_ref().unwrap(), to_type.clone()) + .map(|x| x.boxed()) + } + LargeUtf8 => { + binary_to_utf8::(array.as_any().downcast_ref().unwrap(), to_type.clone()) + .map(|x| x.boxed()) + } + LargeList(inner) if matches!(inner.data_type, DataType::UInt8) => Ok( + binary_to_list::(array.as_any().downcast_ref().unwrap(), to_type.clone()) + .boxed(), + ), + _ => Err(Error::NotYetImplemented(format!( + "Casting from {from_type:?} to {to_type:?} not supported", + ))), } - _ => Err(Error::NotYetImplemented(format!( - "Casting from {from_type:?} to {to_type:?} not supported", - ))), - }, + } (FixedSizeBinary(_), _) => match to_type { Binary => Ok(fixed_size_binary_binary::( array.as_any().downcast_ref().unwrap(), From 1ebb7dbe7ca548d0628a028396bfc7c8eaa684c0 Mon Sep 17 00:00:00 2001 From: Reece Kibble Date: Thu, 1 Jun 2023 21:07:24 +0800 Subject: [PATCH 22/80] Fix missing `UInt64` case for page filtering (#1498) --- src/io/parquet/read/indexes/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/io/parquet/read/indexes/mod.rs b/src/io/parquet/read/indexes/mod.rs index 08a8dde5d8..de7b69b3a4 100644 --- a/src/io/parquet/read/indexes/mod.rs +++ b/src/io/parquet/read/indexes/mod.rs @@ -147,7 +147,8 @@ fn deserialize( .unwrap(); Ok(primitive::deserialize_i32(&index.indexes, data_type).into()) } - PhysicalType::Primitive(PrimitiveType::Int64) => { + PhysicalType::Primitive(PrimitiveType::UInt64) + | PhysicalType::Primitive(PrimitiveType::Int64) => { let index = indexes.pop_front().unwrap(); match index.physical_type() { ParquetPhysicalType::Int64 => { From fb5e4d591c7149df590a330365fae55d2370962f Mon Sep 17 00:00:00 2001 From: sundyli <543950155@qq.com> Date: Thu, 1 Jun 2023 06:07:41 -0700 Subject: [PATCH 23/80] Bump arrow-rs to 40.0.0 (#1495) --- Cargo.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7e43f68c97..79651e5ab1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -101,10 +101,10 @@ odbc-api = { version = "0.36", optional = true } ahash = "0.8" # Support conversion to/from arrow-rs -arrow-buffer = { version = "39.0.0", optional = true } -arrow-schema = { version = "39.0.0", optional = true } -arrow-data = { version = "39.0.0", optional = true } -arrow-array = { version = "39.0.0", optional = true } +arrow-buffer = { version = "^40.0.0", optional = true } +arrow-schema = { version = "^40.0.0", optional = true } +arrow-data = { version = "^40.0.0", optional = true } +arrow-array = { version = "^40.0.0", optional = true } [target.wasm32-unknown-unknown.dependencies] getrandom = { version = "0.2", features = ["js"] } From 82c959a88bae848fa34848707a2e1e632f8e4054 Mon Sep 17 00:00:00 2001 From: Qqwy / Marten Date: Mon, 5 Jun 2023 05:59:05 +0200 Subject: [PATCH 24/80] Relax the checks for the presence of ArrowSchema.name as this field is optional (#1499) --- src/ffi/schema.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/ffi/schema.rs b/src/ffi/schema.rs index 2751583ef1..e41de33e43 100644 --- a/src/ffi/schema.rs +++ b/src/ffi/schema.rs @@ -154,15 +154,18 @@ impl ArrowSchema { } /// returns the name of this schema. + /// + /// Since this field is optional, `""` is returned if it is not set (as per the spec). pub(crate) fn name(&self) -> &str { - assert!(!self.name.is_null()); + if self.name.is_null() { + return ""; + } // safe because the lifetime of `self.name` equals `self` unsafe { CStr::from_ptr(self.name) }.to_str().unwrap() } pub(crate) fn child(&self, index: usize) -> &'static Self { assert!(index < self.n_children as usize); - assert!(!self.name.is_null()); unsafe { self.children.add(index).as_ref().unwrap().as_ref().unwrap() } } From 99e30d3549219934adfee00a60f15f6c0ffb06e4 Mon Sep 17 00:00:00 2001 From: Qqwy / Marten Date: Mon, 5 Jun 2023 05:59:21 +0200 Subject: [PATCH 25/80] Allows to use wrappers other than Box to build ArrowArrayStreamReader. (#1500) --- src/ffi/stream.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ffi/stream.rs b/src/ffi/stream.rs index 2aeb1b47f5..9611935821 100644 --- a/src/ffi/stream.rs +++ b/src/ffi/stream.rs @@ -1,4 +1,5 @@ use std::ffi::{CStr, CString}; +use std::ops::DerefMut; use crate::{array::Array, datatypes::Field, error::Error}; @@ -45,12 +46,12 @@ unsafe fn handle_error(iter: &mut ArrowArrayStream) -> Error { } /// Implements an iterator of [`Array`] consumed from the [C stream interface](https://arrow.apache.org/docs/format/CStreamInterface.html). -pub struct ArrowArrayStreamReader { - iter: Box, +pub struct ArrowArrayStreamReader> { + iter: Iter, field: Field, } -impl ArrowArrayStreamReader { +impl> ArrowArrayStreamReader { /// Returns a new [`ArrowArrayStreamReader`] /// # Error /// Errors iff the [`ArrowArrayStream`] is out of specification @@ -60,7 +61,7 @@ impl ArrowArrayStreamReader { /// In particular: /// * The `ArrowArrayStream` fulfills the invariants of the C stream interface /// * The schema `get_schema` produces fulfills the C data interface - pub unsafe fn try_new(mut iter: Box) -> Result { + pub unsafe fn try_new(mut iter: Iter) -> Result { if iter.get_next.is_none() { return Err(Error::OutOfSpec( "The C stream MUST contain a non-null get_next".to_string(), From 0bfc3c05287c500ff42120dd17d5cf800484590b Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 9 Jun 2023 08:55:32 +0200 Subject: [PATCH 26/80] inline MutablePrimitiveArray::push (#1505) --- src/array/primitive/mutable.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/array/primitive/mutable.rs b/src/array/primitive/mutable.rs index ce6c8fa3c9..4432ab2e33 100644 --- a/src/array/primitive/mutable.rs +++ b/src/array/primitive/mutable.rs @@ -132,6 +132,7 @@ impl MutablePrimitiveArray { } /// Adds a new value to the array. + #[inline] pub fn push(&mut self, value: Option) { match value { Some(value) => { From f46e26cfef09f6355e22af2e13c82fb07859d29f Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Mon, 12 Jun 2023 10:30:47 +0200 Subject: [PATCH 27/80] feat: add from_inner (#1506) --- src/array/boolean/mod.rs | 17 ++++++++ src/array/primitive/mod.rs | 28 +++++++++++++ src/bitmap/immutable.rs | 85 ++++++++++++++++++++++++++++---------- src/buffer/immutable.rs | 24 +++++++++++ src/buffer/mod.rs | 52 ++++++++++++++++++++++- src/ffi/array.rs | 3 +- 6 files changed, 185 insertions(+), 24 deletions(-) diff --git a/src/array/boolean/mod.rs b/src/array/boolean/mod.rs index 78555572bb..0b634ee90e 100644 --- a/src/array/boolean/mod.rs +++ b/src/array/boolean/mod.rs @@ -354,6 +354,23 @@ impl BooleanArray { } = self; (data_type, values, validity) } + + /// Creates a `[BooleanArray]` from its internal representation. + /// This is the inverted from `[BooleanArray::into_inner]` + /// + /// # Safety + /// Callers must ensure all invariants of this struct are upheld. + pub unsafe fn from_inner_unchecked( + data_type: DataType, + values: Bitmap, + validity: Option, + ) -> Self { + Self { + data_type, + values, + validity, + } + } } impl Array for BooleanArray { diff --git a/src/array/primitive/mod.rs b/src/array/primitive/mod.rs index 9e4ae03881..04b74a3529 100644 --- a/src/array/primitive/mod.rs +++ b/src/array/primitive/mod.rs @@ -289,6 +289,34 @@ impl PrimitiveArray { (data_type, values, validity) } + /// Creates a `[PrimitiveArray]` from its internal representation. + /// This is the inverted from `[PrimitiveArray::into_inner]` + pub fn from_inner( + data_type: DataType, + values: Buffer, + validity: Option, + ) -> Result { + check(&data_type, &values, validity.as_ref().map(|v| v.len()))?; + Ok(unsafe { Self::from_inner_unchecked(data_type, values, validity) }) + } + + /// Creates a `[PrimitiveArray]` from its internal representation. + /// This is the inverted from `[PrimitiveArray::into_inner]` + /// + /// # Safety + /// Callers must ensure all invariants of this struct are upheld. + pub unsafe fn from_inner_unchecked( + data_type: DataType, + values: Buffer, + validity: Option, + ) -> Self { + Self { + data_type, + values, + validity, + } + } + /// Try to convert this [`PrimitiveArray`] to a [`MutablePrimitiveArray`] via copy-on-write semantics. /// /// A [`PrimitiveArray`] is backed by a [`Buffer`] and [`Bitmap`] which are essentially `Arc>`. diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs index be1e3a662d..2a5e14dc5b 100644 --- a/src/bitmap/immutable.rs +++ b/src/bitmap/immutable.rs @@ -65,6 +65,17 @@ impl Default for Bitmap { } } +pub(super) fn check(bytes: &[u8], offset: usize, length: usize) -> Result<(), Error> { + if offset + length > bytes.len().saturating_mul(8) { + return Err(Error::InvalidArgumentError(format!( + "The offset + length of the bitmap ({}) must be `<=` to the number of bytes times 8 ({})", + offset + length, + bytes.len().saturating_mul(8) + ))); + } + Ok(()) +} + impl Bitmap { /// Initializes an empty [`Bitmap`]. #[inline] @@ -77,13 +88,7 @@ impl Bitmap { /// This function errors iff `length > bytes.len() * 8` #[inline] pub fn try_new(bytes: Vec, length: usize) -> Result { - if length > bytes.len().saturating_mul(8) { - return Err(Error::InvalidArgumentError(format!( - "The length of the bitmap ({}) must be `<=` to the number of bytes times 8 ({})", - length, - bytes.len().saturating_mul(8) - ))); - } + check(&bytes, 0, length)?; let unset_bits = count_zeros(&bytes, 0, length); Ok(Self { length, @@ -117,21 +122,6 @@ impl Bitmap { BitChunks::new(&self.bytes, self.offset, self.length) } - /// Creates a new [`Bitmap`] from [`Bytes`] and a length. - /// # Panic - /// Panics iff `length <= bytes.len() * 8` - #[inline] - pub(crate) fn from_bytes(bytes: Bytes, length: usize) -> Self { - assert!(length <= bytes.len() * 8); - let unset_bits = count_zeros(&bytes, 0, length); - Self { - length, - offset: 0, - bytes: Arc::new(bytes), - unset_bits, - } - } - /// Returns the byte slice of this [`Bitmap`]. /// /// The returned tuple contains: @@ -327,6 +317,57 @@ impl Bitmap { None } } + + /// Returns its internal representation + #[must_use] + pub fn into_inner(self) -> (Arc>, usize, usize, usize) { + let Self { + bytes, + offset, + length, + unset_bits, + } = self; + (bytes, offset, length, unset_bits) + } + + /// Creates a `[Bitmap]` from its internal representation. + /// This is the inverted from `[Bitmap::into_inner]` + /// + /// # Safety + /// The invariants of this struct must be upheld + pub unsafe fn from_inner( + bytes: Arc>, + offset: usize, + length: usize, + unset_bits: usize, + ) -> Result { + check(&bytes, offset, length)?; + Ok(Self { + bytes, + offset, + length, + unset_bits, + }) + } + + /// Creates a `[Bitmap]` from its internal representation. + /// This is the inverted from `[Bitmap::into_inner]` + /// + /// # Safety + /// Callers must ensure all invariants of this struct are upheld. + pub unsafe fn from_inner_unchecked( + bytes: Arc>, + offset: usize, + length: usize, + unset_bits: usize, + ) -> Self { + Self { + bytes, + offset, + length, + unset_bits, + } + } } impl> From

for Bitmap { diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs index 10c7968b0d..0da4a41ace 100644 --- a/src/buffer/immutable.rs +++ b/src/buffer/immutable.rs @@ -244,6 +244,30 @@ impl Buffer { pub fn shared_count_weak(&self) -> usize { Arc::weak_count(&self.data) } + + /// Returns its internal representation + #[must_use] + pub fn into_inner(self) -> (Arc>, usize, usize) { + let Self { + data, + offset, + length, + } = self; + (data, offset, length) + } + + /// Creates a `[Bitmap]` from its internal representation. + /// This is the inverted from `[Bitmap::into_inner]` + /// + /// # Safety + /// Callers must ensure all invariants of this struct are upheld. + pub unsafe fn from_inner_unchecked(data: Arc>, offset: usize, length: usize) -> Self { + Self { + data, + offset, + length, + } + } } impl From> for Buffer { diff --git a/src/buffer/mod.rs b/src/buffer/mod.rs index 7ce2677532..46c0a4d64a 100644 --- a/src/buffer/mod.rs +++ b/src/buffer/mod.rs @@ -4,6 +4,7 @@ mod immutable; mod iterator; use crate::ffi::InternalArrowArray; +use std::ops::Deref; pub(crate) enum BytesAllocator { InternalArrowArray(InternalArrowArray), @@ -11,8 +12,57 @@ pub(crate) enum BytesAllocator { #[cfg(feature = "arrow")] Arrow(arrow_buffer::Buffer), } +pub(crate) type BytesInner = foreign_vec::ForeignVec; -pub(crate) type Bytes = foreign_vec::ForeignVec; +/// Bytes representation. +#[repr(transparent)] +pub struct Bytes(BytesInner); + +impl Bytes { + /// Takes ownership of an allocated memory region. + /// # Panics + /// This function panics if and only if pointer is not null + /// # Safety + /// This function is safe if and only if `ptr` is valid for `length` + /// # Implementation + /// This function leaks if and only if `owner` does not deallocate + /// the region `[ptr, ptr+length[` when dropped. + #[inline] + pub(crate) unsafe fn from_foreign(ptr: *const T, length: usize, owner: BytesAllocator) -> Self { + Self(BytesInner::from_foreign(ptr, length, owner)) + } + + /// Returns a `Some` mutable reference of [`Vec`] iff this was initialized + /// from a [`Vec`] and `None` otherwise. + #[inline] + pub(crate) fn get_vec(&mut self) -> Option<&mut Vec> { + self.0.get_vec() + } +} + +impl Deref for Bytes { + type Target = [T]; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl From> for Bytes { + #[inline] + fn from(data: Vec) -> Self { + let inner: BytesInner = data.into(); + Bytes(inner) + } +} + +impl From> for Bytes { + #[inline] + fn from(value: BytesInner) -> Self { + Self(value) + } +} #[cfg(feature = "arrow")] pub(crate) fn to_buffer( diff --git a/src/ffi/array.rs b/src/ffi/array.rs index ad1b0568a7..1fa68eabbe 100644 --- a/src/ffi/array.rs +++ b/src/ffi/array.rs @@ -258,10 +258,11 @@ unsafe fn create_bitmap( let len: usize = array.length.try_into().expect("length to fit in `usize`"); let offset: usize = array.offset.try_into().expect("Offset to fit in `usize`"); + let null_count: usize = array.null_count(); let bytes_len = bytes_for(offset + len); let bytes = Bytes::from_foreign(ptr, bytes_len, BytesAllocator::InternalArrowArray(owner)); - Ok(Bitmap::from_bytes(bytes, offset + len).sliced(offset, len)) + Bitmap::from_inner(Arc::new(bytes), offset, len, null_count) } fn buffer_offset(array: &ArrowArray, data_type: &DataType, i: usize) -> usize { From 2d2e7053f9a50810bfe9cecff25ab39089aef98e Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Mon, 12 Jun 2023 14:31:15 +0200 Subject: [PATCH 28/80] defer null_count compute (#1507) * defer null_count compute * rogue import --- src/ffi/array.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/ffi/array.rs b/src/ffi/array.rs index 1fa68eabbe..b1c77d7366 100644 --- a/src/ffi/array.rs +++ b/src/ffi/array.rs @@ -1,6 +1,7 @@ //! Contains functionality to load an ArrayData from the C Data Interface use std::sync::Arc; +use crate::bitmap::utils::count_zeros; use crate::buffer::BytesAllocator; use crate::{ array::*, @@ -253,15 +254,22 @@ unsafe fn create_bitmap( data_type: &DataType, owner: InternalArrowArray, index: usize, + // if this is the validity bitmap + // we can use the null count directly + is_validity: bool, ) -> Result { let ptr = get_buffer_ptr(array, data_type, index)?; let len: usize = array.length.try_into().expect("length to fit in `usize`"); - let offset: usize = array.offset.try_into().expect("Offset to fit in `usize`"); - let null_count: usize = array.null_count(); + let offset: usize = array.offset.try_into().expect("offset to fit in `usize`"); let bytes_len = bytes_for(offset + len); let bytes = Bytes::from_foreign(ptr, bytes_len, BytesAllocator::InternalArrowArray(owner)); + let null_count: usize = if is_validity { + array.null_count() + } else { + count_zeros(bytes.as_ref(), offset, len) + }; Bitmap::from_inner(Arc::new(bytes), offset, len, null_count) } @@ -420,7 +428,7 @@ pub trait ArrowArrayRef: std::fmt::Debug { if self.array().null_count() == 0 { Ok(None) } else { - create_bitmap(self.array(), self.data_type(), self.owner(), 0).map(Some) + create_bitmap(self.array(), self.data_type(), self.owner(), 0, true).map(Some) } } @@ -436,7 +444,7 @@ pub trait ArrowArrayRef: std::fmt::Debug { /// * the buffer at position `index` is valid for the declared length /// * the buffers' pointer is not mutable for the lifetime of `owner` unsafe fn bitmap(&self, index: usize) -> Result { - create_bitmap(self.array(), self.data_type(), self.owner(), index) + create_bitmap(self.array(), self.data_type(), self.owner(), index, false) } /// # Safety From 0d568a38850176d710523771a3a8c14b3d93b9ba Mon Sep 17 00:00:00 2001 From: Colin Jermain Date: Mon, 19 Jun 2023 05:25:10 -0400 Subject: [PATCH 29/80] Adding new constructor for MutableListArray (#1503) --- src/array/list/mutable.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/array/list/mutable.rs b/src/array/list/mutable.rs index 8841ceaf53..2bb39dea87 100644 --- a/src/array/list/mutable.rs +++ b/src/array/list/mutable.rs @@ -142,6 +142,23 @@ impl MutableListArray { Self::new_from(values, data_type, capacity) } + /// Creates a new [`MutableListArray`] from a [`MutableArray`], [`Offsets`] and + /// [`MutableBitmap`]. + pub fn new_from_mutable( + values: M, + offsets: Offsets, + validity: Option, + ) -> Self { + assert_eq!(values.len(), offsets.last().to_usize()); + let data_type = values.data_type().clone(); + Self { + data_type, + offsets, + values, + validity, + } + } + #[inline] /// Needs to be called when a valid value was extended to this array. /// This is a relatively low level function, prefer `try_push` when you can. From 3ab9b61f0ec6656d4613ad077c87bad6fac4682d Mon Sep 17 00:00:00 2001 From: Colin Jermain Date: Mon, 19 Jun 2023 22:19:13 -0400 Subject: [PATCH 30/80] Fixing dtype for MutableListArray (#1509) --- src/array/list/mutable.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/array/list/mutable.rs b/src/array/list/mutable.rs index 2bb39dea87..d24475e86d 100644 --- a/src/array/list/mutable.rs +++ b/src/array/list/mutable.rs @@ -150,7 +150,7 @@ impl MutableListArray { validity: Option, ) -> Self { assert_eq!(values.len(), offsets.last().to_usize()); - let data_type = values.data_type().clone(); + let data_type = ListArray::::default_datatype(values.data_type().clone()); Self { data_type, offsets, From 98287faf826d74982be2c1734b514a4e86ff14d1 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Mon, 26 Jun 2023 09:06:58 +0200 Subject: [PATCH 31/80] feat: add json null serialization (#1512) --- src/io/json/write/serialize.rs | 10 ++++++++++ src/util/mod.rs | 2 ++ tests/it/io/json/write.rs | 9 +++++++++ 3 files changed, 21 insertions(+) diff --git a/src/io/json/write/serialize.rs b/src/io/json/write/serialize.rs index dfa104d78e..7d46b82e0f 100644 --- a/src/io/json/write/serialize.rs +++ b/src/io/json/write/serialize.rs @@ -51,6 +51,15 @@ fn boolean_serializer<'a>( materialize_serializer(f, array.iter(), offset, take) } +fn null_serializer( + len: usize, + offset: usize, + take: usize, +) -> Box + Send + Sync> { + let f = |_x: (), buf: &mut Vec| buf.extend_from_slice(b"null"); + materialize_serializer(f, std::iter::repeat(()).take(len), offset, take) +} + fn primitive_serializer<'a, T: NativeType + ToLexical>( array: &'a PrimitiveArray, offset: usize, @@ -376,6 +385,7 @@ pub(crate) fn new_serializer<'a>( ) } } + DataType::Null => null_serializer(array.len(), offset, take), other => todo!("Writing {:?} to JSON", other), } } diff --git a/src/util/mod.rs b/src/util/mod.rs index c88c76e6da..90642b151a 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -5,6 +5,7 @@ feature = "io_csv_write", feature = "io_csv_read", feature = "io_json", + feature = "io_json_write", feature = "compute_cast" ))] mod lexical; @@ -13,6 +14,7 @@ mod lexical; feature = "io_csv_write", feature = "io_csv_read", feature = "io_json", + feature = "io_json_write", feature = "compute_cast" ))] pub use lexical::*; diff --git a/tests/it/io/json/write.rs b/tests/it/io/json/write.rs index 9c8d1313f2..44cc7ef125 100644 --- a/tests/it/io/json/write.rs +++ b/tests/it/io/json/write.rs @@ -26,6 +26,15 @@ fn int32() -> Result<()> { test!(array, expected) } +#[test] +fn null() -> Result<()> { + let array = NullArray::new(DataType::Null, 3); + + let expected = r#"[null,null,null]"#; + + test!(array, expected) +} + #[test] fn f32() -> Result<()> { let array = Float32Array::from([ From e923e03d38ed84c1d2ff37867517d31b3ed8aca9 Mon Sep 17 00:00:00 2001 From: Frank Murphy Date: Mon, 26 Jun 2023 03:08:31 -0400 Subject: [PATCH 32/80] Fix list array parsing in pandas record json (#1511) --- src/io/json/read/deserialize.rs | 13 +++++-------- src/io/json/read/infer_schema.rs | 7 +------ tests/it/io/json/read.rs | 29 +++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 14 deletions(-) diff --git a/src/io/json/read/deserialize.rs b/src/io/json/read/deserialize.rs index d4147cd033..d889ce4320 100644 --- a/src/io/json/read/deserialize.rs +++ b/src/io/json/read/deserialize.rs @@ -625,14 +625,11 @@ fn allocate_array(f: &Field) -> Box { f.data_type().clone(), *size, )), - DataType::List(inner) => match inner.data_type() { - DataType::List(_) => Box::new(MutableListArray::::new_from( - allocate_array(inner), - inner.data_type().clone(), - 0, - )), - _ => allocate_array(inner), - }, + DataType::List(inner) => Box::new(MutableListArray::::new_from( + allocate_array(inner), + f.data_type().clone(), + 0, + )), _ => todo!(), } } diff --git a/src/io/json/read/infer_schema.rs b/src/io/json/read/infer_schema.rs index f098bf80d3..c4ba23f56f 100644 --- a/src/io/json/read/infer_schema.rs +++ b/src/io/json/read/infer_schema.rs @@ -38,12 +38,7 @@ pub fn infer_records_schema(json: &Value) -> Result { Ok(Field { name: name.clone(), - data_type: DataType::List(Box::new(Field { - name: format!("{name}-records"), - data_type, - is_nullable: true, - metadata: Metadata::default(), - })), + data_type, is_nullable: true, metadata: Metadata::default(), }) diff --git a/tests/it/io/json/read.rs b/tests/it/io/json/read.rs index 2f78069396..fe91fb378e 100644 --- a/tests/it/io/json/read.rs +++ b/tests/it/io/json/read.rs @@ -164,6 +164,35 @@ fn read_json_fixed_size_records() -> Result<()> { Ok(()) } +#[test] +fn read_json_records_with_schema() -> Result<()> { + let raw = b"[{\"matrix\":[0.0,2.0]},{\"matrix\":[0.0,0.0,2.1,3.0]}]"; + let schema = Schema { + fields: vec![Field::new( + "matrix", + DataType::List(Box::new(Field::new("inner", DataType::Float32, false))), + false, + )], + metadata: Metadata::default(), + }; + + let json = json_deserializer::parse(raw)?; + let actual = read::deserialize_records(&json, &schema)?; + assert_eq!( + format!("{:?}", actual.arrays()[0]), + "ListArray[[0, 2], [0, 0, 2.1, 3]]" + ); + + let schema = read::infer_records_schema(&json)?; + let actual = read::deserialize_records(&json, &schema)?; + assert_eq!( + format!("{:?}", actual.arrays()[0]), + "ListArray[[0, 2], [0, 0, 2.1, 3]]" + ); + + Ok(()) +} + #[test] fn deserialize_timestamp_string_ns() -> Result<()> { let data = br#"["2023-04-07T12:23:34.000000001Z"]"#; From d1240b68ab9cb3d3cb3aaee514a2b376f6e0dfb6 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Mon, 26 Jun 2023 12:17:33 +0200 Subject: [PATCH 33/80] perf: don't needlessly trigger bitcount (#1513) --- src/bitmap/immutable.rs | 5 ++++- src/bitmap/mutable.rs | 16 ++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs index 2a5e14dc5b..41799e0adb 100644 --- a/src/bitmap/immutable.rs +++ b/src/bitmap/immutable.rs @@ -282,7 +282,10 @@ impl Bitmap { /// Initializes an new [`Bitmap`] filled with unset values. #[inline] pub fn new_zeroed(length: usize) -> Self { - MutableBitmap::from_len_zeroed(length).into() + // don't use `MutableBitmap::from_len_zeroed().into()` + // it triggers a bitcount + let bytes = vec![0; length.saturating_add(7) / 8]; + unsafe { Bitmap::from_inner_unchecked(Arc::new(bytes.into()), 0, length, length) } } /// Counts the nulls (unset bits) starting from `offset` bits and for `length` bits. diff --git a/src/bitmap/mutable.rs b/src/bitmap/mutable.rs index 1cc2193917..31834f2165 100644 --- a/src/bitmap/mutable.rs +++ b/src/bitmap/mutable.rs @@ -1,5 +1,6 @@ use std::hint::unreachable_unchecked; use std::iter::FromIterator; +use std::sync::Arc; use crate::bitmap::utils::{merge_reversed, set_bit_unchecked}; use crate::error::Error; @@ -336,8 +337,19 @@ impl From for Bitmap { impl From for Option { #[inline] fn from(buffer: MutableBitmap) -> Self { - if buffer.unset_bits() > 0 { - Some(Bitmap::try_new(buffer.buffer, buffer.length).unwrap()) + let unset_bits = buffer.unset_bits(); + if unset_bits > 0 { + // safety: + // invariants of the `MutableBitmap` equal that of `Bitmap` + let bitmap = unsafe { + Bitmap::from_inner_unchecked( + Arc::new(buffer.buffer.into()), + 0, + buffer.length, + unset_bits, + ) + }; + Some(bitmap) } else { None } From 8ee5ad8c774e7355218376caf73d94bce2a769a6 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Mon, 3 Jul 2023 14:25:33 +0200 Subject: [PATCH 34/80] tag unsafe function as such (#1516) --- src/array/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/array/mod.rs b/src/array/mod.rs index 50eb962b2b..bbbbedc359 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -528,8 +528,8 @@ macro_rules! impl_sliced { /// The caller must ensure that `offset + length <= self.len()`. #[inline] #[must_use] - pub fn sliced_unchecked(mut self, offset: usize, length: usize) -> Self { - unsafe { self.slice_unchecked(offset, length) }; + pub unsafe fn sliced_unchecked(mut self, offset: usize, length: usize) -> Self { + self.slice_unchecked(offset, length); self } }; From e9386ffa36159abe8f6d82fa4ae2e40d2dd28e95 Mon Sep 17 00:00:00 2001 From: Frank Murphy Date: Sat, 8 Jul 2023 08:18:50 -0400 Subject: [PATCH 35/80] Pandas record support for utf8 and bool arrays (#1517) Co-authored-by: JONBRWN --- src/io/json/read/deserialize.rs | 3 +++ tests/it/io/json/read.rs | 36 +++++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/src/io/json/read/deserialize.rs b/src/io/json/read/deserialize.rs index d889ce4320..048e29a300 100644 --- a/src/io/json/read/deserialize.rs +++ b/src/io/json/read/deserialize.rs @@ -620,6 +620,9 @@ fn allocate_array(f: &Field) -> Box { DataType::Float16 => Box::new(MutablePrimitiveArray::::new()), DataType::Float32 => Box::new(MutablePrimitiveArray::::new()), DataType::Float64 => Box::new(MutablePrimitiveArray::::new()), + DataType::Boolean => Box::new(MutableBooleanArray::new()), + DataType::Utf8 => Box::new(MutableUtf8Array::::new()), + DataType::LargeUtf8 => Box::new(MutableUtf8Array::::new()), DataType::FixedSizeList(inner, size) => Box::new(MutableFixedSizeListArray::<_>::new_from( allocate_array(inner), f.data_type().clone(), diff --git a/tests/it/io/json/read.rs b/tests/it/io/json/read.rs index fe91fb378e..411564190b 100644 --- a/tests/it/io/json/read.rs +++ b/tests/it/io/json/read.rs @@ -43,7 +43,9 @@ fn read_json_records() -> Result<()> { [2, 3], [4, 5, 6] ], - "b": [1, 2, 3] + "b": [1, 2, 3], + "c": ["test"], + "d": [true] }, { "a": [ @@ -53,7 +55,9 @@ fn read_json_records() -> Result<()> { ] }, { - "b": [7, 8, 9] + "b": [7, 8, 9], + "c": ["string"], + "d": [false] } ]"#; @@ -96,6 +100,30 @@ fn read_json_records() -> Result<()> { b.try_extend(b_iter).unwrap(); let b_expected: ListArray = b.into(); + let c_iter = vec![vec![Some("test")], vec![Some("string")]]; + + let c_iter = c_iter.into_iter().map(Some); + let mut c = MutableListArray::>::new_with_field( + MutableUtf8Array::::new(), + "item", + true, + ); + + c.try_extend(c_iter).unwrap(); + let c_expected: ListArray = c.into(); + + let d_iter = vec![vec![Some(true)], vec![Some(false)]]; + + let d_iter = d_iter.into_iter().map(Some); + let mut d = MutableListArray::::new_with_field( + MutableBooleanArray::new(), + "item", + true, + ); + + d.try_extend(d_iter).unwrap(); + let d_expected: ListArray = d.into(); + let json = json_deserializer::parse(data)?; let schema = read::infer_records_schema(&json)?; @@ -106,6 +134,10 @@ fn read_json_records() -> Result<()> { (&a_expected, arr.as_ref()) } else if f.name == "b" { (&b_expected, arr.as_ref()) + } else if f.name == "c" { + (&c_expected, arr.as_ref()) + } else if f.name == "d" { + (&d_expected, arr.as_ref()) } else { panic!("unexpected field found: {}", f.name); }; From 589b3f842bb2092cbddc41bea05039898a0db74d Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 15 Jul 2023 11:28:35 +0200 Subject: [PATCH 36/80] arrow2 0.17.3 release (#1520) --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 79651e5ab1..24e4ec9173 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "arrow2" -version = "0.17.1" +version = "0.17.3" license = "Apache-2.0" description = "Unofficial implementation of Apache Arrow spec in safe Rust" homepage = "https://github.com/jorgecarleitao/arrow2" From 031bc7bc9ff720073b553a23696e7288db269229 Mon Sep 17 00:00:00 2001 From: Yang Xiufeng Date: Tue, 18 Jul 2023 15:12:04 +0800 Subject: [PATCH 37/80] Bump arrow-rs version to 43. (#1521) --- Cargo.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 24e4ec9173..05d7851811 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -101,10 +101,10 @@ odbc-api = { version = "0.36", optional = true } ahash = "0.8" # Support conversion to/from arrow-rs -arrow-buffer = { version = "^40.0.0", optional = true } -arrow-schema = { version = "^40.0.0", optional = true } -arrow-data = { version = "^40.0.0", optional = true } -arrow-array = { version = "^40.0.0", optional = true } +arrow-buffer = { version = ">=40, <44", optional = true } +arrow-schema = { version = ">=40, <44", optional = true } +arrow-data = { version = ">=40, <44", optional = true } +arrow-array = { version = ">=40, <44", optional = true } [target.wasm32-unknown-unknown.dependencies] getrandom = { version = "0.2", features = ["js"] } From f175c1cf2070735d747d6b355c7e9286caae2c19 Mon Sep 17 00:00:00 2001 From: Frank Murphy Date: Thu, 27 Jul 2023 04:30:31 -0400 Subject: [PATCH 38/80] Sampling tests for parquet round trips (#1519) --- Cargo.toml | 11 +++ tests/it/io/parquet/mod.rs | 3 + tests/it/io/parquet/sample_tests.rs | 119 ++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+) create mode 100644 tests/it/io/parquet/sample_tests.rs diff --git a/Cargo.toml b/Cargo.toml index 05d7851811..ed5882cfd0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -129,6 +129,14 @@ proptest = { version = "1", default_features = false, features = ["std"] } avro-rs = { version = "0.13", features = ["snappy"] } # use for flaky testing rand = "0.8" +# use for generating and testing random data samples +sample-arrow2 = "0.1" +sample-std = "0.1" +sample-test = "0.1" + +# ugly hack needed to match this library in sample_arrow2 +[patch.crates-io] +arrow2 = { path = "." } [package.metadata.docs.rs] features = ["full"] @@ -188,6 +196,9 @@ io_parquet_compression = [ "io_parquet_brotli" ] +# sample testing of generated arrow data +io_parquet_sample_test = ["io_parquet"] + # compression backends io_parquet_zstd = ["parquet2/zstd"] io_parquet_snappy = ["parquet2/snappy"] diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs index cdf5b41573..1ad218e0fe 100644 --- a/tests/it/io/parquet/mod.rs +++ b/tests/it/io/parquet/mod.rs @@ -21,6 +21,9 @@ mod read_indexes; mod write; mod write_async; +#[cfg(feature = "io_parquet_sample_test")] +mod sample_tests; + type ArrayStats = (Box, Statistics); fn new_struct( diff --git a/tests/it/io/parquet/sample_tests.rs b/tests/it/io/parquet/sample_tests.rs new file mode 100644 index 0000000000..959f120128 --- /dev/null +++ b/tests/it/io/parquet/sample_tests.rs @@ -0,0 +1,119 @@ +use arrow2::io::parquet::write::*; +use arrow2::{ + chunk::Chunk, + datatypes::{Field, Metadata, Schema}, + error::Result, + io::parquet::read as p_read, +}; +use std::borrow::Borrow; +use std::io::Cursor; + +use sample_arrow2::{ + array::ArbitraryArray, + chunk::{ArbitraryChunk, ChainedChunk}, + datatypes::{sample_flat, ArbitraryDataType}, +}; +use sample_std::{Chance, Random, Regex, Sample}; +use sample_test::sample_test; + +fn deep_chunk(depth: usize, len: usize) -> ArbitraryChunk { + let names = Regex::new("[a-z]{4,8}"); + let data_type = ArbitraryDataType { + struct_branch: 1..3, + names: names.clone(), + // TODO: this breaks the test + // nullable: Chance(0.5), + nullable: Chance(0.0), + flat: sample_flat, + } + .sample_depth(depth); + + let array = ArbitraryArray { + names, + branch: 0..10, + len: len..(len + 1), + null: Chance(0.1), + // TODO: this breaks the test + // is_nullable: true, + is_nullable: false, + }; + + ArbitraryChunk { + // TODO: shrinking appears to be an issue with chunks this large. issues + // currently reproduce on the smaller sizes anyway. + // chunk_len: 10..1000, + chunk_len: 1..10, + array_count: 1..2, + data_type, + array, + } +} + +#[sample_test] +fn round_trip_sample( + #[sample(deep_chunk(5, 100).sample_one())] chained: ChainedChunk, +) -> Result<()> { + sample_test::env_logger_init(); + let chunks = vec![chained.value]; + let name = Regex::new("[a-z]{4, 8}"); + let mut g = Random::new(); + + // TODO: this probably belongs in a helper in sample-arrow2 + let schema = Schema { + fields: chunks + .first() + .unwrap() + .iter() + .map(|arr| { + Field::new( + name.generate(&mut g), + arr.data_type().clone(), + arr.validity().is_some(), + ) + }) + .collect(), + metadata: Metadata::default(), + }; + + let options = WriteOptions { + write_statistics: true, + compression: CompressionOptions::Uncompressed, + version: Version::V2, + data_pagesize_limit: None, + }; + + let encodings: Vec<_> = schema + .borrow() + .fields + .iter() + .map(|field| transverse(field.data_type(), |_| Encoding::Plain)) + .collect(); + + let row_groups = RowGroupIterator::try_new( + chunks.clone().into_iter().map(Ok), + &schema, + options, + encodings, + )?; + + let buffer = Cursor::new(vec![]); + let mut writer = FileWriter::try_new(buffer, schema, options)?; + + for group in row_groups { + writer.write(group?)?; + } + writer.end(None)?; + + let mut buffer = writer.into_inner(); + + let metadata = p_read::read_metadata(&mut buffer)?; + let schema = p_read::infer_schema(&metadata)?; + + let mut reader = p_read::FileReader::new(buffer, metadata.row_groups, schema, None, None, None); + + let result: Vec<_> = reader.collect::>()?; + + assert_eq!(result, chunks); + + Ok(()) +} From 15c5ec1ee8ffd123c305e0072bb959baae54d353 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 27 Jul 2023 13:45:29 +0200 Subject: [PATCH 39/80] feat: add duration type to json writer (#1522) --- src/io/json/write/serialize.rs | 43 +++++++++++++++++++++++++++++++--- src/temporal_conversions.rs | 26 +++++++++++++++++++- 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/src/io/json/write/serialize.rs b/src/io/json/write/serialize.rs index 7d46b82e0f..abf845714c 100644 --- a/src/io/json/write/serialize.rs +++ b/src/io/json/write/serialize.rs @@ -1,4 +1,4 @@ -use chrono::{NaiveDate, NaiveDateTime}; +use chrono::{Duration, NaiveDate, NaiveDateTime}; use lexical_core::ToLexical; use std::io::Write; use streaming_iterator::StreamingIterator; @@ -8,8 +8,9 @@ use crate::datatypes::{IntegerType, TimeUnit}; use crate::io::iterator::BufStreamingIterator; use crate::offset::Offset; use crate::temporal_conversions::{ - date32_to_date, date64_to_date, timestamp_ms_to_datetime, timestamp_ns_to_datetime, - timestamp_s_to_datetime, timestamp_us_to_datetime, + date32_to_date, date64_to_date, duration_ms_to_duration, duration_ns_to_duration, + duration_s_to_duration, duration_us_to_duration, timestamp_ms_to_datetime, + timestamp_ns_to_datetime, timestamp_s_to_datetime, timestamp_us_to_datetime, }; use crate::util::lexical_to_bytes_mut; use crate::{array::*, datatypes::DataType, types::NativeType}; @@ -266,6 +267,28 @@ where materialize_serializer(f, array.iter(), offset, take) } +fn duration_serializer<'a, T, F>( + array: &'a PrimitiveArray, + convert: F, + offset: usize, + take: usize, +) -> Box + 'a + Send + Sync> +where + T: NativeType, + F: Fn(T) -> Duration + 'static + Send + Sync, +{ + let f = move |x: Option<&T>, buf: &mut Vec| { + if let Some(x) = x { + let duration = convert(*x); + write!(buf, "\"{duration}\"").unwrap(); + } else { + buf.extend_from_slice(b"null") + } + }; + + materialize_serializer(f, array.iter(), offset, take) +} + fn timestamp_serializer<'a, F>( array: &'a PrimitiveArray, convert: F, @@ -385,6 +408,20 @@ pub(crate) fn new_serializer<'a>( ) } } + DataType::Duration(tu) => { + let convert = match tu { + TimeUnit::Nanosecond => duration_ns_to_duration, + TimeUnit::Microsecond => duration_us_to_duration, + TimeUnit::Millisecond => duration_ms_to_duration, + TimeUnit::Second => duration_s_to_duration, + }; + duration_serializer( + array.as_any().downcast_ref().unwrap(), + convert, + offset, + take, + ) + } DataType::Null => null_serializer(array.len(), offset, take), other => todo!("Writing {:?} to JSON", other), } diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs index a76700f444..48f2078a2a 100644 --- a/src/temporal_conversions.rs +++ b/src/temporal_conversions.rs @@ -2,7 +2,7 @@ use chrono::{ format::{parse, Parsed, StrftimeItems}, - Datelike, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime, + Datelike, Duration, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime, }; use crate::error::Result; @@ -66,6 +66,30 @@ pub fn time32s_to_time(v: i32) -> NaiveTime { NaiveTime::from_num_seconds_from_midnight_opt(v as u32, 0).expect("invalid time") } +/// converts a `i64` representing a `duration(s)` to [`Duration`] +#[inline] +pub fn duration_s_to_duration(v: i64) -> Duration { + Duration::seconds(v) +} + +/// converts a `i64` representing a `duration(ms)` to [`Duration`] +#[inline] +pub fn duration_ms_to_duration(v: i64) -> Duration { + Duration::milliseconds(v) +} + +/// converts a `i64` representing a `duration(us)` to [`Duration`] +#[inline] +pub fn duration_us_to_duration(v: i64) -> Duration { + Duration::microseconds(v) +} + +/// converts a `i64` representing a `duration(ns)` to [`Duration`] +#[inline] +pub fn duration_ns_to_duration(v: i64) -> Duration { + Duration::nanoseconds(v) +} + /// converts a `i32` representing a `time32(ms)` to [`NaiveTime`] #[inline] pub fn time32ms_to_time(v: i32) -> NaiveTime { From 393c3f8c58309388acc6a5d08575b360671e0a0f Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 27 Jul 2023 14:51:10 +0200 Subject: [PATCH 40/80] feat: add temporal conversions that don't panic (#1523) --- src/temporal_conversions.rs | 58 ++++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs index 48f2078a2a..b706a45b29 100644 --- a/src/temporal_conversions.rs +++ b/src/temporal_conversions.rs @@ -32,14 +32,25 @@ pub const EPOCH_DAYS_FROM_CE: i32 = 719_163; /// converts a `i32` representing a `date32` to [`NaiveDateTime`] #[inline] pub fn date32_to_datetime(v: i32) -> NaiveDateTime { + date32_to_datetime_opt(v).expect("invalid or out-of-range datetime") +} + +/// converts a `i32` representing a `date32` to [`NaiveDateTime`] +#[inline] +pub fn date32_to_datetime_opt(v: i32) -> Option { NaiveDateTime::from_timestamp_opt(v as i64 * SECONDS_IN_DAY, 0) - .expect("invalid or out-of-range datetime") } /// converts a `i32` representing a `date32` to [`NaiveDate`] #[inline] pub fn date32_to_date(days: i32) -> NaiveDate { - NaiveDate::from_num_days_from_ce_opt(EPOCH_DAYS_FROM_CE + days).expect("out-of-range date") + date32_to_date_opt(days).expect("out-of-range date") +} + +/// converts a `i32` representing a `date32` to [`NaiveDate`] +#[inline] +pub fn date32_to_date_opt(days: i32) -> Option { + NaiveDate::from_num_days_from_ce_opt(EPOCH_DAYS_FROM_CE + days) } /// converts a `i64` representing a `date64` to [`NaiveDateTime`] @@ -105,6 +116,12 @@ pub fn time32ms_to_time(v: i32) -> NaiveTime { /// converts a `i64` representing a `time64(us)` to [`NaiveTime`] #[inline] pub fn time64us_to_time(v: i64) -> NaiveTime { + time64us_to_time_opt(v).expect("invalid time") +} + +/// converts a `i64` representing a `time64(us)` to [`NaiveTime`] +#[inline] +pub fn time64us_to_time_opt(v: i64) -> Option { NaiveTime::from_num_seconds_from_midnight_opt( // extract seconds from microseconds (v / MICROSECONDS) as u32, @@ -112,30 +129,46 @@ pub fn time64us_to_time(v: i64) -> NaiveTime { // nanoseconds (v % MICROSECONDS * MILLISECONDS) as u32, ) - .expect("invalid time") } /// converts a `i64` representing a `time64(ns)` to [`NaiveTime`] #[inline] pub fn time64ns_to_time(v: i64) -> NaiveTime { + time64ns_to_time_opt(v).expect("invalid time") +} + +/// converts a `i64` representing a `time64(ns)` to [`NaiveTime`] +#[inline] +pub fn time64ns_to_time_opt(v: i64) -> Option { NaiveTime::from_num_seconds_from_midnight_opt( // extract seconds from nanoseconds (v / NANOSECONDS) as u32, // discard extracted seconds (v % NANOSECONDS) as u32, ) - .expect("invalid time") } /// converts a `i64` representing a `timestamp(s)` to [`NaiveDateTime`] #[inline] pub fn timestamp_s_to_datetime(seconds: i64) -> NaiveDateTime { - NaiveDateTime::from_timestamp_opt(seconds, 0).expect("invalid or out-of-range datetime") + timestamp_s_to_datetime_opt(seconds).expect("invalid or out-of-range datetime") +} + +/// converts a `i64` representing a `timestamp(s)` to [`NaiveDateTime`] +#[inline] +pub fn timestamp_s_to_datetime_opt(seconds: i64) -> Option { + NaiveDateTime::from_timestamp_opt(seconds, 0) } /// converts a `i64` representing a `timestamp(ms)` to [`NaiveDateTime`] #[inline] pub fn timestamp_ms_to_datetime(v: i64) -> NaiveDateTime { + timestamp_ms_to_datetime_opt(v).expect("invalid or out-of-range datetime") +} + +/// converts a `i64` representing a `timestamp(ms)` to [`NaiveDateTime`] +#[inline] +pub fn timestamp_ms_to_datetime_opt(v: i64) -> Option { if v >= 0 { NaiveDateTime::from_timestamp_opt( // extract seconds from milliseconds @@ -157,12 +190,17 @@ pub fn timestamp_ms_to_datetime(v: i64) -> NaiveDateTime { ) } } - .expect("invalid or out-of-range datetime") } /// converts a `i64` representing a `timestamp(us)` to [`NaiveDateTime`] #[inline] pub fn timestamp_us_to_datetime(v: i64) -> NaiveDateTime { + timestamp_us_to_datetime_opt(v).expect("invalid or out-of-range datetime") +} + +/// converts a `i64` representing a `timestamp(us)` to [`NaiveDateTime`] +#[inline] +pub fn timestamp_us_to_datetime_opt(v: i64) -> Option { if v >= 0 { NaiveDateTime::from_timestamp_opt( // extract seconds from microseconds @@ -184,12 +222,17 @@ pub fn timestamp_us_to_datetime(v: i64) -> NaiveDateTime { ) } } - .expect("invalid or out-of-range datetime") } /// converts a `i64` representing a `timestamp(ns)` to [`NaiveDateTime`] #[inline] pub fn timestamp_ns_to_datetime(v: i64) -> NaiveDateTime { + timestamp_ns_to_datetime_opt(v).expect("invalid or out-of-range datetime") +} + +/// converts a `i64` representing a `timestamp(ns)` to [`NaiveDateTime`] +#[inline] +pub fn timestamp_ns_to_datetime_opt(v: i64) -> Option { if v >= 0 { NaiveDateTime::from_timestamp_opt( // extract seconds from nanoseconds @@ -211,7 +254,6 @@ pub fn timestamp_ns_to_datetime(v: i64) -> NaiveDateTime { ) } } - .expect("invalid or out-of-range datetime") } /// Converts a timestamp in `time_unit` and `timezone` into [`chrono::DateTime`]. From de20f2dd18604d1ecefc5fd6385952120014259a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nils=20O=2E=20Sel=C3=A5sdal?= Date: Thu, 27 Jul 2023 15:08:37 +0200 Subject: [PATCH 41/80] Add datetime with timezone support to Json serializer (#1510) --- src/io/json/write/serialize.rs | 89 ++++++++++++++++++++++++++-------- tests/it/io/json/write.rs | 55 +++++++++++++++++++++ 2 files changed, 125 insertions(+), 19 deletions(-) diff --git a/src/io/json/write/serialize.rs b/src/io/json/write/serialize.rs index abf845714c..1a6fc2e4a5 100644 --- a/src/io/json/write/serialize.rs +++ b/src/io/json/write/serialize.rs @@ -7,10 +7,13 @@ use crate::bitmap::utils::ZipValidity; use crate::datatypes::{IntegerType, TimeUnit}; use crate::io::iterator::BufStreamingIterator; use crate::offset::Offset; +#[cfg(feature = "chrono-tz")] +use crate::temporal_conversions::parse_offset_tz; use crate::temporal_conversions::{ date32_to_date, date64_to_date, duration_ms_to_duration, duration_ns_to_duration, - duration_s_to_duration, duration_us_to_duration, timestamp_ms_to_datetime, - timestamp_ns_to_datetime, timestamp_s_to_datetime, timestamp_us_to_datetime, + duration_s_to_duration, duration_us_to_duration, parse_offset, timestamp_ms_to_datetime, + timestamp_ns_to_datetime, timestamp_s_to_datetime, timestamp_to_datetime, + timestamp_us_to_datetime, }; use crate::util::lexical_to_bytes_mut; use crate::{array::*, datatypes::DataType, types::NativeType}; @@ -309,6 +312,51 @@ where materialize_serializer(f, array.iter(), offset, take) } +fn timestamp_tz_serializer<'a>( + array: &'a PrimitiveArray, + time_unit: TimeUnit, + tz: &str, + offset: usize, + take: usize, +) -> Box + 'a + Send + Sync> { + match parse_offset(tz) { + Ok(parsed_tz) => { + let f = move |x: Option<&i64>, buf: &mut Vec| { + if let Some(x) = x { + let dt_str = timestamp_to_datetime(*x, time_unit, &parsed_tz).to_rfc3339(); + write!(buf, "\"{dt_str}\"").unwrap(); + } else { + buf.extend_from_slice(b"null") + } + }; + + materialize_serializer(f, array.iter(), offset, take) + } + #[cfg(feature = "chrono-tz")] + _ => match parse_offset_tz(tz) { + Ok(parsed_tz) => { + let f = move |x: Option<&i64>, buf: &mut Vec| { + if let Some(x) = x { + let dt_str = timestamp_to_datetime(*x, time_unit, &parsed_tz).to_rfc3339(); + write!(buf, "\"{dt_str}\"").unwrap(); + } else { + buf.extend_from_slice(b"null") + } + }; + + materialize_serializer(f, array.iter(), offset, take) + } + _ => { + panic!("Timezone {} is invalid or not supported", tz); + } + }, + #[cfg(not(feature = "chrono-tz"))] + _ => { + panic!("Invalid Offset format (must be [-]00:00) or chrono-tz feature not active"); + } + } +} + pub(crate) fn new_serializer<'a>( array: &'a dyn Array, offset: usize, @@ -390,24 +438,27 @@ pub(crate) fn new_serializer<'a>( offset, take, ), - DataType::Timestamp(tu, tz) => { - if tz.is_some() { - todo!("still have to implement timezone") - } else { - let convert = match tu { - TimeUnit::Nanosecond => timestamp_ns_to_datetime, - TimeUnit::Microsecond => timestamp_us_to_datetime, - TimeUnit::Millisecond => timestamp_ms_to_datetime, - TimeUnit::Second => timestamp_s_to_datetime, - }; - timestamp_serializer( - array.as_any().downcast_ref().unwrap(), - convert, - offset, - take, - ) - } + DataType::Timestamp(tu, None) => { + let convert = match tu { + TimeUnit::Nanosecond => timestamp_ns_to_datetime, + TimeUnit::Microsecond => timestamp_us_to_datetime, + TimeUnit::Millisecond => timestamp_ms_to_datetime, + TimeUnit::Second => timestamp_s_to_datetime, + }; + timestamp_serializer( + array.as_any().downcast_ref().unwrap(), + convert, + offset, + take, + ) } + DataType::Timestamp(time_unit, Some(tz)) => timestamp_tz_serializer( + array.as_any().downcast_ref().unwrap(), + *time_unit, + tz, + offset, + take, + ), DataType::Duration(tu) => { let convert = match tu { TimeUnit::Nanosecond => duration_ns_to_duration, diff --git a/tests/it/io/json/write.rs b/tests/it/io/json/write.rs index 44cc7ef125..1e6ede9fec 100644 --- a/tests/it/io/json/write.rs +++ b/tests/it/io/json/write.rs @@ -422,3 +422,58 @@ fn write_timestamp() -> Result<()> { test!(array, expected) } + +#[test] +fn write_timestamp_with_tz_secs() -> Result<()> { + let array = PrimitiveArray::new( + DataType::Timestamp(TimeUnit::Second, Some("UTC".to_owned())), + vec![10i64, 1 << 32, 1 << 33].into(), + None, + ); + + let expected = + r#"["1970-01-01T00:00:10+00:00","2106-02-07T06:28:16+00:00","2242-03-16T12:56:32+00:00"]"#; + test!(array, expected) +} + +#[test] +fn write_timestamp_with_tz_micros() -> Result<()> { + let array = PrimitiveArray::new( + DataType::Timestamp(TimeUnit::Microsecond, Some("+02:00".to_owned())), + vec![ + 10i64 * 1_000_000, + (1 << 32) * 1_000_000, + (1 << 33) * 1_000_000, + 1_234_567_890_123_450, + 1_234_567_890_120_000, + ] + .into(), + None, + ); + // Note, default chrono DateTime string conversion strips off milli/micro/nanoseconds parts + // if they are zero + let expected = r#"["1970-01-01T02:00:10+02:00","2106-02-07T08:28:16+02:00","2242-03-16T14:56:32+02:00","2009-02-14T01:31:30.123450+02:00","2009-02-14T01:31:30.120+02:00"]"#; + + test!(array, expected) +} +#[cfg(feature = "chrono-tz")] +#[test] +fn write_timestamp_with_chrono_tz_millis() -> Result<()> { + let array = PrimitiveArray::new( + DataType::Timestamp(TimeUnit::Millisecond, Some("Europe/Oslo".to_owned())), + vec![ + 10i64 * 1_000, + (1 << 32) * 1_000, + (1 << 33) * 1_000, + 1_234_567_890_123, + 1_239_874_560_120, + ] + .into(), + None, + ); + // Note, default chrono DateTime string conversion strips off milli/micro/nanoseconds parts + // if they are zero + let expected = r#"["1970-01-01T01:00:10+01:00","2106-02-07T07:28:16+01:00","2242-03-16T13:56:32+01:00","2009-02-14T00:31:30.123+01:00","2009-04-16T11:36:00.120+02:00"]"#; + + test!(array, expected) +} From d5c78e7ba45fcebfbafd55a82ba2601ee3ea9617 Mon Sep 17 00:00:00 2001 From: Qqwy / Marten Date: Thu, 27 Jul 2023 15:10:00 +0200 Subject: [PATCH 42/80] ArrowArrayStreamReader::try_new(): Safeguard against released streams (#1501) --- src/ffi/stream.rs | 9 ++++++++- tests/it/ffi/stream.rs | 13 ++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/ffi/stream.rs b/src/ffi/stream.rs index 9611935821..08fcaf0f43 100644 --- a/src/ffi/stream.rs +++ b/src/ffi/stream.rs @@ -54,7 +54,8 @@ pub struct ArrowArrayStreamReader> { impl> ArrowArrayStreamReader { /// Returns a new [`ArrowArrayStreamReader`] /// # Error - /// Errors iff the [`ArrowArrayStream`] is out of specification + /// Errors iff the [`ArrowArrayStream`] is out of specification, + /// or was already released prior to calling this function. /// # Safety /// This method is intrinsically `unsafe` since it assumes that the `ArrowArrayStream` /// contains a valid Arrow C stream interface. @@ -62,6 +63,12 @@ impl> ArrowArrayStreamReader { /// * The `ArrowArrayStream` fulfills the invariants of the C stream interface /// * The schema `get_schema` produces fulfills the C data interface pub unsafe fn try_new(mut iter: Iter) -> Result { + if iter.release.is_none() { + return Err(Error::InvalidArgumentError( + "The C stream was already released".to_string(), + )); + }; + if iter.get_next.is_none() { return Err(Error::OutOfSpec( "The C stream MUST contain a non-null get_next".to_string(), diff --git a/tests/it/ffi/stream.rs b/tests/it/ffi/stream.rs index 44d0e1e7cc..53887d4362 100644 --- a/tests/it/ffi/stream.rs +++ b/tests/it/ffi/stream.rs @@ -1,6 +1,6 @@ use arrow2::array::*; use arrow2::datatypes::Field; -use arrow2::{error::Result, ffi}; +use arrow2::{error::Error, error::Result, ffi}; fn _test_round_trip(arrays: Vec>) -> Result<()> { let field = Field::new("a", arrays[0].data_type().clone(), true); @@ -30,3 +30,14 @@ fn round_trip() -> Result<()> { _test_round_trip(vec![array.clone(), array.clone(), array]) } + +#[test] +fn stream_reader_try_new_invalid_argument_error_on_released_stream() { + let released_stream = Box::new(ffi::ArrowArrayStream::empty()); + let reader = unsafe { ffi::ArrowArrayStreamReader::try_new(released_stream) }; + // poor man's assert_matches: + match reader { + Err(Error::InvalidArgumentError(_)) => {} + _ => panic!("ArrowArrayStreamReader::try_new did not return an InvalidArgumentError"), + } +} From 92050ec64877fe1348116e0f5dc6e06b949c0519 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 6 Aug 2023 09:59:37 +0200 Subject: [PATCH 43/80] feat: better error message when reader feather v1 (#1528) --- src/ffi/array.rs | 14 +++++++++++--- src/io/ipc/mod.rs | 3 ++- src/io/ipc/read/file.rs | 9 ++++++--- src/io/ipc/read/file_async.rs | 4 ++-- src/io/ipc/write/file_async.rs | 6 +++--- src/io/ipc/write/writer.rs | 6 +++--- 6 files changed, 27 insertions(+), 15 deletions(-) diff --git a/src/ffi/array.rs b/src/ffi/array.rs index b1c77d7366..4057d0be8f 100644 --- a/src/ffi/array.rs +++ b/src/ffi/array.rs @@ -209,7 +209,7 @@ unsafe fn get_buffer_ptr( let ptr = *buffers.add(index); if ptr.is_null() { return Err(Error::oos(format!( - "An array of type {data_type:?} + "An array of type {data_type:?} must have a non-null buffer {index}" ))); } @@ -235,9 +235,14 @@ unsafe fn create_buffer( owner: InternalArrowArray, index: usize, ) -> Result> { + let len = buffer_len(array, data_type, index)?; + + if len == 0 { + return Ok(Buffer::new()); + } + let ptr = get_buffer_ptr(array, data_type, index)?; - let len = buffer_len(array, data_type, index)?; let offset = buffer_offset(array, data_type, index); let bytes = Bytes::from_foreign(ptr, len, BytesAllocator::InternalArrowArray(owner)); @@ -258,9 +263,12 @@ unsafe fn create_bitmap( // we can use the null count directly is_validity: bool, ) -> Result { + let len: usize = array.length.try_into().expect("length to fit in `usize`"); + if len == 0 { + return Ok(Bitmap::new()); + } let ptr = get_buffer_ptr(array, data_type, index)?; - let len: usize = array.length.try_into().expect("length to fit in `usize`"); let offset: usize = array.offset.try_into().expect("offset to fit in `usize`"); let bytes_len = bytes_for(offset + len); let bytes = Bytes::from_foreign(ptr, bytes_len, BytesAllocator::InternalArrowArray(owner)); diff --git a/src/io/ipc/mod.rs b/src/io/ipc/mod.rs index e618a92687..2bb233a147 100644 --- a/src/io/ipc/mod.rs +++ b/src/io/ipc/mod.rs @@ -80,7 +80,8 @@ pub mod append; pub mod read; pub mod write; -const ARROW_MAGIC: [u8; 6] = [b'A', b'R', b'R', b'O', b'W', b'1']; +const ARROW_MAGIC_V1: [u8; 4] = [b'F', b'E', b'A', b'1']; +const ARROW_MAGIC_V2: [u8; 6] = [b'A', b'R', b'R', b'O', b'W', b'1']; pub(crate) const CONTINUATION_MARKER: [u8; 4] = [0xff; 4]; /// Struct containing `dictionary_id` and nested `IpcField`, allowing users diff --git a/src/io/ipc/read/file.rs b/src/io/ipc/read/file.rs index 341cdfeba6..dd4a5852c7 100644 --- a/src/io/ipc/read/file.rs +++ b/src/io/ipc/read/file.rs @@ -8,7 +8,7 @@ use crate::datatypes::Schema; use crate::error::{Error, Result}; use crate::io::ipc::IpcSchema; -use super::super::{ARROW_MAGIC, CONTINUATION_MARKER}; +use super::super::{ARROW_MAGIC_V1, ARROW_MAGIC_V2, CONTINUATION_MARKER}; use super::common::*; use super::schema::fb_to_schema; use super::Dictionaries; @@ -151,7 +151,7 @@ fn read_footer_len(reader: &mut R) -> Result<(u64, usize)> { reader.read_exact(&mut footer)?; let footer_len = i32::from_le_bytes(footer[..4].try_into().unwrap()); - if footer[4..] != ARROW_MAGIC { + if footer[4..] != ARROW_MAGIC_V2 { return Err(Error::from(OutOfSpecKind::InvalidFooter)); } let footer_len = footer_len @@ -215,7 +215,10 @@ pub fn read_file_metadata(reader: &mut R) -> Result(reader: &mut R) -> Re reader.read_exact(&mut footer).await?; let footer_len = i32::from_le_bytes(footer[..4].try_into().unwrap()); - if footer[4..] != ARROW_MAGIC { + if footer[4..] != ARROW_MAGIC_V2 { return Err(Error::from(OutOfSpecKind::InvalidFooter)); } footer_len diff --git a/src/io/ipc/write/file_async.rs b/src/io/ipc/write/file_async.rs index 02dd5a4c7f..6bf7753664 100644 --- a/src/io/ipc/write/file_async.rs +++ b/src/io/ipc/write/file_async.rs @@ -11,7 +11,7 @@ use super::schema::serialize_schema; use super::{default_ipc_fields, schema_to_bytes, Record}; use crate::datatypes::*; use crate::error::{Error, Result}; -use crate::io::ipc::{IpcField, ARROW_MAGIC}; +use crate::io::ipc::{IpcField, ARROW_MAGIC_V2}; type WriteOutput = (usize, Option, Vec, Option); @@ -105,7 +105,7 @@ where } async fn start(mut writer: W, encoded: EncodedData) -> Result> { - writer.write_all(&ARROW_MAGIC[..]).await?; + writer.write_all(&ARROW_MAGIC_V2[..]).await?; writer.write_all(&[0, 0]).await?; let (meta, data) = write_message(&mut writer, encoded).await?; @@ -149,7 +149,7 @@ where writer .write_all(&(footer.len() as i32).to_le_bytes()) .await?; - writer.write_all(&ARROW_MAGIC).await?; + writer.write_all(&ARROW_MAGIC_V2).await?; writer.close().await?; Ok((0, None, vec![], None)) diff --git a/src/io/ipc/write/writer.rs b/src/io/ipc/write/writer.rs index 1637f8ea7c..b92f1b2ba8 100644 --- a/src/io/ipc/write/writer.rs +++ b/src/io/ipc/write/writer.rs @@ -4,7 +4,7 @@ use arrow_format::ipc::planus::Builder; use super::{ super::IpcField, - super::ARROW_MAGIC, + super::ARROW_MAGIC_V2, common::{DictionaryTracker, EncodedData, WriteOptions}, common_sync::{write_continuation, write_message}, default_ipc_fields, schema, schema_to_bytes, @@ -114,7 +114,7 @@ impl FileWriter { return Err(Error::oos("The IPC file can only be started once")); } // write magic to header - self.writer.write_all(&ARROW_MAGIC[..])?; + self.writer.write_all(&ARROW_MAGIC_V2[..])?; // create an 8-byte boundary after the header self.writer.write_all(&[0, 0])?; // write the schema, set the written bytes to the schema @@ -205,7 +205,7 @@ impl FileWriter { self.writer.write_all(footer_data)?; self.writer .write_all(&(footer_data.len() as i32).to_le_bytes())?; - self.writer.write_all(&ARROW_MAGIC)?; + self.writer.write_all(&ARROW_MAGIC_V2)?; self.writer.flush()?; self.state = State::Finished; From 2ecd3e823f63884ca77b146a8cd8fcdea9f328fd Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Mon, 7 Aug 2023 08:55:11 +0200 Subject: [PATCH 44/80] fix oob if in .get (#1529) --- src/array/mod.rs | 11 ++++++++++- src/io/ipc/read/file.rs | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/array/mod.rs b/src/array/mod.rs index bbbbedc359..04b7b2c8e3 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -73,9 +73,18 @@ pub trait Array: Send + Sync + dyn_clone::DynClone + 'static { /// Panics iff `i >= self.len()`. #[inline] fn is_null(&self, i: usize) -> bool { + assert!(i < self.len()); + unsafe { self.is_null_unchecked(i) } + } + + /// Returns whether slot `i` is null. + /// # Safety + /// The caller must ensure `i < self.len()` + #[inline] + unsafe fn is_null_unchecked(&self, i: usize) -> bool { self.validity() .as_ref() - .map(|x| !x.get_bit(i)) + .map(|x| !x.get_bit_unchecked(i)) .unwrap_or(false) } diff --git a/src/io/ipc/read/file.rs b/src/io/ipc/read/file.rs index dd4a5852c7..e95b37e44d 100644 --- a/src/io/ipc/read/file.rs +++ b/src/io/ipc/read/file.rs @@ -216,7 +216,7 @@ pub fn read_file_metadata(reader: &mut R) -> Result Date: Thu, 10 Aug 2023 11:09:09 +0200 Subject: [PATCH 45/80] feat: handle unaligned pointers in FFI (#1535) --- src/ffi/array.rs | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/ffi/array.rs b/src/ffi/array.rs index 4057d0be8f..1a25b98510 100644 --- a/src/ffi/array.rs +++ b/src/ffi/array.rs @@ -213,13 +213,7 @@ unsafe fn get_buffer_ptr( must have a non-null buffer {index}" ))); } - if ptr.align_offset(std::mem::align_of::()) != 0 { - return Err(Error::oos(format!( - "An ArrowArray of type {data_type:?} - must have buffer {index} aligned to type {}", - std::any::type_name::() - ))); - } + // note: we can't prove that this pointer is not mutably shared - part of the safety invariant Ok(ptr as *mut T) } @@ -241,12 +235,21 @@ unsafe fn create_buffer( return Ok(Buffer::new()); } - let ptr = get_buffer_ptr(array, data_type, index)?; - let offset = buffer_offset(array, data_type, index); - let bytes = Bytes::from_foreign(ptr, len, BytesAllocator::InternalArrowArray(owner)); + let ptr: *mut T = get_buffer_ptr(array, data_type, index)?; - Ok(Buffer::from_bytes(bytes).sliced(offset, len - offset)) + // We have to check alignment. + // This is the zero-copy path. + if ptr.align_offset(std::mem::align_of::()) == 0 { + let bytes = Bytes::from_foreign(ptr, len, BytesAllocator::InternalArrowArray(owner)); + Ok(Buffer::from_bytes(bytes).sliced(offset, len - offset)) + } + // This is the path where alignment isn't correct. + // We copy the data to a new vec + else { + let buf = std::slice::from_raw_parts(ptr, len - offset).to_vec(); + Ok(Buffer::from(buf)) + } } /// returns the buffer `i` of `array` interpreted as a [`Bitmap`]. @@ -269,6 +272,8 @@ unsafe fn create_bitmap( } let ptr = get_buffer_ptr(array, data_type, index)?; + // Pointer of u8 has alignment 1, so we don't have to check alignment. + let offset: usize = array.offset.try_into().expect("offset to fit in `usize`"); let bytes_len = bytes_for(offset + len); let bytes = Bytes::from_foreign(ptr, bytes_len, BytesAllocator::InternalArrowArray(owner)); From a760a4ce847fc0fec781f26908b05efb2cafc1c8 Mon Sep 17 00:00:00 2001 From: Sebastian Holmin Date: Mon, 14 Aug 2023 10:57:03 +0200 Subject: [PATCH 46/80] Update multiversion to 0.7.3 (#1536) Fixes https://github.com/calebzulawski/multiversion/issues/37. --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index ed5882cfd0..cafbe8360e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -92,7 +92,7 @@ serde_json = { version = "^1.0", features = ["preserve_order"], optional = true strength_reduce = { version = "0.2", optional = true } # For instruction multiversioning -multiversion = { version = "0.7.1", optional = true } +multiversion = { version = "0.7.3", optional = true } # For support for odbc odbc-api = { version = "0.36", optional = true } From 36e905d9e23e595693f33e44e514215074b6199b Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Mon, 14 Aug 2023 14:10:28 +0200 Subject: [PATCH 47/80] arrow2 0.17.4 release (#1537) --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index cafbe8360e..911e8e3960 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "arrow2" -version = "0.17.3" +version = "0.17.4" license = "Apache-2.0" description = "Unofficial implementation of Apache Arrow spec in safe Rust" homepage = "https://github.com/jorgecarleitao/arrow2" From 076ceee9dec15834943c574cb4343f0237a51447 Mon Sep 17 00:00:00 2001 From: RinChanNOW Date: Tue, 15 Aug 2023 18:59:46 +0800 Subject: [PATCH 48/80] Bump version of arrow-rs. (#1540) * Bump version of arrow-rs. * Remove version limit. --- Cargo.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 911e8e3960..0b8a4ff7e6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -101,10 +101,10 @@ odbc-api = { version = "0.36", optional = true } ahash = "0.8" # Support conversion to/from arrow-rs -arrow-buffer = { version = ">=40, <44", optional = true } -arrow-schema = { version = ">=40, <44", optional = true } -arrow-data = { version = ">=40, <44", optional = true } -arrow-array = { version = ">=40, <44", optional = true } +arrow-buffer = { version = ">=40", optional = true } +arrow-schema = { version = ">=40", optional = true } +arrow-data = { version = ">=40", optional = true } +arrow-array = { version = ">=40", optional = true } [target.wasm32-unknown-unknown.dependencies] getrandom = { version = "0.2", features = ["js"] } From 86b8a5b34a743f042095c25e77b02ac39f25b866 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Wed, 16 Aug 2023 08:58:49 +0200 Subject: [PATCH 49/80] feat: expose (utf8|binary)_substring (#1539) --- src/compute/substring.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/compute/substring.rs b/src/compute/substring.rs index 7a20114ed1..2919b3037b 100644 --- a/src/compute/substring.rs +++ b/src/compute/substring.rs @@ -24,7 +24,13 @@ use crate::{ offset::{Offset, Offsets}, }; -fn utf8_substring(array: &Utf8Array, start: O, length: &Option) -> Utf8Array { +/// Returns a Utf8Array with a substring starting from `start` and with optional length `length` of each of the elements in `array`. +/// `start` can be negative, in which case the start counts from the end of the string. +pub fn utf8_substring( + array: &Utf8Array, + start: O, + length: &Option, +) -> Utf8Array { let length = length.map(|v| v.to_usize()); let iter = array.values_iter().map(|str_val| { @@ -68,7 +74,9 @@ fn utf8_substring(array: &Utf8Array, start: O, length: &Option) new.with_validity(array.validity().cloned()) } -fn binary_substring( +/// Returns a BinaryArray with a substring starting from `start` and with optional length `length` of each of the elements in `array`. +/// `start` can be negative, in which case the start counts from the end of the string. +pub fn binary_substring( array: &BinaryArray, start: O, length: &Option, From c1446fb17229f1e5e703a8623b0946c6c8f5c3df Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Wed, 16 Aug 2023 08:58:59 +0200 Subject: [PATCH 50/80] chore: update dependencies (#1542) --- Cargo.toml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0b8a4ff7e6..0f3f9ec27b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ bench = false [dependencies] foreign_vec = "0.1.0" -either = "1.6" +either = "1.9" num-traits = "0.2" dyn-clone = "1" bytemuck = { version = "1", features = ["derive"] } @@ -29,10 +29,10 @@ ethnum = "1" # crate provides HashMap that assumes pre-hashed values. hash_hasher = "^2.0.3" # For SIMD utf8 validation -simdutf8 = "0.1.3" +simdutf8 = "0.1.4" # A Rust port of SwissTable -hashbrown = { version = "0.13", default-features = false, optional = true } +hashbrown = { version = "0.14", default-features = false, optional = true } # for timezone support chrono-tz = { version = "0.8", optional = true } @@ -46,8 +46,8 @@ csv-core = { version = "0.1", optional = true } # for csv async io csv-async = { version = "^1.1", optional = true } -regex = { version = "^1.3", optional = true } -regex-syntax = { version = "^0.6", optional = true } +regex = { version = "1.9", optional = true } +regex-syntax = { version = "0.7", optional = true } streaming-iterator = { version = "0.1", optional = true } fallible-streaming-iterator = { version = "0.1", optional = true } @@ -62,7 +62,7 @@ arrow-format = { version = "0.8", optional = true, features = ["ipc"] } hex = { version = "^0.4", optional = true } # for IPC compression -lz4 = { version = "1.23.1", optional = true } +lz4 = { version = "1.24", optional = true } zstd = { version = "0.12", optional = true } rand = { version = "0.8", optional = true } From 5720d9a2450823b68f2174807af74a44c058b0b8 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sat, 19 Aug 2023 10:05:20 +0200 Subject: [PATCH 51/80] Fix `any`/`all` for Kleene logic (#1545) --- src/compute/boolean.rs | 42 +++++++++++++++++-- src/compute/boolean_kleene.rs | 66 +++++++++++++++++++++++++----- tests/it/compute/boolean.rs | 9 ++-- tests/it/compute/boolean_kleene.rs | 4 +- 4 files changed, 101 insertions(+), 20 deletions(-) diff --git a/src/compute/boolean.rs b/src/compute/boolean.rs index b8200ef5a9..e34b90c637 100644 --- a/src/compute/boolean.rs +++ b/src/compute/boolean.rs @@ -230,11 +230,28 @@ pub fn or_scalar(array: &BooleanArray, scalar: &BooleanScalar) -> BooleanArray { } } -/// Returns whether any of the values in the array is `true` +/// Returns whether any of the values in the array are `true`. +/// +/// Null values are ignored. +/// +/// # Example +/// +/// ``` +/// use arrow2::array::BooleanArray; +/// use arrow2::compute::boolean::any; +/// +/// let a = BooleanArray::from(&[Some(true), Some(false)]); +/// let b = BooleanArray::from(&[Some(false), Some(false)]); +/// let c = BooleanArray::from(&[None, Some(false)]); +/// +/// assert_eq!(any(&a), true); +/// assert_eq!(any(&b), false); +/// assert_eq!(any(&c), false); +/// ``` pub fn any(array: &BooleanArray) -> bool { if array.is_empty() { false - } else if array.validity().is_some() { + } else if array.null_count() > 0 { array.into_iter().any(|v| v == Some(true)) } else { let vals = array.values(); @@ -242,12 +259,29 @@ pub fn any(array: &BooleanArray) -> bool { } } -/// Check if all of the values in the array are `true` +/// Returns whether all values in the array are `true`. +/// +/// Null values are ignored. +/// +/// # Example +/// +/// ``` +/// use arrow2::array::BooleanArray; +/// use arrow2::compute::boolean::all; +/// +/// let a = BooleanArray::from(&[Some(true), Some(true)]); +/// let b = BooleanArray::from(&[Some(false), Some(true)]); +/// let c = BooleanArray::from(&[None, Some(true)]); +/// +/// assert_eq!(all(&a), true); +/// assert_eq!(all(&b), false); +/// assert_eq!(all(&c), true); +/// ``` pub fn all(array: &BooleanArray) -> bool { if array.is_empty() { true } else if array.null_count() > 0 { - false + !array.into_iter().any(|v| v == Some(false)) } else { let vals = array.values(); vals.unset_bits() == 0 diff --git a/src/compute/boolean_kleene.rs b/src/compute/boolean_kleene.rs index fc6b717543..b19efeaa78 100644 --- a/src/compute/boolean_kleene.rs +++ b/src/compute/boolean_kleene.rs @@ -234,26 +234,70 @@ pub fn and_scalar(array: &BooleanArray, scalar: &BooleanScalar) -> BooleanArray } } -/// Returns whether any of the values in the array is `true` -pub fn any(array: &BooleanArray) -> bool { +/// Returns whether any of the values in the array are `true`. +/// +/// The output is unknown (`None`) if the array contains any null values and +/// no `true` values. +/// +/// # Example +/// +/// ``` +/// use arrow2::array::BooleanArray; +/// use arrow2::compute::boolean_kleene::any; +/// +/// let a = BooleanArray::from(&[Some(true), Some(false)]); +/// let b = BooleanArray::from(&[Some(false), Some(false)]); +/// let c = BooleanArray::from(&[None, Some(false)]); +/// +/// assert_eq!(any(&a), Some(true)); +/// assert_eq!(any(&b), Some(false)); +/// assert_eq!(any(&c), None); +/// ``` +pub fn any(array: &BooleanArray) -> Option { if array.is_empty() { - false - } else if array.validity().is_some() { - array.into_iter().any(|v| v == Some(true)) + Some(false) + } else if array.null_count() > 0 { + if array.into_iter().any(|v| v == Some(true)) { + Some(true) + } else { + None + } } else { let vals = array.values(); - vals.unset_bits() != vals.len() + Some(vals.unset_bits() != vals.len()) } } -/// Returns whether all values in the array are `true` -pub fn all(array: &BooleanArray) -> bool { +/// Returns whether all values in the array are `true`. +/// +/// The output is unknown (`None`) if the array contains any null values and +/// no `false` values. +/// +/// # Example +/// +/// ``` +/// use arrow2::array::BooleanArray; +/// use arrow2::compute::boolean_kleene::all; +/// +/// let a = BooleanArray::from(&[Some(true), Some(true)]); +/// let b = BooleanArray::from(&[Some(false), Some(true)]); +/// let c = BooleanArray::from(&[None, Some(true)]); +/// +/// assert_eq!(all(&a), Some(true)); +/// assert_eq!(all(&b), Some(false)); +/// assert_eq!(all(&c), None); +/// ``` +pub fn all(array: &BooleanArray) -> Option { if array.is_empty() { - true + Some(true) } else if array.null_count() > 0 { - false + if array.into_iter().any(|v| v == Some(false)) { + Some(false) + } else { + None + } } else { let vals = array.values(); - vals.unset_bits() == 0 + Some(vals.unset_bits() == 0) } } diff --git a/tests/it/compute/boolean.rs b/tests/it/compute/boolean.rs index 8c505a164f..ae4c0fde85 100644 --- a/tests/it/compute/boolean.rs +++ b/tests/it/compute/boolean.rs @@ -429,21 +429,24 @@ fn test_any_all() { assert!(!any(&array)); assert!(!all(&array)); let array = BooleanArray::from(&[None, Some(true), Some(true)]); - assert!(!all(&array)); assert!(any(&array)); + assert!(all(&array)); let array = BooleanArray::from_iter(std::iter::repeat(false).take(10).map(Some)); assert!(!any(&array)); assert!(!all(&array)); let array = BooleanArray::from_iter(std::iter::repeat(true).take(10).map(Some)); - assert!(all(&array)); assert!(any(&array)); + assert!(all(&array)); let array = BooleanArray::from_iter([true, false, true, true].map(Some)); - assert!(!all(&array)); assert!(any(&array)); + assert!(!all(&array)); let array = BooleanArray::from(&[Some(true)]); assert!(any(&array)); assert!(all(&array)); let array = BooleanArray::from(&[Some(false)]); assert!(!any(&array)); assert!(!all(&array)); + let array = BooleanArray::from(&[]); + assert!(!any(&array)); + assert!(all(&array)); } diff --git a/tests/it/compute/boolean_kleene.rs b/tests/it/compute/boolean_kleene.rs index 8dac6e63c4..902e5b425a 100644 --- a/tests/it/compute/boolean_kleene.rs +++ b/tests/it/compute/boolean_kleene.rs @@ -218,6 +218,6 @@ fn array_or_none() { #[test] fn array_empty() { let array = BooleanArray::from(&[]); - assert!(!any(&array)); - assert!(all(&array)); + assert_eq!(any(&array), Some(false)); + assert_eq!(all(&array), Some(true)); } From 7edf5f9e359e0ed02e9d0c6b9318b06964d805f0 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Sat, 19 Aug 2023 10:05:45 +0200 Subject: [PATCH 52/80] fix: slice values in list to fixed-size list cast (#1544) --- src/compute/cast/mod.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index d97878d497..14622f9b03 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -400,7 +400,11 @@ fn cast_list_to_fixed_size_list( "incompatible offsets in source list".to_string(), )), None => { - let new_values = cast(list.values().as_ref(), inner.data_type(), options)?; + let sliced_values = list.values().sliced( + list.offsets().first().to_usize(), + list.offsets().range().to_usize(), + ); + let new_values = cast(sliced_values.as_ref(), inner.data_type(), options)?; Ok(FixedSizeListArray::new( DataType::FixedSizeList(Box::new(inner.clone()), size), new_values, From 2b3e2a9e83725a557d78b90cd39298c5bef0ca4a Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 20 Aug 2023 11:04:58 +0200 Subject: [PATCH 53/80] feat: add fallible extend to mutable arrays (#1546) --- src/array/binary/mutable.rs | 12 ++++++++++++ src/array/binary/mutable_values.rs | 12 ++++++++++++ src/array/utf8/mutable.rs | 12 ++++++++++++ src/array/utf8/mutable_values.rs | 12 ++++++++++++ 4 files changed, 48 insertions(+) diff --git a/src/array/binary/mutable.rs b/src/array/binary/mutable.rs index 25bbe286c2..db0312fb54 100644 --- a/src/array/binary/mutable.rs +++ b/src/array/binary/mutable.rs @@ -404,6 +404,18 @@ impl MutableBinaryArray { let (offsets, values) = values_iter(iterator); Self::try_new(Self::default_data_type(), offsets, values, None).unwrap() } + + /// Extend with a fallible iterator + pub fn extend_fallible(&mut self, iter: I) -> std::result::Result<(), E> + where + E: std::error::Error, + I: IntoIterator, E>>, + T: AsRef<[u8]>, + { + let mut iter = iter.into_iter(); + self.reserve(iter.size_hint().0, 0); + iter.try_for_each(|x| Ok(self.push(x?))) + } } impl> Extend> for MutableBinaryArray { diff --git a/src/array/binary/mutable_values.rs b/src/array/binary/mutable_values.rs index 59f42b238f..53b2d93a9a 100644 --- a/src/array/binary/mutable_values.rs +++ b/src/array/binary/mutable_values.rs @@ -314,6 +314,18 @@ impl MutableBinaryValuesArray { } Ok(array) } + + /// Extend with a fallible iterator + pub fn extend_fallible(&mut self, iter: I) -> std::result::Result<(), E> + where + E: std::error::Error, + I: IntoIterator>, + T: AsRef<[u8]>, + { + let mut iter = iter.into_iter(); + self.reserve(iter.size_hint().0, 0); + iter.try_for_each(|x| Ok(self.push(x?))) + } } impl> Extend for MutableBinaryValuesArray { diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs index ca2013eac7..6f5b7973ed 100644 --- a/src/array/utf8/mutable.rs +++ b/src/array/utf8/mutable.rs @@ -483,6 +483,18 @@ impl MutableUtf8Array { pub fn from_iter_values, I: Iterator>(iterator: I) -> Self { MutableUtf8ValuesArray::from_iter(iterator).into() } + + /// Extend with a fallible iterator + pub fn extend_fallible(&mut self, iter: I) -> std::result::Result<(), E> + where + E: std::error::Error, + I: IntoIterator, E>>, + T: AsRef, + { + let mut iter = iter.into_iter(); + self.reserve(iter.size_hint().0, 0); + iter.try_for_each(|x| Ok(self.push(x?))) + } } impl> Extend> for MutableUtf8Array { diff --git a/src/array/utf8/mutable_values.rs b/src/array/utf8/mutable_values.rs index fc8708667d..c70c870388 100644 --- a/src/array/utf8/mutable_values.rs +++ b/src/array/utf8/mutable_values.rs @@ -359,6 +359,18 @@ impl MutableUtf8ValuesArray { } Ok(array) } + + /// Extend with a fallible iterator + pub fn extend_fallible(&mut self, iter: I) -> std::result::Result<(), E> + where + E: std::error::Error, + I: IntoIterator>, + T: AsRef, + { + let mut iter = iter.into_iter(); + self.reserve(iter.size_hint().0, 0); + iter.try_for_each(|x| Ok(self.push(x?))) + } } impl> Extend for MutableUtf8ValuesArray { From 3d7d9acdda34f36d904e362de411792714794331 Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Sun, 20 Aug 2023 19:15:58 +0800 Subject: [PATCH 54/80] feat: Support cast to large list. (#1547) --- src/compute/cast/mod.rs | 14 ++++++++++++++ tests/it/compute/cast.rs | 23 +++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index 14622f9b03..a9d27aa12d 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -104,6 +104,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (List(list_from), LargeList(list_to)) if list_from == list_to => true, (LargeList(list_from), List(list_to)) if list_from == list_to => true, (_, List(list_to)) => can_cast_types(from_type, &list_to.data_type), + (_, LargeList(list_to)) => can_cast_types(from_type, &list_to.data_type), (Dictionary(_, from_value_type, _), Dictionary(_, to_value_type, _)) => { can_cast_types(from_value_type, to_value_type) } @@ -509,6 +510,19 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu Ok(Box::new(list_array)) } + (_, LargeList(to)) => { + // cast primitive to list's primitive + let values = cast(array, &to.data_type, options)?; + // create offsets, where if array.len() = 2, we have [0,1,2] + let offsets = (0..=array.len() as i64).collect::>(); + // Safety: offsets _are_ monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }; + + let list_array = ListArray::::new(to_type.clone(), offsets.into(), values, None); + + Ok(Box::new(list_array)) + } + (Dictionary(index_type, ..), _) => match_integer_type!(index_type, |$T| { dictionary_cast_dyn::<$T>(array, to_type, options) }), diff --git a/tests/it/compute/cast.rs b/tests/it/compute/cast.rs index 01cb31d2f2..d8a9ecfce1 100644 --- a/tests/it/compute/cast.rs +++ b/tests/it/compute/cast.rs @@ -1,5 +1,6 @@ use arrow2::array::*; use arrow2::compute::cast::{can_cast_types, cast, CastOptions}; +use arrow2::datatypes::DataType::LargeList; use arrow2::datatypes::*; use arrow2::types::{days_ms, months_days_ns, NativeType}; @@ -120,6 +121,28 @@ fn i32_to_i32() { assert_eq!(c, &expected); } +#[test] +fn i32_to_large_list_i32() { + let array = Int32Array::from_slice([5, 6, 7, 8, 9]); + let b = cast( + &array, + &LargeList(Box::new(Field::new("item", DataType::Int32, true))), + CastOptions::default(), + ) + .unwrap(); + + let arr = b.as_any().downcast_ref::>().unwrap(); + assert_eq!(&[0, 1, 2, 3, 4, 5], arr.offsets().as_slice()); + let values = arr.values(); + let c = values + .as_any() + .downcast_ref::>() + .unwrap(); + + let expected = Int32Array::from_slice([5, 6, 7, 8, 9]); + assert_eq!(c, &expected); +} + #[test] fn i32_to_list_i32() { let array = Int32Array::from_slice([5, 6, 7, 8, 9]); From 697f7fb2bda471fb67ea9ae145975c477345d84b Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 20 Aug 2023 14:35:59 +0200 Subject: [PATCH 55/80] fix clippy (#1548) --- src/array/binary/mutable.rs | 5 ++++- src/array/binary/mutable_values.rs | 5 ++++- src/array/utf8/mutable.rs | 5 ++++- src/array/utf8/mutable_values.rs | 5 ++++- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/array/binary/mutable.rs b/src/array/binary/mutable.rs index db0312fb54..32a6f17acb 100644 --- a/src/array/binary/mutable.rs +++ b/src/array/binary/mutable.rs @@ -414,7 +414,10 @@ impl MutableBinaryArray { { let mut iter = iter.into_iter(); self.reserve(iter.size_hint().0, 0); - iter.try_for_each(|x| Ok(self.push(x?))) + iter.try_for_each(|x| { + self.push(x?); + Ok(()) + }) } } diff --git a/src/array/binary/mutable_values.rs b/src/array/binary/mutable_values.rs index 53b2d93a9a..3e14d9c578 100644 --- a/src/array/binary/mutable_values.rs +++ b/src/array/binary/mutable_values.rs @@ -324,7 +324,10 @@ impl MutableBinaryValuesArray { { let mut iter = iter.into_iter(); self.reserve(iter.size_hint().0, 0); - iter.try_for_each(|x| Ok(self.push(x?))) + iter.try_for_each(|x| { + self.push(x?); + Ok(()) + }) } } diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs index 6f5b7973ed..108fe8e474 100644 --- a/src/array/utf8/mutable.rs +++ b/src/array/utf8/mutable.rs @@ -493,7 +493,10 @@ impl MutableUtf8Array { { let mut iter = iter.into_iter(); self.reserve(iter.size_hint().0, 0); - iter.try_for_each(|x| Ok(self.push(x?))) + iter.try_for_each(|x| { + self.push(x?); + Ok(()) + }) } } diff --git a/src/array/utf8/mutable_values.rs b/src/array/utf8/mutable_values.rs index c70c870388..dce8b09e4c 100644 --- a/src/array/utf8/mutable_values.rs +++ b/src/array/utf8/mutable_values.rs @@ -369,7 +369,10 @@ impl MutableUtf8ValuesArray { { let mut iter = iter.into_iter(); self.reserve(iter.size_hint().0, 0); - iter.try_for_each(|x| Ok(self.push(x?))) + iter.try_for_each(|x| { + self.push(x?); + Ok(()) + }) } } From ba6a882bc1542b0b899774b696ebea77482b5c31 Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Mon, 21 Aug 2023 15:36:01 +0800 Subject: [PATCH 56/80] fix: LargeBinary to LargeList should be taken as a special (#1550) --- src/compute/cast/mod.rs | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index a9d27aa12d..8f89151a06 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -104,7 +104,6 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (List(list_from), LargeList(list_to)) if list_from == list_to => true, (LargeList(list_from), List(list_to)) if list_from == list_to => true, (_, List(list_to)) => can_cast_types(from_type, &list_to.data_type), - (_, LargeList(list_to)) => can_cast_types(from_type, &list_to.data_type), (Dictionary(_, from_value_type, _), Dictionary(_, to_value_type, _)) => { can_cast_types(from_value_type, to_value_type) } @@ -151,7 +150,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Timestamp(_, _), LargeUtf8) => true, (_, Utf8) => is_numeric(from_type) || from_type == &Binary, (_, LargeUtf8) => is_numeric(from_type) || from_type == &LargeBinary, - + (_, LargeList(list_to)) => can_cast_types(from_type, &list_to.data_type), (_, Binary) => is_numeric(from_type), (_, LargeBinary) => is_numeric(from_type), @@ -510,19 +509,6 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu Ok(Box::new(list_array)) } - (_, LargeList(to)) => { - // cast primitive to list's primitive - let values = cast(array, &to.data_type, options)?; - // create offsets, where if array.len() = 2, we have [0,1,2] - let offsets = (0..=array.len() as i64).collect::>(); - // Safety: offsets _are_ monotonically increasing - let offsets = unsafe { Offsets::new_unchecked(offsets) }; - - let list_array = ListArray::::new(to_type.clone(), offsets.into(), values, None); - - Ok(Box::new(list_array)) - } - (Dictionary(index_type, ..), _) => match_integer_type!(index_type, |$T| { dictionary_cast_dyn::<$T>(array, to_type, options) }), @@ -754,6 +740,19 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu ))), }, + (_, LargeList(to)) => { + // cast primitive to list's primitive + let values = cast(array, &to.data_type, options)?; + // create offsets, where if array.len() = 2, we have [0,1,2] + let offsets = (0..=array.len() as i64).collect::>(); + // Safety: offsets _are_ monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }; + + let list_array = ListArray::::new(to_type.clone(), offsets.into(), values, None); + + Ok(Box::new(list_array)) + } + (_, Binary) => match from_type { UInt8 => primitive_to_binary_dyn::(array), UInt16 => primitive_to_binary_dyn::(array), From f609d0c0cc138f00f297f05e8fe23f6bf195938c Mon Sep 17 00:00:00 2001 From: Yijun Zhao Date: Tue, 29 Aug 2023 20:24:57 +0800 Subject: [PATCH 57/80] Support nested decimal read write (#1553) * add decimal and decimal256 supports to array_to_page_nested * add decimal and decimal256 supports to array_to_page_nested * support nested decimal 256 * fix reviewer comments * add tests * fix tests --- parquet_integration/write_parquet.py | 6 + .../deserialize/fixed_size_binary/basic.rs | 34 ++-- .../read/deserialize/fixed_size_binary/mod.rs | 2 + .../deserialize/fixed_size_binary/nested.rs | 189 ++++++++++++++++++ src/io/parquet/read/deserialize/nested.rs | 159 +++++++++++++++ src/io/parquet/write/mod.rs | 133 ++++++++++++ tests/it/io/parquet/mod.rs | 86 ++++++-- tests/it/io/parquet/read.rs | 12 ++ tests/it/io/parquet/write.rs | 24 +++ 9 files changed, 609 insertions(+), 36 deletions(-) create mode 100644 src/io/parquet/read/deserialize/fixed_size_binary/nested.rs diff --git a/parquet_integration/write_parquet.py b/parquet_integration/write_parquet.py index acfd819d57..a7f7560fc5 100644 --- a/parquet_integration/write_parquet.py +++ b/parquet_integration/write_parquet.py @@ -178,6 +178,8 @@ def case_nested() -> Tuple[dict, pa.Schema, str]: [""], ] + decimal_nullable = [[Decimal(n) if n is not None else None for n in sublist] if sublist is not None else None for sublist in items_nullable] + list_struct_nullable = [ [{"a": "a"}, {"a": "b"}], None, @@ -222,6 +224,8 @@ def case_nested() -> Tuple[dict, pa.Schema, str]: pa.field("list_bool", pa.list_(pa.bool_())), pa.field("list_utf8", pa.list_(pa.utf8())), pa.field("list_large_binary", pa.list_(pa.large_binary())), + pa.field("list_decimal", pa.list_(pa.decimal128(9, 0))), + pa.field("list_decimal256", pa.list_(pa.decimal256(9, 0))), pa.field("list_nested_i64", pa.list_(pa.list_(pa.int64()))), pa.field("list_nested_inner_required_i64", pa.list_(pa.list_(pa.int64()))), pa.field( @@ -251,6 +255,8 @@ def case_nested() -> Tuple[dict, pa.Schema, str]: "list_bool": boolean, "list_utf8": string, "list_large_binary": string, + "list_decimal": decimal_nullable, + "list_decimal256": decimal_nullable, "list_nested_i64": items_nested, "list_nested_inner_required_i64": items_required_nested, "list_nested_inner_required_required_i64": items_required_nested_2, diff --git a/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs b/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs index ab47aa98cf..c77ff5f027 100644 --- a/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs +++ b/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs @@ -19,16 +19,16 @@ use super::super::utils::{ use super::super::Pages; use super::utils::FixedSizeBinary; -type Dict = Vec; +pub(super) type Dict = Vec; #[derive(Debug)] -struct Optional<'a> { - values: std::slice::ChunksExact<'a, u8>, - validity: OptionalPageValidity<'a>, +pub(super) struct Optional<'a> { + pub(super) values: std::slice::ChunksExact<'a, u8>, + pub(super) validity: OptionalPageValidity<'a>, } impl<'a> Optional<'a> { - fn try_new(page: &'a DataPage, size: usize) -> Result { + pub(super) fn try_new(page: &'a DataPage, size: usize) -> Result { let (_, _, values) = split_buffer(page)?; let values = values.chunks_exact(size); @@ -41,12 +41,12 @@ impl<'a> Optional<'a> { } #[derive(Debug)] -struct Required<'a> { +pub(super) struct Required<'a> { pub values: std::slice::ChunksExact<'a, u8>, } impl<'a> Required<'a> { - fn new(page: &'a DataPage, size: usize) -> Self { + pub(super) fn new(page: &'a DataPage, size: usize) -> Self { let values = page.buffer(); assert_eq!(values.len() % size, 0); let values = values.chunks_exact(size); @@ -60,7 +60,7 @@ impl<'a> Required<'a> { } #[derive(Debug)] -struct FilteredRequired<'a> { +pub(super) struct FilteredRequired<'a> { pub values: SliceFilteredIter>, } @@ -83,13 +83,13 @@ impl<'a> FilteredRequired<'a> { } #[derive(Debug)] -struct RequiredDictionary<'a> { +pub(super) struct RequiredDictionary<'a> { pub values: hybrid_rle::HybridRleDecoder<'a>, - dict: &'a Dict, + pub dict: &'a Dict, } impl<'a> RequiredDictionary<'a> { - fn try_new(page: &'a DataPage, dict: &'a Dict) -> Result { + pub(super) fn try_new(page: &'a DataPage, dict: &'a Dict) -> Result { let values = dict_indices_decoder(page)?; Ok(Self { dict, values }) @@ -102,14 +102,14 @@ impl<'a> RequiredDictionary<'a> { } #[derive(Debug)] -struct OptionalDictionary<'a> { - values: hybrid_rle::HybridRleDecoder<'a>, - validity: OptionalPageValidity<'a>, - dict: &'a Dict, +pub(super) struct OptionalDictionary<'a> { + pub(super) values: hybrid_rle::HybridRleDecoder<'a>, + pub(super) validity: OptionalPageValidity<'a>, + pub(super) dict: &'a Dict, } impl<'a> OptionalDictionary<'a> { - fn try_new(page: &'a DataPage, dict: &'a Dict) -> Result { + pub(super) fn try_new(page: &'a DataPage, dict: &'a Dict) -> Result { let values = dict_indices_decoder(page)?; Ok(Self { @@ -267,7 +267,7 @@ impl<'a> Decoder<'a> for BinaryDecoder { } } -fn finish( +pub fn finish( data_type: &DataType, values: FixedSizeBinary, validity: MutableBitmap, diff --git a/src/io/parquet/read/deserialize/fixed_size_binary/mod.rs b/src/io/parquet/read/deserialize/fixed_size_binary/mod.rs index 0ed9e60eac..c48bfe276b 100644 --- a/src/io/parquet/read/deserialize/fixed_size_binary/mod.rs +++ b/src/io/parquet/read/deserialize/fixed_size_binary/mod.rs @@ -1,6 +1,8 @@ mod basic; mod dictionary; +mod nested; mod utils; pub use basic::Iter; pub use dictionary::{DictIter, NestedDictIter}; +pub use nested::NestedIter; diff --git a/src/io/parquet/read/deserialize/fixed_size_binary/nested.rs b/src/io/parquet/read/deserialize/fixed_size_binary/nested.rs new file mode 100644 index 0000000000..5cef9eabfc --- /dev/null +++ b/src/io/parquet/read/deserialize/fixed_size_binary/nested.rs @@ -0,0 +1,189 @@ +use std::collections::VecDeque; + +use parquet2::{ + encoding::Encoding, + page::{DataPage, DictPage}, + schema::Repetition, +}; + +use super::super::utils::{not_implemented, MaybeNext, PageState}; +use super::utils::FixedSizeBinary; +use crate::array::FixedSizeBinaryArray; +use crate::io::parquet::read::deserialize::fixed_size_binary::basic::{ + finish, Dict, Optional, OptionalDictionary, Required, RequiredDictionary, +}; +use crate::io::parquet::read::deserialize::nested_utils::{next, NestedDecoder}; +use crate::io::parquet::read::deserialize::utils::Pushable; +use crate::io::parquet::read::{InitNested, NestedState}; +use crate::{bitmap::MutableBitmap, datatypes::DataType, error::Result, io::parquet::read::Pages}; + +#[derive(Debug)] +enum State<'a> { + Optional(Optional<'a>), + Required(Required<'a>), + RequiredDictionary(RequiredDictionary<'a>), + OptionalDictionary(OptionalDictionary<'a>), +} + +impl<'a> PageState<'a> for State<'a> { + fn len(&self) -> usize { + match self { + State::Optional(state) => state.validity.len(), + State::Required(state) => state.len(), + State::RequiredDictionary(state) => state.len(), + State::OptionalDictionary(state) => state.validity.len(), + } + } +} + +#[derive(Debug, Default)] +struct BinaryDecoder { + size: usize, +} + +impl<'a> NestedDecoder<'a> for BinaryDecoder { + type State = State<'a>; + type Dictionary = Dict; + type DecodedState = (FixedSizeBinary, MutableBitmap); + + fn build_state( + &self, + page: &'a DataPage, + dict: Option<&'a Self::Dictionary>, + ) -> Result { + let is_optional = + page.descriptor.primitive_type.field_info.repetition == Repetition::Optional; + let is_filtered = page.selected_rows().is_some(); + + match (page.encoding(), dict, is_optional, is_filtered) { + (Encoding::Plain, _, true, false) => { + Ok(State::Optional(Optional::try_new(page, self.size)?)) + } + (Encoding::Plain, _, false, false) => { + Ok(State::Required(Required::new(page, self.size))) + } + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false, false) => { + RequiredDictionary::try_new(page, dict).map(State::RequiredDictionary) + } + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), true, false) => { + OptionalDictionary::try_new(page, dict).map(State::OptionalDictionary) + } + _ => Err(not_implemented(page)), + } + } + + fn with_capacity(&self, capacity: usize) -> Self::DecodedState { + ( + FixedSizeBinary::with_capacity(capacity, self.size), + MutableBitmap::with_capacity(capacity), + ) + } + + fn push_valid(&self, state: &mut Self::State, decoded: &mut Self::DecodedState) -> Result<()> { + let (values, validity) = decoded; + match state { + State::Optional(page) => { + let value = page.values.by_ref().next().unwrap_or_default(); + values.push(value); + validity.push(true); + } + State::Required(page) => { + let value = page.values.by_ref().next().unwrap_or_default(); + values.push(value); + } + State::RequiredDictionary(page) => { + let item = page + .values + .by_ref() + .next() + .map(|index| { + let index = index.unwrap() as usize; + &page.dict[index * self.size..(index + 1) * self.size] + }) + .unwrap_or_default(); + values.push(item); + } + State::OptionalDictionary(page) => { + let item = page + .values + .by_ref() + .next() + .map(|index| { + let index = index.unwrap() as usize; + &page.dict[index * self.size..(index + 1) * self.size] + }) + .unwrap_or_default(); + values.push(item); + validity.push(true); + } + } + Ok(()) + } + + fn push_null(&self, decoded: &mut Self::DecodedState) { + let (values, validity) = decoded; + values.push_null(); + validity.push(false); + } + + fn deserialize_dict(&self, page: &DictPage) -> Self::Dictionary { + page.buffer.clone() + } +} + +pub struct NestedIter { + iter: I, + data_type: DataType, + size: usize, + init: Vec, + items: VecDeque<(NestedState, (FixedSizeBinary, MutableBitmap))>, + dict: Option, + chunk_size: Option, + remaining: usize, +} + +impl NestedIter { + pub fn new( + iter: I, + init: Vec, + data_type: DataType, + num_rows: usize, + chunk_size: Option, + ) -> Self { + let size = FixedSizeBinaryArray::get_size(&data_type); + Self { + iter, + data_type, + size, + init, + items: VecDeque::new(), + dict: None, + chunk_size, + remaining: num_rows, + } + } +} + +impl Iterator for NestedIter { + type Item = Result<(NestedState, FixedSizeBinaryArray)>; + + fn next(&mut self) -> Option { + let maybe_state = next( + &mut self.iter, + &mut self.items, + &mut self.dict, + &mut self.remaining, + &self.init, + self.chunk_size, + &BinaryDecoder { size: self.size }, + ); + match maybe_state { + MaybeNext::Some(Ok((nested, decoded))) => { + Some(Ok((nested, finish(&self.data_type, decoded.0, decoded.1)))) + } + MaybeNext::Some(Err(e)) => Some(Err(e)), + MaybeNext::None => None, + MaybeNext::More => self.next(), + } + } +} diff --git a/src/io/parquet/read/deserialize/nested.rs b/src/io/parquet/read/deserialize/nested.rs index 0887751438..ff27ca007b 100644 --- a/src/io/parquet/read/deserialize/nested.rs +++ b/src/io/parquet/read/deserialize/nested.rs @@ -1,5 +1,7 @@ +use ethnum::I256; use parquet2::schema::types::PrimitiveType; +use crate::array::PrimitiveArray; use crate::{ datatypes::{DataType, Field}, error::{Error, Result}, @@ -261,6 +263,163 @@ where }); Box::new(iter) as _ } + DataType::Decimal(_, _) => { + init.push(InitNested::Primitive(field.is_nullable)); + let type_ = types.pop().unwrap(); + match type_.physical_type { + PhysicalType::Int32 => primitive(primitive::NestedIter::new( + columns.pop().unwrap(), + init, + field.data_type.clone(), + num_rows, + chunk_size, + |x: i32| x as i128, + )), + PhysicalType::Int64 => primitive(primitive::NestedIter::new( + columns.pop().unwrap(), + init, + field.data_type.clone(), + num_rows, + chunk_size, + |x: i64| x as i128, + )), + PhysicalType::FixedLenByteArray(n) if n > 16 => { + return Err(Error::InvalidArgumentError(format!( + "Can't decode Decimal128 type from `FixedLenByteArray` of len {n}" + ))) + } + PhysicalType::FixedLenByteArray(n) => { + let iter = fixed_size_binary::NestedIter::new( + columns.pop().unwrap(), + init, + DataType::FixedSizeBinary(n), + num_rows, + chunk_size, + ); + // Convert the fixed length byte array to Decimal. + let iter = iter.map(move |x| { + let (mut nested, array) = x?; + let values = array + .values() + .chunks_exact(n) + .map(|value: &[u8]| super::super::convert_i128(value, n)) + .collect::>(); + let validity = array.validity().cloned(); + + let array: Box = Box::new(PrimitiveArray::::try_new( + field.data_type.clone(), + values.into(), + validity, + )?); + + let _ = nested.nested.pop().unwrap(); // the primitive + + Ok((nested, array)) + }); + Box::new(iter) + } + _ => { + return Err(Error::nyi(format!( + "Deserializing type for Decimal {:?} from parquet", + type_.physical_type + ))) + } + } + } + DataType::Decimal256(_, _) => { + init.push(InitNested::Primitive(field.is_nullable)); + let type_ = types.pop().unwrap(); + match type_.physical_type { + PhysicalType::Int32 => primitive(primitive::NestedIter::new( + columns.pop().unwrap(), + init, + field.data_type.clone(), + num_rows, + chunk_size, + |x: i32| i256(I256::new(x as i128)), + )), + PhysicalType::Int64 => primitive(primitive::NestedIter::new( + columns.pop().unwrap(), + init, + field.data_type.clone(), + num_rows, + chunk_size, + |x: i64| i256(I256::new(x as i128)), + )), + PhysicalType::FixedLenByteArray(n) if n <= 16 => { + let iter = fixed_size_binary::NestedIter::new( + columns.pop().unwrap(), + init, + DataType::FixedSizeBinary(n), + num_rows, + chunk_size, + ); + // Convert the fixed length byte array to Decimal. + let iter = iter.map(move |x| { + let (mut nested, array) = x?; + let values = array + .values() + .chunks_exact(n) + .map(|value| i256(I256::new(super::super::convert_i128(value, n)))) + .collect::>(); + let validity = array.validity().cloned(); + + let array: Box = Box::new(PrimitiveArray::::try_new( + field.data_type.clone(), + values.into(), + validity, + )?); + + let _ = nested.nested.pop().unwrap(); // the primitive + + Ok((nested, array)) + }); + Box::new(iter) as _ + } + + PhysicalType::FixedLenByteArray(n) if n <= 32 => { + let iter = fixed_size_binary::NestedIter::new( + columns.pop().unwrap(), + init, + DataType::FixedSizeBinary(n), + num_rows, + chunk_size, + ); + // Convert the fixed length byte array to Decimal. + let iter = iter.map(move |x| { + let (mut nested, array) = x?; + let values = array + .values() + .chunks_exact(n) + .map(super::super::convert_i256) + .collect::>(); + let validity = array.validity().cloned(); + + let array: Box = Box::new(PrimitiveArray::::try_new( + field.data_type.clone(), + values.into(), + validity, + )?); + + let _ = nested.nested.pop().unwrap(); // the primitive + + Ok((nested, array)) + }); + Box::new(iter) as _ + } + PhysicalType::FixedLenByteArray(n) => { + return Err(Error::InvalidArgumentError(format!( + "Can't decode Decimal256 type from from `FixedLenByteArray` of len {n}" + ))) + } + _ => { + return Err(Error::nyi(format!( + "Deserializing type for Decimal {:?} from parquet", + type_.physical_type + ))) + } + } + } DataType::Struct(fields) => { let columns = fields .iter() diff --git a/src/io/parquet/write/mod.rs b/src/io/parquet/write/mod.rs index a0040a9a0d..7889ea04fa 100644 --- a/src/io/parquet/write/mod.rs +++ b/src/io/parquet/write/mod.rs @@ -677,6 +677,139 @@ fn array_to_page_nested( let array = array.as_any().downcast_ref().unwrap(); primitive::nested_array_to_page::(array, options, type_, nested) } + Decimal(precision, _) => { + let type_ = type_; + let precision = *precision; + let array = array + .as_any() + .downcast_ref::>() + .unwrap(); + if precision <= 9 { + let values = array + .values() + .iter() + .map(|x| *x as i32) + .collect::>() + .into(); + + let array = + PrimitiveArray::::new(DataType::Int32, values, array.validity().cloned()); + primitive::nested_array_to_page::(&array, options, type_, nested) + } else if precision <= 18 { + let values = array + .values() + .iter() + .map(|x| *x as i64) + .collect::>() + .into(); + + let array = + PrimitiveArray::::new(DataType::Int64, values, array.validity().cloned()); + primitive::nested_array_to_page::(&array, options, type_, nested) + } else { + let size = decimal_length_from_precision(precision); + + let statistics = if options.write_statistics { + let stats = + fixed_len_bytes::build_statistics_decimal(array, type_.clone(), size); + Some(stats) + } else { + None + }; + + let mut values = Vec::::with_capacity(size * array.len()); + array.values().iter().for_each(|x| { + let bytes = &x.to_be_bytes()[16 - size..]; + values.extend_from_slice(bytes) + }); + let array = FixedSizeBinaryArray::new( + DataType::FixedSizeBinary(size), + values.into(), + array.validity().cloned(), + ); + fixed_len_bytes::array_to_page(&array, options, type_, statistics) + } + } + Decimal256(precision, _) => { + let type_ = type_; + let precision = *precision; + let array = array + .as_any() + .downcast_ref::>() + .unwrap(); + if precision <= 9 { + let values = array + .values() + .iter() + .map(|x| x.0.as_i32()) + .collect::>() + .into(); + + let array = + PrimitiveArray::::new(DataType::Int32, values, array.validity().cloned()); + primitive::nested_array_to_page::(&array, options, type_, nested) + } else if precision <= 18 { + let values = array + .values() + .iter() + .map(|x| x.0.as_i64()) + .collect::>() + .into(); + + let array = + PrimitiveArray::::new(DataType::Int64, values, array.validity().cloned()); + primitive::nested_array_to_page::(&array, options, type_, nested) + } else if precision <= 38 { + let size = decimal_length_from_precision(precision); + let statistics = if options.write_statistics { + let stats = fixed_len_bytes::build_statistics_decimal256_with_i128( + array, + type_.clone(), + size, + ); + Some(stats) + } else { + None + }; + + let mut values = Vec::::with_capacity(size * array.len()); + array.values().iter().for_each(|x| { + let bytes = &x.0.low().to_be_bytes()[16 - size..]; + values.extend_from_slice(bytes) + }); + let array = FixedSizeBinaryArray::new( + DataType::FixedSizeBinary(size), + values.into(), + array.validity().cloned(), + ); + fixed_len_bytes::array_to_page(&array, options, type_, statistics) + } else { + let size = 32; + let array = array + .as_any() + .downcast_ref::>() + .unwrap(); + let statistics = if options.write_statistics { + let stats = + fixed_len_bytes::build_statistics_decimal256(array, type_.clone(), size); + Some(stats) + } else { + None + }; + let mut values = Vec::::with_capacity(size * array.len()); + array.values().iter().for_each(|x| { + let bytes = &x.to_be_bytes(); + values.extend_from_slice(bytes) + }); + let array = FixedSizeBinaryArray::new( + DataType::FixedSizeBinary(size), + values.into(), + array.validity().cloned(), + ); + + fixed_len_bytes::array_to_page(&array, options, type_, statistics) + } + } other => Err(Error::NotYetImplemented(format!( "Writing nested parquet pages for data type {other:?}" ))), diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs index 1ad218e0fe..4539d21a33 100644 --- a/tests/it/io/parquet/mod.rs +++ b/tests/it/io/parquet/mod.rs @@ -63,10 +63,9 @@ pub fn read_column(mut reader: R, column: &str) -> Result Box { @@ -131,26 +130,26 @@ pub fn pyarrow_nested_edge(column: &str) -> Box { } pub fn pyarrow_nested_nullable(column: &str) -> Box { + let i64_values = &[ + Some(0), + Some(1), + Some(2), + None, + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + Some(9), + Some(10), + ]; let offsets = vec![0, 2, 2, 5, 8, 8, 11, 11, 12].try_into().unwrap(); let values = match column { "list_int64" => { // [[0, 1], None, [2, None, 3], [4, 5, 6], [], [7, 8, 9], None, [10]] - PrimitiveArray::::from(&[ - Some(0), - Some(1), - Some(2), - None, - Some(3), - Some(4), - Some(5), - Some(6), - Some(7), - Some(8), - Some(9), - Some(10), - ]) - .boxed() + PrimitiveArray::::from(i64_values).boxed() } "list_int64_required" | "list_int64_optional_required" | "list_int64_required_required" => { // [[0, 1], None, [2, 0, 3], [4, 5, 6], [], [7, 8, 9], None, [10]] @@ -241,6 +240,21 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { Some(b"bbb".to_vec()), Some(b"".to_vec()), ])), + "list_decimal" => { + let values = i64_values + .iter() + .map(|x| x.map(|x| x as i128)) + .collect::>(); + Box::new(PrimitiveArray::::from(values).to(DataType::Decimal(9, 0))) + } + "list_decimal256" => { + let values = i64_values + .iter() + .map(|x| x.map(|x| i256(x.as_i256()))) + .collect::>(); + let array = PrimitiveArray::::from(values).to(DataType::Decimal256(9, 0)); + Box::new(array) + } "list_nested_i64" | "list_nested_inner_required_i64" | "list_nested_inner_required_required_i64" => Box::new(NullArray::new(DataType::Null, 1)), @@ -422,6 +436,8 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { "list_bool" => Field::new("item", DataType::Boolean, true), "list_utf8" => Field::new("item", DataType::Utf8, true), "list_large_binary" => Field::new("item", DataType::LargeBinary, true), + "list_decimal" => Field::new("item", DataType::Decimal(9, 0), true), + "list_decimal256" => Field::new("item", DataType::Decimal256(9, 0), true), "list_struct_nullable" => Field::new("item", values.data_type().clone(), true), "list_struct_list_nullable" => Field::new("item", values.data_type().clone(), true), other => unreachable!("{}", other), @@ -868,6 +884,38 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { min_value: new_list(Box::new(BinaryArray::::from_slice([b""])), true).boxed(), max_value: new_list(Box::new(BinaryArray::::from_slice([b"ccc"])), true).boxed(), }, + "list_decimal" => Statistics { + distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(), + null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(), + min_value: new_list( + Box::new(Int128Array::from_slice([0]).to(DataType::Decimal(9, 0))), + true, + ) + .boxed(), + max_value: new_list( + Box::new(Int128Array::from_slice([10]).to(DataType::Decimal(9, 0))), + true, + ) + .boxed(), + }, + "list_decimal256" => Statistics { + distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(), + null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(), + min_value: new_list( + Box::new( + Int256Array::from_slice([i256(0.as_i256())]).to(DataType::Decimal256(9, 0)), + ), + true, + ) + .boxed(), + max_value: new_list( + Box::new( + Int256Array::from_slice([i256(10.as_i256())]).to(DataType::Decimal256(9, 0)), + ), + true, + ) + .boxed(), + }, "list_int64" => Statistics { distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(), null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(), diff --git a/tests/it/io/parquet/read.rs b/tests/it/io/parquet/read.rs index 786bdf6f96..a2237b4926 100644 --- a/tests/it/io/parquet/read.rs +++ b/tests/it/io/parquet/read.rs @@ -62,6 +62,8 @@ fn test_pyarrow_integration( "list_nested_i64", "list_utf8", "list_bool", + "list_decimal", + "list_decimal256", "list_nested_inner_required_required_i64", "list_nested_inner_required_i64", // pyarrow counts null struct items as nulls @@ -322,6 +324,16 @@ fn v1_nested_large_binary() -> Result<()> { test_pyarrow_integration("list_large_binary", 1, "nested", false, false, None) } +#[test] +fn v2_nested_decimal_nullable() -> Result<()> { + test_pyarrow_integration("list_decimal", 2, "nested", false, false, None) +} + +#[test] +fn v2_nested_decimal256_nullable() -> Result<()> { + test_pyarrow_integration("list_decimal256", 2, "nested", false, false, None) +} + #[test] fn v2_nested_nested() -> Result<()> { test_pyarrow_integration("list_nested_i64", 2, "nested", false, false, None) diff --git a/tests/it/io/parquet/write.rs b/tests/it/io/parquet/write.rs index 439710eb24..5fda011374 100644 --- a/tests/it/io/parquet/write.rs +++ b/tests/it/io/parquet/write.rs @@ -404,6 +404,30 @@ fn list_struct_nullable() -> Result<()> { ) } +#[test] +fn list_decimal_nullable() -> Result<()> { + round_trip_opt_stats( + "list_decimal", + "nested", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + +#[test] +fn list_decimal256_nullable() -> Result<()> { + round_trip_opt_stats( + "list_decimal256", + "nested", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + #[test] fn v1_nested_struct_list_nullable() -> Result<()> { round_trip_opt_stats( From a9139196f0f1498959a804d533fe2397cd36fb2a Mon Sep 17 00:00:00 2001 From: Jay Chia <17691182+jaychia@users.noreply.github.com> Date: Sat, 2 Sep 2023 17:10:03 -0700 Subject: [PATCH 58/80] Correctly coerce Parquet Int96 timestamps into requested TimeUnits (#1532) * Add correct coercion logic when reading int96 timestamps into specified timeunits * Refactor to better inline nested function call in Iter::new * Fix static check issues - immediately wrap and return an ArrayIter object * Fix lints --------- Co-authored-by: Jay Chia --- src/io/parquet/read/deserialize/simple.rs | 77 ++++++++++++++++++++--- tests/it/io/parquet/read.rs | 58 +++++++++++++++++ 2 files changed, 125 insertions(+), 10 deletions(-) diff --git a/src/io/parquet/read/deserialize/simple.rs b/src/io/parquet/read/deserialize/simple.rs index b4b614980e..d19296a4b7 100644 --- a/src/io/parquet/read/deserialize/simple.rs +++ b/src/io/parquet/read/deserialize/simple.rs @@ -391,6 +391,44 @@ fn unifiy_timestmap_unit( } } +#[inline] +pub fn int96_to_i64_us(value: [u32; 3]) -> i64 { + const JULIAN_DAY_OF_EPOCH: i64 = 2_440_588; + const SECONDS_PER_DAY: i64 = 86_400; + const MICROS_PER_SECOND: i64 = 1_000_000; + + let day = value[2] as i64; + let microseconds = (((value[1] as i64) << 32) + value[0] as i64) / 1_000; + let seconds = (day - JULIAN_DAY_OF_EPOCH) * SECONDS_PER_DAY; + + seconds * MICROS_PER_SECOND + microseconds +} + +#[inline] +pub fn int96_to_i64_ms(value: [u32; 3]) -> i64 { + const JULIAN_DAY_OF_EPOCH: i64 = 2_440_588; + const SECONDS_PER_DAY: i64 = 86_400; + const MILLIS_PER_SECOND: i64 = 1_000; + + let day = value[2] as i64; + let milliseconds = (((value[1] as i64) << 32) + value[0] as i64) / 1_000_000; + let seconds = (day - JULIAN_DAY_OF_EPOCH) * SECONDS_PER_DAY; + + seconds * MILLIS_PER_SECOND + milliseconds +} + +#[inline] +pub fn int96_to_i64_s(value: [u32; 3]) -> i64 { + const JULIAN_DAY_OF_EPOCH: i64 = 2_440_588; + const SECONDS_PER_DAY: i64 = 86_400; + + let day = value[2] as i64; + let seconds = (((value[1] as i64) << 32) + value[0] as i64) / 1_000_000_000; + let day_seconds = (day - JULIAN_DAY_OF_EPOCH) * SECONDS_PER_DAY; + + day_seconds + seconds +} + fn timestamp<'a, I: Pages + 'a>( pages: I, physical_type: &PhysicalType, @@ -401,16 +439,35 @@ fn timestamp<'a, I: Pages + 'a>( time_unit: TimeUnit, ) -> Result> { if physical_type == &PhysicalType::Int96 { - let iter = primitive::Iter::new(pages, data_type, num_rows, chunk_size, int96_to_i64_ns); - let logical_type = PrimitiveLogicalType::Timestamp { - unit: ParquetTimeUnit::Nanoseconds, - is_adjusted_to_utc: false, - }; - let (factor, is_multiplier) = unifiy_timestmap_unit(&Some(logical_type), time_unit); - return match (factor, is_multiplier) { - (1, _) => Ok(dyn_iter(iden(iter))), - (a, true) => Ok(dyn_iter(op(iter, move |x| x * a))), - (a, false) => Ok(dyn_iter(op(iter, move |x| x / a))), + return match time_unit { + TimeUnit::Nanosecond => Ok(dyn_iter(iden(primitive::Iter::new( + pages, + data_type, + num_rows, + chunk_size, + int96_to_i64_ns, + )))), + TimeUnit::Microsecond => Ok(dyn_iter(iden(primitive::Iter::new( + pages, + data_type, + num_rows, + chunk_size, + int96_to_i64_us, + )))), + TimeUnit::Millisecond => Ok(dyn_iter(iden(primitive::Iter::new( + pages, + data_type, + num_rows, + chunk_size, + int96_to_i64_ms, + )))), + TimeUnit::Second => Ok(dyn_iter(iden(primitive::Iter::new( + pages, + data_type, + num_rows, + chunk_size, + int96_to_i64_s, + )))), }; }; diff --git a/tests/it/io/parquet/read.rs b/tests/it/io/parquet/read.rs index a2237b4926..8f45eb874d 100644 --- a/tests/it/io/parquet/read.rs +++ b/tests/it/io/parquet/read.rs @@ -782,3 +782,61 @@ fn invalid_utf8() -> Result<()> { ); Ok(()) } + +#[test] +fn read_int96_timestamps() -> Result<()> { + use std::collections::BTreeMap; + + let timestamp_data = &[ + 0x50, 0x41, 0x52, 0x31, 0x15, 0x04, 0x15, 0x48, 0x15, 0x3c, 0x4c, 0x15, 0x06, 0x15, 0x00, + 0x12, 0x00, 0x00, 0x24, 0x00, 0x00, 0x0d, 0x01, 0x08, 0x9f, 0xd5, 0x1f, 0x0d, 0x0a, 0x44, + 0x00, 0x00, 0x59, 0x68, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14, + 0xfb, 0x2a, 0x00, 0x15, 0x00, 0x15, 0x14, 0x15, 0x18, 0x2c, 0x15, 0x06, 0x15, 0x10, 0x15, + 0x06, 0x15, 0x06, 0x1c, 0x00, 0x00, 0x00, 0x0a, 0x24, 0x02, 0x00, 0x00, 0x00, 0x06, 0x01, + 0x02, 0x03, 0x24, 0x00, 0x26, 0x9e, 0x01, 0x1c, 0x15, 0x06, 0x19, 0x35, 0x10, 0x00, 0x06, + 0x19, 0x18, 0x0a, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x73, 0x15, 0x02, + 0x16, 0x06, 0x16, 0x9e, 0x01, 0x16, 0x96, 0x01, 0x26, 0x60, 0x26, 0x08, 0x29, 0x2c, 0x15, + 0x04, 0x15, 0x00, 0x15, 0x02, 0x00, 0x15, 0x00, 0x15, 0x10, 0x15, 0x02, 0x00, 0x00, 0x00, + 0x15, 0x04, 0x19, 0x2c, 0x35, 0x00, 0x18, 0x06, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x15, + 0x02, 0x00, 0x15, 0x06, 0x25, 0x02, 0x18, 0x0a, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, + 0x6d, 0x70, 0x73, 0x00, 0x16, 0x06, 0x19, 0x1c, 0x19, 0x1c, 0x26, 0x9e, 0x01, 0x1c, 0x15, + 0x06, 0x19, 0x35, 0x10, 0x00, 0x06, 0x19, 0x18, 0x0a, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74, + 0x61, 0x6d, 0x70, 0x73, 0x15, 0x02, 0x16, 0x06, 0x16, 0x9e, 0x01, 0x16, 0x96, 0x01, 0x26, + 0x60, 0x26, 0x08, 0x29, 0x2c, 0x15, 0x04, 0x15, 0x00, 0x15, 0x02, 0x00, 0x15, 0x00, 0x15, + 0x10, 0x15, 0x02, 0x00, 0x00, 0x00, 0x16, 0x9e, 0x01, 0x16, 0x06, 0x26, 0x08, 0x16, 0x96, + 0x01, 0x14, 0x00, 0x00, 0x28, 0x20, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2d, 0x63, + 0x70, 0x70, 0x2d, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x20, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, + 0x6e, 0x20, 0x31, 0x32, 0x2e, 0x30, 0x2e, 0x30, 0x19, 0x1c, 0x1c, 0x00, 0x00, 0x00, 0x95, + 0x00, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31, + ]; + + let parse = |time_unit: TimeUnit| { + let mut reader = Cursor::new(timestamp_data); + let metadata = read_metadata(&mut reader)?; + let schema = arrow2::datatypes::Schema { + fields: vec![arrow2::datatypes::Field::new( + "timestamps", + arrow2::datatypes::DataType::Timestamp(time_unit, None), + false, + )], + metadata: BTreeMap::new(), + }; + let reader = FileReader::new(reader, metadata.row_groups, schema, Some(5), None, None); + reader.collect::>>() + }; + + // This data contains int96 timestamps in the year 1000 and 3000, which are out of range for + // Timestamp(TimeUnit::Nanoseconds) and will cause a panic in dev builds/overflow in release builds + // However, the code should work for the Microsecond/Millisecond time units + for time_unit in [ + arrow2::datatypes::TimeUnit::Microsecond, + arrow2::datatypes::TimeUnit::Millisecond, + arrow2::datatypes::TimeUnit::Second, + ] { + parse(time_unit).expect("Should not error"); + } + std::panic::catch_unwind(|| parse(arrow2::datatypes::TimeUnit::Nanosecond)) + .expect_err("Should be a panic error"); + + Ok(()) +} From f6135f57bf08cceb1be9c2f7d104e297bdedba48 Mon Sep 17 00:00:00 2001 From: sundyli <543950155@qq.com> Date: Sat, 2 Sep 2023 19:19:26 -0700 Subject: [PATCH 59/80] chore: fix clippy (#1558) --- src/compute/cast/primitive_to.rs | 8 ++++---- src/compute/temporal.rs | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs index 585e826cdd..30b265e2d5 100644 --- a/src/compute/cast/primitive_to.rs +++ b/src/compute/cast/primitive_to.rs @@ -433,7 +433,7 @@ where x.map(|x| { let datetime = timestamp_ns_to_datetime(*x); let offset = timezone.offset_from_utc_datetime(&datetime); - chrono::DateTime::::from_utc(datetime, offset).to_rfc3339() + chrono::DateTime::::from_naive_utc_and_offset(datetime, offset).to_rfc3339() }) }); Utf8Array::from_trusted_len_iter(iter) @@ -443,7 +443,7 @@ where x.map(|x| { let datetime = timestamp_us_to_datetime(*x); let offset = timezone.offset_from_utc_datetime(&datetime); - chrono::DateTime::::from_utc(datetime, offset).to_rfc3339() + chrono::DateTime::::from_naive_utc_and_offset(datetime, offset).to_rfc3339() }) }); Utf8Array::from_trusted_len_iter(iter) @@ -453,7 +453,7 @@ where x.map(|x| { let datetime = timestamp_ms_to_datetime(*x); let offset = timezone.offset_from_utc_datetime(&datetime); - chrono::DateTime::::from_utc(datetime, offset).to_rfc3339() + chrono::DateTime::::from_naive_utc_and_offset(datetime, offset).to_rfc3339() }) }); Utf8Array::from_trusted_len_iter(iter) @@ -463,7 +463,7 @@ where x.map(|x| { let datetime = timestamp_s_to_datetime(*x); let offset = timezone.offset_from_utc_datetime(&datetime); - chrono::DateTime::::from_utc(datetime, offset).to_rfc3339() + chrono::DateTime::::from_naive_utc_and_offset(datetime, offset).to_rfc3339() }) }); Utf8Array::from_trusted_len_iter(iter) diff --git a/src/compute/temporal.rs b/src/compute/temporal.rs index d1fd6cc735..60e573da4b 100644 --- a/src/compute/temporal.rs +++ b/src/compute/temporal.rs @@ -288,7 +288,9 @@ where let op = |x| { let datetime = timestamp_s_to_datetime(x); let offset = timezone.offset_from_utc_datetime(&datetime); - extract(chrono::DateTime::::from_utc(datetime, offset)) + extract(chrono::DateTime::::from_naive_utc_and_offset( + datetime, offset, + )) }; unary(array, op, A::PRIMITIVE.into()) } @@ -296,7 +298,9 @@ where let op = |x| { let datetime = timestamp_ms_to_datetime(x); let offset = timezone.offset_from_utc_datetime(&datetime); - extract(chrono::DateTime::::from_utc(datetime, offset)) + extract(chrono::DateTime::::from_naive_utc_and_offset( + datetime, offset, + )) }; unary(array, op, A::PRIMITIVE.into()) } @@ -304,7 +308,9 @@ where let op = |x| { let datetime = timestamp_us_to_datetime(x); let offset = timezone.offset_from_utc_datetime(&datetime); - extract(chrono::DateTime::::from_utc(datetime, offset)) + extract(chrono::DateTime::::from_naive_utc_and_offset( + datetime, offset, + )) }; unary(array, op, A::PRIMITIVE.into()) } @@ -312,7 +318,9 @@ where let op = |x| { let datetime = timestamp_ns_to_datetime(x); let offset = timezone.offset_from_utc_datetime(&datetime); - extract(chrono::DateTime::::from_utc(datetime, offset)) + extract(chrono::DateTime::::from_naive_utc_and_offset( + datetime, offset, + )) }; unary(array, op, A::PRIMITIVE.into()) } From cf9ec83318513928a10fce1a7c02033cec99fdf2 Mon Sep 17 00:00:00 2001 From: Ivan Smirnov Date: Sun, 3 Sep 2023 03:21:23 +0100 Subject: [PATCH 60/80] MutableDictionaryArray rewrite: use values stored in the array instead of the hash->hash map (#1555) --- Cargo.toml | 4 +- src/array/dictionary/mod.rs | 1 + src/array/dictionary/mutable.rs | 135 ++++++++--------- src/array/dictionary/value_map.rs | 207 +++++++++++++++++++++++++++ src/array/indexable.rs | 197 +++++++++++++++++++++++++ src/array/mod.rs | 4 +- src/compute/cast/primitive_to.rs | 4 +- tests/it/array/dictionary/mutable.rs | 15 -- 8 files changed, 469 insertions(+), 98 deletions(-) create mode 100644 src/array/dictionary/value_map.rs create mode 100644 src/array/indexable.rs diff --git a/Cargo.toml b/Cargo.toml index 0f3f9ec27b..1bb20a6955 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ hash_hasher = "^2.0.3" simdutf8 = "0.1.4" # A Rust port of SwissTable -hashbrown = { version = "0.14", default-features = false, optional = true } +hashbrown = { version = "0.14", default-features = false, features = ["ahash"] } # for timezone support chrono-tz = { version = "0.8", optional = true } @@ -243,7 +243,7 @@ compute_merge_sort = ["itertools", "compute_sort"] compute_nullif = ["compute_comparison"] compute_partition = ["compute_sort"] compute_regex_match = ["regex"] -compute_sort = ["compute_take", "hashbrown"] +compute_sort = ["compute_take"] compute_substring = [] compute_take = [] compute_temporal = [] diff --git a/src/array/dictionary/mod.rs b/src/array/dictionary/mod.rs index f7d4a0f43d..a4be1ea210 100644 --- a/src/array/dictionary/mod.rs +++ b/src/array/dictionary/mod.rs @@ -20,6 +20,7 @@ mod iterator; mod mutable; use crate::array::specification::check_indexes_unchecked; mod typed_iterator; +mod value_map; use crate::array::dictionary::typed_iterator::{DictValue, DictionaryValuesIterTyped}; pub use iterator::*; diff --git a/src/array/dictionary/mutable.rs b/src/array/dictionary/mutable.rs index 444de34bcc..16a00a1357 100644 --- a/src/array/dictionary/mutable.rs +++ b/src/array/dictionary/mutable.rs @@ -1,15 +1,15 @@ -use std::hash::{Hash, Hasher}; -use std::{collections::hash_map::DefaultHasher, sync::Arc}; - -use hash_hasher::HashedMap; +use std::hash::Hash; +use std::sync::Arc; +use crate::array::indexable::{AsIndexed, Indexable}; use crate::{ array::{primitive::MutablePrimitiveArray, Array, MutableArray, TryExtend, TryPush}, bitmap::MutableBitmap, datatypes::DataType, - error::{Error, Result}, + error::Result, }; +use super::value_map::ValueMap; use super::{DictionaryArray, DictionaryKey}; /// A mutable, strong-typed version of [`DictionaryArray`]. @@ -30,55 +30,29 @@ use super::{DictionaryArray, DictionaryKey}; #[derive(Debug)] pub struct MutableDictionaryArray { data_type: DataType, + map: ValueMap, + // invariant: `max(keys) < map.values().len()` keys: MutablePrimitiveArray, - map: HashedMap, - // invariant: `keys.len() <= values.len()` - values: M, } impl From> for DictionaryArray { - fn from(mut other: MutableDictionaryArray) -> Self { + fn from(other: MutableDictionaryArray) -> Self { // Safety - the invariant of this struct ensures that this is up-held unsafe { DictionaryArray::::try_new_unchecked( other.data_type, other.keys.into(), - other.values.as_box(), + other.map.into_boxed().as_box(), ) .unwrap() } } } -impl From for MutableDictionaryArray { - fn from(values: M) -> Self { - Self { - data_type: DataType::Dictionary( - K::KEY_TYPE, - Box::new(values.data_type().clone()), - false, - ), - keys: MutablePrimitiveArray::::new(), - map: HashedMap::default(), - values, - } - } -} - impl MutableDictionaryArray { /// Creates an empty [`MutableDictionaryArray`]. pub fn new() -> Self { - let values = M::default(); - Self { - data_type: DataType::Dictionary( - K::KEY_TYPE, - Box::new(values.data_type().clone()), - false, - ), - keys: MutablePrimitiveArray::::new(), - map: HashedMap::default(), - values, - } + Self::try_empty(M::default()).unwrap() } } @@ -89,22 +63,34 @@ impl Default for MutableDictionaryA } impl MutableDictionaryArray { - /// Returns whether the value should be pushed to the values or not - fn try_push_valid(&mut self, value: &T) -> Result { - let mut hasher = DefaultHasher::new(); - value.hash(&mut hasher); - let hash = hasher.finish(); - match self.map.get(&hash) { - Some(key) => { - self.keys.push(Some(*key)); - Ok(false) - } - None => { - let key = K::try_from(self.map.len()).map_err(|_| Error::Overflow)?; - self.map.insert(hash, key); - self.keys.push(Some(key)); - Ok(true) - } + /// Creates an empty [`MutableDictionaryArray`] from a given empty values array. + /// # Errors + /// Errors if the array is non-empty. + pub fn try_empty(values: M) -> Result { + Ok(Self::from_value_map(ValueMap::::try_empty(values)?)) + } + + /// Creates an empty [`MutableDictionaryArray`] preloaded with a given dictionary of values. + /// Indices associated with those values are automatically assigned based on the order of + /// the values. + /// # Errors + /// Errors if there's more values than the maximum value of `K`. + pub fn from_values(values: M) -> Result + where + M: Indexable, + M::Type: Eq + Hash, + { + Ok(Self::from_value_map(ValueMap::::from_values(values)?)) + } + + fn from_value_map(value_map: ValueMap) -> Self { + let keys = MutablePrimitiveArray::::new(); + let data_type = + DataType::Dictionary(K::KEY_TYPE, Box::new(value_map.data_type().clone()), false); + Self { + data_type, + map: value_map, + keys, } } @@ -113,14 +99,9 @@ impl MutableDictionaryArray { self.keys.push(None) } - /// returns a mutable reference to the inner values. - fn mut_values(&mut self) -> &mut M { - &mut self.values - } - /// returns a reference to the inner values. pub fn values(&self) -> &M { - &self.values + self.map.values() } /// converts itself into [`Arc`] @@ -142,15 +123,10 @@ impl MutableDictionaryArray { /// Shrinks the capacity of the [`MutableDictionaryArray`] to fit its current length. pub fn shrink_to_fit(&mut self) { - self.values.shrink_to_fit(); + self.map.shrink_to_fit(); self.keys.shrink_to_fit(); } - /// Returns the dictionary map - pub fn map(&self) -> &HashedMap { - &self.map - } - /// Returns the dictionary keys pub fn keys(&self) -> &MutablePrimitiveArray { &self.keys @@ -160,7 +136,7 @@ impl MutableDictionaryArray { DictionaryArray::::try_new( self.data_type.clone(), std::mem::take(&mut self.keys).into(), - self.values.as_box(), + self.map.take_into(), ) .unwrap() } @@ -208,17 +184,20 @@ impl MutableArray for MutableDictio } } -impl TryExtend> for MutableDictionaryArray +impl TryExtend> for MutableDictionaryArray where K: DictionaryKey, - M: MutableArray + TryExtend>, + M: MutableArray + Indexable + TryExtend>, + T: AsIndexed, + M::Type: Eq + Hash, { fn try_extend>>(&mut self, iter: II) -> Result<()> { for value in iter { if let Some(value) = value { - if self.try_push_valid(&value)? { - self.mut_values().try_extend(std::iter::once(Some(value)))?; - } + let key = self + .map + .try_push_valid(value, |arr, v| arr.try_extend(std::iter::once(Some(v))))?; + self.keys.try_push(Some(key))?; } else { self.push_null(); } @@ -230,19 +209,19 @@ where impl TryPush> for MutableDictionaryArray where K: DictionaryKey, - M: MutableArray + TryPush>, - T: Hash, + M: MutableArray + Indexable + TryPush>, + T: AsIndexed, + M::Type: Eq + Hash, { fn try_push(&mut self, item: Option) -> Result<()> { if let Some(value) = item { - if self.try_push_valid(&value)? { - self.values.try_push(Some(value)) - } else { - Ok(()) - } + let key = self + .map + .try_push_valid(value, |arr, v| arr.try_push(Some(v)))?; + self.keys.try_push(Some(key))?; } else { self.push_null(); - Ok(()) } + Ok(()) } } diff --git a/src/array/dictionary/value_map.rs b/src/array/dictionary/value_map.rs new file mode 100644 index 0000000000..35603de4cf --- /dev/null +++ b/src/array/dictionary/value_map.rs @@ -0,0 +1,207 @@ +use std::borrow::Borrow; +use std::fmt::{self, Debug}; +use std::hash::{Hash, Hasher}; +use std::pin::Pin; +use std::ptr::NonNull; + +use hashbrown::{Equivalent, HashMap}; + +use crate::array::Array; +use crate::{ + array::indexable::{AsIndexed, Indexable}, + array::MutableArray, + datatypes::DataType, + error::{Error, Result}, +}; + +use super::DictionaryKey; + +struct NonNullSend(NonNull); + +// safety: these pointers are for internal self-referential purposes to pinned array only +unsafe impl Send for NonNullSend {} +unsafe impl Sync for NonNullSend {} + +impl From<&M> for NonNullSend { + #[inline] + fn from(reference: &M) -> Self { + Self(NonNull::from(reference)) + } +} + +struct ValueRef { + array: NonNullSend, + index: usize, +} + +impl ValueRef { + #[inline] + pub fn new(array: &Pin>, index: usize) -> Self { + Self { + array: NonNullSend::from(Pin::get_ref(array.as_ref())), + index, + } + } + + #[inline] + pub fn get_array(&self) -> &M { + // safety: the invariant of the struct + unsafe { self.array.0.as_ref() } + } + + #[inline] + pub unsafe fn get_unchecked(&self) -> M::Value<'_> + where + M: Indexable, + { + self.get_array().value_unchecked_at(self.index) + } + + #[inline] + pub unsafe fn equals(&self, other: &M::Type) -> bool + where + M: Indexable, + M::Type: Eq, + { + self.get_unchecked().borrow() == other + } +} + +impl PartialEq for ValueRef +where + M::Type: PartialEq, +{ + #[inline] + fn eq(&self, other: &Self) -> bool { + // safety: the way these value refs are constructed, they are always within bounds + unsafe { + self.get_unchecked() + .borrow() + .eq(other.get_unchecked().borrow()) + } + } +} + +impl Eq for ValueRef where for<'a> M::Type: Eq {} + +impl Hash for ValueRef +where + M::Type: Hash, +{ + #[inline] + fn hash(&self, state: &mut H) { + // safety: the way these value refs are constructed, they are always within bounds + unsafe { self.get_unchecked().borrow().hash(state) } + } +} + +// To avoid blanket implementation issues with `Equivalent` trait (we only use hashbrown +// instead of the default HashMap to avoid blanket implementation problems with Borrow). +#[derive(Hash)] +struct Wrapped<'a, T: ?Sized>(&'a T); + +impl<'a, M: Indexable> Equivalent> for Wrapped<'a, M::Type> +where + M::Type: Eq, +{ + #[inline] + fn equivalent(&self, key: &ValueRef) -> bool { + // safety: invariant of the struct + unsafe { key.equals(self.0) } + } +} + +pub struct ValueMap { + values: Pin>, + map: HashMap, K>, +} + +impl ValueMap { + pub fn try_empty(values: M) -> Result { + if !values.is_empty() { + return Err(Error::InvalidArgumentError( + "initializing value map with non-empty values array".into(), + )); + } + Ok(Self { + values: Box::pin(values), + map: HashMap::default(), + }) + } + + pub fn from_values(values: M) -> Result + where + M: Indexable, + M::Type: Eq + Hash, + { + let values = Box::pin(values); + let map = (0..values.len()) + .map(|i| { + let key = K::try_from(i).map_err(|_| Error::Overflow)?; + Ok((ValueRef::new(&values, i), key)) + }) + .collect::>()?; + Ok(Self { values, map }) + } + + pub fn data_type(&self) -> &DataType { + Pin::get_ref(self.values.as_ref()).data_type() + } + + pub fn into_boxed(self) -> Box { + // safety: we unpin the pointer but the value map is dropped along with all + // the value references that might refer to the pinned array + unsafe { Pin::into_inner_unchecked(self.values) } + } + + pub fn take_into(&mut self) -> Box { + // safety: we unpin the pointer but the value map is manually cleared + let arr = unsafe { self.values.as_mut().get_unchecked_mut().as_box() }; + self.map.clear(); + arr + } + + #[inline] + pub fn values(&self) -> &M { + &self.values + } + + /// Try to insert a value and return its index (it may or may not get inserted). + pub fn try_push_valid( + &mut self, + value: V, + mut push: impl FnMut(&mut M, V) -> Result<()>, + ) -> Result + where + M: Indexable, + V: AsIndexed, + M::Type: Eq + Hash, + { + if let Some(&key) = self.map.get(&Wrapped(value.as_indexed())) { + return Ok(key); + } + let index = self.values.len(); + let key = K::try_from(index).map_err(|_| Error::Overflow)?; + // safety: we don't move the data out of the mutable pinned reference + unsafe { + push(self.values.as_mut().get_unchecked_mut(), value)?; + } + debug_assert_eq!(self.values.len(), index + 1); + self.map.insert(ValueRef::new(&self.values, index), key); + debug_assert_eq!(self.values.len(), self.map.len()); + Ok(key) + } + + pub fn shrink_to_fit(&mut self) { + // safety: we don't move the data out of the mutable pinned reference + unsafe { + self.values.as_mut().get_unchecked_mut().shrink_to_fit(); + } + } +} + +impl Debug for ValueMap { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + Pin::get_ref(self.values.as_ref()).fmt(f) + } +} diff --git a/src/array/indexable.rs b/src/array/indexable.rs new file mode 100644 index 0000000000..76001bfcf5 --- /dev/null +++ b/src/array/indexable.rs @@ -0,0 +1,197 @@ +use std::borrow::Borrow; + +use crate::{ + array::{ + MutableArray, MutableBinaryArray, MutableBinaryValuesArray, MutableBooleanArray, + MutableFixedSizeBinaryArray, MutablePrimitiveArray, MutableUtf8Array, + MutableUtf8ValuesArray, + }, + offset::Offset, + types::NativeType, +}; + +/// Trait for arrays that can be indexed directly to extract a value. +pub trait Indexable { + /// The type of the element at index `i`; may be a reference type or a value type. + type Value<'a>: Borrow + where + Self: 'a; + + type Type: ?Sized; + + /// Returns the element at index `i`. + /// # Panic + /// May panic if `i >= self.len()`. + fn value_at(&self, index: usize) -> Self::Value<'_>; + + /// Returns the element at index `i`. + /// # Safety + /// Assumes that the `i < self.len`. + #[inline] + unsafe fn value_unchecked_at(&self, index: usize) -> Self::Value<'_> { + self.value_at(index) + } +} + +pub trait AsIndexed { + fn as_indexed(&self) -> &M::Type; +} + +impl Indexable for MutableBooleanArray { + type Value<'a> = bool; + type Type = bool; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + self.values().get(i) + } +} + +impl AsIndexed for bool { + #[inline] + fn as_indexed(&self) -> &bool { + self + } +} + +impl Indexable for MutableBinaryArray { + type Value<'a> = &'a [u8]; + type Type = [u8]; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + // TODO: add .value() / .value_unchecked() to MutableBinaryArray? + assert!(i < self.len()); + unsafe { self.value_unchecked_at(i) } + } + + #[inline] + unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { + // TODO: add .value() / .value_unchecked() to MutableBinaryArray? + // soundness: the invariant of the function + let (start, end) = self.offsets().start_end_unchecked(i); + // soundness: the invariant of the struct + self.values().get_unchecked(start..end) + } +} + +impl AsIndexed> for &[u8] { + #[inline] + fn as_indexed(&self) -> &[u8] { + self + } +} + +impl Indexable for MutableBinaryValuesArray { + type Value<'a> = &'a [u8]; + type Type = [u8]; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + self.value(i) + } + + #[inline] + unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { + self.value_unchecked(i) + } +} + +impl AsIndexed> for &[u8] { + #[inline] + fn as_indexed(&self) -> &[u8] { + self + } +} + +impl Indexable for MutableFixedSizeBinaryArray { + type Value<'a> = &'a [u8]; + type Type = [u8]; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + self.value(i) + } + + #[inline] + unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { + // soundness: the invariant of the struct + self.value_unchecked(i) + } +} + +impl AsIndexed for &[u8] { + #[inline] + fn as_indexed(&self) -> &[u8] { + self + } +} + +// TODO: should NativeType derive from Hash? +impl Indexable for MutablePrimitiveArray { + type Value<'a> = T; + type Type = T; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + assert!(i < self.len()); + // TODO: add Length trait? (for both Array and MutableArray) + unsafe { self.value_unchecked_at(i) } + } + + #[inline] + unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { + *self.values().get_unchecked(i) + } +} + +impl AsIndexed> for T { + #[inline] + fn as_indexed(&self) -> &T { + self + } +} + +impl Indexable for MutableUtf8Array { + type Value<'a> = &'a str; + type Type = str; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + self.value(i) + } + + #[inline] + unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { + self.value_unchecked(i) + } +} + +impl> AsIndexed> for V { + #[inline] + fn as_indexed(&self) -> &str { + self.as_ref() + } +} + +impl Indexable for MutableUtf8ValuesArray { + type Value<'a> = &'a str; + type Type = str; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + self.value(i) + } + + #[inline] + unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { + self.value_unchecked(i) + } +} + +impl> AsIndexed> for V { + #[inline] + fn as_indexed(&self) -> &str { + self.as_ref() + } +} diff --git a/src/array/mod.rs b/src/array/mod.rs index 04b7b2c8e3..1575130989 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -720,8 +720,10 @@ mod utf8; mod equal; mod ffi; mod fmt; -pub mod growable; +mod indexable; mod iterator; + +pub mod growable; pub mod ord; pub(crate) use iterator::ArrayAccessor; diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs index 30b265e2d5..110288817a 100644 --- a/src/compute/cast/primitive_to.rs +++ b/src/compute/cast/primitive_to.rs @@ -306,9 +306,9 @@ pub fn primitive_to_dictionary( from: &PrimitiveArray, ) -> Result> { let iter = from.iter().map(|x| x.copied()); - let mut array = MutableDictionaryArray::::from(MutablePrimitiveArray::::from( + let mut array = MutableDictionaryArray::::try_empty(MutablePrimitiveArray::::from( from.data_type().clone(), - )); + ))?; array.try_extend(iter)?; Ok(array.into()) diff --git a/tests/it/array/dictionary/mutable.rs b/tests/it/array/dictionary/mutable.rs index b6103dcccf..1b54a92647 100644 --- a/tests/it/array/dictionary/mutable.rs +++ b/tests/it/array/dictionary/mutable.rs @@ -1,8 +1,5 @@ use arrow2::array::*; use arrow2::error::Result; -use hash_hasher::HashedMap; -use std::collections::hash_map::DefaultHasher; -use std::hash::{Hash, Hasher}; #[test] fn primitive() -> Result<()> { @@ -61,16 +58,4 @@ fn push_utf8() { expected_keys.push(Some(0)); expected_keys.push(Some(1)); assert_eq!(*new.keys(), expected_keys); - - let expected_map = ["A", "B", "C"] - .iter() - .enumerate() - .map(|(index, value)| { - let mut hasher = DefaultHasher::new(); - value.hash(&mut hasher); - let hash = hasher.finish(); - (hash, index as i32) - }) - .collect::>(); - assert_eq!(*new.map(), expected_map); } From b2017d7cc3611cd9d578bc675ebc3fe176d3a907 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 3 Sep 2023 14:30:49 +0200 Subject: [PATCH 61/80] Revert "MutableDictionaryArray rewrite: use values stored in the array instead of the hash->hash map" (#1559) --- Cargo.toml | 4 +- src/array/dictionary/mod.rs | 1 - src/array/dictionary/mutable.rs | 135 +++++++++-------- src/array/dictionary/value_map.rs | 207 --------------------------- src/array/indexable.rs | 197 ------------------------- src/array/mod.rs | 4 +- src/compute/cast/primitive_to.rs | 4 +- tests/it/array/dictionary/mutable.rs | 15 ++ 8 files changed, 98 insertions(+), 469 deletions(-) delete mode 100644 src/array/dictionary/value_map.rs delete mode 100644 src/array/indexable.rs diff --git a/Cargo.toml b/Cargo.toml index 1bb20a6955..0f3f9ec27b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ hash_hasher = "^2.0.3" simdutf8 = "0.1.4" # A Rust port of SwissTable -hashbrown = { version = "0.14", default-features = false, features = ["ahash"] } +hashbrown = { version = "0.14", default-features = false, optional = true } # for timezone support chrono-tz = { version = "0.8", optional = true } @@ -243,7 +243,7 @@ compute_merge_sort = ["itertools", "compute_sort"] compute_nullif = ["compute_comparison"] compute_partition = ["compute_sort"] compute_regex_match = ["regex"] -compute_sort = ["compute_take"] +compute_sort = ["compute_take", "hashbrown"] compute_substring = [] compute_take = [] compute_temporal = [] diff --git a/src/array/dictionary/mod.rs b/src/array/dictionary/mod.rs index a4be1ea210..f7d4a0f43d 100644 --- a/src/array/dictionary/mod.rs +++ b/src/array/dictionary/mod.rs @@ -20,7 +20,6 @@ mod iterator; mod mutable; use crate::array::specification::check_indexes_unchecked; mod typed_iterator; -mod value_map; use crate::array::dictionary::typed_iterator::{DictValue, DictionaryValuesIterTyped}; pub use iterator::*; diff --git a/src/array/dictionary/mutable.rs b/src/array/dictionary/mutable.rs index 16a00a1357..444de34bcc 100644 --- a/src/array/dictionary/mutable.rs +++ b/src/array/dictionary/mutable.rs @@ -1,15 +1,15 @@ -use std::hash::Hash; -use std::sync::Arc; +use std::hash::{Hash, Hasher}; +use std::{collections::hash_map::DefaultHasher, sync::Arc}; + +use hash_hasher::HashedMap; -use crate::array::indexable::{AsIndexed, Indexable}; use crate::{ array::{primitive::MutablePrimitiveArray, Array, MutableArray, TryExtend, TryPush}, bitmap::MutableBitmap, datatypes::DataType, - error::Result, + error::{Error, Result}, }; -use super::value_map::ValueMap; use super::{DictionaryArray, DictionaryKey}; /// A mutable, strong-typed version of [`DictionaryArray`]. @@ -30,29 +30,55 @@ use super::{DictionaryArray, DictionaryKey}; #[derive(Debug)] pub struct MutableDictionaryArray { data_type: DataType, - map: ValueMap, - // invariant: `max(keys) < map.values().len()` keys: MutablePrimitiveArray, + map: HashedMap, + // invariant: `keys.len() <= values.len()` + values: M, } impl From> for DictionaryArray { - fn from(other: MutableDictionaryArray) -> Self { + fn from(mut other: MutableDictionaryArray) -> Self { // Safety - the invariant of this struct ensures that this is up-held unsafe { DictionaryArray::::try_new_unchecked( other.data_type, other.keys.into(), - other.map.into_boxed().as_box(), + other.values.as_box(), ) .unwrap() } } } +impl From for MutableDictionaryArray { + fn from(values: M) -> Self { + Self { + data_type: DataType::Dictionary( + K::KEY_TYPE, + Box::new(values.data_type().clone()), + false, + ), + keys: MutablePrimitiveArray::::new(), + map: HashedMap::default(), + values, + } + } +} + impl MutableDictionaryArray { /// Creates an empty [`MutableDictionaryArray`]. pub fn new() -> Self { - Self::try_empty(M::default()).unwrap() + let values = M::default(); + Self { + data_type: DataType::Dictionary( + K::KEY_TYPE, + Box::new(values.data_type().clone()), + false, + ), + keys: MutablePrimitiveArray::::new(), + map: HashedMap::default(), + values, + } } } @@ -63,34 +89,22 @@ impl Default for MutableDictionaryA } impl MutableDictionaryArray { - /// Creates an empty [`MutableDictionaryArray`] from a given empty values array. - /// # Errors - /// Errors if the array is non-empty. - pub fn try_empty(values: M) -> Result { - Ok(Self::from_value_map(ValueMap::::try_empty(values)?)) - } - - /// Creates an empty [`MutableDictionaryArray`] preloaded with a given dictionary of values. - /// Indices associated with those values are automatically assigned based on the order of - /// the values. - /// # Errors - /// Errors if there's more values than the maximum value of `K`. - pub fn from_values(values: M) -> Result - where - M: Indexable, - M::Type: Eq + Hash, - { - Ok(Self::from_value_map(ValueMap::::from_values(values)?)) - } - - fn from_value_map(value_map: ValueMap) -> Self { - let keys = MutablePrimitiveArray::::new(); - let data_type = - DataType::Dictionary(K::KEY_TYPE, Box::new(value_map.data_type().clone()), false); - Self { - data_type, - map: value_map, - keys, + /// Returns whether the value should be pushed to the values or not + fn try_push_valid(&mut self, value: &T) -> Result { + let mut hasher = DefaultHasher::new(); + value.hash(&mut hasher); + let hash = hasher.finish(); + match self.map.get(&hash) { + Some(key) => { + self.keys.push(Some(*key)); + Ok(false) + } + None => { + let key = K::try_from(self.map.len()).map_err(|_| Error::Overflow)?; + self.map.insert(hash, key); + self.keys.push(Some(key)); + Ok(true) + } } } @@ -99,9 +113,14 @@ impl MutableDictionaryArray { self.keys.push(None) } + /// returns a mutable reference to the inner values. + fn mut_values(&mut self) -> &mut M { + &mut self.values + } + /// returns a reference to the inner values. pub fn values(&self) -> &M { - self.map.values() + &self.values } /// converts itself into [`Arc`] @@ -123,10 +142,15 @@ impl MutableDictionaryArray { /// Shrinks the capacity of the [`MutableDictionaryArray`] to fit its current length. pub fn shrink_to_fit(&mut self) { - self.map.shrink_to_fit(); + self.values.shrink_to_fit(); self.keys.shrink_to_fit(); } + /// Returns the dictionary map + pub fn map(&self) -> &HashedMap { + &self.map + } + /// Returns the dictionary keys pub fn keys(&self) -> &MutablePrimitiveArray { &self.keys @@ -136,7 +160,7 @@ impl MutableDictionaryArray { DictionaryArray::::try_new( self.data_type.clone(), std::mem::take(&mut self.keys).into(), - self.map.take_into(), + self.values.as_box(), ) .unwrap() } @@ -184,20 +208,17 @@ impl MutableArray for MutableDictio } } -impl TryExtend> for MutableDictionaryArray +impl TryExtend> for MutableDictionaryArray where K: DictionaryKey, - M: MutableArray + Indexable + TryExtend>, - T: AsIndexed, - M::Type: Eq + Hash, + M: MutableArray + TryExtend>, { fn try_extend>>(&mut self, iter: II) -> Result<()> { for value in iter { if let Some(value) = value { - let key = self - .map - .try_push_valid(value, |arr, v| arr.try_extend(std::iter::once(Some(v))))?; - self.keys.try_push(Some(key))?; + if self.try_push_valid(&value)? { + self.mut_values().try_extend(std::iter::once(Some(value)))?; + } } else { self.push_null(); } @@ -209,19 +230,19 @@ where impl TryPush> for MutableDictionaryArray where K: DictionaryKey, - M: MutableArray + Indexable + TryPush>, - T: AsIndexed, - M::Type: Eq + Hash, + M: MutableArray + TryPush>, + T: Hash, { fn try_push(&mut self, item: Option) -> Result<()> { if let Some(value) = item { - let key = self - .map - .try_push_valid(value, |arr, v| arr.try_push(Some(v)))?; - self.keys.try_push(Some(key))?; + if self.try_push_valid(&value)? { + self.values.try_push(Some(value)) + } else { + Ok(()) + } } else { self.push_null(); + Ok(()) } - Ok(()) } } diff --git a/src/array/dictionary/value_map.rs b/src/array/dictionary/value_map.rs deleted file mode 100644 index 35603de4cf..0000000000 --- a/src/array/dictionary/value_map.rs +++ /dev/null @@ -1,207 +0,0 @@ -use std::borrow::Borrow; -use std::fmt::{self, Debug}; -use std::hash::{Hash, Hasher}; -use std::pin::Pin; -use std::ptr::NonNull; - -use hashbrown::{Equivalent, HashMap}; - -use crate::array::Array; -use crate::{ - array::indexable::{AsIndexed, Indexable}, - array::MutableArray, - datatypes::DataType, - error::{Error, Result}, -}; - -use super::DictionaryKey; - -struct NonNullSend(NonNull); - -// safety: these pointers are for internal self-referential purposes to pinned array only -unsafe impl Send for NonNullSend {} -unsafe impl Sync for NonNullSend {} - -impl From<&M> for NonNullSend { - #[inline] - fn from(reference: &M) -> Self { - Self(NonNull::from(reference)) - } -} - -struct ValueRef { - array: NonNullSend, - index: usize, -} - -impl ValueRef { - #[inline] - pub fn new(array: &Pin>, index: usize) -> Self { - Self { - array: NonNullSend::from(Pin::get_ref(array.as_ref())), - index, - } - } - - #[inline] - pub fn get_array(&self) -> &M { - // safety: the invariant of the struct - unsafe { self.array.0.as_ref() } - } - - #[inline] - pub unsafe fn get_unchecked(&self) -> M::Value<'_> - where - M: Indexable, - { - self.get_array().value_unchecked_at(self.index) - } - - #[inline] - pub unsafe fn equals(&self, other: &M::Type) -> bool - where - M: Indexable, - M::Type: Eq, - { - self.get_unchecked().borrow() == other - } -} - -impl PartialEq for ValueRef -where - M::Type: PartialEq, -{ - #[inline] - fn eq(&self, other: &Self) -> bool { - // safety: the way these value refs are constructed, they are always within bounds - unsafe { - self.get_unchecked() - .borrow() - .eq(other.get_unchecked().borrow()) - } - } -} - -impl Eq for ValueRef where for<'a> M::Type: Eq {} - -impl Hash for ValueRef -where - M::Type: Hash, -{ - #[inline] - fn hash(&self, state: &mut H) { - // safety: the way these value refs are constructed, they are always within bounds - unsafe { self.get_unchecked().borrow().hash(state) } - } -} - -// To avoid blanket implementation issues with `Equivalent` trait (we only use hashbrown -// instead of the default HashMap to avoid blanket implementation problems with Borrow). -#[derive(Hash)] -struct Wrapped<'a, T: ?Sized>(&'a T); - -impl<'a, M: Indexable> Equivalent> for Wrapped<'a, M::Type> -where - M::Type: Eq, -{ - #[inline] - fn equivalent(&self, key: &ValueRef) -> bool { - // safety: invariant of the struct - unsafe { key.equals(self.0) } - } -} - -pub struct ValueMap { - values: Pin>, - map: HashMap, K>, -} - -impl ValueMap { - pub fn try_empty(values: M) -> Result { - if !values.is_empty() { - return Err(Error::InvalidArgumentError( - "initializing value map with non-empty values array".into(), - )); - } - Ok(Self { - values: Box::pin(values), - map: HashMap::default(), - }) - } - - pub fn from_values(values: M) -> Result - where - M: Indexable, - M::Type: Eq + Hash, - { - let values = Box::pin(values); - let map = (0..values.len()) - .map(|i| { - let key = K::try_from(i).map_err(|_| Error::Overflow)?; - Ok((ValueRef::new(&values, i), key)) - }) - .collect::>()?; - Ok(Self { values, map }) - } - - pub fn data_type(&self) -> &DataType { - Pin::get_ref(self.values.as_ref()).data_type() - } - - pub fn into_boxed(self) -> Box { - // safety: we unpin the pointer but the value map is dropped along with all - // the value references that might refer to the pinned array - unsafe { Pin::into_inner_unchecked(self.values) } - } - - pub fn take_into(&mut self) -> Box { - // safety: we unpin the pointer but the value map is manually cleared - let arr = unsafe { self.values.as_mut().get_unchecked_mut().as_box() }; - self.map.clear(); - arr - } - - #[inline] - pub fn values(&self) -> &M { - &self.values - } - - /// Try to insert a value and return its index (it may or may not get inserted). - pub fn try_push_valid( - &mut self, - value: V, - mut push: impl FnMut(&mut M, V) -> Result<()>, - ) -> Result - where - M: Indexable, - V: AsIndexed, - M::Type: Eq + Hash, - { - if let Some(&key) = self.map.get(&Wrapped(value.as_indexed())) { - return Ok(key); - } - let index = self.values.len(); - let key = K::try_from(index).map_err(|_| Error::Overflow)?; - // safety: we don't move the data out of the mutable pinned reference - unsafe { - push(self.values.as_mut().get_unchecked_mut(), value)?; - } - debug_assert_eq!(self.values.len(), index + 1); - self.map.insert(ValueRef::new(&self.values, index), key); - debug_assert_eq!(self.values.len(), self.map.len()); - Ok(key) - } - - pub fn shrink_to_fit(&mut self) { - // safety: we don't move the data out of the mutable pinned reference - unsafe { - self.values.as_mut().get_unchecked_mut().shrink_to_fit(); - } - } -} - -impl Debug for ValueMap { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - Pin::get_ref(self.values.as_ref()).fmt(f) - } -} diff --git a/src/array/indexable.rs b/src/array/indexable.rs deleted file mode 100644 index 76001bfcf5..0000000000 --- a/src/array/indexable.rs +++ /dev/null @@ -1,197 +0,0 @@ -use std::borrow::Borrow; - -use crate::{ - array::{ - MutableArray, MutableBinaryArray, MutableBinaryValuesArray, MutableBooleanArray, - MutableFixedSizeBinaryArray, MutablePrimitiveArray, MutableUtf8Array, - MutableUtf8ValuesArray, - }, - offset::Offset, - types::NativeType, -}; - -/// Trait for arrays that can be indexed directly to extract a value. -pub trait Indexable { - /// The type of the element at index `i`; may be a reference type or a value type. - type Value<'a>: Borrow - where - Self: 'a; - - type Type: ?Sized; - - /// Returns the element at index `i`. - /// # Panic - /// May panic if `i >= self.len()`. - fn value_at(&self, index: usize) -> Self::Value<'_>; - - /// Returns the element at index `i`. - /// # Safety - /// Assumes that the `i < self.len`. - #[inline] - unsafe fn value_unchecked_at(&self, index: usize) -> Self::Value<'_> { - self.value_at(index) - } -} - -pub trait AsIndexed { - fn as_indexed(&self) -> &M::Type; -} - -impl Indexable for MutableBooleanArray { - type Value<'a> = bool; - type Type = bool; - - #[inline] - fn value_at(&self, i: usize) -> Self::Value<'_> { - self.values().get(i) - } -} - -impl AsIndexed for bool { - #[inline] - fn as_indexed(&self) -> &bool { - self - } -} - -impl Indexable for MutableBinaryArray { - type Value<'a> = &'a [u8]; - type Type = [u8]; - - #[inline] - fn value_at(&self, i: usize) -> Self::Value<'_> { - // TODO: add .value() / .value_unchecked() to MutableBinaryArray? - assert!(i < self.len()); - unsafe { self.value_unchecked_at(i) } - } - - #[inline] - unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { - // TODO: add .value() / .value_unchecked() to MutableBinaryArray? - // soundness: the invariant of the function - let (start, end) = self.offsets().start_end_unchecked(i); - // soundness: the invariant of the struct - self.values().get_unchecked(start..end) - } -} - -impl AsIndexed> for &[u8] { - #[inline] - fn as_indexed(&self) -> &[u8] { - self - } -} - -impl Indexable for MutableBinaryValuesArray { - type Value<'a> = &'a [u8]; - type Type = [u8]; - - #[inline] - fn value_at(&self, i: usize) -> Self::Value<'_> { - self.value(i) - } - - #[inline] - unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { - self.value_unchecked(i) - } -} - -impl AsIndexed> for &[u8] { - #[inline] - fn as_indexed(&self) -> &[u8] { - self - } -} - -impl Indexable for MutableFixedSizeBinaryArray { - type Value<'a> = &'a [u8]; - type Type = [u8]; - - #[inline] - fn value_at(&self, i: usize) -> Self::Value<'_> { - self.value(i) - } - - #[inline] - unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { - // soundness: the invariant of the struct - self.value_unchecked(i) - } -} - -impl AsIndexed for &[u8] { - #[inline] - fn as_indexed(&self) -> &[u8] { - self - } -} - -// TODO: should NativeType derive from Hash? -impl Indexable for MutablePrimitiveArray { - type Value<'a> = T; - type Type = T; - - #[inline] - fn value_at(&self, i: usize) -> Self::Value<'_> { - assert!(i < self.len()); - // TODO: add Length trait? (for both Array and MutableArray) - unsafe { self.value_unchecked_at(i) } - } - - #[inline] - unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { - *self.values().get_unchecked(i) - } -} - -impl AsIndexed> for T { - #[inline] - fn as_indexed(&self) -> &T { - self - } -} - -impl Indexable for MutableUtf8Array { - type Value<'a> = &'a str; - type Type = str; - - #[inline] - fn value_at(&self, i: usize) -> Self::Value<'_> { - self.value(i) - } - - #[inline] - unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { - self.value_unchecked(i) - } -} - -impl> AsIndexed> for V { - #[inline] - fn as_indexed(&self) -> &str { - self.as_ref() - } -} - -impl Indexable for MutableUtf8ValuesArray { - type Value<'a> = &'a str; - type Type = str; - - #[inline] - fn value_at(&self, i: usize) -> Self::Value<'_> { - self.value(i) - } - - #[inline] - unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { - self.value_unchecked(i) - } -} - -impl> AsIndexed> for V { - #[inline] - fn as_indexed(&self) -> &str { - self.as_ref() - } -} diff --git a/src/array/mod.rs b/src/array/mod.rs index 1575130989..04b7b2c8e3 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -720,10 +720,8 @@ mod utf8; mod equal; mod ffi; mod fmt; -mod indexable; -mod iterator; - pub mod growable; +mod iterator; pub mod ord; pub(crate) use iterator::ArrayAccessor; diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs index 110288817a..30b265e2d5 100644 --- a/src/compute/cast/primitive_to.rs +++ b/src/compute/cast/primitive_to.rs @@ -306,9 +306,9 @@ pub fn primitive_to_dictionary( from: &PrimitiveArray, ) -> Result> { let iter = from.iter().map(|x| x.copied()); - let mut array = MutableDictionaryArray::::try_empty(MutablePrimitiveArray::::from( + let mut array = MutableDictionaryArray::::from(MutablePrimitiveArray::::from( from.data_type().clone(), - ))?; + )); array.try_extend(iter)?; Ok(array.into()) diff --git a/tests/it/array/dictionary/mutable.rs b/tests/it/array/dictionary/mutable.rs index 1b54a92647..b6103dcccf 100644 --- a/tests/it/array/dictionary/mutable.rs +++ b/tests/it/array/dictionary/mutable.rs @@ -1,5 +1,8 @@ use arrow2::array::*; use arrow2::error::Result; +use hash_hasher::HashedMap; +use std::collections::hash_map::DefaultHasher; +use std::hash::{Hash, Hasher}; #[test] fn primitive() -> Result<()> { @@ -58,4 +61,16 @@ fn push_utf8() { expected_keys.push(Some(0)); expected_keys.push(Some(1)); assert_eq!(*new.keys(), expected_keys); + + let expected_map = ["A", "B", "C"] + .iter() + .enumerate() + .map(|(index, value)| { + let mut hasher = DefaultHasher::new(); + value.hash(&mut hasher); + let hash = hasher.finish(); + (hash, index as i32) + }) + .collect::>(); + assert_eq!(*new.map(), expected_map); } From 87ab84460d86d5b729cfcf148406542ba40df48f Mon Sep 17 00:00:00 2001 From: Ivan Smirnov Date: Mon, 4 Sep 2023 19:23:56 +0100 Subject: [PATCH 62/80] 2nd (safe) rewrite of MutableDictionaryArray (#1561) --- Cargo.toml | 4 +- src/array/dictionary/mod.rs | 4 +- src/array/dictionary/mutable.rs | 151 ++++++++++---------- src/array/dictionary/value_map.rs | 127 +++++++++++++++++ src/array/indexable.rs | 197 +++++++++++++++++++++++++++ src/array/mod.rs | 5 +- src/compute/cast/primitive_to.rs | 4 +- tests/it/array/dictionary/mutable.rs | 104 ++++++++++++-- 8 files changed, 498 insertions(+), 98 deletions(-) create mode 100644 src/array/dictionary/value_map.rs create mode 100644 src/array/indexable.rs diff --git a/Cargo.toml b/Cargo.toml index 0f3f9ec27b..1bb20a6955 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ hash_hasher = "^2.0.3" simdutf8 = "0.1.4" # A Rust port of SwissTable -hashbrown = { version = "0.14", default-features = false, optional = true } +hashbrown = { version = "0.14", default-features = false, features = ["ahash"] } # for timezone support chrono-tz = { version = "0.8", optional = true } @@ -243,7 +243,7 @@ compute_merge_sort = ["itertools", "compute_sort"] compute_nullif = ["compute_comparison"] compute_partition = ["compute_sort"] compute_regex_match = ["regex"] -compute_sort = ["compute_take", "hashbrown"] +compute_sort = ["compute_take"] compute_substring = [] compute_take = [] compute_temporal = [] diff --git a/src/array/dictionary/mod.rs b/src/array/dictionary/mod.rs index f7d4a0f43d..3a23e670a1 100644 --- a/src/array/dictionary/mod.rs +++ b/src/array/dictionary/mod.rs @@ -1,3 +1,4 @@ +use std::hash::Hash; use std::hint::unreachable_unchecked; use crate::{ @@ -20,6 +21,7 @@ mod iterator; mod mutable; use crate::array::specification::check_indexes_unchecked; mod typed_iterator; +mod value_map; use crate::array::dictionary::typed_iterator::{DictValue, DictionaryValuesIterTyped}; pub use iterator::*; @@ -33,7 +35,7 @@ use super::{new_null_array, specification::check_indexes}; /// /// Any implementation of this trait must ensure that `always_fits_usize` only /// returns `true` if all values succeeds on `value::try_into::().unwrap()`. -pub unsafe trait DictionaryKey: NativeType + TryInto + TryFrom { +pub unsafe trait DictionaryKey: NativeType + TryInto + TryFrom + Hash { /// The corresponding [`IntegerType`] of this key const KEY_TYPE: IntegerType; diff --git a/src/array/dictionary/mutable.rs b/src/array/dictionary/mutable.rs index 444de34bcc..b48a57a945 100644 --- a/src/array/dictionary/mutable.rs +++ b/src/array/dictionary/mutable.rs @@ -1,15 +1,15 @@ -use std::hash::{Hash, Hasher}; -use std::{collections::hash_map::DefaultHasher, sync::Arc}; - -use hash_hasher::HashedMap; +use std::hash::Hash; +use std::sync::Arc; +use crate::array::indexable::{AsIndexed, Indexable}; use crate::{ array::{primitive::MutablePrimitiveArray, Array, MutableArray, TryExtend, TryPush}, bitmap::MutableBitmap, datatypes::DataType, - error::{Error, Result}, + error::Result, }; +use super::value_map::ValueMap; use super::{DictionaryArray, DictionaryKey}; /// A mutable, strong-typed version of [`DictionaryArray`]. @@ -30,55 +30,29 @@ use super::{DictionaryArray, DictionaryKey}; #[derive(Debug)] pub struct MutableDictionaryArray { data_type: DataType, + map: ValueMap, + // invariant: `max(keys) < map.values().len()` keys: MutablePrimitiveArray, - map: HashedMap, - // invariant: `keys.len() <= values.len()` - values: M, } impl From> for DictionaryArray { - fn from(mut other: MutableDictionaryArray) -> Self { + fn from(other: MutableDictionaryArray) -> Self { // Safety - the invariant of this struct ensures that this is up-held unsafe { DictionaryArray::::try_new_unchecked( other.data_type, other.keys.into(), - other.values.as_box(), + other.map.into_values().as_box(), ) .unwrap() } } } -impl From for MutableDictionaryArray { - fn from(values: M) -> Self { - Self { - data_type: DataType::Dictionary( - K::KEY_TYPE, - Box::new(values.data_type().clone()), - false, - ), - keys: MutablePrimitiveArray::::new(), - map: HashedMap::default(), - values, - } - } -} - impl MutableDictionaryArray { /// Creates an empty [`MutableDictionaryArray`]. pub fn new() -> Self { - let values = M::default(); - Self { - data_type: DataType::Dictionary( - K::KEY_TYPE, - Box::new(values.data_type().clone()), - false, - ), - keys: MutablePrimitiveArray::::new(), - map: HashedMap::default(), - values, - } + Self::try_empty(M::default()).unwrap() } } @@ -89,38 +63,61 @@ impl Default for MutableDictionaryA } impl MutableDictionaryArray { - /// Returns whether the value should be pushed to the values or not - fn try_push_valid(&mut self, value: &T) -> Result { - let mut hasher = DefaultHasher::new(); - value.hash(&mut hasher); - let hash = hasher.finish(); - match self.map.get(&hash) { - Some(key) => { - self.keys.push(Some(*key)); - Ok(false) - } - None => { - let key = K::try_from(self.map.len()).map_err(|_| Error::Overflow)?; - self.map.insert(hash, key); - self.keys.push(Some(key)); - Ok(true) - } + /// Creates an empty [`MutableDictionaryArray`] from a given empty values array. + /// # Errors + /// Errors if the array is non-empty. + pub fn try_empty(values: M) -> Result { + Ok(Self::from_value_map(ValueMap::::try_empty(values)?)) + } + + /// Creates an empty [`MutableDictionaryArray`] preloaded with a given dictionary of values. + /// Indices associated with those values are automatically assigned based on the order of + /// the values. + /// # Errors + /// Errors if there's more values than the maximum value of `K` or if values are not unique. + pub fn from_values(values: M) -> Result + where + M: Indexable, + M::Type: Eq + Hash, + { + Ok(Self::from_value_map(ValueMap::::from_values(values)?)) + } + + fn from_value_map(value_map: ValueMap) -> Self { + let keys = MutablePrimitiveArray::::new(); + let data_type = + DataType::Dictionary(K::KEY_TYPE, Box::new(value_map.data_type().clone()), false); + Self { + data_type, + map: value_map, + keys, } } + /// Creates an empty [`MutableDictionaryArray`] retaining the same dictionary as the current + /// mutable dictionary array, but with no data. This may come useful when serializing the + /// array into multiple chunks, where there's a requirement that the dictionary is the same. + /// No copying is performed, the value map is moved over to the new array. + pub fn into_empty(self) -> Self { + Self::from_value_map(self.map) + } + + /// Same as `into_empty` but clones the inner value map instead of taking full ownership. + pub fn to_empty(&self) -> Self + where + M: Clone, + { + Self::from_value_map(self.map.clone()) + } + /// pushes a null value pub fn push_null(&mut self) { self.keys.push(None) } - /// returns a mutable reference to the inner values. - fn mut_values(&mut self) -> &mut M { - &mut self.values - } - /// returns a reference to the inner values. pub fn values(&self) -> &M { - &self.values + self.map.values() } /// converts itself into [`Arc`] @@ -142,15 +139,10 @@ impl MutableDictionaryArray { /// Shrinks the capacity of the [`MutableDictionaryArray`] to fit its current length. pub fn shrink_to_fit(&mut self) { - self.values.shrink_to_fit(); + self.map.shrink_to_fit(); self.keys.shrink_to_fit(); } - /// Returns the dictionary map - pub fn map(&self) -> &HashedMap { - &self.map - } - /// Returns the dictionary keys pub fn keys(&self) -> &MutablePrimitiveArray { &self.keys @@ -160,7 +152,7 @@ impl MutableDictionaryArray { DictionaryArray::::try_new( self.data_type.clone(), std::mem::take(&mut self.keys).into(), - self.values.as_box(), + self.map.take_into(), ) .unwrap() } @@ -208,17 +200,20 @@ impl MutableArray for MutableDictio } } -impl TryExtend> for MutableDictionaryArray +impl TryExtend> for MutableDictionaryArray where K: DictionaryKey, - M: MutableArray + TryExtend>, + M: MutableArray + Indexable + TryExtend>, + T: AsIndexed, + M::Type: Eq + Hash, { fn try_extend>>(&mut self, iter: II) -> Result<()> { for value in iter { if let Some(value) = value { - if self.try_push_valid(&value)? { - self.mut_values().try_extend(std::iter::once(Some(value)))?; - } + let key = self + .map + .try_push_valid(value, |arr, v| arr.try_extend(std::iter::once(Some(v))))?; + self.keys.try_push(Some(key))?; } else { self.push_null(); } @@ -230,19 +225,19 @@ where impl TryPush> for MutableDictionaryArray where K: DictionaryKey, - M: MutableArray + TryPush>, - T: Hash, + M: MutableArray + Indexable + TryPush>, + T: AsIndexed, + M::Type: Eq + Hash, { fn try_push(&mut self, item: Option) -> Result<()> { if let Some(value) = item { - if self.try_push_valid(&value)? { - self.values.try_push(Some(value)) - } else { - Ok(()) - } + let key = self + .map + .try_push_valid(value, |arr, v| arr.try_push(Some(v)))?; + self.keys.try_push(Some(key))?; } else { self.push_null(); - Ok(()) } + Ok(()) } } diff --git a/src/array/dictionary/value_map.rs b/src/array/dictionary/value_map.rs new file mode 100644 index 0000000000..35b59aaa2a --- /dev/null +++ b/src/array/dictionary/value_map.rs @@ -0,0 +1,127 @@ +use std::borrow::Borrow; +use std::fmt::{self, Debug}; +use std::hash::{BuildHasher, Hash, Hasher}; + +use hashbrown::hash_map::RawEntryMut; +use hashbrown::HashMap; + +use crate::array::Array; +use crate::{ + array::indexable::{AsIndexed, Indexable}, + array::MutableArray, + datatypes::DataType, + error::{Error, Result}, +}; + +use super::DictionaryKey; + +#[derive(Clone)] +pub struct ValueMap { + values: M, + map: HashMap, // NB: *only* use insert_hashed_nocheck() and no other hashmap API +} + +impl ValueMap { + pub fn try_empty(values: M) -> Result { + if !values.is_empty() { + return Err(Error::InvalidArgumentError( + "initializing value map with non-empty values array".into(), + )); + } + Ok(Self { + values, + map: HashMap::default(), + }) + } + + pub fn from_values(values: M) -> Result + where + M: Indexable, + M::Type: Eq + Hash, + { + let mut map = HashMap::with_capacity(values.len()); + for index in 0..values.len() { + let key = K::try_from(index).map_err(|_| Error::Overflow)?; + // safety: we only iterate within bounds + let value = unsafe { values.value_unchecked_at(index) }; + let mut hasher = map.hasher().build_hasher(); + value.borrow().hash(&mut hasher); + let hash = hasher.finish(); + match map.raw_entry_mut().from_hash(hash, |_| true) { + RawEntryMut::Occupied(_) => { + return Err(Error::InvalidArgumentError( + "duplicate value in dictionary values array".into(), + )) + } + RawEntryMut::Vacant(entry) => { + entry.insert_hashed_nocheck(hash, key, ()); // NB: don't use .insert() here! + } + } + } + Ok(Self { values, map }) + } + + pub fn data_type(&self) -> &DataType { + self.values.data_type() + } + + pub fn into_values(self) -> M { + self.values + } + + pub fn take_into(&mut self) -> Box { + let arr = self.values.as_box(); + self.map.clear(); + arr + } + + #[inline] + pub fn values(&self) -> &M { + &self.values + } + + /// Try to insert a value and return its index (it may or may not get inserted). + pub fn try_push_valid( + &mut self, + value: V, + mut push: impl FnMut(&mut M, V) -> Result<()>, + ) -> Result + where + M: Indexable, + V: AsIndexed, + M::Type: Eq + Hash, + { + let mut hasher = self.map.hasher().build_hasher(); + value.as_indexed().hash(&mut hasher); + let hash = hasher.finish(); + + Ok( + match self.map.raw_entry_mut().from_hash(hash, |key| { + // safety: we've already checked (the inverse) when we pushed it, so it should be ok? + let index = unsafe { key.as_usize() }; + // safety: invariant of the struct, it's always in bounds since we maintain it + let stored_value = unsafe { self.values.value_unchecked_at(index) }; + stored_value.borrow() == value.as_indexed() + }) { + RawEntryMut::Occupied(entry) => *entry.key(), + RawEntryMut::Vacant(entry) => { + let index = self.values.len(); + let key = K::try_from(index).map_err(|_| Error::Overflow)?; + entry.insert_hashed_nocheck(hash, key, ()); // NB: don't use .insert() here! + push(&mut self.values, value)?; + key + } + }, + ) + } + + pub fn shrink_to_fit(&mut self) { + self.values.shrink_to_fit(); + } +} + +impl Debug for ValueMap { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.values.fmt(f) + } +} diff --git a/src/array/indexable.rs b/src/array/indexable.rs new file mode 100644 index 0000000000..76001bfcf5 --- /dev/null +++ b/src/array/indexable.rs @@ -0,0 +1,197 @@ +use std::borrow::Borrow; + +use crate::{ + array::{ + MutableArray, MutableBinaryArray, MutableBinaryValuesArray, MutableBooleanArray, + MutableFixedSizeBinaryArray, MutablePrimitiveArray, MutableUtf8Array, + MutableUtf8ValuesArray, + }, + offset::Offset, + types::NativeType, +}; + +/// Trait for arrays that can be indexed directly to extract a value. +pub trait Indexable { + /// The type of the element at index `i`; may be a reference type or a value type. + type Value<'a>: Borrow + where + Self: 'a; + + type Type: ?Sized; + + /// Returns the element at index `i`. + /// # Panic + /// May panic if `i >= self.len()`. + fn value_at(&self, index: usize) -> Self::Value<'_>; + + /// Returns the element at index `i`. + /// # Safety + /// Assumes that the `i < self.len`. + #[inline] + unsafe fn value_unchecked_at(&self, index: usize) -> Self::Value<'_> { + self.value_at(index) + } +} + +pub trait AsIndexed { + fn as_indexed(&self) -> &M::Type; +} + +impl Indexable for MutableBooleanArray { + type Value<'a> = bool; + type Type = bool; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + self.values().get(i) + } +} + +impl AsIndexed for bool { + #[inline] + fn as_indexed(&self) -> &bool { + self + } +} + +impl Indexable for MutableBinaryArray { + type Value<'a> = &'a [u8]; + type Type = [u8]; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + // TODO: add .value() / .value_unchecked() to MutableBinaryArray? + assert!(i < self.len()); + unsafe { self.value_unchecked_at(i) } + } + + #[inline] + unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { + // TODO: add .value() / .value_unchecked() to MutableBinaryArray? + // soundness: the invariant of the function + let (start, end) = self.offsets().start_end_unchecked(i); + // soundness: the invariant of the struct + self.values().get_unchecked(start..end) + } +} + +impl AsIndexed> for &[u8] { + #[inline] + fn as_indexed(&self) -> &[u8] { + self + } +} + +impl Indexable for MutableBinaryValuesArray { + type Value<'a> = &'a [u8]; + type Type = [u8]; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + self.value(i) + } + + #[inline] + unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { + self.value_unchecked(i) + } +} + +impl AsIndexed> for &[u8] { + #[inline] + fn as_indexed(&self) -> &[u8] { + self + } +} + +impl Indexable for MutableFixedSizeBinaryArray { + type Value<'a> = &'a [u8]; + type Type = [u8]; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + self.value(i) + } + + #[inline] + unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { + // soundness: the invariant of the struct + self.value_unchecked(i) + } +} + +impl AsIndexed for &[u8] { + #[inline] + fn as_indexed(&self) -> &[u8] { + self + } +} + +// TODO: should NativeType derive from Hash? +impl Indexable for MutablePrimitiveArray { + type Value<'a> = T; + type Type = T; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + assert!(i < self.len()); + // TODO: add Length trait? (for both Array and MutableArray) + unsafe { self.value_unchecked_at(i) } + } + + #[inline] + unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { + *self.values().get_unchecked(i) + } +} + +impl AsIndexed> for T { + #[inline] + fn as_indexed(&self) -> &T { + self + } +} + +impl Indexable for MutableUtf8Array { + type Value<'a> = &'a str; + type Type = str; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + self.value(i) + } + + #[inline] + unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { + self.value_unchecked(i) + } +} + +impl> AsIndexed> for V { + #[inline] + fn as_indexed(&self) -> &str { + self.as_ref() + } +} + +impl Indexable for MutableUtf8ValuesArray { + type Value<'a> = &'a str; + type Type = str; + + #[inline] + fn value_at(&self, i: usize) -> Self::Value<'_> { + self.value(i) + } + + #[inline] + unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { + self.value_unchecked(i) + } +} + +impl> AsIndexed> for V { + #[inline] + fn as_indexed(&self) -> &str { + self.as_ref() + } +} diff --git a/src/array/mod.rs b/src/array/mod.rs index 04b7b2c8e3..02735c3d0b 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -720,8 +720,11 @@ mod utf8; mod equal; mod ffi; mod fmt; -pub mod growable; +#[doc(hidden)] +pub mod indexable; mod iterator; + +pub mod growable; pub mod ord; pub(crate) use iterator::ArrayAccessor; diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs index 30b265e2d5..110288817a 100644 --- a/src/compute/cast/primitive_to.rs +++ b/src/compute/cast/primitive_to.rs @@ -306,9 +306,9 @@ pub fn primitive_to_dictionary( from: &PrimitiveArray, ) -> Result> { let iter = from.iter().map(|x| x.copied()); - let mut array = MutableDictionaryArray::::from(MutablePrimitiveArray::::from( + let mut array = MutableDictionaryArray::::try_empty(MutablePrimitiveArray::::from( from.data_type().clone(), - )); + ))?; array.try_extend(iter)?; Ok(array.into()) diff --git a/tests/it/array/dictionary/mutable.rs b/tests/it/array/dictionary/mutable.rs index b6103dcccf..9570339893 100644 --- a/tests/it/array/dictionary/mutable.rs +++ b/tests/it/array/dictionary/mutable.rs @@ -1,8 +1,11 @@ +use std::borrow::Borrow; +use std::collections::HashSet; +use std::fmt::Debug; +use std::hash::Hash; + +use arrow2::array::indexable::{AsIndexed, Indexable}; use arrow2::array::*; use arrow2::error::Result; -use hash_hasher::HashedMap; -use std::collections::hash_map::DefaultHasher; -use std::hash::{Hash, Hasher}; #[test] fn primitive() -> Result<()> { @@ -61,16 +64,89 @@ fn push_utf8() { expected_keys.push(Some(0)); expected_keys.push(Some(1)); assert_eq!(*new.keys(), expected_keys); +} + +#[test] +fn into_empty() { + let mut new: MutableDictionaryArray> = MutableDictionaryArray::new(); + for value in [Some("A"), Some("B"), None, Some("C"), Some("A"), Some("B")] { + new.try_push(value).unwrap(); + } + let values = new.values().clone(); + let empty = new.into_empty(); + assert_eq!(empty.values(), &values); + assert!(empty.is_empty()); +} + +#[test] +fn from_values() { + let mut new: MutableDictionaryArray> = MutableDictionaryArray::new(); + for value in [Some("A"), Some("B"), None, Some("C"), Some("A"), Some("B")] { + new.try_push(value).unwrap(); + } + let mut values = new.values().clone(); + let empty = MutableDictionaryArray::::from_values(values.clone()).unwrap(); + assert_eq!(empty.values(), &values); + assert!(empty.is_empty()); + values.push(Some("A")); + assert!(MutableDictionaryArray::::from_values(values).is_err()); +} - let expected_map = ["A", "B", "C"] - .iter() - .enumerate() - .map(|(index, value)| { - let mut hasher = DefaultHasher::new(); - value.hash(&mut hasher); - let hash = hasher.finish(); - (hash, index as i32) - }) - .collect::>(); - assert_eq!(*new.map(), expected_map); +#[test] +fn try_empty() { + let mut values = MutableUtf8Array::::new(); + MutableDictionaryArray::::try_empty(values.clone()).unwrap(); + values.push(Some("A")); + assert!(MutableDictionaryArray::::try_empty(values.clone()).is_err()); +} + +fn test_push_ex(values: Vec, gen: impl Fn(usize) -> T) +where + M: MutableArray + Indexable + TryPush> + TryExtend> + Default + 'static, + M::Type: Eq + Hash + Debug, + T: AsIndexed + Default + Clone + Eq + Hash, +{ + for is_extend in [false, true] { + let mut set = HashSet::new(); + let mut arr = MutableDictionaryArray::::new(); + macro_rules! push { + ($v:expr) => { + if is_extend { + arr.try_extend(std::iter::once($v)) + } else { + arr.try_push($v) + } + }; + } + arr.push_null(); + push!(None).unwrap(); + assert_eq!(arr.len(), 2); + assert_eq!(arr.values().len(), 0); + for (i, v) in values.iter().cloned().enumerate() { + push!(Some(v.clone())).unwrap(); + let is_dup = !set.insert(v.clone()); + if !is_dup { + assert_eq!(arr.values().value_at(i).borrow(), v.as_indexed()); + assert_eq!(arr.keys().value_at(arr.keys().len() - 1), i as u8); + } + assert_eq!(arr.values().len(), set.len()); + assert_eq!(arr.len(), 3 + i); + } + for i in 0..256 - set.len() { + push!(Some(gen(i))).unwrap(); + } + assert!(push!(Some(gen(256))).is_err()); + } +} + +#[test] +fn test_push_utf8_ex() { + test_push_ex::, _>(vec!["a".into(), "b".into(), "a".into()], |i| { + i.to_string() + }) +} + +#[test] +fn test_push_i64_ex() { + test_push_ex::, _>(vec![10, 20, 30, 20], |i| 1000 + i as i64); } From 767834e7c54e39f5b540e5c7341a5b8007a2345f Mon Sep 17 00:00:00 2001 From: Ivan Smirnov Date: Wed, 6 Sep 2023 14:07:24 +0100 Subject: [PATCH 63/80] Fix: native hashed-hash for MutableDictionaryArray (#1564) --- src/array/dictionary/value_map.rs | 82 ++++++++++++++++++++++------ tests/it/array/dictionary/mutable.rs | 17 ++++++ 2 files changed, 82 insertions(+), 17 deletions(-) diff --git a/src/array/dictionary/value_map.rs b/src/array/dictionary/value_map.rs index 35b59aaa2a..eb0f8790ca 100644 --- a/src/array/dictionary/value_map.rs +++ b/src/array/dictionary/value_map.rs @@ -1,6 +1,6 @@ use std::borrow::Borrow; use std::fmt::{self, Debug}; -use std::hash::{BuildHasher, Hash, Hasher}; +use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher}; use hashbrown::hash_map::RawEntryMut; use hashbrown::HashMap; @@ -15,10 +15,54 @@ use crate::{ use super::DictionaryKey; +/// Hasher for pre-hashed values; similar to `hash_hasher` but with native endianness. +/// +/// We know that we'll only use it for `u64` values, so we can avoid endian conversion. +/// +/// Invariant: hash of a u64 value is always equal to itself. +#[derive(Copy, Clone, Default)] +pub struct PassthroughHasher(u64); + +impl Hasher for PassthroughHasher { + #[inline] + fn write_u64(&mut self, value: u64) { + self.0 = value; + } + + fn write(&mut self, _: &[u8]) { + unreachable!(); + } + + #[inline] + fn finish(&self) -> u64 { + self.0 + } +} + +#[derive(Clone)] +pub struct Hashed { + hash: u64, + key: K, +} + +#[inline] +fn ahash_hash(value: &T) -> u64 { + let mut hasher = BuildHasherDefault::::default().build_hasher(); + value.hash(&mut hasher); + hasher.finish() +} + +impl Hash for Hashed { + #[inline] + fn hash(&self, state: &mut H) { + self.hash.hash(state) + } +} + #[derive(Clone)] pub struct ValueMap { - values: M, - map: HashMap, // NB: *only* use insert_hashed_nocheck() and no other hashmap API + pub values: M, + pub map: HashMap, (), BuildHasherDefault>, // NB: *only* use insert_hashed_nocheck() and no other hashmap API } impl ValueMap { @@ -39,22 +83,28 @@ impl ValueMap { M: Indexable, M::Type: Eq + Hash, { - let mut map = HashMap::with_capacity(values.len()); + let mut map = HashMap::, _, _>::with_capacity_and_hasher( + values.len(), + BuildHasherDefault::::default(), + ); for index in 0..values.len() { let key = K::try_from(index).map_err(|_| Error::Overflow)?; // safety: we only iterate within bounds let value = unsafe { values.value_unchecked_at(index) }; - let mut hasher = map.hasher().build_hasher(); - value.borrow().hash(&mut hasher); - let hash = hasher.finish(); - match map.raw_entry_mut().from_hash(hash, |_| true) { + let hash = ahash_hash(value.borrow()); + match map.raw_entry_mut().from_hash(hash, |item| { + // safety: invariant of the struct, it's always in bounds since we maintain it + let stored_value = unsafe { values.value_unchecked_at(item.key.as_usize()) }; + stored_value.borrow() == value.borrow() + }) { RawEntryMut::Occupied(_) => { return Err(Error::InvalidArgumentError( "duplicate value in dictionary values array".into(), )) } RawEntryMut::Vacant(entry) => { - entry.insert_hashed_nocheck(hash, key, ()); // NB: don't use .insert() here! + // NB: don't use .insert() here! + entry.insert_hashed_nocheck(hash, Hashed { hash, key }, ()); } } } @@ -91,24 +141,22 @@ impl ValueMap { V: AsIndexed, M::Type: Eq + Hash, { - let mut hasher = self.map.hasher().build_hasher(); - value.as_indexed().hash(&mut hasher); - let hash = hasher.finish(); - + let hash = ahash_hash(value.as_indexed()); Ok( - match self.map.raw_entry_mut().from_hash(hash, |key| { + match self.map.raw_entry_mut().from_hash(hash, |item| { // safety: we've already checked (the inverse) when we pushed it, so it should be ok? - let index = unsafe { key.as_usize() }; + let index = unsafe { item.key.as_usize() }; // safety: invariant of the struct, it's always in bounds since we maintain it let stored_value = unsafe { self.values.value_unchecked_at(index) }; stored_value.borrow() == value.as_indexed() }) { - RawEntryMut::Occupied(entry) => *entry.key(), + RawEntryMut::Occupied(entry) => entry.key().key, RawEntryMut::Vacant(entry) => { let index = self.values.len(); let key = K::try_from(index).map_err(|_| Error::Overflow)?; - entry.insert_hashed_nocheck(hash, key, ()); // NB: don't use .insert() here! + entry.insert_hashed_nocheck(hash, Hashed { hash, key }, ()); // NB: don't use .insert() here! push(&mut self.values, value)?; + debug_assert_eq!(self.values.len(), index + 1); key } }, diff --git a/tests/it/array/dictionary/mutable.rs b/tests/it/array/dictionary/mutable.rs index 9570339893..a7845114d9 100644 --- a/tests/it/array/dictionary/mutable.rs +++ b/tests/it/array/dictionary/mutable.rs @@ -150,3 +150,20 @@ fn test_push_utf8_ex() { fn test_push_i64_ex() { test_push_ex::, _>(vec![10, 20, 30, 20], |i| 1000 + i as i64); } + +#[test] +fn test_big_dict() { + let n = 10; + let strings = (0..10).map(|i| i.to_string()).collect::>(); + let mut arr = MutableDictionaryArray::>::new(); + for s in &strings { + arr.try_push(Some(s)).unwrap(); + } + assert_eq!(arr.values().len(), n); + for _ in 0..10_000 { + for s in &strings { + arr.try_push(Some(s)).unwrap(); + } + } + assert_eq!(arr.values().len(), n); +} From fb7b5fe3f61764da41a37124eee3d808a9409fb6 Mon Sep 17 00:00:00 2001 From: Jay Chia <17691182+jaychia@users.noreply.github.com> Date: Wed, 6 Sep 2023 18:36:11 -0700 Subject: [PATCH 64/80] Add SchemaInferenceOptions options to infer_schema and option to configure int96 inference (#1533) Co-authored-by: Jay Chia --- src/io/parquet/read/schema/convert.rs | 126 +++++++++++++++++++++----- src/io/parquet/read/schema/mod.rs | 34 ++++++- 2 files changed, 132 insertions(+), 28 deletions(-) diff --git a/src/io/parquet/read/schema/convert.rs b/src/io/parquet/read/schema/convert.rs index 821d510764..007797bd9d 100644 --- a/src/io/parquet/read/schema/convert.rs +++ b/src/io/parquet/read/schema/convert.rs @@ -1,4 +1,4 @@ -//! This module has a single entry point, [`parquet_to_arrow_schema`]. +//! This module has entry points, [`parquet_to_arrow_schema`] and the more configurable [`parquet_to_arrow_schema_with_options`]. use parquet2::schema::{ types::{ FieldInfo, GroupConvertedType, GroupLogicalType, IntegerType, ParquetType, PhysicalType, @@ -8,11 +8,23 @@ use parquet2::schema::{ }; use crate::datatypes::{DataType, Field, IntervalUnit, TimeUnit}; +use crate::io::parquet::read::schema::SchemaInferenceOptions; /// Converts [`ParquetType`]s to a [`Field`], ignoring parquet fields that do not contain /// any physical column. pub fn parquet_to_arrow_schema(fields: &[ParquetType]) -> Vec { - fields.iter().filter_map(to_field).collect::>() + parquet_to_arrow_schema_with_options(fields, &None) +} + +/// Like [`parquet_to_arrow_schema`] but with configurable options which affect the behavior of schema inference +pub fn parquet_to_arrow_schema_with_options( + fields: &[ParquetType], + options: &Option, +) -> Vec { + fields + .iter() + .filter_map(|f| to_field(f, options.as_ref().unwrap_or(&Default::default()))) + .collect::>() } fn from_int32( @@ -169,7 +181,10 @@ fn from_fixed_len_byte_array( } /// Maps a [`PhysicalType`] with optional metadata to a [`DataType`] -fn to_primitive_type_inner(primitive_type: &PrimitiveType) -> DataType { +fn to_primitive_type_inner( + primitive_type: &PrimitiveType, + options: &SchemaInferenceOptions, +) -> DataType { match primitive_type.physical_type { PhysicalType::Boolean => DataType::Boolean, PhysicalType::Int32 => { @@ -178,7 +193,7 @@ fn to_primitive_type_inner(primitive_type: &PrimitiveType) -> DataType { PhysicalType::Int64 => { from_int64(primitive_type.logical_type, primitive_type.converted_type) } - PhysicalType::Int96 => DataType::Timestamp(TimeUnit::Nanosecond, None), + PhysicalType::Int96 => DataType::Timestamp(options.int96_coerce_to_timeunit, None), PhysicalType::Float => DataType::Float32, PhysicalType::Double => DataType::Float64, PhysicalType::ByteArray => { @@ -195,8 +210,8 @@ fn to_primitive_type_inner(primitive_type: &PrimitiveType) -> DataType { /// Entry point for converting parquet primitive type to arrow type. /// /// This function takes care of repetition. -fn to_primitive_type(primitive_type: &PrimitiveType) -> DataType { - let base_type = to_primitive_type_inner(primitive_type); +fn to_primitive_type(primitive_type: &PrimitiveType, options: &SchemaInferenceOptions) -> DataType { + let base_type = to_primitive_type_inner(primitive_type, options); if primitive_type.field_info.repetition == Repetition::Repeated { DataType::List(Box::new(Field::new( @@ -214,23 +229,27 @@ fn non_repeated_group( converted_type: &Option, fields: &[ParquetType], parent_name: &str, + options: &SchemaInferenceOptions, ) -> Option { debug_assert!(!fields.is_empty()); match (logical_type, converted_type) { - (Some(GroupLogicalType::List), _) => to_list(fields, parent_name), - (None, Some(GroupConvertedType::List)) => to_list(fields, parent_name), - (Some(GroupLogicalType::Map), _) => to_list(fields, parent_name), + (Some(GroupLogicalType::List), _) => to_list(fields, parent_name, options), + (None, Some(GroupConvertedType::List)) => to_list(fields, parent_name, options), + (Some(GroupLogicalType::Map), _) => to_list(fields, parent_name, options), (None, Some(GroupConvertedType::Map) | Some(GroupConvertedType::MapKeyValue)) => { - to_map(fields) + to_map(fields, options) } - _ => to_struct(fields), + _ => to_struct(fields, options), } } /// Converts a parquet group type to an arrow [`DataType::Struct`]. /// Returns [`None`] if all its fields are empty -fn to_struct(fields: &[ParquetType]) -> Option { - let fields = fields.iter().filter_map(to_field).collect::>(); +fn to_struct(fields: &[ParquetType], options: &SchemaInferenceOptions) -> Option { + let fields = fields + .iter() + .filter_map(|f| to_field(f, options)) + .collect::>(); if fields.is_empty() { None } else { @@ -240,8 +259,8 @@ fn to_struct(fields: &[ParquetType]) -> Option { /// Converts a parquet group type to an arrow [`DataType::Struct`]. /// Returns [`None`] if all its fields are empty -fn to_map(fields: &[ParquetType]) -> Option { - let inner = to_field(&fields[0])?; +fn to_map(fields: &[ParquetType], options: &SchemaInferenceOptions) -> Option { + let inner = to_field(&fields[0], options)?; Some(DataType::Map(Box::new(inner), false)) } @@ -254,16 +273,17 @@ fn to_group_type( converted_type: &Option, fields: &[ParquetType], parent_name: &str, + options: &SchemaInferenceOptions, ) -> Option { debug_assert!(!fields.is_empty()); if field_info.repetition == Repetition::Repeated { Some(DataType::List(Box::new(Field::new( &field_info.name, - to_struct(fields)?, + to_struct(fields, options)?, is_nullable(field_info), )))) } else { - non_repeated_group(logical_type, converted_type, fields, parent_name) + non_repeated_group(logical_type, converted_type, fields, parent_name, options) } } @@ -279,10 +299,10 @@ pub(crate) fn is_nullable(field_info: &FieldInfo) -> bool { /// Converts parquet schema to arrow field. /// Returns `None` iff the parquet type has no associated primitive types, /// i.e. if it is a column-less group type. -fn to_field(type_: &ParquetType) -> Option { +fn to_field(type_: &ParquetType, options: &SchemaInferenceOptions) -> Option { Some(Field::new( &type_.get_field_info().name, - to_data_type(type_)?, + to_data_type(type_, options)?, is_nullable(type_.get_field_info()), )) } @@ -291,11 +311,15 @@ fn to_field(type_: &ParquetType) -> Option { /// /// To fully understand this algorithm, please refer to /// [parquet doc](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md). -fn to_list(fields: &[ParquetType], parent_name: &str) -> Option { +fn to_list( + fields: &[ParquetType], + parent_name: &str, + options: &SchemaInferenceOptions, +) -> Option { let item = fields.first().unwrap(); let item_type = match item { - ParquetType::PrimitiveType(primitive) => Some(to_primitive_type_inner(primitive)), + ParquetType::PrimitiveType(primitive) => Some(to_primitive_type_inner(primitive, options)), ParquetType::GroupType { fields, .. } => { if fields.len() == 1 && item.name() != "array" @@ -303,9 +327,9 @@ fn to_list(fields: &[ParquetType], parent_name: &str) -> Option { { // extract the repetition field let nested_item = fields.first().unwrap(); - to_data_type(nested_item) + to_data_type(nested_item, options) } else { - to_struct(fields) + to_struct(fields, options) } } }?; @@ -346,9 +370,12 @@ fn to_list(fields: &[ParquetType], parent_name: &str) -> Option { /// /// If this schema is a group type and none of its children is reserved in the /// conversion, the result is Ok(None). -pub(crate) fn to_data_type(type_: &ParquetType) -> Option { +pub(crate) fn to_data_type( + type_: &ParquetType, + options: &SchemaInferenceOptions, +) -> Option { match type_ { - ParquetType::PrimitiveType(primitive) => Some(to_primitive_type(primitive)), + ParquetType::PrimitiveType(primitive) => Some(to_primitive_type(primitive, options)), ParquetType::GroupType { field_info, logical_type, @@ -364,6 +391,7 @@ pub(crate) fn to_data_type(type_: &ParquetType) -> Option { converted_type, fields, &field_info.name, + options, ) } } @@ -973,4 +1001,52 @@ mod tests { assert_eq!(arrow_fields, fields); Ok(()) } + + #[test] + fn test_int96_options() -> Result<()> { + for tu in [ + TimeUnit::Second, + TimeUnit::Microsecond, + TimeUnit::Millisecond, + TimeUnit::Nanosecond, + ] { + let message_type = " + message arrow_schema { + REQUIRED INT96 int96_field; + OPTIONAL GROUP int96_list (LIST) { + REPEATED GROUP list { + OPTIONAL INT96 element; + } + } + REQUIRED GROUP int96_struct { + REQUIRED INT96 int96_field; + } + } + "; + let coerced_to = DataType::Timestamp(tu, None); + let arrow_fields = vec![ + Field::new("int96_field", coerced_to.clone(), false), + Field::new( + "int96_list", + DataType::List(Box::new(Field::new("element", coerced_to.clone(), true))), + true, + ), + Field::new( + "int96_struct", + DataType::Struct(vec![Field::new("int96_field", coerced_to.clone(), false)]), + false, + ), + ]; + + let parquet_schema = SchemaDescriptor::try_from_message(message_type)?; + let fields = parquet_to_arrow_schema_with_options( + parquet_schema.fields(), + &Some(SchemaInferenceOptions { + int96_coerce_to_timeunit: tu, + }), + ); + assert_eq!(arrow_fields, fields); + } + Ok(()) + } } diff --git a/src/io/parquet/read/schema/mod.rs b/src/io/parquet/read/schema/mod.rs index d47055ef6a..6a1e49ae7e 100644 --- a/src/io/parquet/read/schema/mod.rs +++ b/src/io/parquet/read/schema/mod.rs @@ -1,11 +1,11 @@ //! APIs to handle Parquet <-> Arrow schemas. -use crate::datatypes::Schema; +use crate::datatypes::{Schema, TimeUnit}; use crate::error::Result; mod convert; mod metadata; -pub use convert::parquet_to_arrow_schema; +pub use convert::{parquet_to_arrow_schema, parquet_to_arrow_schema_with_options}; pub use metadata::read_schema_from_metadata; pub use parquet2::metadata::{FileMetaData, KeyValue, SchemaDescriptor}; pub use parquet2::schema::types::ParquetType; @@ -14,6 +14,26 @@ pub(crate) use convert::*; use self::metadata::parse_key_value_metadata; +/// Options when inferring schemas from Parquet +pub struct SchemaInferenceOptions { + /// When inferring schemas from the Parquet INT96 timestamp type, this is the corresponding TimeUnit + /// in the inferred Arrow Timestamp type. + /// + /// This defaults to `TimeUnit::Nanosecond`, but INT96 timestamps outside of the range of years 1678-2262, + /// will overflow when parsed as `Timestamp(TimeUnit::Nanosecond)`. Setting this to a lower resolution + /// (e.g. TimeUnit::Milliseconds) will result in loss of precision, but support a larger range of dates + /// without overflowing when parsing the data. + pub int96_coerce_to_timeunit: TimeUnit, +} + +impl Default for SchemaInferenceOptions { + fn default() -> Self { + SchemaInferenceOptions { + int96_coerce_to_timeunit: TimeUnit::Nanosecond, + } + } +} + /// Infers a [`Schema`] from parquet's [`FileMetaData`]. This first looks for the metadata key /// `"ARROW:schema"`; if it does not exist, it converts the parquet types declared in the /// file's parquet schema to Arrow's equivalent. @@ -21,11 +41,19 @@ use self::metadata::parse_key_value_metadata; /// This function errors iff the key `"ARROW:schema"` exists but is not correctly encoded, /// indicating that that the file's arrow metadata was incorrectly written. pub fn infer_schema(file_metadata: &FileMetaData) -> Result { + infer_schema_with_options(file_metadata, &None) +} + +/// Like [`infer_schema`] but with configurable options which affects the behavior of inference +pub fn infer_schema_with_options( + file_metadata: &FileMetaData, + options: &Option, +) -> Result { let mut metadata = parse_key_value_metadata(file_metadata.key_value_metadata()); let schema = read_schema_from_metadata(&mut metadata)?; Ok(schema.unwrap_or_else(|| { - let fields = parquet_to_arrow_schema(file_metadata.schema().fields()); + let fields = parquet_to_arrow_schema_with_options(file_metadata.schema().fields(), options); Schema { fields, metadata } })) } From 7c93e358fc400bf3c0c0219c22eefc6b38fc2d12 Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Mon, 11 Sep 2023 18:03:34 +0800 Subject: [PATCH 65/80] fix: More types supports cast to LargeList (#1567) --- src/compute/cast/mod.rs | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index 8f89151a06..688291dd12 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -104,6 +104,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (List(list_from), LargeList(list_to)) if list_from == list_to => true, (LargeList(list_from), List(list_to)) if list_from == list_to => true, (_, List(list_to)) => can_cast_types(from_type, &list_to.data_type), + (_, LargeList(list_to)) if from_type != &LargeBinary => { + can_cast_types(from_type, &list_to.data_type) + } (Dictionary(_, from_value_type, _), Dictionary(_, to_value_type, _)) => { can_cast_types(from_value_type, to_value_type) } @@ -150,7 +153,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Timestamp(_, _), LargeUtf8) => true, (_, Utf8) => is_numeric(from_type) || from_type == &Binary, (_, LargeUtf8) => is_numeric(from_type) || from_type == &LargeBinary, - (_, LargeList(list_to)) => can_cast_types(from_type, &list_to.data_type), + (_, Binary) => is_numeric(from_type), (_, LargeBinary) => is_numeric(from_type), @@ -509,6 +512,19 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu Ok(Box::new(list_array)) } + (_, LargeList(to)) if from_type != &LargeBinary => { + // cast primitive to list's primitive + let values = cast(array, &to.data_type, options)?; + // create offsets, where if array.len() = 2, we have [0,1,2] + let offsets = (0..=array.len() as i64).collect::>(); + // Safety: offsets _are_ monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }; + + let list_array = ListArray::::new(to_type.clone(), offsets.into(), values, None); + + Ok(Box::new(list_array)) + } + (Dictionary(index_type, ..), _) => match_integer_type!(index_type, |$T| { dictionary_cast_dyn::<$T>(array, to_type, options) }), @@ -740,19 +756,6 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu ))), }, - (_, LargeList(to)) => { - // cast primitive to list's primitive - let values = cast(array, &to.data_type, options)?; - // create offsets, where if array.len() = 2, we have [0,1,2] - let offsets = (0..=array.len() as i64).collect::>(); - // Safety: offsets _are_ monotonically increasing - let offsets = unsafe { Offsets::new_unchecked(offsets) }; - - let list_array = ListArray::::new(to_type.clone(), offsets.into(), values, None); - - Ok(Box::new(list_array)) - } - (_, Binary) => match from_type { UInt8 => primitive_to_binary_dyn::(array), UInt16 => primitive_to_binary_dyn::(array), From 231a6fa61c3aad9d766165557501e28d73cf6b9a Mon Sep 17 00:00:00 2001 From: sundyli <543950155@qq.com> Date: Thu, 21 Sep 2023 19:51:56 -0700 Subject: [PATCH 66/80] fix parquet datatype conversion from arrow (#1570) --- src/io/csv/read_utils.rs | 4 ++-- src/io/odbc/read/deserialize.rs | 4 ++-- src/io/parquet/read/schema/convert.rs | 16 ++++++++++++++-- src/io/parquet/write/schema.rs | 4 ++-- src/temporal_conversions.rs | 12 ++++++------ 5 files changed, 26 insertions(+), 14 deletions(-) diff --git a/src/io/csv/read_utils.rs b/src/io/csv/read_utils.rs index d23f6c1197..cb91417ed1 100644 --- a/src/io/csv/read_utils.rs +++ b/src/io/csv/read_utils.rs @@ -136,7 +136,7 @@ fn deserialize_datetime(string: &str, tz: &T) -> Option( Timestamp(time_unit, None) => deserialize_primitive(rows, column, datatype, |bytes| { to_utf8(bytes) .and_then(|x| x.parse::().ok()) - .map(|x| x.timestamp_nanos()) + .map(|x| x.timestamp_nanos_opt().unwrap()) .map(|x| match time_unit { TimeUnit::Second => x / 1_000_000_000, TimeUnit::Millisecond => x / 1_000_000, diff --git a/src/io/odbc/read/deserialize.rs b/src/io/odbc/read/deserialize.rs index 3e18fa279b..be0a548e1a 100644 --- a/src/io/odbc/read/deserialize.rs +++ b/src/io/odbc/read/deserialize.rs @@ -264,12 +264,12 @@ fn timestamp_ms(timestamp: &odbc_api::sys::Timestamp) -> i64 { fn timestamp_us(timestamp: &odbc_api::sys::Timestamp) -> i64 { timestamp_to_naive(timestamp) - .map(|x| x.timestamp_nanos() / 1000) + .map(|x| x.timestamp_nanos_opt().unwrap() / 1000) .unwrap_or(0) } fn timestamp_ns(timestamp: &odbc_api::sys::Timestamp) -> i64 { timestamp_to_naive(timestamp) - .map(|x| x.timestamp_nanos()) + .map(|x| x.timestamp_nanos_opt().unwrap()) .unwrap_or(0) } diff --git a/src/io/parquet/read/schema/convert.rs b/src/io/parquet/read/schema/convert.rs index 007797bd9d..dea14ca86b 100644 --- a/src/io/parquet/read/schema/convert.rs +++ b/src/io/parquet/read/schema/convert.rs @@ -165,10 +165,18 @@ fn from_fixed_len_byte_array( ) -> DataType { match (logical_type, converted_type) { (Some(PrimitiveLogicalType::Decimal(precision, scale)), _) => { - DataType::Decimal(precision, scale) + if length < 32 { + DataType::Decimal(precision, scale) + } else { + DataType::Decimal256(precision, scale) + } } (None, Some(PrimitiveConvertedType::Decimal(precision, scale))) => { - DataType::Decimal(precision, scale) + if length < 32 { + DataType::Decimal(precision, scale) + } else { + DataType::Decimal256(precision, scale) + } } (None, Some(PrimitiveConvertedType::Interval)) => { // There is currently no reliable way of determining which IntervalUnit @@ -451,11 +459,15 @@ mod tests { message test_schema { REQUIRED BYTE_ARRAY binary; REQUIRED FIXED_LEN_BYTE_ARRAY (20) fixed_binary; + REQUIRED FIXED_LEN_BYTE_ARRAY (7) decimal_128 (Decimal(16, 2)) ; + REQUIRED FIXED_LEN_BYTE_ARRAY (32) decimal_256 (Decimal(44, 2)) ; } "; let expected = vec![ Field::new("binary", DataType::Binary, false), Field::new("fixed_binary", DataType::FixedSizeBinary(20), false), + Field::new("decimal_128", DataType::Decimal(16, 2), false), + Field::new("decimal_256", DataType::Decimal256(44, 2), false), ]; let parquet_schema = SchemaDescriptor::try_from_message(message)?; diff --git a/src/io/parquet/write/schema.rs b/src/io/parquet/write/schema.rs index 48dd853ea4..69af988d82 100644 --- a/src/io/parquet/write/schema.rs +++ b/src/io/parquet/write/schema.rs @@ -333,8 +333,8 @@ pub fn to_parquet_type(field: &Field) -> Result { name, PhysicalType::FixedLenByteArray(32), repetition, - None, - None, + Some(PrimitiveConvertedType::Decimal(precision, scale)), + logical_type, None, )?) } diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs index b706a45b29..f2864c3417 100644 --- a/src/temporal_conversions.rs +++ b/src/temporal_conversions.rs @@ -363,7 +363,7 @@ pub fn utf8_to_timestamp_scalar( TimeUnit::Second => x.timestamp(), TimeUnit::Millisecond => x.timestamp_millis(), TimeUnit::Microsecond => x.timestamp_micros(), - TimeUnit::Nanosecond => x.timestamp_nanos(), + TimeUnit::Nanosecond => x.timestamp_nanos_opt().unwrap(), }) .ok() } else { @@ -390,7 +390,7 @@ pub fn utf8_to_naive_timestamp_scalar(value: &str, fmt: &str, tu: &TimeUnit) -> TimeUnit::Second => x.timestamp(), TimeUnit::Millisecond => x.timestamp_millis(), TimeUnit::Microsecond => x.timestamp_micros(), - TimeUnit::Nanosecond => x.timestamp_nanos(), + TimeUnit::Nanosecond => x.timestamp_nanos_opt().unwrap(), }) .ok() } @@ -515,8 +515,8 @@ pub fn add_naive_interval(timestamp: i64, time_unit: TimeUnit, interval: months_ match time_unit { TimeUnit::Second => new_datetime_tz.timestamp_millis() / 1000, TimeUnit::Millisecond => new_datetime_tz.timestamp_millis(), - TimeUnit::Microsecond => new_datetime_tz.timestamp_nanos() / 1000, - TimeUnit::Nanosecond => new_datetime_tz.timestamp_nanos(), + TimeUnit::Microsecond => new_datetime_tz.timestamp_nanos_opt().unwrap() / 1000, + TimeUnit::Nanosecond => new_datetime_tz.timestamp_nanos_opt().unwrap(), } } @@ -544,7 +544,7 @@ pub fn add_interval( match time_unit { TimeUnit::Second => new_datetime_tz.timestamp_millis() / 1000, TimeUnit::Millisecond => new_datetime_tz.timestamp_millis(), - TimeUnit::Microsecond => new_datetime_tz.timestamp_nanos() / 1000, - TimeUnit::Nanosecond => new_datetime_tz.timestamp_nanos(), + TimeUnit::Microsecond => new_datetime_tz.timestamp_nanos_opt().unwrap() / 1000, + TimeUnit::Nanosecond => new_datetime_tz.timestamp_nanos_opt().unwrap(), } } From 8880501b07405bc9d3b75210ca883eecacffb8e1 Mon Sep 17 00:00:00 2001 From: zhyass Date: Sat, 23 Sep 2023 19:00:34 +0800 Subject: [PATCH 67/80] fix typo in merge_sort comment (#1571) --- src/compute/merge_sort/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compute/merge_sort/mod.rs b/src/compute/merge_sort/mod.rs index 598ba66d31..f57b09bb4a 100644 --- a/src/compute/merge_sort/mod.rs +++ b/src/compute/merge_sort/mod.rs @@ -74,7 +74,7 @@ use crate::error::Result; /// This is used to keep track of contiguous blocks of slots. /// An array of MergeSlice, `[MergeSlice]`, represents inter-leaved array slices. /// For example, `[(0, 0, 2), (1, 0, 1), (0, 2, 3)]` represents 2 arrays (a0 and a1) arranged as follows: -/// `[a0[0..2], a1[0..1], a0[2..3]]` +/// `[a0[0..2], a1[0..1], a0[2..5]]` /// This representation is useful when building arrays in memory as it allows to memcopy slices of arrays. /// This is particularly useful in merge-sort because sorted arrays (passed to the merge-sort) are more likely /// to have contiguous blocks of sorted elements (than by random). From 6271f48e4d8d1bf2b43ef1da81f6d9f681e38d63 Mon Sep 17 00:00:00 2001 From: Yijun Zhao Date: Thu, 28 Sep 2023 11:05:46 +0800 Subject: [PATCH 68/80] Add test for list_nested_decimal (#1572) --- parquet_integration/write_parquet.py | 12 +++++ tests/it/io/parquet/mod.rs | 73 ++++++++++++++++++++++++++++ tests/it/io/parquet/read.rs | 5 ++ 3 files changed, 90 insertions(+) diff --git a/parquet_integration/write_parquet.py b/parquet_integration/write_parquet.py index a7f7560fc5..072b59c775 100644 --- a/parquet_integration/write_parquet.py +++ b/parquet_integration/write_parquet.py @@ -179,6 +179,16 @@ def case_nested() -> Tuple[dict, pa.Schema, str]: ] decimal_nullable = [[Decimal(n) if n is not None else None for n in sublist] if sublist is not None else None for sublist in items_nullable] + decimal_nested = [ + [[Decimal(0), Decimal(1)]], + None, + [[Decimal(2), None], [Decimal(3)]], + [[Decimal(4), Decimal(5)], [Decimal(6)]], + [], + [[Decimal(7)], None, [Decimal(9)]], + [[], [None], None], + [[Decimal(10)]], + ] list_struct_nullable = [ [{"a": "a"}, {"a": "b"}], @@ -227,6 +237,7 @@ def case_nested() -> Tuple[dict, pa.Schema, str]: pa.field("list_decimal", pa.list_(pa.decimal128(9, 0))), pa.field("list_decimal256", pa.list_(pa.decimal256(9, 0))), pa.field("list_nested_i64", pa.list_(pa.list_(pa.int64()))), + pa.field("list_nested_decimal", pa.list_(pa.list_(pa.decimal128(9, 0)))), pa.field("list_nested_inner_required_i64", pa.list_(pa.list_(pa.int64()))), pa.field( "list_nested_inner_required_required_i64", pa.list_(pa.list_(pa.int64())) @@ -258,6 +269,7 @@ def case_nested() -> Tuple[dict, pa.Schema, str]: "list_decimal": decimal_nullable, "list_decimal256": decimal_nullable, "list_nested_i64": items_nested, + "list_nested_decimal": decimal_nested, "list_nested_inner_required_i64": items_required_nested, "list_nested_inner_required_required_i64": items_required_nested_2, "list_struct_nullable": list_struct_nullable, diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs index 4539d21a33..94d6cdf77e 100644 --- a/tests/it/io/parquet/mod.rs +++ b/tests/it/io/parquet/mod.rs @@ -256,6 +256,7 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { Box::new(array) } "list_nested_i64" + | "list_nested_decimal" | "list_nested_inner_required_i64" | "list_nested_inner_required_required_i64" => Box::new(NullArray::new(DataType::Null, 1)), "struct_list_nullable" => pyarrow_nested_nullable("list_utf8"), @@ -389,6 +390,48 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { let array: ListArray = a.into(); Box::new(array) } + "list_nested_decimal" => { + // [ + // [[Decimal(0), Decimal(1)]], + // None, + // [[Decimal(2), None], [Decimal(3)]], + // [[Decimal(4), Decimal(5)], [Decimal(6)]], + // [], + // [[Decimal(7)], None, [Decimal(9)]], + // [[], [None], None], + // [[Decimal(10)]], + // ] + + let data = [ + Some(vec![Some(vec![Some(0), Some(1)])]), + None, + Some(vec![Some(vec![Some(2), None]), Some(vec![Some(3)])]), + Some(vec![Some(vec![Some(4), Some(5)]), Some(vec![Some(6)])]), + Some(vec![]), + Some(vec![Some(vec![Some(7)]), None, Some(vec![Some(9)])]), + Some(vec![Some(vec![]), Some(vec![None]), None]), + Some(vec![Some(vec![Some(10)])]), + ]; + + let inner_array = MutablePrimitiveArray::::from(DataType::Decimal(9, 0)); + let middle_array = MutableListArray::>::new_from( + inner_array.clone(), + ListArray::::default_datatype(inner_array.data_type().clone()), + 0, + ); + let mut outer_array = MutableListArray::< + i32, + MutableListArray>, + >::new_from( + middle_array.clone(), + ListArray::::default_datatype(middle_array.data_type().clone()), + 0, + ); + + outer_array.try_extend(data).unwrap(); + let array: ListArray = outer_array.into(); + Box::new(array) + } "list_nested_inner_required_i64" => { let data = [ Some(vec![Some(vec![Some(0), Some(1)])]), @@ -948,6 +991,36 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { ) .boxed(), }, + "list_nested_decimal" => Statistics { + distinct_count: new_list( + new_list(UInt64Array::from([None]).boxed(), true).boxed(), + true, + ) + .boxed(), + null_count: new_list( + new_list(Box::new(UInt64Array::from_slice([7])), true).boxed(), + true, + ) + .boxed(), + min_value: new_list( + new_list( + Box::new(Int128Array::from_slice([0]).to(DataType::Decimal(9, 0))), + true, + ) + .boxed(), + true, + ) + .boxed(), + max_value: new_list( + new_list( + Box::new(Int128Array::from_slice([10]).to(DataType::Decimal(9, 0))), + true, + ) + .boxed(), + true, + ) + .boxed(), + }, "list_nested_inner_required_required_i64" => Statistics { distinct_count: UInt64Array::from([None]).boxed(), null_count: UInt64Array::from([Some(0)]).boxed(), diff --git a/tests/it/io/parquet/read.rs b/tests/it/io/parquet/read.rs index 8f45eb874d..7689f1532f 100644 --- a/tests/it/io/parquet/read.rs +++ b/tests/it/io/parquet/read.rs @@ -339,6 +339,11 @@ fn v2_nested_nested() -> Result<()> { test_pyarrow_integration("list_nested_i64", 2, "nested", false, false, None) } +#[test] +fn v2_nested_nested_decimal() -> Result<()> { + test_pyarrow_integration("list_nested_decimal", 2, "nested", false, false, None) +} + #[test] fn v2_nested_nested_required() -> Result<()> { test_pyarrow_integration( From 63e99ad2828669134fe4ca6f8685c75f986a9732 Mon Sep 17 00:00:00 2001 From: Jk Xu <54522439+Dousir9@users.noreply.github.com> Date: Wed, 4 Oct 2023 09:39:32 +0800 Subject: [PATCH 69/80] Improve bitmap slice unchecked (#1574) --- src/bitmap/immutable.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs index 41799e0adb..c453a6f31a 100644 --- a/src/bitmap/immutable.rs +++ b/src/bitmap/immutable.rs @@ -172,11 +172,15 @@ impl Bitmap { /// The caller must ensure that `self.offset + offset + length <= self.len()` #[inline] pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) { - // first guard a no-op slice so that we don't do a bitcount - // if there isn't any data sliced - if !(offset == 0 && length == self.length) { - // count the smallest chunk - if length < self.length / 2 { + // we don't do a bitcount in the following cases: + // 1. if there isn't any data sliced. + // 2. if this [`Bitmap`] is all true or all false. + if !(offset == 0 && length == self.length || self.unset_bits == 0) { + // if `self.unset_bits == self.length` is false, we count the smallest chunk + // and do a bitcount. + if self.unset_bits == self.length { + self.unset_bits = length; + } else if length < self.length / 2 { // count the null values in the slice self.unset_bits = count_zeros(&self.bytes, self.offset + offset, length); } else { @@ -186,9 +190,9 @@ impl Bitmap { let tail_count = count_zeros(&self.bytes, start_end, self.length - length - offset); self.unset_bits -= head_count + tail_count; } - self.offset += offset; - self.length = length; } + self.offset += offset; + self.length = length; } /// Slices `self`, offsetting by `offset` and truncating up to `length` bits. From ced09386227974e178ad0deeb57c433229e640c3 Mon Sep 17 00:00:00 2001 From: Paul C Date: Fri, 6 Oct 2023 19:22:05 -0500 Subject: [PATCH 70/80] fix: fix deserialization of parquets with large string list columns causing stack overflow (#1575) --- .../parquet/read/deserialize/binary/nested.rs | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/src/io/parquet/read/deserialize/binary/nested.rs b/src/io/parquet/read/deserialize/binary/nested.rs index 76d58f9c49..7aa4244163 100644 --- a/src/io/parquet/read/deserialize/binary/nested.rs +++ b/src/io/parquet/read/deserialize/binary/nested.rs @@ -170,22 +170,26 @@ impl Iterator for NestedIter { type Item = Result<(NestedState, Box)>; fn next(&mut self) -> Option { - let maybe_state = next( - &mut self.iter, - &mut self.items, - &mut self.dict, - &mut self.remaining, - &self.init, - self.chunk_size, - &BinaryDecoder::::default(), - ); - match maybe_state { - MaybeNext::Some(Ok((nested, decoded))) => { - Some(finish(&self.data_type, decoded.0, decoded.1).map(|array| (nested, array))) + loop { + let maybe_state = next( + &mut self.iter, + &mut self.items, + &mut self.dict, + &mut self.remaining, + &self.init, + self.chunk_size, + &BinaryDecoder::::default(), + ); + match maybe_state { + MaybeNext::Some(Ok((nested, decoded))) => { + return Some( + finish(&self.data_type, decoded.0, decoded.1).map(|array| (nested, array)), + ) + } + MaybeNext::Some(Err(e)) => return Some(Err(e)), + MaybeNext::None => return None, + MaybeNext::More => continue, // Using continue in a loop instead of calling next helps prevent stack overflow. } - MaybeNext::Some(Err(e)) => Some(Err(e)), - MaybeNext::None => None, - MaybeNext::More => self.next(), } } } From 420936ed69205fe34d33babbd0ab04817e623649 Mon Sep 17 00:00:00 2001 From: Ryan Marcus Date: Fri, 6 Oct 2023 20:24:18 -0400 Subject: [PATCH 71/80] Fixed typo (#1576) --- src/scalar/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalar/mod.rs b/src/scalar/mod.rs index e3404e4eaa..aab5ed929f 100644 --- a/src/scalar/mod.rs +++ b/src/scalar/mod.rs @@ -31,7 +31,7 @@ mod union; pub use union::UnionScalar; /// Trait object declaring an optional value with a [`DataType`]. -/// This strait is often used in APIs that accept multiple scalar types. +/// This trait is often used in APIs that accept multiple scalar types. pub trait Scalar: std::fmt::Debug + Send + Sync + dyn_clone::DynClone + 'static { /// convert itself to fn as_any(&self) -> &dyn Any; From 710d6b3d76ebd968651fb9541815210147221091 Mon Sep 17 00:00:00 2001 From: Yijun Zhao Date: Sat, 7 Oct 2023 09:59:49 +0800 Subject: [PATCH 72/80] fix: fix nested decimal read and write (#1573) --- parquet_integration/write_parquet.py | 20 +- .../deserialize/fixed_size_binary/basic.rs | 8 +- .../deserialize/fixed_size_binary/nested.rs | 2 +- src/io/parquet/write/fixed_len_bytes.rs | 32 +++- src/io/parquet/write/mod.rs | 6 +- tests/it/io/parquet/mod.rs | 164 +++++++++++++++- tests/it/io/parquet/read.rs | 88 ++++++++- tests/it/io/parquet/write.rs | 176 +++++++++++++++++- 8 files changed, 467 insertions(+), 29 deletions(-) diff --git a/parquet_integration/write_parquet.py b/parquet_integration/write_parquet.py index 072b59c775..2e0e4b332b 100644 --- a/parquet_integration/write_parquet.py +++ b/parquet_integration/write_parquet.py @@ -234,8 +234,14 @@ def case_nested() -> Tuple[dict, pa.Schema, str]: pa.field("list_bool", pa.list_(pa.bool_())), pa.field("list_utf8", pa.list_(pa.utf8())), pa.field("list_large_binary", pa.list_(pa.large_binary())), - pa.field("list_decimal", pa.list_(pa.decimal128(9, 0))), - pa.field("list_decimal256", pa.list_(pa.decimal256(9, 0))), + pa.field("list_decimal_9", pa.list_(pa.decimal128(9, 0))), + pa.field("list_decimal_18", pa.list_(pa.decimal128(18, 0))), + pa.field("list_decimal_26", pa.list_(pa.decimal128(26, 0))), + pa.field("list_decimal256_9", pa.list_(pa.decimal256(9, 0))), + pa.field("list_decimal256_18", pa.list_(pa.decimal256(18, 0))), + pa.field("list_decimal256_26", pa.list_(pa.decimal256(26, 0))), + pa.field("list_decimal256_39", pa.list_(pa.decimal256(39, 0))), + pa.field("list_decimal256_76", pa.list_(pa.decimal256(76, 0))), pa.field("list_nested_i64", pa.list_(pa.list_(pa.int64()))), pa.field("list_nested_decimal", pa.list_(pa.list_(pa.decimal128(9, 0)))), pa.field("list_nested_inner_required_i64", pa.list_(pa.list_(pa.int64()))), @@ -266,8 +272,14 @@ def case_nested() -> Tuple[dict, pa.Schema, str]: "list_bool": boolean, "list_utf8": string, "list_large_binary": string, - "list_decimal": decimal_nullable, - "list_decimal256": decimal_nullable, + "list_decimal_9": decimal_nullable, + "list_decimal_18": decimal_nullable, + "list_decimal_26": decimal_nullable, + "list_decimal256_9": decimal_nullable, + "list_decimal256_18": decimal_nullable, + "list_decimal256_26": decimal_nullable, + "list_decimal256_39": decimal_nullable, + "list_decimal256_76": decimal_nullable, "list_nested_i64": items_nested, "list_nested_decimal": decimal_nested, "list_nested_inner_required_i64": items_required_nested, diff --git a/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs b/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs index c77ff5f027..913d1e6be4 100644 --- a/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs +++ b/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs @@ -46,11 +46,11 @@ pub(super) struct Required<'a> { } impl<'a> Required<'a> { - pub(super) fn new(page: &'a DataPage, size: usize) -> Self { - let values = page.buffer(); + pub(super) fn try_new(page: &'a DataPage, size: usize) -> Result { + let (_, _, values) = split_buffer(page)?; assert_eq!(values.len() % size, 0); let values = values.chunks_exact(size); - Self { values } + Ok(Self { values }) } #[inline] @@ -171,7 +171,7 @@ impl<'a> Decoder<'a> for BinaryDecoder { Ok(State::Optional(Optional::try_new(page, self.size)?)) } (Encoding::Plain, _, false, false) => { - Ok(State::Required(Required::new(page, self.size))) + Ok(State::Required(Required::try_new(page, self.size)?)) } (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false, false) => { RequiredDictionary::try_new(page, dict).map(State::RequiredDictionary) diff --git a/src/io/parquet/read/deserialize/fixed_size_binary/nested.rs b/src/io/parquet/read/deserialize/fixed_size_binary/nested.rs index 5cef9eabfc..19552447f9 100644 --- a/src/io/parquet/read/deserialize/fixed_size_binary/nested.rs +++ b/src/io/parquet/read/deserialize/fixed_size_binary/nested.rs @@ -60,7 +60,7 @@ impl<'a> NestedDecoder<'a> for BinaryDecoder { Ok(State::Optional(Optional::try_new(page, self.size)?)) } (Encoding::Plain, _, false, false) => { - Ok(State::Required(Required::new(page, self.size))) + Ok(State::Required(Required::try_new(page, self.size)?)) } (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false, false) => { RequiredDictionary::try_new(page, dict).map(State::RequiredDictionary) diff --git a/src/io/parquet/write/fixed_len_bytes.rs b/src/io/parquet/write/fixed_len_bytes.rs index 91b641da17..789b822a73 100644 --- a/src/io/parquet/write/fixed_len_bytes.rs +++ b/src/io/parquet/write/fixed_len_bytes.rs @@ -6,6 +6,7 @@ use parquet2::{ }; use super::{binary::ord_binary, utils, WriteOptions}; +use crate::io::parquet::write::{nested, Nested}; use crate::types::i256; use crate::{ array::{Array, FixedSizeBinaryArray, PrimitiveArray}, @@ -62,7 +63,36 @@ pub fn array_to_page( ) } -pub(super) fn build_statistics( +pub fn nested_array_to_page( + array: &FixedSizeBinaryArray, + options: WriteOptions, + type_: PrimitiveType, + statistics: Option, + nested: &[Nested], +) -> Result { + let is_optional = is_nullable(&type_.field_info); + + let mut buffer = vec![]; + let (repetition_levels_byte_length, definition_levels_byte_length) = + nested::write_rep_and_def(options.version, nested, &mut buffer)?; + + encode_plain(array, is_optional, &mut buffer); + + utils::build_plain_page( + buffer, + nested::num_values(nested), + nested[0].len(), + array.null_count(), + repetition_levels_byte_length, + definition_levels_byte_length, + statistics.map(|x| serialize_statistics(&x)), + type_, + options, + Encoding::Plain, + ) +} + +pub fn build_statistics( array: &FixedSizeBinaryArray, primitive_type: PrimitiveType, ) -> FixedLenStatistics { diff --git a/src/io/parquet/write/mod.rs b/src/io/parquet/write/mod.rs index 7889ea04fa..d4134f27df 100644 --- a/src/io/parquet/write/mod.rs +++ b/src/io/parquet/write/mod.rs @@ -727,7 +727,7 @@ fn array_to_page_nested( values.into(), array.validity().cloned(), ); - fixed_len_bytes::array_to_page(&array, options, type_, statistics) + fixed_len_bytes::nested_array_to_page(&array, options, type_, statistics, nested) } } Decimal256(precision, _) => { @@ -782,7 +782,7 @@ fn array_to_page_nested( values.into(), array.validity().cloned(), ); - fixed_len_bytes::array_to_page(&array, options, type_, statistics) + fixed_len_bytes::nested_array_to_page(&array, options, type_, statistics, nested) } else { let size = 32; let array = array @@ -807,7 +807,7 @@ fn array_to_page_nested( array.validity().cloned(), ); - fixed_len_bytes::array_to_page(&array, options, type_, statistics) + fixed_len_bytes::nested_array_to_page(&array, options, type_, statistics, nested) } } other => Err(Error::NotYetImplemented(format!( diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs index 94d6cdf77e..4803cc9c52 100644 --- a/tests/it/io/parquet/mod.rs +++ b/tests/it/io/parquet/mod.rs @@ -240,14 +240,28 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { Some(b"bbb".to_vec()), Some(b"".to_vec()), ])), - "list_decimal" => { + "list_decimal_9" => { let values = i64_values .iter() .map(|x| x.map(|x| x as i128)) .collect::>(); Box::new(PrimitiveArray::::from(values).to(DataType::Decimal(9, 0))) } - "list_decimal256" => { + "list_decimal_18" => { + let values = i64_values + .iter() + .map(|x| x.map(|x| x as i128)) + .collect::>(); + Box::new(PrimitiveArray::::from(values).to(DataType::Decimal(18, 0))) + } + "list_decimal_26" => { + let values = i64_values + .iter() + .map(|x| x.map(|x| x as i128)) + .collect::>(); + Box::new(PrimitiveArray::::from(values).to(DataType::Decimal(26, 0))) + } + "list_decimal256_9" => { let values = i64_values .iter() .map(|x| x.map(|x| i256(x.as_i256()))) @@ -255,6 +269,38 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { let array = PrimitiveArray::::from(values).to(DataType::Decimal256(9, 0)); Box::new(array) } + "list_decimal256_18" => { + let values = i64_values + .iter() + .map(|x| x.map(|x| i256(x.as_i256()))) + .collect::>(); + let array = PrimitiveArray::::from(values).to(DataType::Decimal256(18, 0)); + Box::new(array) + } + "list_decimal256_26" => { + let values = i64_values + .iter() + .map(|x| x.map(|x| i256(x.as_i256()))) + .collect::>(); + let array = PrimitiveArray::::from(values).to(DataType::Decimal256(26, 0)); + Box::new(array) + } + "list_decimal256_39" => { + let values = i64_values + .iter() + .map(|x| x.map(|x| i256(x.as_i256()))) + .collect::>(); + let array = PrimitiveArray::::from(values).to(DataType::Decimal256(39, 0)); + Box::new(array) + } + "list_decimal256_76" => { + let values = i64_values + .iter() + .map(|x| x.map(|x| i256(x.as_i256()))) + .collect::>(); + let array = PrimitiveArray::::from(values).to(DataType::Decimal256(76, 0)); + Box::new(array) + } "list_nested_i64" | "list_nested_decimal" | "list_nested_inner_required_i64" @@ -479,8 +525,14 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { "list_bool" => Field::new("item", DataType::Boolean, true), "list_utf8" => Field::new("item", DataType::Utf8, true), "list_large_binary" => Field::new("item", DataType::LargeBinary, true), - "list_decimal" => Field::new("item", DataType::Decimal(9, 0), true), - "list_decimal256" => Field::new("item", DataType::Decimal256(9, 0), true), + "list_decimal_9" => Field::new("item", DataType::Decimal(9, 0), true), + "list_decimal_18" => Field::new("item", DataType::Decimal(18, 0), true), + "list_decimal_26" => Field::new("item", DataType::Decimal(26, 0), true), + "list_decimal256_9" => Field::new("item", DataType::Decimal256(9, 0), true), + "list_decimal256_18" => Field::new("item", DataType::Decimal256(18, 0), true), + "list_decimal256_26" => Field::new("item", DataType::Decimal256(26, 0), true), + "list_decimal256_39" => Field::new("item", DataType::Decimal256(39, 0), true), + "list_decimal256_76" => Field::new("item", DataType::Decimal256(76, 0), true), "list_struct_nullable" => Field::new("item", values.data_type().clone(), true), "list_struct_list_nullable" => Field::new("item", values.data_type().clone(), true), other => unreachable!("{}", other), @@ -927,7 +979,7 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { min_value: new_list(Box::new(BinaryArray::::from_slice([b""])), true).boxed(), max_value: new_list(Box::new(BinaryArray::::from_slice([b"ccc"])), true).boxed(), }, - "list_decimal" => Statistics { + "list_decimal_9" => Statistics { distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(), null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(), min_value: new_list( @@ -941,7 +993,35 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { ) .boxed(), }, - "list_decimal256" => Statistics { + "list_decimal_18" => Statistics { + distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(), + null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(), + min_value: new_list( + Box::new(Int128Array::from_slice([0]).to(DataType::Decimal(18, 0))), + true, + ) + .boxed(), + max_value: new_list( + Box::new(Int128Array::from_slice([10]).to(DataType::Decimal(18, 0))), + true, + ) + .boxed(), + }, + "list_decimal_26" => Statistics { + distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(), + null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(), + min_value: new_list( + Box::new(Int128Array::from_slice([0]).to(DataType::Decimal(26, 0))), + true, + ) + .boxed(), + max_value: new_list( + Box::new(Int128Array::from_slice([10]).to(DataType::Decimal(26, 0))), + true, + ) + .boxed(), + }, + "list_decimal256_9" => Statistics { distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(), null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(), min_value: new_list( @@ -959,6 +1039,78 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { ) .boxed(), }, + "list_decimal256_18" => Statistics { + distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(), + null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(), + min_value: new_list( + Box::new( + Int256Array::from_slice([i256(0.as_i256())]).to(DataType::Decimal256(18, 0)), + ), + true, + ) + .boxed(), + max_value: new_list( + Box::new( + Int256Array::from_slice([i256(10.as_i256())]).to(DataType::Decimal256(18, 0)), + ), + true, + ) + .boxed(), + }, + "list_decimal256_26" => Statistics { + distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(), + null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(), + min_value: new_list( + Box::new( + Int256Array::from_slice([i256(0.as_i256())]).to(DataType::Decimal256(26, 0)), + ), + true, + ) + .boxed(), + max_value: new_list( + Box::new( + Int256Array::from_slice([i256(10.as_i256())]).to(DataType::Decimal256(26, 0)), + ), + true, + ) + .boxed(), + }, + "list_decimal256_39" => Statistics { + distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(), + null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(), + min_value: new_list( + Box::new( + Int256Array::from_slice([i256(0.as_i256())]).to(DataType::Decimal256(39, 0)), + ), + true, + ) + .boxed(), + max_value: new_list( + Box::new( + Int256Array::from_slice([i256(10.as_i256())]).to(DataType::Decimal256(39, 0)), + ), + true, + ) + .boxed(), + }, + "list_decimal256_76" => Statistics { + distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(), + null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(), + min_value: new_list( + Box::new( + Int256Array::from_slice([i256(0.as_i256())]).to(DataType::Decimal256(76, 0)), + ), + true, + ) + .boxed(), + max_value: new_list( + Box::new( + Int256Array::from_slice([i256(10.as_i256())]).to(DataType::Decimal256(76, 0)), + ), + true, + ) + .boxed(), + }, "list_int64" => Statistics { distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(), null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(), diff --git a/tests/it/io/parquet/read.rs b/tests/it/io/parquet/read.rs index 7689f1532f..12512116f4 100644 --- a/tests/it/io/parquet/read.rs +++ b/tests/it/io/parquet/read.rs @@ -62,8 +62,14 @@ fn test_pyarrow_integration( "list_nested_i64", "list_utf8", "list_bool", - "list_decimal", - "list_decimal256", + "list_decimal_9", + "list_decimal_18", + "list_decimal_26", + "list_decimal256_9", + "list_decimal256_18", + "list_decimal256_26", + "list_decimal256_39", + "list_decimal256_76", "list_nested_inner_required_required_i64", "list_nested_inner_required_i64", // pyarrow counts null struct items as nulls @@ -325,13 +331,83 @@ fn v1_nested_large_binary() -> Result<()> { } #[test] -fn v2_nested_decimal_nullable() -> Result<()> { - test_pyarrow_integration("list_decimal", 2, "nested", false, false, None) +fn v1_nested_decimal_9_nullable() -> Result<()> { + test_pyarrow_integration("list_decimal_9", 1, "nested", false, false, None) } #[test] -fn v2_nested_decimal256_nullable() -> Result<()> { - test_pyarrow_integration("list_decimal256", 2, "nested", false, false, None) +fn v1_nested_decimal_18_nullable() -> Result<()> { + test_pyarrow_integration("list_decimal_18", 1, "nested", false, false, None) +} + +#[test] +fn v1_nested_decimal_26_nullable() -> Result<()> { + test_pyarrow_integration("list_decimal_26", 1, "nested", false, false, None) +} + +#[test] +fn v2_nested_decimal_9_nullable() -> Result<()> { + test_pyarrow_integration("list_decimal_9", 2, "nested", false, false, None) +} + +#[test] +fn v2_nested_decimal_18_nullable() -> Result<()> { + test_pyarrow_integration("list_decimal_18", 2, "nested", false, false, None) +} + +#[test] +fn v2_nested_decimal_26_nullable() -> Result<()> { + test_pyarrow_integration("list_decimal_26", 2, "nested", false, false, None) +} + +#[test] +fn v1_nested_decimal256_9_nullable() -> Result<()> { + test_pyarrow_integration("list_decimal256_9", 1, "nested", false, false, None) +} + +#[test] +fn v1_nested_decimal256_18_nullable() -> Result<()> { + test_pyarrow_integration("list_decimal256_18", 1, "nested", false, false, None) +} + +#[test] +fn v1_nested_decimal256_26_nullable() -> Result<()> { + test_pyarrow_integration("list_decimal256_26", 1, "nested", false, false, None) +} + +#[test] +fn v1_nested_decimal256_39_nullable() -> Result<()> { + test_pyarrow_integration("list_decimal256_39", 1, "nested", false, false, None) +} + +#[test] +fn v1_nested_decimal256_76_nullable() -> Result<()> { + test_pyarrow_integration("list_decimal256_76", 1, "nested", false, false, None) +} + +#[test] +fn v2_nested_decimal256_9_nullable() -> Result<()> { + test_pyarrow_integration("list_decimal256_9", 2, "nested", false, false, None) +} + +#[test] +fn v2_nested_decimal256_18_nullable() -> Result<()> { + test_pyarrow_integration("list_decimal256_18", 2, "nested", false, false, None) +} + +#[test] +fn v2_nested_decimal256_26_nullable() -> Result<()> { + test_pyarrow_integration("list_decimal256_26", 2, "nested", false, false, None) +} + +#[test] +fn v2_nested_decimal256_39_nullable() -> Result<()> { + test_pyarrow_integration("list_decimal256_39", 2, "nested", false, false, None) +} + +#[test] +fn v2_nested_decimal256_76_nullable() -> Result<()> { + test_pyarrow_integration("list_decimal256_76", 2, "nested", false, false, None) } #[test] diff --git a/tests/it/io/parquet/write.rs b/tests/it/io/parquet/write.rs index 5fda011374..dee5b8e253 100644 --- a/tests/it/io/parquet/write.rs +++ b/tests/it/io/parquet/write.rs @@ -405,9 +405,9 @@ fn list_struct_nullable() -> Result<()> { } #[test] -fn list_decimal_nullable() -> Result<()> { +fn list_decimal_9_nullable_v1() -> Result<()> { round_trip_opt_stats( - "list_decimal", + "list_decimal_9", "nested", Version::V1, CompressionOptions::Uncompressed, @@ -417,9 +417,9 @@ fn list_decimal_nullable() -> Result<()> { } #[test] -fn list_decimal256_nullable() -> Result<()> { +fn list_decimal_18_nullable_v1() -> Result<()> { round_trip_opt_stats( - "list_decimal256", + "list_decimal_18", "nested", Version::V1, CompressionOptions::Uncompressed, @@ -428,6 +428,174 @@ fn list_decimal256_nullable() -> Result<()> { ) } +#[test] +fn list_decimal_26_nullable_v1() -> Result<()> { + round_trip_opt_stats( + "list_decimal_26", + "nested", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + +#[test] +fn list_decimal_9_nullable_v2() -> Result<()> { + round_trip_opt_stats( + "list_decimal_9", + "nested", + Version::V2, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + +#[test] +fn list_decimal_18_nullable_v2() -> Result<()> { + round_trip_opt_stats( + "list_decimal_18", + "nested", + Version::V2, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + +#[test] +fn list_decimal_26_nullable_v2() -> Result<()> { + round_trip_opt_stats( + "list_decimal_26", + "nested", + Version::V2, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + +#[test] +fn list_decimal256_9_nullable_v1() -> Result<()> { + round_trip_opt_stats( + "list_decimal256_9", + "nested", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + +#[test] +fn list_decimal256_18_nullable_v1() -> Result<()> { + round_trip_opt_stats( + "list_decimal256_18", + "nested", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + +#[test] +fn list_decimal256_26_nullable_v1() -> Result<()> { + round_trip_opt_stats( + "list_decimal256_26", + "nested", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + +#[test] +fn list_decimal256_39_nullable_v1() -> Result<()> { + round_trip_opt_stats( + "list_decimal256_39", + "nested", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + +#[test] +fn list_decimal256_76_nullable_v1() -> Result<()> { + round_trip_opt_stats( + "list_decimal256_76", + "nested", + Version::V1, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + +#[test] +fn list_decimal256_9_nullable_v2() -> Result<()> { + round_trip_opt_stats( + "list_decimal256_9", + "nested", + Version::V2, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + +#[test] +fn list_decimal256_18_nullable_v2() -> Result<()> { + round_trip_opt_stats( + "list_decimal256_18", + "nested", + Version::V2, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + +#[test] +fn list_decimal256_26_nullable_v2() -> Result<()> { + round_trip_opt_stats( + "list_decimal256_26", + "nested", + Version::V2, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + +#[test] +fn list_decimal256_39_nullable_v2() -> Result<()> { + round_trip_opt_stats( + "list_decimal256_39", + "nested", + Version::V2, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + +#[test] +fn list_decimal256_76_nullable_v2() -> Result<()> { + round_trip_opt_stats( + "list_decimal256_76", + "nested", + Version::V2, + CompressionOptions::Uncompressed, + vec![Encoding::Plain], + true, + ) +} + #[test] fn v1_nested_struct_list_nullable() -> Result<()> { round_trip_opt_stats( From dd80c891850213104c8c0d11b76b56401cb1ce52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Sat, 7 Oct 2023 04:17:22 +0200 Subject: [PATCH 73/80] Fix the inferred nullability when converting a nested parquet schema to arrow (#1565) --- src/io/parquet/read/schema/convert.rs | 74 +++++++++++++++++++++------ 1 file changed, 58 insertions(+), 16 deletions(-) diff --git a/src/io/parquet/read/schema/convert.rs b/src/io/parquet/read/schema/convert.rs index dea14ca86b..1ec43acd4e 100644 --- a/src/io/parquet/read/schema/convert.rs +++ b/src/io/parquet/read/schema/convert.rs @@ -299,7 +299,7 @@ fn to_group_type( pub(crate) fn is_nullable(field_info: &FieldInfo) -> bool { match field_info.repetition { Repetition::Optional => true, - Repetition::Repeated => true, + Repetition::Repeated => false, Repetition::Required => false, } } @@ -353,12 +353,12 @@ fn to_list( let field = fields.first().unwrap(); ( &field.get_field_info().name, - field.get_field_info().repetition != Repetition::Required, + field.get_field_info().repetition == Repetition::Optional, ) } _ => ( &item.get_field_info().name, - item.get_field_info().repetition != Repetition::Required, + item.get_field_info().repetition == Repetition::Optional, ), }; @@ -611,7 +611,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("element", DataType::Utf8, true))), + DataType::List(Box::new(Field::new("element", DataType::Utf8, false))), true, )); } @@ -623,7 +623,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("element", DataType::Int32, true))), + DataType::List(Box::new(Field::new("element", DataType::Int32, false))), true, )); } @@ -642,7 +642,7 @@ mod tests { ]); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("element", arrow_struct, true))), + DataType::List(Box::new(Field::new("element", arrow_struct, false))), true, )); } @@ -658,7 +658,7 @@ mod tests { let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("array", arrow_struct, true))), + DataType::List(Box::new(Field::new("array", arrow_struct, false))), true, )); } @@ -674,7 +674,7 @@ mod tests { let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("my_list_tuple", arrow_struct, true))), + DataType::List(Box::new(Field::new("my_list_tuple", arrow_struct, false))), true, )); } @@ -684,8 +684,50 @@ mod tests { { arrow_fields.push(Field::new( "name", - DataType::List(Box::new(Field::new("name", DataType::Int32, true))), - true, + DataType::List(Box::new(Field::new("name", DataType::Int32, false))), + false, + )); + } + + let parquet_schema = SchemaDescriptor::try_from_message(message_type)?; + let fields = parquet_to_arrow_schema(parquet_schema.fields()); + + assert_eq!(arrow_fields, fields); + Ok(()) + } + + #[test] + fn test_parquet_list_with_struct() -> Result<()> { + let mut arrow_fields = Vec::new(); + + let message_type = " + message eventlog { + REQUIRED group events (LIST) { + REPEATED group array { + REQUIRED BYTE_ARRAY event_name (STRING); + REQUIRED INT64 event_time (TIMESTAMP(MILLIS,true)); + } + } + } + "; + + { + let struct_fields = vec![ + Field::new("event_name", DataType::Utf8, false), + Field::new( + "event_time", + DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())), + false, + ), + ]; + arrow_fields.push(Field::new( + "events", + DataType::List(Box::new(Field::new( + "array", + DataType::Struct(struct_fields), + false, + ))), + false, )); } @@ -812,9 +854,9 @@ mod tests { DataType::List(Box::new(Field::new( "innerGroup", DataType::Struct(vec![Field::new("leaf3", DataType::Int32, true)]), - true, + false, ))), - true, + false, ); let outer_group_list = Field::new( @@ -825,9 +867,9 @@ mod tests { Field::new("leaf2", DataType::Int32, true), inner_group_list, ]), - true, + false, ))), - true, + false, ); arrow_fields.push(outer_group_list); } @@ -888,8 +930,8 @@ mod tests { Field::new("string", DataType::Utf8, true), Field::new( "bools", - DataType::List(Box::new(Field::new("bools", DataType::Boolean, true))), - true, + DataType::List(Box::new(Field::new("bools", DataType::Boolean, false))), + false, ), Field::new("date", DataType::Date32, true), Field::new("time_milli", DataType::Time32(TimeUnit::Millisecond), true), From 6a4b53169a48cbd234cecde6ab6a98f84146fca2 Mon Sep 17 00:00:00 2001 From: Jk Xu <54522439+Dousir9@users.noreply.github.com> Date: Fri, 13 Oct 2023 21:40:29 +0800 Subject: [PATCH 74/80] add new_constant for Bitmap (#1579) --- src/bitmap/immutable.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs index c453a6f31a..57502e911e 100644 --- a/src/bitmap/immutable.rs +++ b/src/bitmap/immutable.rs @@ -283,6 +283,15 @@ impl Bitmap { } } + /// Initializes an new [`Bitmap`] filled with set/unset values. + #[inline] + pub fn new_constant(value: bool, length: usize) -> Self { + match value { + true => Self::new_trued(length), + false => Self::new_zeroed(length), + } + } + /// Initializes an new [`Bitmap`] filled with unset values. #[inline] pub fn new_zeroed(length: usize) -> Self { @@ -292,6 +301,15 @@ impl Bitmap { unsafe { Bitmap::from_inner_unchecked(Arc::new(bytes.into()), 0, length, length) } } + /// Initializes an new [`Bitmap`] filled with set values. + #[inline] + pub fn new_trued(length: usize) -> Self { + // just set each byte to u8::MAX + // we will not access data with index >= length + let bytes = vec![0b11111111u8; length.saturating_add(7) / 8]; + unsafe { Bitmap::from_inner_unchecked(Arc::new(bytes.into()), 0, length, length) } + } + /// Counts the nulls (unset bits) starting from `offset` bits and for `length` bits. #[inline] pub fn null_count_range(&self, offset: usize, length: usize) -> usize { From 3c61372d7ac76bba149316b1fbad6e981752e502 Mon Sep 17 00:00:00 2001 From: Jk Xu <54522439+Dousir9@users.noreply.github.com> Date: Wed, 18 Oct 2023 15:26:47 +0800 Subject: [PATCH 75/80] fix bitmap new_trued (#1580) --- src/bitmap/immutable.rs | 2 +- tests/it/bitmap/immutable.rs | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs index 57502e911e..6883d3312f 100644 --- a/src/bitmap/immutable.rs +++ b/src/bitmap/immutable.rs @@ -307,7 +307,7 @@ impl Bitmap { // just set each byte to u8::MAX // we will not access data with index >= length let bytes = vec![0b11111111u8; length.saturating_add(7) / 8]; - unsafe { Bitmap::from_inner_unchecked(Arc::new(bytes.into()), 0, length, length) } + unsafe { Bitmap::from_inner_unchecked(Arc::new(bytes.into()), 0, length, 0) } } /// Counts the nulls (unset bits) starting from `offset` bits and for `length` bits. diff --git a/tests/it/bitmap/immutable.rs b/tests/it/bitmap/immutable.rs index 5e6157413e..cc003009e0 100644 --- a/tests/it/bitmap/immutable.rs +++ b/tests/it/bitmap/immutable.rs @@ -32,6 +32,25 @@ fn as_slice_offset_middle() { assert_eq!(length, 5); } +#[test] +fn new_constant() { + let b = Bitmap::new_constant(true, 9); + let (slice, offset, length) = b.as_slice(); + assert_eq!(slice[0], 0b11111111); + assert!((slice[1] & 0b00000001) > 0); + assert_eq!(offset, 0); + assert_eq!(length, 9); + assert_eq!(b.unset_bits(), 0); + + let b = Bitmap::new_constant(false, 9); + let (slice, offset, length) = b.as_slice(); + assert_eq!(slice[0], 0b00000000); + assert!((slice[1] & 0b00000001) == 0); + assert_eq!(offset, 0); + assert_eq!(length, 9); + assert_eq!(b.unset_bits(), 9); +} + #[test] fn debug() { let b = Bitmap::from([true, true, false, true, true, true, true, true, true]); From 9a26422d00b83c65245f75e02eb436dedd91b5b8 Mon Sep 17 00:00:00 2001 From: sundyli <543950155@qq.com> Date: Thu, 19 Oct 2023 17:19:15 -0700 Subject: [PATCH 76/80] chore: add max bytes_estimate to reserve the capacity of binary (#1581) --- src/io/parquet/read/deserialize/binary/utils.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/io/parquet/read/deserialize/binary/utils.rs b/src/io/parquet/read/deserialize/binary/utils.rs index ec514766fa..a48063c56e 100644 --- a/src/io/parquet/read/deserialize/binary/utils.rs +++ b/src/io/parquet/read/deserialize/binary/utils.rs @@ -48,7 +48,8 @@ impl Binary { if self.offsets.len_proxy() == 100 && self.offsets.capacity() > 100 { let bytes_per_row = self.values.len() / 100 + 1; let bytes_estimate = bytes_per_row * self.offsets.capacity(); - if bytes_estimate > self.values.capacity() { + + if bytes_estimate > self.values.capacity() && bytes_estimate < 10 * 1024 * 1024 { self.values.reserve(bytes_estimate - self.values.capacity()); } } From 346c866c4dbfd9d9517148fd6d18dd2f17b730d1 Mon Sep 17 00:00:00 2001 From: Ryan Marcus Date: Sat, 21 Oct 2023 20:09:03 -0400 Subject: [PATCH 77/80] Add a "contains" fast-path to `like_utf8_scalar` (#1582) --- Cargo.toml | 10 +++++++++- benches/like_kernels.rs | 22 ++++++++++++++++++++++ src/compute/like.rs | 11 +++++++++++ tests/it/compute/like.rs | 4 ++++ 4 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 benches/like_kernels.rs diff --git a/Cargo.toml b/Cargo.toml index 1bb20a6955..50dcea2e51 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -100,6 +100,9 @@ odbc-api = { version = "0.36", optional = true } # Faster hashing ahash = "0.8" +# For `LIKE` matching "contains" fast-path +memchr = { version = "2.6", optional = true } + # Support conversion to/from arrow-rs arrow-buffer = { version = ">=40", optional = true } arrow-schema = { version = ">=40", optional = true } @@ -237,7 +240,7 @@ compute_filter = [] compute_hash = ["multiversion"] compute_if_then_else = [] compute_length = [] -compute_like = ["regex", "regex-syntax"] +compute_like = ["regex", "regex-syntax", "dep:memchr"] compute_limit = [] compute_merge_sort = ["itertools", "compute_sort"] compute_nullif = ["compute_comparison"] @@ -394,3 +397,8 @@ harness = false [[bench]] name = "assign_ops" harness = false + +[[bench]] +name = "like_kernels" +harness = false + diff --git a/benches/like_kernels.rs b/benches/like_kernels.rs new file mode 100644 index 0000000000..24f700244c --- /dev/null +++ b/benches/like_kernels.rs @@ -0,0 +1,22 @@ +use arrow2::util::bench_util::create_string_array; +use criterion::{criterion_group, criterion_main, Criterion}; + +use arrow2::array::*; +use arrow2::compute::like::like_utf8_scalar; + +fn bench_like(array: &Utf8Array, pattern: &str) { + criterion::black_box(like_utf8_scalar(array, pattern).unwrap()); +} + +fn add_benchmark(c: &mut Criterion) { + for size_log2 in 16..21_u32 { + let size = size_log2.pow(2) as usize; + let array = create_string_array::(100, size, 0.0, 0); + c.bench_function(&format!("LIKE length = 2^{}", size_log2), |b| { + b.iter(|| bench_like(&array, "%abba%")) + }); + } +} + +criterion_group!(benches, add_benchmark); +criterion_main!(benches); diff --git a/src/compute/like.rs b/src/compute/like.rs index 98c1ea92f2..d52e9c5e9f 100644 --- a/src/compute/like.rs +++ b/src/compute/like.rs @@ -152,6 +152,17 @@ fn a_like_utf8_scalar bool>( // fast path, can use ends_with let ends_with = &rhs[1..]; Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x.ends_with(ends_with)))) + } else if rhs.starts_with('%') + && rhs.ends_with('%') + && !rhs.ends_with("\\%") + && !rhs[1..rhs.len() - 1].contains(is_like_pattern) + { + let needle = &rhs[1..rhs.len() - 1]; + let finder = memchr::memmem::Finder::new(needle); + Bitmap::from_trusted_len_iter( + lhs.values_iter() + .map(|x| op(finder.find(x.as_bytes()).is_some())), + ) } else { let re_pattern = replace_pattern(rhs); let re = Regex::new(&format!("^{re_pattern}$")).map_err(|e| { diff --git a/tests/it/compute/like.rs b/tests/it/compute/like.rs index c7026be7ca..8b99beb081 100644 --- a/tests/it/compute/like.rs +++ b/tests/it/compute/like.rs @@ -58,6 +58,10 @@ fn test_like_utf8_scalar() -> Result<()> { let result = like_utf8_scalar(&array, "A\\_row").unwrap(); assert_eq!(result, BooleanArray::from_slice([true, false])); + let array = Utf8Array::::from_slice(["Arrow", "Arrow", "row your", "boat"]); + let result = like_utf8_scalar(&array, "%row%").unwrap(); + assert_eq!(result, BooleanArray::from_slice([true, true, true, false])); + Ok(()) } From 45313f7e1af6e164a7fd45940db2611d81ddeb1d Mon Sep 17 00:00:00 2001 From: sundyli <543950155@qq.com> Date: Tue, 24 Oct 2023 00:49:14 -0700 Subject: [PATCH 78/80] bump chrono to 0.4.31 (#1584) --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 50dcea2e51..5deab656f5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,7 @@ either = "1.9" num-traits = "0.2" dyn-clone = "1" bytemuck = { version = "1", features = ["derive"] } -chrono = { version = "0.4", default_features = false, features = ["std"] } +chrono = { version = "0.4.31", default_features = false, features = ["std"] } # for decimal i256 ethnum = "1" From b0734542c2fef5d2d0c7b6ffce5d094de371168a Mon Sep 17 00:00:00 2001 From: baishen Date: Tue, 24 Oct 2023 15:54:58 +0800 Subject: [PATCH 79/80] feat: Add `nested_column_iter_to_arrays` to deserialize inner columns (#1583) --- src/io/parquet/read/deserialize/mod.rs | 19 ++++++ src/io/parquet/read/mod.rs | 2 +- tests/it/io/parquet/deserialize.rs | 85 ++++++++++++++++++++++++++ tests/it/io/parquet/mod.rs | 1 + 4 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 tests/it/io/parquet/deserialize.rs diff --git a/src/io/parquet/read/deserialize/mod.rs b/src/io/parquet/read/deserialize/mod.rs index 8dd55bb877..1079e577a8 100644 --- a/src/io/parquet/read/deserialize/mod.rs +++ b/src/io/parquet/read/deserialize/mod.rs @@ -214,3 +214,22 @@ where .map(|x| x.map(|x| x.1)), )) } + +/// Basically the same as `column_iter_to_arrays`, with the addition of the `init` parameter +/// to read the inner columns of the nested type directly, instead of reading the entire nested type. +pub fn nested_column_iter_to_arrays<'a, I: 'a>( + columns: Vec, + types: Vec<&PrimitiveType>, + field: Field, + init: Vec, + chunk_size: Option, + num_rows: usize, +) -> Result> +where + I: Pages, +{ + Ok(Box::new( + nested::columns_to_iter_recursive(columns, types, field, init, num_rows, chunk_size)? + .map(|x| x.map(|x| x.1)), + )) +} diff --git a/src/io/parquet/read/mod.rs b/src/io/parquet/read/mod.rs index baaffd6d44..ea2b2f46d4 100644 --- a/src/io/parquet/read/mod.rs +++ b/src/io/parquet/read/mod.rs @@ -37,7 +37,7 @@ use crate::{array::Array, error::Result}; use crate::types::{i256, NativeType}; pub use deserialize::{ column_iter_to_arrays, create_list, create_map, get_page_iterator, init_nested, n_columns, - InitNested, NestedArrayIter, NestedState, StructIterator, + nested_column_iter_to_arrays, InitNested, NestedArrayIter, NestedState, StructIterator, }; pub use file::{FileReader, RowGroupReader}; pub use row_group::*; diff --git a/tests/it/io/parquet/deserialize.rs b/tests/it/io/parquet/deserialize.rs new file mode 100644 index 0000000000..3ea1c2846e --- /dev/null +++ b/tests/it/io/parquet/deserialize.rs @@ -0,0 +1,85 @@ +use std::fs::File; + +use arrow2::{ + array::StructArray, + datatypes::DataType, + error::Result, + io::parquet::read::{ + infer_schema, n_columns, nested_column_iter_to_arrays, read_columns, read_metadata, + to_deserializer, BasicDecompressor, InitNested, PageReader, + }, +}; + +#[test] +fn test_deserialize_nested_column() -> Result<()> { + let path = "testing/parquet-testing/data/nested_structs.rust.parquet"; + let mut reader = File::open(path).unwrap(); + + let metadata = read_metadata(&mut reader)?; + let schema = infer_schema(&metadata)?; + + let num_rows = metadata.num_rows; + let row_group = metadata.row_groups[0].clone(); + + let field_columns = schema + .fields + .iter() + .map(|field| read_columns(&mut reader, row_group.columns(), &field.name)) + .collect::>>()?; + + let fields = schema.fields.clone(); + for (mut columns, field) in field_columns.into_iter().zip(fields.iter()) { + if let DataType::Struct(inner_fields) = &field.data_type { + let mut array_iter = + to_deserializer(columns.clone(), field.clone(), num_rows, None, None)?; + let array = array_iter.next().transpose()?.unwrap(); + let expected_array = array + .as_any() + .downcast_ref::() + .unwrap() + .clone(); + + // deserialize inner values of struct fields. + let init = vec![InitNested::Struct(field.is_nullable)]; + let mut values = Vec::with_capacity(inner_fields.len()); + for inner_field in inner_fields { + let n = n_columns(&inner_field.data_type); + let inner_columns: Vec<_> = columns.drain(0..n).collect(); + + let (nestd_columns, types): (Vec<_>, Vec<_>) = inner_columns + .into_iter() + .map(|(column_meta, chunk)| { + let len = chunk.len(); + let pages = PageReader::new( + std::io::Cursor::new(chunk), + column_meta, + std::sync::Arc::new(|_, _| true), + vec![], + len * 2 + 1024, + ); + ( + BasicDecompressor::new(pages, vec![]), + &column_meta.descriptor().descriptor.primitive_type, + ) + }) + .unzip(); + + let mut inner_array_iter = nested_column_iter_to_arrays( + nestd_columns, + types, + inner_field.clone(), + init.clone(), + None, + num_rows, + )?; + let inner_array = inner_array_iter.next().transpose()?; + values.push(inner_array.unwrap()); + } + let struct_array = StructArray::try_new(field.data_type.clone(), values, None)?; + + assert_eq!(expected_array, struct_array); + } + } + + Ok(()) +} diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs index 4803cc9c52..1b38c61c99 100644 --- a/tests/it/io/parquet/mod.rs +++ b/tests/it/io/parquet/mod.rs @@ -14,6 +14,7 @@ use arrow2::{ types::{days_ms, NativeType}, }; +mod deserialize; #[cfg(feature = "io_json_integration")] mod integration; mod read; From 3ddc6a10c6fbc2d0f85a9f66eeb46112abd07029 Mon Sep 17 00:00:00 2001 From: Ben Levin Date: Fri, 27 Oct 2023 19:26:38 -0500 Subject: [PATCH 80/80] Move parquet async functionality behind feature flag (#1586) --- Cargo.toml | 10 +++++----- src/io/parquet/read/mod.rs | 13 +++++++++---- src/io/parquet/read/row_group.rs | 6 ++++++ src/io/parquet/write/mod.rs | 3 +++ 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5deab656f5..a8e5933d2f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -100,7 +100,7 @@ odbc-api = { version = "0.36", optional = true } # Faster hashing ahash = "0.8" -# For `LIKE` matching "contains" fast-path +# For `LIKE` matching "contains" fast-path memchr = { version = "2.6", optional = true } # Support conversion to/from arrow-rs @@ -117,7 +117,6 @@ getrandom = { version = "0.2", features = ["js"] } version = "0.17" optional = true default_features = false -features = ["async"] [dev-dependencies] criterion = "0.4" @@ -160,7 +159,7 @@ full = [ "io_ipc_compression", "io_json_integration", "io_print", - "io_parquet", + "io_parquet_async", "io_parquet_compression", "io_avro", "io_orc", @@ -189,7 +188,8 @@ io_ipc_compression = ["lz4", "zstd"] io_flight = ["io_ipc", "arrow-format/flight-data"] # base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format. -io_parquet = ["parquet2", "io_ipc", "base64", "futures", "streaming-iterator", "fallible-streaming-iterator"] +io_parquet = ["parquet2", "io_ipc", "base64", "streaming-iterator", "fallible-streaming-iterator"] +io_parquet_async = ["futures", "io_parquet", "parquet2/async"] io_parquet_compression = [ "io_parquet_zstd", @@ -200,7 +200,7 @@ io_parquet_compression = [ ] # sample testing of generated arrow data -io_parquet_sample_test = ["io_parquet"] +io_parquet_sample_test = ["io_parquet_async"] # compression backends io_parquet_zstd = ["parquet2/zstd"] diff --git a/src/io/parquet/read/mod.rs b/src/io/parquet/read/mod.rs index ea2b2f46d4..e856f101af 100644 --- a/src/io/parquet/read/mod.rs +++ b/src/io/parquet/read/mod.rs @@ -10,19 +10,22 @@ pub mod statistics; use std::io::{Read, Seek}; +#[cfg(feature = "io_parquet_async")] use futures::{AsyncRead, AsyncSeek}; // re-exports of parquet2's relevant APIs +#[cfg(feature = "io_parquet_async")] +#[cfg_attr(docsrs, doc(cfg(feature = "io_parquet_async")))] +pub use parquet2::read::{get_page_stream, read_metadata_async as _read_metadata_async}; pub use parquet2::{ error::Error as ParquetError, fallible_streaming_iterator, metadata::{ColumnChunkMetaData, ColumnDescriptor, RowGroupMetaData}, page::{CompressedDataPage, DataPageHeader, Page}, read::{ - decompress, get_column_iterator, get_page_stream, - read_columns_indexes as _read_columns_indexes, read_metadata as _read_metadata, - read_metadata_async as _read_metadata_async, read_pages_locations, BasicDecompressor, - Decompressor, MutStreamingIterator, PageFilter, PageReader, ReadColumnIterator, State, + decompress, get_column_iterator, read_columns_indexes as _read_columns_indexes, + read_metadata as _read_metadata, read_pages_locations, BasicDecompressor, Decompressor, + MutStreamingIterator, PageFilter, PageReader, ReadColumnIterator, State, }, schema::types::{ GroupLogicalType, ParquetType, PhysicalType, PrimitiveConvertedType, PrimitiveLogicalType, @@ -60,6 +63,8 @@ pub fn read_metadata(reader: &mut R) -> Result { } /// Reads parquets' metadata asynchronously. +#[cfg(feature = "io_parquet_async")] +#[cfg_attr(docsrs, doc(cfg(feature = "io_parquet_async")))] pub async fn read_metadata_async( reader: &mut R, ) -> Result { diff --git a/src/io/parquet/read/row_group.rs b/src/io/parquet/read/row_group.rs index 176c6e8318..7062df31e4 100644 --- a/src/io/parquet/read/row_group.rs +++ b/src/io/parquet/read/row_group.rs @@ -1,5 +1,6 @@ use std::io::{Read, Seek}; +#[cfg(feature = "io_parquet_async")] use futures::{ future::{try_join_all, BoxFuture}, AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt, @@ -138,6 +139,7 @@ where Ok((meta, chunk)) } +#[cfg(feature = "io_parquet_async")] async fn _read_single_column_async<'b, R, F>( reader_factory: F, meta: &ColumnChunkMetaData, @@ -163,6 +165,8 @@ where /// /// It does so asynchronously via a single `join_all` over all the necessary columns for /// `field_name`. +#[cfg(feature = "io_parquet_async")] +#[cfg_attr(docsrs, doc(cfg(feature = "io_parquet_async")))] pub async fn read_columns_async< 'a, 'b, @@ -303,6 +307,8 @@ pub fn read_columns_many<'a, R: Read + Seek>( /// This operation is IO-bounded `O(C)` where C is the number of columns in the row group - /// it reads all the columns to memory from the row group associated to the requested fields. /// It does so asynchronously via `join_all` +#[cfg(feature = "io_parquet_async")] +#[cfg_attr(docsrs, doc(cfg(feature = "io_parquet_async")))] pub async fn read_columns_many_async< 'a, 'b, diff --git a/src/io/parquet/write/mod.rs b/src/io/parquet/write/mod.rs index d4134f27df..6ef1864c6f 100644 --- a/src/io/parquet/write/mod.rs +++ b/src/io/parquet/write/mod.rs @@ -22,6 +22,7 @@ mod pages; mod primitive; mod row_group; mod schema; +#[cfg(feature = "io_parquet_async")] mod sink; mod utf8; mod utils; @@ -68,6 +69,8 @@ use crate::compute::aggregate::estimated_bytes_size; pub use file::FileWriter; pub use row_group::{row_group_iter, RowGroupIterator}; pub use schema::to_parquet_type; +#[cfg(feature = "io_parquet_async")] +#[cfg_attr(docsrs, doc(cfg(feature = "io_parquet_async")))] pub use sink::FileSink; pub use pages::array_to_columns;