From 1491c6e8f4fd100f53c358e4f3ef1536d9e75090 Mon Sep 17 00:00:00 2001
From: Alexander Beedie <alexander-beedie@users.noreply.github.com>
Date: Mon, 17 Apr 2023 13:42:07 +0400
Subject: [PATCH 01/80] Fixed timestamp to datetime conversion error (#1470)

Fixed timestamp to datetime conversion error on pre-epoch (negative) values and improved test coverage
---
 src/temporal_conversions.rs      | 51 +++++++++++++++++++++-----------
 tests/it/temporal_conversions.rs |  6 ++--
 2 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs
index 65448aa699..6560a7067e 100644
--- a/src/temporal_conversions.rs
+++ b/src/temporal_conversions.rs
@@ -120,13 +120,18 @@ pub fn timestamp_ms_to_datetime(v: i64) -> NaiveDateTime {
             (v % MILLISECONDS * MICROSECONDS) as u32,
         )
     } else {
-        // note: negative values require 'div_floor' rounding behaviour, which isn't
-        // yet stabilised (see - https://github.com/rust-lang/rust/issues/88581).
         let secs_rem = (v / MILLISECONDS, v % MILLISECONDS);
-        NaiveDateTime::from_timestamp_opt(
-            secs_rem.0 - (secs_rem.1 != 0) as i64,
-            (v % MILLISECONDS * MICROSECONDS).unsigned_abs() as u32,
-        )
+        if secs_rem.1 == 0 {
+            // whole/integer seconds; no adjustment required
+            NaiveDateTime::from_timestamp_opt(secs_rem.0, (v % MILLISECONDS * MICROSECONDS) as u32)
+        } else {
+            // negative values with fractional seconds require 'div_floor' rounding behaviour.
+            // (which isn't yet stabilised: https://github.com/rust-lang/rust/issues/88581)
+            NaiveDateTime::from_timestamp_opt(
+                secs_rem.0 - 1,
+                (NANOSECONDS + (v % MILLISECONDS * MICROSECONDS)) as u32,
+            )
+        }
     }
     .expect("invalid or out-of-range datetime")
 }
@@ -142,13 +147,18 @@ pub fn timestamp_us_to_datetime(v: i64) -> NaiveDateTime {
             (v % MICROSECONDS * MILLISECONDS) as u32,
         )
     } else {
-        // note: negative values require 'div_floor' rounding behaviour, which isn't
-        // yet stabilised (see - https://github.com/rust-lang/rust/issues/88581).
         let secs_rem = (v / MICROSECONDS, v % MICROSECONDS);
-        NaiveDateTime::from_timestamp_opt(
-            secs_rem.0 - (secs_rem.1 != 0) as i64,
-            (v % MICROSECONDS * MILLISECONDS).unsigned_abs() as u32,
-        )
+        if secs_rem.1 == 0 {
+            // whole/integer seconds; no adjustment required
+            NaiveDateTime::from_timestamp_opt(secs_rem.0, (v % MICROSECONDS * MILLISECONDS) as u32)
+        } else {
+            // negative values with fractional seconds require 'div_floor' rounding behaviour.
+            // (which isn't yet stabilised: https://github.com/rust-lang/rust/issues/88581)
+            NaiveDateTime::from_timestamp_opt(
+                secs_rem.0 - 1,
+                (NANOSECONDS + (v % MICROSECONDS * MILLISECONDS)) as u32,
+            )
+        }
     }
     .expect("invalid or out-of-range datetime")
 }
@@ -164,13 +174,18 @@ pub fn timestamp_ns_to_datetime(v: i64) -> NaiveDateTime {
             (v % NANOSECONDS) as u32,
         )
     } else {
-        // note: negative values require 'div_floor' rounding behaviour, which isn't
-        // yet stabilised (see - https://github.com/rust-lang/rust/issues/88581).
         let secs_rem = (v / NANOSECONDS, v % NANOSECONDS);
-        NaiveDateTime::from_timestamp_opt(
-            secs_rem.0 - (secs_rem.1 != 0) as i64,
-            (v % NANOSECONDS).unsigned_abs() as u32,
-        )
+        if secs_rem.1 == 0 {
+            // whole/integer seconds; no adjustment required
+            NaiveDateTime::from_timestamp_opt(secs_rem.0, (v % NANOSECONDS) as u32)
+        } else {
+            // negative values with fractional seconds require 'div_floor' rounding behaviour.
+            // (which isn't yet stabilised: https://github.com/rust-lang/rust/issues/88581)
+            NaiveDateTime::from_timestamp_opt(
+                secs_rem.0 - 1,
+                (NANOSECONDS + (v % NANOSECONDS)) as u32,
+            )
+        }
     }
     .expect("invalid or out-of-range datetime")
 }
diff --git a/tests/it/temporal_conversions.rs b/tests/it/temporal_conversions.rs
index 883d524f95..1bb206de5a 100644
--- a/tests/it/temporal_conversions.rs
+++ b/tests/it/temporal_conversions.rs
@@ -150,17 +150,17 @@ fn timestamp_to_datetime() {
     // negative milliseconds
     assert_eq!(
         temporal_conversions::timestamp_ms_to_datetime(ts / 1_000_000),
-        NaiveDateTime::parse_from_str("1969-07-05T01:02:03.987000000", fmt).unwrap()
+        NaiveDateTime::parse_from_str("1969-07-05T01:02:03.013000000", fmt).unwrap()
     );
     // negative microseconds
     assert_eq!(
         temporal_conversions::timestamp_us_to_datetime(ts / 1_000),
-        NaiveDateTime::parse_from_str("1969-07-05T01:02:03.987654000", fmt).unwrap()
+        NaiveDateTime::parse_from_str("1969-07-05T01:02:03.012346000", fmt).unwrap()
     );
     // negative nanoseconds
     assert_eq!(
         temporal_conversions::timestamp_ns_to_datetime(ts),
-        NaiveDateTime::parse_from_str("1969-07-05T01:02:03.987654321", fmt).unwrap()
+        NaiveDateTime::parse_from_str("1969-07-05T01:02:03.012345679", fmt).unwrap()
     );
 
     let fmt = "%Y-%m-%dT%H:%M:%S";

From 144a8b865f0652b9fcd8a17e88f8e47cd53dca57 Mon Sep 17 00:00:00 2001
From: Alexander Beedie <alexander-beedie@users.noreply.github.com>
Date: Wed, 19 Apr 2023 19:33:20 +0400
Subject: [PATCH 02/80] Don't calculate nanoseconds in timestamp conversion
 when remainder implies integer seconds (#1471)

---
 src/temporal_conversions.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs
index 6560a7067e..a76700f444 100644
--- a/src/temporal_conversions.rs
+++ b/src/temporal_conversions.rs
@@ -123,7 +123,7 @@ pub fn timestamp_ms_to_datetime(v: i64) -> NaiveDateTime {
         let secs_rem = (v / MILLISECONDS, v % MILLISECONDS);
         if secs_rem.1 == 0 {
             // whole/integer seconds; no adjustment required
-            NaiveDateTime::from_timestamp_opt(secs_rem.0, (v % MILLISECONDS * MICROSECONDS) as u32)
+            NaiveDateTime::from_timestamp_opt(secs_rem.0, 0)
         } else {
             // negative values with fractional seconds require 'div_floor' rounding behaviour.
             // (which isn't yet stabilised: https://github.com/rust-lang/rust/issues/88581)
@@ -150,7 +150,7 @@ pub fn timestamp_us_to_datetime(v: i64) -> NaiveDateTime {
         let secs_rem = (v / MICROSECONDS, v % MICROSECONDS);
         if secs_rem.1 == 0 {
             // whole/integer seconds; no adjustment required
-            NaiveDateTime::from_timestamp_opt(secs_rem.0, (v % MICROSECONDS * MILLISECONDS) as u32)
+            NaiveDateTime::from_timestamp_opt(secs_rem.0, 0)
         } else {
             // negative values with fractional seconds require 'div_floor' rounding behaviour.
             // (which isn't yet stabilised: https://github.com/rust-lang/rust/issues/88581)
@@ -177,7 +177,7 @@ pub fn timestamp_ns_to_datetime(v: i64) -> NaiveDateTime {
         let secs_rem = (v / NANOSECONDS, v % NANOSECONDS);
         if secs_rem.1 == 0 {
             // whole/integer seconds; no adjustment required
-            NaiveDateTime::from_timestamp_opt(secs_rem.0, (v % NANOSECONDS) as u32)
+            NaiveDateTime::from_timestamp_opt(secs_rem.0, 0)
         } else {
             // negative values with fractional seconds require 'div_floor' rounding behaviour.
             // (which isn't yet stabilised: https://github.com/rust-lang/rust/issues/88581)

From 64d8ec203f991468032025a13a4f971f1f2cfc14 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Thu, 27 Apr 2023 10:27:46 +0200
Subject: [PATCH 03/80] Added parquet deserialization of nested null types
 (#1472)

---
 src/array/mod.rs                              |   2 +-
 src/array/null.rs                             |  65 +++++++++
 src/io/parquet/read/deserialize/nested.rs     |  12 ++
 .../read/deserialize/{null.rs => null/mod.rs} |   3 +
 .../parquet/read/deserialize/null/nested.rs   | 124 ++++++++++++++++++
 src/io/parquet/read/statistics/mod.rs         |   5 +
 src/io/parquet/read/statistics/null.rs        |  11 ++
 7 files changed, 221 insertions(+), 1 deletion(-)
 rename src/io/parquet/read/deserialize/{null.rs => null/mod.rs} (98%)
 create mode 100644 src/io/parquet/read/deserialize/null/nested.rs
 create mode 100644 src/io/parquet/read/statistics/null.rs

diff --git a/src/array/mod.rs b/src/array/mod.rs
index 42b528fc27..f9c320a650 100644
--- a/src/array/mod.rs
+++ b/src/array/mod.rs
@@ -728,7 +728,7 @@ pub use fixed_size_binary::{FixedSizeBinaryArray, MutableFixedSizeBinaryArray};
 pub use fixed_size_list::{FixedSizeListArray, MutableFixedSizeListArray};
 pub use list::{ListArray, ListValuesIter, MutableListArray};
 pub use map::MapArray;
-pub use null::NullArray;
+pub use null::{MutableNullArray, NullArray};
 pub use primitive::*;
 pub use struct_::{MutableStructArray, StructArray};
 pub use union::UnionArray;
diff --git a/src/array/null.rs b/src/array/null.rs
index 5a1471efcf..bcd5c0aff7 100644
--- a/src/array/null.rs
+++ b/src/array/null.rs
@@ -1,5 +1,8 @@
 use crate::{bitmap::Bitmap, datatypes::DataType};
+use std::any::Any;
 
+use crate::array::MutableArray;
+use crate::bitmap::MutableBitmap;
 use crate::{
     array::{Array, FromFfi, ToFfi},
     datatypes::PhysicalType,
@@ -88,6 +91,68 @@ impl Array for NullArray {
     }
 }
 
+#[derive(Debug)]
+/// A distinct type to disambiguate
+/// clashing methods
+pub struct MutableNullArray {
+    inner: NullArray,
+}
+
+impl MutableNullArray {
+    /// Returns a new [`MutableNullArray`].
+    /// # Panics
+    /// This function errors iff:
+    /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to [`crate::datatypes::PhysicalType::Null`].
+    pub fn new(data_type: DataType, length: usize) -> Self {
+        let inner = NullArray::try_new(data_type, length).unwrap();
+        Self { inner }
+    }
+}
+
+impl From<MutableNullArray> for NullArray {
+    fn from(value: MutableNullArray) -> Self {
+        value.inner
+    }
+}
+
+impl MutableArray for MutableNullArray {
+    fn data_type(&self) -> &DataType {
+        &DataType::Null
+    }
+
+    fn len(&self) -> usize {
+        self.inner.length
+    }
+
+    fn validity(&self) -> Option<&MutableBitmap> {
+        None
+    }
+
+    fn as_box(&mut self) -> Box<dyn Array> {
+        self.inner.clone().boxed()
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn as_mut_any(&mut self) -> &mut dyn Any {
+        self
+    }
+
+    fn push_null(&mut self) {
+        self.inner.length += 1;
+    }
+
+    fn reserve(&mut self, _additional: usize) {
+        // no-op
+    }
+
+    fn shrink_to_fit(&mut self) {
+        // no-op
+    }
+}
+
 impl std::fmt::Debug for NullArray {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "NullArray({})", self.len())
diff --git a/src/io/parquet/read/deserialize/nested.rs b/src/io/parquet/read/deserialize/nested.rs
index 9a2c0232f2..0887751438 100644
--- a/src/io/parquet/read/deserialize/nested.rs
+++ b/src/io/parquet/read/deserialize/nested.rs
@@ -52,6 +52,18 @@ where
     use crate::datatypes::PrimitiveType::*;
 
     Ok(match field.data_type().to_physical_type() {
+        Null => {
+            // physical type is i32
+            init.push(InitNested::Primitive(field.is_nullable));
+            types.pop();
+            primitive(null::NestedIter::new(
+                columns.pop().unwrap(),
+                init,
+                field.data_type().clone(),
+                num_rows,
+                chunk_size,
+            ))
+        }
         Boolean => {
             init.push(InitNested::Primitive(field.is_nullable));
             types.pop();
diff --git a/src/io/parquet/read/deserialize/null.rs b/src/io/parquet/read/deserialize/null/mod.rs
similarity index 98%
rename from src/io/parquet/read/deserialize/null.rs
rename to src/io/parquet/read/deserialize/null/mod.rs
index e897a045b6..18c745191d 100644
--- a/src/io/parquet/read/deserialize/null.rs
+++ b/src/io/parquet/read/deserialize/null/mod.rs
@@ -1,8 +1,11 @@
+mod nested;
+
 use parquet2::page::Page;
 
 use crate::{array::NullArray, datatypes::DataType};
 
 use super::super::{ArrayIter, Pages};
+pub(super) use nested::NestedIter;
 
 /// Converts [`Pages`] to an [`ArrayIter`]
 pub fn iter_to_arrays<'a, I>(
diff --git a/src/io/parquet/read/deserialize/null/nested.rs b/src/io/parquet/read/deserialize/null/nested.rs
new file mode 100644
index 0000000000..7f0d33d825
--- /dev/null
+++ b/src/io/parquet/read/deserialize/null/nested.rs
@@ -0,0 +1,124 @@
+use std::collections::VecDeque;
+
+use parquet2::page::{DataPage, DictPage};
+
+use crate::array::NullArray;
+use crate::io::parquet::read::deserialize::utils::DecodedState;
+use crate::{datatypes::DataType, error::Result};
+
+use super::super::nested_utils::*;
+use super::super::utils;
+use super::super::Pages;
+
+impl<'a> utils::PageState<'a> for () {
+    fn len(&self) -> usize {
+        0
+    }
+}
+
+#[derive(Debug)]
+struct NullDecoder {}
+
+impl DecodedState for usize {
+    fn len(&self) -> usize {
+        *self
+    }
+}
+
+impl<'a> NestedDecoder<'a> for NullDecoder {
+    type State = ();
+    type Dictionary = ();
+    type DecodedState = usize;
+
+    fn build_state(
+        &self,
+        _page: &'a DataPage,
+        _dict: Option<&'a Self::Dictionary>,
+    ) -> Result<Self::State> {
+        Ok(())
+    }
+
+    /// Initializes a new state
+    fn with_capacity(&self, _capacity: usize) -> Self::DecodedState {
+        0
+    }
+
+    fn push_valid(&self, _state: &mut Self::State, decoded: &mut Self::DecodedState) -> Result<()> {
+        *decoded += 1;
+        Ok(())
+    }
+
+    fn push_null(&self, decoded: &mut Self::DecodedState) {
+        let length = decoded;
+        *length += 1;
+    }
+
+    fn deserialize_dict(&self, _page: &DictPage) -> Self::Dictionary {
+        unreachable!()
+    }
+}
+
+/// An iterator adapter over [`Pages`] assumed to be encoded as null arrays
+#[derive(Debug)]
+pub struct NestedIter<I>
+where
+    I: Pages,
+{
+    iter: I,
+    init: Vec<InitNested>,
+    data_type: DataType,
+    items: VecDeque<(NestedState, usize)>,
+    remaining: usize,
+    chunk_size: Option<usize>,
+    decoder: NullDecoder,
+}
+
+impl<I> NestedIter<I>
+where
+    I: Pages,
+{
+    pub fn new(
+        iter: I,
+        init: Vec<InitNested>,
+        data_type: DataType,
+        num_rows: usize,
+        chunk_size: Option<usize>,
+    ) -> Self {
+        Self {
+            iter,
+            init,
+            data_type,
+            items: VecDeque::new(),
+            chunk_size,
+            remaining: num_rows,
+            decoder: NullDecoder {},
+        }
+    }
+}
+
+impl<I> Iterator for NestedIter<I>
+where
+    I: Pages,
+{
+    type Item = Result<(NestedState, NullArray)>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let maybe_state = next(
+            &mut self.iter,
+            &mut self.items,
+            &mut None,
+            &mut self.remaining,
+            &self.init,
+            self.chunk_size,
+            &self.decoder,
+        );
+        match maybe_state {
+            utils::MaybeNext::Some(Ok((nested, state))) => {
+                Some(Ok((nested, NullArray::new(self.data_type.clone(), state))))
+            }
+            utils::MaybeNext::Some(Err(e)) => Some(Err(e)),
+            utils::MaybeNext::None => None,
+            utils::MaybeNext::More => self.next(),
+        }
+    }
+}
diff --git a/src/io/parquet/read/statistics/mod.rs b/src/io/parquet/read/statistics/mod.rs
index f3c1ed9e8d..8514913d56 100644
--- a/src/io/parquet/read/statistics/mod.rs
+++ b/src/io/parquet/read/statistics/mod.rs
@@ -26,6 +26,7 @@ mod dictionary;
 mod fixlen;
 mod list;
 mod map;
+mod null;
 mod primitive;
 mod struct_;
 mod utf8;
@@ -194,6 +195,9 @@ fn make_mutable(data_type: &DataType, capacity: usize) -> Result<Box<dyn Mutable
             data_type.clone(),
             capacity,
         )?),
+        PhysicalType::Null => {
+            Box::new(MutableNullArray::new(DataType::Null, 0)) as Box<dyn MutableArray>
+        }
         other => {
             return Err(Error::NotYetImplemented(format!(
                 "Deserializing parquet stats from {other:?} is still not implemented"
@@ -538,6 +542,7 @@ fn push(
         Utf8 => utf8::push::<i32>(from, min, max),
         LargeUtf8 => utf8::push::<i64>(from, min, max),
         FixedSizeBinary(_) => fixlen::push(from, min, max),
+        Null => null::push(min, max),
         other => todo!("{:?}", other),
     }
 }
diff --git a/src/io/parquet/read/statistics/null.rs b/src/io/parquet/read/statistics/null.rs
new file mode 100644
index 0000000000..9102720ebc
--- /dev/null
+++ b/src/io/parquet/read/statistics/null.rs
@@ -0,0 +1,11 @@
+use crate::array::*;
+use crate::error::Result;
+
+pub(super) fn push(min: &mut dyn MutableArray, max: &mut dyn MutableArray) -> Result<()> {
+    let min = min.as_mut_any().downcast_mut::<MutableNullArray>().unwrap();
+    let max = max.as_mut_any().downcast_mut::<MutableNullArray>().unwrap();
+    min.push_null();
+    max.push_null();
+
+    Ok(())
+}

From 07fd3f639cb983fb7496e6adeac2c2dbe0a3cad0 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Sat, 29 Apr 2023 11:39:16 +0200
Subject: [PATCH 04/80] Fixed struct FFI for sliced pyarrow (#1474)

---
 src/array/struct_/ffi.rs | 29 ++++++++++++++++++++++++++++-
 src/bitmap/immutable.rs  | 28 ++++++++++++++++------------
 2 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/src/array/struct_/ffi.rs b/src/array/struct_/ffi.rs
index b29a342d88..847948630f 100644
--- a/src/array/struct_/ffi.rs
+++ b/src/array/struct_/ffi.rs
@@ -30,11 +30,38 @@ impl<A: ffi::ArrowArrayRef> FromFfi<A> for StructArray {
         let data_type = array.data_type().clone();
         let fields = Self::get_fields(&data_type);
 
+        let arrow_array = array.array();
         let validity = unsafe { array.validity() }?;
+        let len = arrow_array.len();
+        let offset = arrow_array.offset();
         let values = (0..fields.len())
             .map(|index| {
                 let child = array.child(index)?;
-                ffi::try_from(child)
+                ffi::try_from(child).map(|arr| {
+                    // there is a discrepancy with how arrow2 exports sliced
+                    // struct array and how pyarrow does it.
+                    // # Pyarrow
+                    // ## struct array len 3
+                    //  * slice 1 by with len 2
+                    //      offset on struct array: 1
+                    //      length on struct array: 2
+                    //      offset on value array: 0
+                    //      length on value array: 3
+                    // # Arrow2
+                    // ## struct array len 3
+                    //  * slice 1 by with len 2
+                    //      offset on struct array: 0
+                    //      length on struct array: 3
+                    //      offset on value array: 1
+                    //      length on value array: 2
+                    //
+                    // this branch will ensure both can round trip
+                    if arr.len() >= (len + offset) {
+                        arr.sliced(offset, len)
+                    } else {
+                        arr
+                    }
+                })
             })
             .collect::<Result<Vec<Box<dyn Array>>>>()?;
 
diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs
index 5d85208782..be1e3a662d 100644
--- a/src/bitmap/immutable.rs
+++ b/src/bitmap/immutable.rs
@@ -182,19 +182,23 @@ impl Bitmap {
     /// The caller must ensure that `self.offset + offset + length <= self.len()`
     #[inline]
     pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
-        // count the smallest chunk
-        if length < self.length / 2 {
-            // count the null values in the slice
-            self.unset_bits = count_zeros(&self.bytes, self.offset + offset, length);
-        } else {
-            // subtract the null count of the chunks we slice off
-            let start_end = self.offset + offset + length;
-            let head_count = count_zeros(&self.bytes, self.offset, offset);
-            let tail_count = count_zeros(&self.bytes, start_end, self.length - length - offset);
-            self.unset_bits -= head_count + tail_count;
+        // first guard a no-op slice so that we don't do a bitcount
+        // if there isn't any data sliced
+        if !(offset == 0 && length == self.length) {
+            // count the smallest chunk
+            if length < self.length / 2 {
+                // count the null values in the slice
+                self.unset_bits = count_zeros(&self.bytes, self.offset + offset, length);
+            } else {
+                // subtract the null count of the chunks we slice off
+                let start_end = self.offset + offset + length;
+                let head_count = count_zeros(&self.bytes, self.offset, offset);
+                let tail_count = count_zeros(&self.bytes, start_end, self.length - length - offset);
+                self.unset_bits -= head_count + tail_count;
+            }
+            self.offset += offset;
+            self.length = length;
         }
-        self.offset += offset;
-        self.length = length;
     }
 
     /// Slices `self`, offsetting by `offset` and truncating up to `length` bits.

From 8de953d76036aa80eb50e5d7cf81e38bf2bce3d6 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Sun, 7 May 2023 10:50:44 +0200
Subject: [PATCH 05/80] Correct for offsets in list json serialization (#1475)

---
 src/io/json/write/mod.rs       |   2 +-
 src/io/json/write/serialize.rs | 324 ++++++++++++++++++++-------------
 src/io/ndjson/write/mod.rs     |   2 +-
 3 files changed, 201 insertions(+), 127 deletions(-)

diff --git a/src/io/json/write/mod.rs b/src/io/json/write/mod.rs
index 53cc3f20e6..c04b4b51a6 100644
--- a/src/io/json/write/mod.rs
+++ b/src/io/json/write/mod.rs
@@ -85,7 +85,7 @@ impl<'a> RecordSerializer<'a> {
         let iterators = chunk
             .arrays()
             .iter()
-            .map(|arr| new_serializer(arr.as_ref()))
+            .map(|arr| new_serializer(arr.as_ref(), 0, usize::MAX))
             .collect();
 
         Self {
diff --git a/src/io/json/write/serialize.rs b/src/io/json/write/serialize.rs
index 8247609406..dfa104d78e 100644
--- a/src/io/json/write/serialize.rs
+++ b/src/io/json/write/serialize.rs
@@ -16,95 +16,114 @@ use crate::{array::*, datatypes::DataType, types::NativeType};
 
 use super::utf8;
 
+fn materialize_serializer<'a, I, F, T>(
+    f: F,
+    iterator: I,
+    offset: usize,
+    take: usize,
+) -> Box<dyn StreamingIterator<Item = [u8]> + 'a + Send + Sync>
+where
+    T: 'a,
+    I: Iterator<Item = T> + Send + Sync + 'a,
+    F: FnMut(T, &mut Vec<u8>) + Send + Sync + 'a,
+{
+    if offset > 0 || take < usize::MAX {
+        Box::new(BufStreamingIterator::new(
+            iterator.skip(offset).take(take),
+            f,
+            vec![],
+        ))
+    } else {
+        Box::new(BufStreamingIterator::new(iterator, f, vec![]))
+    }
+}
+
 fn boolean_serializer<'a>(
     array: &'a BooleanArray,
+    offset: usize,
+    take: usize,
 ) -> Box<dyn StreamingIterator<Item = [u8]> + 'a + Send + Sync> {
-    Box::new(BufStreamingIterator::new(
-        array.iter(),
-        |x, buf| match x {
-            Some(true) => buf.extend_from_slice(b"true"),
-            Some(false) => buf.extend_from_slice(b"false"),
-            None => buf.extend_from_slice(b"null"),
-        },
-        vec![],
-    ))
+    let f = |x: Option<bool>, buf: &mut Vec<u8>| match x {
+        Some(true) => buf.extend_from_slice(b"true"),
+        Some(false) => buf.extend_from_slice(b"false"),
+        None => buf.extend_from_slice(b"null"),
+    };
+    materialize_serializer(f, array.iter(), offset, take)
 }
 
 fn primitive_serializer<'a, T: NativeType + ToLexical>(
     array: &'a PrimitiveArray<T>,
+    offset: usize,
+    take: usize,
 ) -> Box<dyn StreamingIterator<Item = [u8]> + 'a + Send + Sync> {
-    Box::new(BufStreamingIterator::new(
-        array.iter(),
-        |x, buf| {
-            if let Some(x) = x {
-                lexical_to_bytes_mut(*x, buf)
-            } else {
-                buf.extend(b"null")
-            }
-        },
-        vec![],
-    ))
+    let f = |x: Option<&T>, buf: &mut Vec<u8>| {
+        if let Some(x) = x {
+            lexical_to_bytes_mut(*x, buf)
+        } else {
+            buf.extend(b"null")
+        }
+    };
+    materialize_serializer(f, array.iter(), offset, take)
 }
 
 fn float_serializer<'a, T>(
     array: &'a PrimitiveArray<T>,
+    offset: usize,
+    take: usize,
 ) -> Box<dyn StreamingIterator<Item = [u8]> + 'a + Send + Sync>
 where
     T: num_traits::Float + NativeType + ToLexical,
 {
-    Box::new(BufStreamingIterator::new(
-        array.iter(),
-        |x, buf| {
-            if let Some(x) = x {
-                if T::is_nan(*x) || T::is_infinite(*x) {
-                    buf.extend(b"null")
-                } else {
-                    lexical_to_bytes_mut(*x, buf)
-                }
-            } else {
+    let f = |x: Option<&T>, buf: &mut Vec<u8>| {
+        if let Some(x) = x {
+            if T::is_nan(*x) || T::is_infinite(*x) {
                 buf.extend(b"null")
+            } else {
+                lexical_to_bytes_mut(*x, buf)
             }
-        },
-        vec![],
-    ))
+        } else {
+            buf.extend(b"null")
+        }
+    };
+
+    materialize_serializer(f, array.iter(), offset, take)
 }
 
 fn dictionary_utf8_serializer<'a, K: DictionaryKey, O: Offset>(
     array: &'a DictionaryArray<K>,
+    offset: usize,
+    take: usize,
 ) -> Box<dyn StreamingIterator<Item = [u8]> + 'a + Send + Sync> {
-    let iter = array.iter_typed::<Utf8Array<O>>().unwrap();
-
-    Box::new(BufStreamingIterator::new(
-        iter,
-        |x, buf| {
-            if let Some(x) = x {
-                utf8::write_str(buf, x).unwrap();
-            } else {
-                buf.extend_from_slice(b"null")
-            }
-        },
-        vec![],
-    ))
+    let iter = array.iter_typed::<Utf8Array<O>>().unwrap().skip(offset);
+    let f = |x: Option<&str>, buf: &mut Vec<u8>| {
+        if let Some(x) = x {
+            utf8::write_str(buf, x).unwrap();
+        } else {
+            buf.extend_from_slice(b"null")
+        }
+    };
+    materialize_serializer(f, iter, offset, take)
 }
 
 fn utf8_serializer<'a, O: Offset>(
     array: &'a Utf8Array<O>,
+    offset: usize,
+    take: usize,
 ) -> Box<dyn StreamingIterator<Item = [u8]> + 'a + Send + Sync> {
-    Box::new(BufStreamingIterator::new(
-        array.iter(),
-        |x, buf| {
-            if let Some(x) = x {
-                utf8::write_str(buf, x).unwrap();
-            } else {
-                buf.extend_from_slice(b"null")
-            }
-        },
-        vec![],
-    ))
+    let f = |x: Option<&str>, buf: &mut Vec<u8>| {
+        if let Some(x) = x {
+            utf8::write_str(buf, x).unwrap();
+        } else {
+            buf.extend_from_slice(b"null")
+        }
+    };
+    materialize_serializer(f, array.iter(), offset, take)
 }
 
 fn struct_serializer<'a>(
     array: &'a StructArray,
+    offset: usize,
+    take: usize,
 ) -> Box<dyn StreamingIterator<Item = [u8]> + 'a + Send + Sync> {
     // {"a": [1, 2, 3], "b": [a, b, c], "c": {"a": [1, 2, 3]}}
     // [
@@ -117,7 +136,7 @@ fn struct_serializer<'a>(
         .values()
         .iter()
         .map(|x| x.as_ref())
-        .map(new_serializer)
+        .map(|arr| new_serializer(arr, offset, take))
         .collect::<Vec<_>>();
     let names = array.fields().iter().map(|f| f.name.as_str());
 
@@ -149,6 +168,8 @@ fn struct_serializer<'a>(
 
 fn list_serializer<'a, O: Offset>(
     array: &'a ListArray<O>,
+    offset: usize,
+    take: usize,
 ) -> Box<dyn StreamingIterator<Item = [u8]> + 'a + Send + Sync> {
     // [[1, 2], [3]]
     // [
@@ -156,35 +177,40 @@ fn list_serializer<'a, O: Offset>(
     //  [3]
     // ]
     //
-    let mut serializer = new_serializer(array.values().as_ref());
+    let offsets = array.offsets().as_slice();
+    let start = offsets[0].to_usize();
+    let end = offsets.last().unwrap().to_usize();
+    let mut serializer = new_serializer(array.values().as_ref(), start, end - start);
 
-    Box::new(BufStreamingIterator::new(
-        ZipValidity::new_with_validity(array.offsets().buffer().windows(2), array.validity()),
-        move |offset, buf| {
-            if let Some(offset) = offset {
-                let length = (offset[1] - offset[0]).to_usize();
-                buf.push(b'[');
-                let mut is_first_row = true;
-                for _ in 0..length {
-                    if !is_first_row {
-                        buf.push(b',');
-                    }
-                    is_first_row = false;
-                    buf.extend(serializer.next().unwrap());
+    let f = move |offset: Option<&[O]>, buf: &mut Vec<u8>| {
+        if let Some(offset) = offset {
+            let length = (offset[1] - offset[0]).to_usize();
+            buf.push(b'[');
+            let mut is_first_row = true;
+            for _ in 0..length {
+                if !is_first_row {
+                    buf.push(b',');
                 }
-                buf.push(b']');
-            } else {
-                buf.extend(b"null");
+                is_first_row = false;
+                buf.extend(serializer.next().unwrap());
             }
-        },
-        vec![],
-    ))
+            buf.push(b']');
+        } else {
+            buf.extend(b"null");
+        }
+    };
+
+    let iter =
+        ZipValidity::new_with_validity(array.offsets().buffer().windows(2), array.validity());
+    materialize_serializer(f, iter, offset, take)
 }
 
 fn fixed_size_list_serializer<'a>(
     array: &'a FixedSizeListArray,
+    offset: usize,
+    take: usize,
 ) -> Box<dyn StreamingIterator<Item = [u8]> + 'a + Send + Sync> {
-    let mut serializer = new_serializer(array.values().as_ref());
+    let mut serializer = new_serializer(array.values().as_ref(), offset, take);
 
     Box::new(BufStreamingIterator::new(
         ZipValidity::new(0..array.len(), array.validity().map(|x| x.iter())),
@@ -212,83 +238,126 @@ fn fixed_size_list_serializer<'a>(
 fn date_serializer<'a, T, F>(
     array: &'a PrimitiveArray<T>,
     convert: F,
+    offset: usize,
+    take: usize,
 ) -> Box<dyn StreamingIterator<Item = [u8]> + 'a + Send + Sync>
 where
     T: NativeType,
     F: Fn(T) -> NaiveDate + 'static + Send + Sync,
 {
-    Box::new(BufStreamingIterator::new(
-        array.iter(),
-        move |x, buf| {
-            if let Some(x) = x {
-                let nd = convert(*x);
-                write!(buf, "\"{nd}\"").unwrap();
-            } else {
-                buf.extend_from_slice(b"null")
-            }
-        },
-        vec![],
-    ))
+    let f = move |x: Option<&T>, buf: &mut Vec<u8>| {
+        if let Some(x) = x {
+            let nd = convert(*x);
+            write!(buf, "\"{nd}\"").unwrap();
+        } else {
+            buf.extend_from_slice(b"null")
+        }
+    };
+
+    materialize_serializer(f, array.iter(), offset, take)
 }
 
 fn timestamp_serializer<'a, F>(
     array: &'a PrimitiveArray<i64>,
     convert: F,
+    offset: usize,
+    take: usize,
 ) -> Box<dyn StreamingIterator<Item = [u8]> + 'a + Send + Sync>
 where
     F: Fn(i64) -> NaiveDateTime + 'static + Send + Sync,
 {
-    Box::new(BufStreamingIterator::new(
-        array.iter(),
-        move |x, buf| {
-            if let Some(x) = x {
-                let ndt = convert(*x);
-                write!(buf, "\"{ndt}\"").unwrap();
-            } else {
-                buf.extend_from_slice(b"null")
-            }
-        },
-        vec![],
-    ))
+    let f = move |x: Option<&i64>, buf: &mut Vec<u8>| {
+        if let Some(x) = x {
+            let ndt = convert(*x);
+            write!(buf, "\"{ndt}\"").unwrap();
+        } else {
+            buf.extend_from_slice(b"null")
+        }
+    };
+    materialize_serializer(f, array.iter(), offset, take)
 }
 
 pub(crate) fn new_serializer<'a>(
     array: &'a dyn Array,
+    offset: usize,
+    take: usize,
 ) -> Box<dyn StreamingIterator<Item = [u8]> + 'a + Send + Sync> {
     match array.data_type().to_logical_type() {
-        DataType::Boolean => boolean_serializer(array.as_any().downcast_ref().unwrap()),
-        DataType::Int8 => primitive_serializer::<i8>(array.as_any().downcast_ref().unwrap()),
-        DataType::Int16 => primitive_serializer::<i16>(array.as_any().downcast_ref().unwrap()),
-        DataType::Int32 => primitive_serializer::<i32>(array.as_any().downcast_ref().unwrap()),
-        DataType::Int64 => primitive_serializer::<i64>(array.as_any().downcast_ref().unwrap()),
-        DataType::UInt8 => primitive_serializer::<u8>(array.as_any().downcast_ref().unwrap()),
-        DataType::UInt16 => primitive_serializer::<u16>(array.as_any().downcast_ref().unwrap()),
-        DataType::UInt32 => primitive_serializer::<u32>(array.as_any().downcast_ref().unwrap()),
-        DataType::UInt64 => primitive_serializer::<u64>(array.as_any().downcast_ref().unwrap()),
-        DataType::Float32 => float_serializer::<f32>(array.as_any().downcast_ref().unwrap()),
-        DataType::Float64 => float_serializer::<f64>(array.as_any().downcast_ref().unwrap()),
-        DataType::Utf8 => utf8_serializer::<i32>(array.as_any().downcast_ref().unwrap()),
-        DataType::LargeUtf8 => utf8_serializer::<i64>(array.as_any().downcast_ref().unwrap()),
-        DataType::Struct(_) => struct_serializer(array.as_any().downcast_ref().unwrap()),
+        DataType::Boolean => {
+            boolean_serializer(array.as_any().downcast_ref().unwrap(), offset, take)
+        }
+        DataType::Int8 => {
+            primitive_serializer::<i8>(array.as_any().downcast_ref().unwrap(), offset, take)
+        }
+        DataType::Int16 => {
+            primitive_serializer::<i16>(array.as_any().downcast_ref().unwrap(), offset, take)
+        }
+        DataType::Int32 => {
+            primitive_serializer::<i32>(array.as_any().downcast_ref().unwrap(), offset, take)
+        }
+        DataType::Int64 => {
+            primitive_serializer::<i64>(array.as_any().downcast_ref().unwrap(), offset, take)
+        }
+        DataType::UInt8 => {
+            primitive_serializer::<u8>(array.as_any().downcast_ref().unwrap(), offset, take)
+        }
+        DataType::UInt16 => {
+            primitive_serializer::<u16>(array.as_any().downcast_ref().unwrap(), offset, take)
+        }
+        DataType::UInt32 => {
+            primitive_serializer::<u32>(array.as_any().downcast_ref().unwrap(), offset, take)
+        }
+        DataType::UInt64 => {
+            primitive_serializer::<u64>(array.as_any().downcast_ref().unwrap(), offset, take)
+        }
+        DataType::Float32 => {
+            float_serializer::<f32>(array.as_any().downcast_ref().unwrap(), offset, take)
+        }
+        DataType::Float64 => {
+            float_serializer::<f64>(array.as_any().downcast_ref().unwrap(), offset, take)
+        }
+        DataType::Utf8 => {
+            utf8_serializer::<i32>(array.as_any().downcast_ref().unwrap(), offset, take)
+        }
+        DataType::LargeUtf8 => {
+            utf8_serializer::<i64>(array.as_any().downcast_ref().unwrap(), offset, take)
+        }
+        DataType::Struct(_) => {
+            struct_serializer(array.as_any().downcast_ref().unwrap(), offset, take)
+        }
         DataType::FixedSizeList(_, _) => {
-            fixed_size_list_serializer(array.as_any().downcast_ref().unwrap())
+            fixed_size_list_serializer(array.as_any().downcast_ref().unwrap(), offset, take)
+        }
+        DataType::List(_) => {
+            list_serializer::<i32>(array.as_any().downcast_ref().unwrap(), offset, take)
+        }
+        DataType::LargeList(_) => {
+            list_serializer::<i64>(array.as_any().downcast_ref().unwrap(), offset, take)
         }
-        DataType::List(_) => list_serializer::<i32>(array.as_any().downcast_ref().unwrap()),
-        DataType::LargeList(_) => list_serializer::<i64>(array.as_any().downcast_ref().unwrap()),
         other @ DataType::Dictionary(k, v, _) => match (k, &**v) {
             (IntegerType::UInt32, DataType::LargeUtf8) => {
                 let array = array
                     .as_any()
                     .downcast_ref::<DictionaryArray<u32>>()
                     .unwrap();
-                dictionary_utf8_serializer::<u32, i64>(array)
+                dictionary_utf8_serializer::<u32, i64>(array, offset, take)
             }
             _ => {
                 todo!("Writing {:?} to JSON", other)
             }
         },
-        DataType::Date32 => date_serializer(array.as_any().downcast_ref().unwrap(), date32_to_date),
-        DataType::Date64 => date_serializer(array.as_any().downcast_ref().unwrap(), date64_to_date),
+        DataType::Date32 => date_serializer(
+            array.as_any().downcast_ref().unwrap(),
+            date32_to_date,
+            offset,
+            take,
+        ),
+        DataType::Date64 => date_serializer(
+            array.as_any().downcast_ref().unwrap(),
+            date64_to_date,
+            offset,
+            take,
+        ),
         DataType::Timestamp(tu, tz) => {
             if tz.is_some() {
                 todo!("still have to implement timezone")
@@ -299,7 +368,12 @@ pub(crate) fn new_serializer<'a>(
                     TimeUnit::Millisecond => timestamp_ms_to_datetime,
                     TimeUnit::Second => timestamp_s_to_datetime,
                 };
-                timestamp_serializer(array.as_any().downcast_ref().unwrap(), convert)
+                timestamp_serializer(
+                    array.as_any().downcast_ref().unwrap(),
+                    convert,
+                    offset,
+                    take,
+                )
             }
         }
         other => todo!("Writing {:?} to JSON", other),
@@ -328,7 +402,7 @@ fn serialize_item(buffer: &mut Vec<u8>, record: &[(&str, &[u8])], is_first_row:
 /// # Implementation
 /// This operation is CPU-bounded
 pub(crate) fn serialize(array: &dyn Array, buffer: &mut Vec<u8>) {
-    let mut serializer = new_serializer(array);
+    let mut serializer = new_serializer(array, 0, usize::MAX);
 
     (0..array.len()).for_each(|i| {
         if i != 0 {
diff --git a/src/io/ndjson/write/mod.rs b/src/io/ndjson/write/mod.rs
index 0932f7b8ec..45ad52253c 100644
--- a/src/io/ndjson/write/mod.rs
+++ b/src/io/ndjson/write/mod.rs
@@ -9,7 +9,7 @@ use crate::error::Error;
 use super::super::json::write::new_serializer;
 
 fn serialize(array: &dyn Array, buffer: &mut Vec<u8>) {
-    let mut serializer = new_serializer(array);
+    let mut serializer = new_serializer(array, 0, usize::MAX);
     (0..array.len()).for_each(|_| {
         buffer.extend_from_slice(serializer.next().unwrap());
         buffer.push(b'\n');

From c676340939a1301d3530c83e648dd463ab8d8291 Mon Sep 17 00:00:00 2001
From: theadd336 <31495729+theadd336@users.noreply.github.com>
Date: Sun, 7 May 2023 04:51:51 -0400
Subject: [PATCH 06/80] Close Underlying Writer when IPC `StreamSink` is Closed
 (#1463)

---
 src/io/ipc/write/stream_async.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/io/ipc/write/stream_async.rs b/src/io/ipc/write/stream_async.rs
index 8d156ff16a..df651461fe 100644
--- a/src/io/ipc/write/stream_async.rs
+++ b/src/io/ipc/write/stream_async.rs
@@ -2,7 +2,7 @@
 
 use std::{pin::Pin, task::Poll};
 
-use futures::{future::BoxFuture, AsyncWrite, FutureExt, Sink};
+use futures::{future::BoxFuture, AsyncWrite, AsyncWriteExt, FutureExt, Sink};
 
 use super::super::IpcField;
 pub use super::common::WriteOptions;
@@ -170,6 +170,8 @@ where
                     this.task = Some(
                         async move {
                             write_continuation(&mut writer, 0).await?;
+                            writer.flush().await?;
+                            writer.close().await?;
                             Ok(None)
                         }
                         .boxed(),

From bf1a3ceffc4599f5ea31144f121c735d20007b7f Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Mon, 8 May 2023 10:31:19 +0200
Subject: [PATCH 07/80] Improved pargquet nested null deserialization (#1477)

---
 .../parquet/read/deserialize/null/nested.rs   | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/io/parquet/read/deserialize/null/nested.rs b/src/io/parquet/read/deserialize/null/nested.rs
index 7f0d33d825..705b607398 100644
--- a/src/io/parquet/read/deserialize/null/nested.rs
+++ b/src/io/parquet/read/deserialize/null/nested.rs
@@ -10,9 +10,9 @@ use super::super::nested_utils::*;
 use super::super::utils;
 use super::super::Pages;
 
-impl<'a> utils::PageState<'a> for () {
+impl<'a> utils::PageState<'a> for usize {
     fn len(&self) -> usize {
-        0
+        *self
     }
 }
 
@@ -26,16 +26,19 @@ impl DecodedState for usize {
 }
 
 impl<'a> NestedDecoder<'a> for NullDecoder {
-    type State = ();
-    type Dictionary = ();
+    type State = usize;
+    type Dictionary = usize;
     type DecodedState = usize;
 
     fn build_state(
         &self,
         _page: &'a DataPage,
-        _dict: Option<&'a Self::Dictionary>,
+        dict: Option<&'a Self::Dictionary>,
     ) -> Result<Self::State> {
-        Ok(())
+        if let Some(n) = dict {
+            return Ok(*n);
+        }
+        Ok(1)
     }
 
     /// Initializes a new state
@@ -43,8 +46,8 @@ impl<'a> NestedDecoder<'a> for NullDecoder {
         0
     }
 
-    fn push_valid(&self, _state: &mut Self::State, decoded: &mut Self::DecodedState) -> Result<()> {
-        *decoded += 1;
+    fn push_valid(&self, state: &mut Self::State, decoded: &mut Self::DecodedState) -> Result<()> {
+        *decoded += *state;
         Ok(())
     }
 
@@ -53,8 +56,8 @@ impl<'a> NestedDecoder<'a> for NullDecoder {
         *length += 1;
     }
 
-    fn deserialize_dict(&self, _page: &DictPage) -> Self::Dictionary {
-        unreachable!()
+    fn deserialize_dict(&self, page: &DictPage) -> Self::Dictionary {
+        page.num_values
     }
 }
 

From b09e580f075293e9af2879dcc7f6b2d5c8fe520e Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Tue, 9 May 2023 11:19:20 +0200
Subject: [PATCH 08/80] Bumped version to 0.17.1 (#1479)

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 7294dc47df..2025b71777 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "arrow2"
-version = "0.17.0"
+version = "0.17.1"
 license = "Apache-2.0"
 description = "Unofficial implementation of Apache Arrow spec in safe Rust"
 homepage = "https://github.com/jorgecarleitao/arrow2"

From ef2ef5e1135e35e262d105a7ab93813dbbf732b2 Mon Sep 17 00:00:00 2001
From: Hieu Minh Nguyen <38937534+therealhieu@users.noreply.github.com>
Date: Tue, 16 May 2023 00:52:41 +0700
Subject: [PATCH 09/80] feat(deps): Bump arrow-rs 39 (#1482)

---
 Cargo.toml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 2025b71777..91cba8cd3d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -101,10 +101,10 @@ odbc-api = { version = "0.36", optional = true }
 ahash = "0.8"
 
 # Support conversion to/from arrow-rs
-arrow-buffer = { version = "37.0.0", optional = true }
-arrow-schema = { version = "37.0.0", optional = true }
-arrow-data = { version = "37.0.0", optional = true }
-arrow-array = { version = "37.0.0", optional = true }
+arrow-buffer = { version = "39.0.0", optional = true }
+arrow-schema = { version = "39.0.0", optional = true }
+arrow-data = { version = "39.0.0", optional = true }
+arrow-array = { version = "39.0.0", optional = true }
 
 [target.wasm32-unknown-unknown.dependencies]
 getrandom = { version = "0.2", features = ["js"] }

From ce92eed4cd5a303669dfd8c971c6f9af7f62e969 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Mon, 15 May 2023 20:06:59 +0200
Subject: [PATCH 10/80] Added `Buffer::is_sliced` (#1480)

---
 src/buffer/immutable.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs
index 98a31cb153..10c7968b0d 100644
--- a/src/buffer/immutable.rs
+++ b/src/buffer/immutable.rs
@@ -96,6 +96,13 @@ impl<T> Buffer<T> {
         self.len() == 0
     }
 
+    /// Returns whether underlying data is sliced.
+    /// If sliced the [`Buffer`] is backed by
+    /// more data than the length of `Self`.
+    pub fn is_sliced(&self) -> bool {
+        self.data.len() != self.length
+    }
+
     /// Returns the byte slice stored in this buffer
     #[inline]
     pub fn as_slice(&self) -> &[T] {

From eed5ebb2b0d18dfbcce363f5d212410f52a49333 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Tue, 16 May 2023 10:17:06 +0200
Subject: [PATCH 11/80] Made `MutableList::try_extend_from_lengths` public
 (#1486)

---
 src/array/list/mutable.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/array/list/mutable.rs b/src/array/list/mutable.rs
index 881cb620a0..8841ceaf53 100644
--- a/src/array/list/mutable.rs
+++ b/src/array/list/mutable.rs
@@ -174,8 +174,7 @@ impl<O: Offset, M: MutableArray> MutableListArray<O, M> {
     /// - the new offsets are not in monotonic increasing order.
     /// - any new offset is not in bounds of the backing array.
     /// - the passed iterator has no upper bound.
-    #[allow(dead_code)]
-    pub(crate) fn try_extend_from_lengths<II>(&mut self, iterator: II) -> Result<()>
+    pub fn try_extend_from_lengths<II>(&mut self, iterator: II) -> Result<()>
     where
         II: TrustedLen<Item = Option<usize>> + Clone,
     {

From 148ea37467dc8d3375a23deab7c2087c630d4c87 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Sat, 20 May 2023 13:57:51 +0200
Subject: [PATCH 12/80] Splitted 'io_json' feature flags in read/write (#1487)

---
 Cargo.toml              |  4 +++-
 src/io/json/mod.rs      | 10 ++--------
 src/io/json/read/mod.rs |  8 ++++++++
 src/io/mod.rs           | 11 ++++++++---
 src/io/ndjson/mod.rs    |  3 +++
 5 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 91cba8cd3d..7c03aad27c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -168,7 +168,9 @@ io_csv_async = ["io_csv_read_async"]
 io_csv_read = ["csv", "lexical-core"]
 io_csv_read_async = ["csv-async", "lexical-core", "futures"]
 io_csv_write = ["csv-core", "streaming-iterator", "lexical-core"]
-io_json = ["json-deserializer", "streaming-iterator", "fallible-streaming-iterator", "indexmap", "lexical-core"]
+io_json = ["io_json_read", "io_json_write"]
+io_json_read = ["json-deserializer", "indexmap", "lexical-core"]
+io_json_write = ["streaming-iterator", "fallible-streaming-iterator", "lexical-core"]
 io_ipc = ["arrow-format"]
 io_ipc_write_async = ["io_ipc", "futures"]
 io_ipc_read_async = ["io_ipc", "futures", "async-stream"]
diff --git a/src/io/json/mod.rs b/src/io/json/mod.rs
index ebbdc92b69..42e40a89c7 100644
--- a/src/io/json/mod.rs
+++ b/src/io/json/mod.rs
@@ -1,12 +1,6 @@
 //! Convert data between the Arrow memory format and JSON line-delimited records.
 
+#[cfg(feature = "io_json_read")]
 pub mod read;
+#[cfg(feature = "io_json_write")]
 pub mod write;
-
-use crate::error::Error;
-
-impl From<json_deserializer::Error> for Error {
-    fn from(error: json_deserializer::Error) -> Self {
-        Error::ExternalFormat(error.to_string())
-    }
-}
diff --git a/src/io/json/read/mod.rs b/src/io/json/read/mod.rs
index 686390df2b..087da38d50 100644
--- a/src/io/json/read/mod.rs
+++ b/src/io/json/read/mod.rs
@@ -8,3 +8,11 @@ pub(crate) use infer_schema::coerce_data_type;
 pub use infer_schema::{infer, infer_records_schema};
 
 pub use json_deserializer;
+
+use crate::error::Error;
+
+impl From<json_deserializer::Error> for Error {
+    fn from(error: json_deserializer::Error) -> Self {
+        Error::ExternalFormat(error.to_string())
+    }
+}
diff --git a/src/io/mod.rs b/src/io/mod.rs
index 69e4657fd7..0dd8e8651d 100644
--- a/src/io/mod.rs
+++ b/src/io/mod.rs
@@ -24,10 +24,10 @@ pub mod orc;
 )]
 pub mod csv;
 
-#[cfg(feature = "io_json")]
+#[cfg(any(feature = "io_json_read", feature = "io_json_write"))]
 #[cfg_attr(docsrs, doc(cfg(feature = "io_json")))]
 pub mod json;
-#[cfg(feature = "io_json")]
+#[cfg(any(feature = "io_json_read", feature = "io_json_write"))]
 #[cfg_attr(docsrs, doc(cfg(feature = "io_json")))]
 pub mod ndjson;
 
@@ -55,5 +55,10 @@ pub mod avro;
 #[cfg_attr(docsrs, doc(cfg(feature = "io_print")))]
 pub mod print;
 
-#[cfg(any(feature = "io_csv_write", feature = "io_avro", feature = "io_json"))]
+#[cfg(any(
+    feature = "io_csv_write",
+    feature = "io_avro",
+    feature = "io_json_write",
+    feature = "io_json_read"
+))]
 mod iterator;
diff --git a/src/io/ndjson/mod.rs b/src/io/ndjson/mod.rs
index 1448a5af6e..a77eda1e3d 100644
--- a/src/io/ndjson/mod.rs
+++ b/src/io/ndjson/mod.rs
@@ -1,3 +1,6 @@
 //! APIs to read from and write to NDJSON
+
+#[cfg(feature = "io_json_read")]
 pub mod read;
+#[cfg(feature = "io_json_write")]
 pub mod write;

From 6f8b5dea4f8d5da637726fb853cc441a2c009b93 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Sat, 20 May 2023 14:57:44 +0200
Subject: [PATCH 13/80] Splitted extend_from_slice into
 extend_from_slice_unchecked (#1488)

---
 src/array/boolean/mutable.rs  |  6 +++++-
 src/array/growable/boolean.rs |  6 +++++-
 src/array/growable/utils.rs   |  7 +++++--
 src/array/physical_binary.rs  |  3 ++-
 src/bitmap/mutable.rs         | 27 ++++++++++++++++++++++++---
 src/compute/filter.rs         |  7 ++++++-
 6 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/src/array/boolean/mutable.rs b/src/array/boolean/mutable.rs
index 4a4ed3f9ce..729ef81d6b 100644
--- a/src/array/boolean/mutable.rs
+++ b/src/array/boolean/mutable.rs
@@ -559,7 +559,11 @@ impl TryExtendFromSelf for MutableBooleanArray {
         extend_validity(self.len(), &mut self.validity, &other.validity);
 
         let slice = other.values.as_slice();
-        self.values.extend_from_slice(slice, 0, other.values.len());
+        // safety: invariant offset + length <= slice.len()
+        unsafe {
+            self.values
+                .extend_from_slice_unchecked(slice, 0, other.values.len());
+        }
         Ok(())
     }
 }
diff --git a/src/array/growable/boolean.rs b/src/array/growable/boolean.rs
index a9fb52ef3e..0cb1213403 100644
--- a/src/array/growable/boolean.rs
+++ b/src/array/growable/boolean.rs
@@ -63,7 +63,11 @@ impl<'a> Growable<'a> for GrowableBoolean<'a> {
         let values = array.values();
 
         let (slice, offset, _) = values.as_slice();
-        self.values.extend_from_slice(slice, start + offset, len);
+        // safety: invariant offset + length <= slice.len()
+        unsafe {
+            self.values
+                .extend_from_slice_unchecked(slice, start + offset, len);
+        }
     }
 
     fn extend_validity(&mut self, additional: usize) {
diff --git a/src/array/growable/utils.rs b/src/array/growable/utils.rs
index 06a85cd9ad..3e0c25a4ee 100644
--- a/src/array/growable/utils.rs
+++ b/src/array/growable/utils.rs
@@ -7,9 +7,12 @@ pub(super) type ExtendNullBits<'a> = Box<dyn Fn(&mut MutableBitmap, usize, usize
 pub(super) fn build_extend_null_bits(array: &dyn Array, use_validity: bool) -> ExtendNullBits {
     if let Some(bitmap) = array.validity() {
         Box::new(move |validity, start, len| {
-            assert!(start + len <= bitmap.len());
+            debug_assert!(start + len <= bitmap.len());
             let (slice, offset, _) = bitmap.as_slice();
-            validity.extend_from_slice(slice, start + offset, len);
+            // safety: invariant offset + length <= slice.len()
+            unsafe {
+                validity.extend_from_slice_unchecked(slice, start + offset, len);
+            }
         })
     } else if use_validity {
         Box::new(|validity, _, len| {
diff --git a/src/array/physical_binary.rs b/src/array/physical_binary.rs
index 161371603e..694e61a7ea 100644
--- a/src/array/physical_binary.rs
+++ b/src/array/physical_binary.rs
@@ -219,7 +219,8 @@ pub(crate) fn extend_validity(
     if let Some(other) = other {
         if let Some(validity) = validity {
             let slice = other.as_slice();
-            validity.extend_from_slice(slice, 0, other.len())
+            // safety: invariant offset + length <= slice.len()
+            unsafe { validity.extend_from_slice_unchecked(slice, 0, other.len()) }
         } else {
             let mut new_validity = MutableBitmap::from_len_set(length);
             new_validity.extend_from_slice(other.as_slice(), 0, other.len());
diff --git a/src/bitmap/mutable.rs b/src/bitmap/mutable.rs
index 2cfc253b2a..1cc2193917 100644
--- a/src/bitmap/mutable.rs
+++ b/src/bitmap/mutable.rs
@@ -673,9 +673,15 @@ impl MutableBitmap {
     /// # Implementation
     /// When both [`MutableBitmap`]'s length and `offset` are both multiples of 8,
     /// this function performs a memcopy. Else, it first aligns bit by bit and then performs a memcopy.
+    /// # Safety
+    /// Caller must ensure `offset + length <= slice.len() * 8`
     #[inline]
-    pub fn extend_from_slice(&mut self, slice: &[u8], offset: usize, length: usize) {
-        assert!(offset + length <= slice.len() * 8);
+    pub unsafe fn extend_from_slice_unchecked(
+        &mut self,
+        slice: &[u8],
+        offset: usize,
+        length: usize,
+    ) {
         if length == 0 {
             return;
         };
@@ -691,11 +697,26 @@ impl MutableBitmap {
         debug_assert_eq!(self.length.saturating_add(7) / 8, self.buffer.len());
     }
 
+    /// Extends the [`MutableBitmap`] from a slice of bytes with optional offset.
+    /// This is the fastest way to extend a [`MutableBitmap`].
+    /// # Implementation
+    /// When both [`MutableBitmap`]'s length and `offset` are both multiples of 8,
+    /// this function performs a memcopy. Else, it first aligns bit by bit and then performs a memcopy.
+    #[inline]
+    pub fn extend_from_slice(&mut self, slice: &[u8], offset: usize, length: usize) {
+        assert!(offset + length <= slice.len() * 8);
+        // safety: invariant is asserted
+        unsafe { self.extend_from_slice_unchecked(slice, offset, length) }
+    }
+
     /// Extends the [`MutableBitmap`] from a [`Bitmap`].
     #[inline]
     pub fn extend_from_bitmap(&mut self, bitmap: &Bitmap) {
         let (slice, offset, length) = bitmap.as_slice();
-        self.extend_from_slice(slice, offset, length);
+        // safety: bitmap.as_slice adheres to the invariant
+        unsafe {
+            self.extend_from_slice_unchecked(slice, offset, length);
+        }
     }
 
     /// Returns the slice of bytes of this [`MutableBitmap`].
diff --git a/src/compute/filter.rs b/src/compute/filter.rs
index 0299e5045f..7ba260e702 100644
--- a/src/compute/filter.rs
+++ b/src/compute/filter.rs
@@ -108,7 +108,12 @@ where
                     std::ptr::copy(chunk.as_ptr(), dst, size);
                     dst = dst.add(size);
 
-                    new_validity.extend_from_slice(validity_chunk.to_ne_bytes().as_ref(), 0, size);
+                    // safety: invariant offset + length <= slice.len()
+                    new_validity.extend_from_slice_unchecked(
+                        validity_chunk.to_ne_bytes().as_ref(),
+                        0,
+                        size,
+                    );
                 }
                 return;
             }

From 6d9290e30d6b4017bef0b2aa25bb0383077e32ad Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Sun, 21 May 2023 10:01:02 +0200
Subject: [PATCH 14/80] Added PushUnchecked trait (#1489)

---
 src/array/fixed_size_list/mutable.rs | 29 ++++++++++++++++++++++++++++
 src/array/mod.rs                     |  9 +++++++++
 2 files changed, 38 insertions(+)

diff --git a/src/array/fixed_size_list/mutable.rs b/src/array/fixed_size_list/mutable.rs
index d929f75e6e..1e387a2f70 100644
--- a/src/array/fixed_size_list/mutable.rs
+++ b/src/array/fixed_size_list/mutable.rs
@@ -1,5 +1,6 @@
 use std::sync::Arc;
 
+use crate::array::PushUnchecked;
 use crate::{
     array::{
         physical_binary::extend_validity, Array, MutableArray, TryExtend, TryExtendFromSelf,
@@ -104,6 +105,15 @@ impl<M: MutableArray> MutableFixedSizeListArray<M> {
         Ok(())
     }
 
+    #[inline]
+    /// Needs to be called when a valid value was extended to this array.
+    /// This is a relatively low level function, prefer `try_push` when you can.
+    pub fn push_valid(&mut self) {
+        if let Some(validity) = &mut self.validity {
+            validity.push(true)
+        }
+    }
+
     #[inline]
     fn push_null(&mut self) {
         (0..self.size).for_each(|_| self.values.push_null());
@@ -221,6 +231,25 @@ where
     }
 }
 
+impl<M, I, T> PushUnchecked<Option<I>> for MutableFixedSizeListArray<M>
+where
+    M: MutableArray + Extend<Option<T>>,
+    I: IntoIterator<Item = Option<T>>,
+{
+    /// # Safety
+    /// The caller must ensure that the `I` iterates exactly over `size`
+    /// items, where `size` is the fixed size width.
+    #[inline]
+    unsafe fn push_unchecked(&mut self, item: Option<I>) {
+        if let Some(items) = item {
+            self.values.extend(items);
+            self.push_valid();
+        } else {
+            self.push_null();
+        }
+    }
+}
+
 impl<M> TryExtendFromSelf for MutableFixedSizeListArray<M>
 where
     M: MutableArray + TryExtendFromSelf,
diff --git a/src/array/mod.rs b/src/array/mod.rs
index f9c320a650..50eb962b2b 100644
--- a/src/array/mod.rs
+++ b/src/array/mod.rs
@@ -751,6 +751,15 @@ pub trait TryPush<A> {
     fn try_push(&mut self, item: A) -> Result<()>;
 }
 
+/// A trait describing the ability of a struct to receive new items.
+pub trait PushUnchecked<A> {
+    /// Push a new element that holds the invariants of the struct.
+    /// # Safety
+    /// The items must uphold the invariants of the struct
+    /// Read the specific implementation of the trait to understand what these are.
+    unsafe fn push_unchecked(&mut self, item: A);
+}
+
 /// A trait describing the ability of a struct to extend from a reference of itself.
 /// Specialization of [`TryExtend`].
 pub trait TryExtendFromSelf {

From 38a2e9090554d25706536205fde2fa92fc850667 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Wed, 24 May 2023 07:05:46 +0200
Subject: [PATCH 15/80] Added cast FixedSizeList <-> LargeList (#1490)

---
 src/compute/cast/mod.rs | 49 +++++++++++++++++++++++++++++------------
 1 file changed, 35 insertions(+), 14 deletions(-)

diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs
index e42f769e7e..afbf0fe44e 100644
--- a/src/compute/cast/mod.rs
+++ b/src/compute/cast/mod.rs
@@ -86,9 +86,15 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
         (FixedSizeList(list_from, _), List(list_to)) => {
             can_cast_types(&list_from.data_type, &list_to.data_type)
         }
+        (FixedSizeList(list_from, _), LargeList(list_to)) => {
+            can_cast_types(&list_from.data_type, &list_to.data_type)
+        }
         (List(list_from), FixedSizeList(list_to, _)) => {
             can_cast_types(&list_from.data_type, &list_to.data_type)
         }
+        (LargeList(list_from), FixedSizeList(list_to, _)) => {
+            can_cast_types(&list_from.data_type, &list_to.data_type)
+        }
         (List(list_from), List(list_to)) => {
             can_cast_types(&list_from.data_type, &list_to.data_type)
         }
@@ -345,24 +351,24 @@ fn cast_large_to_list(array: &ListArray<i64>, to_type: &DataType) -> ListArray<i
     )
 }
 
-fn cast_fixed_size_list_to_list(
+fn cast_fixed_size_list_to_list<O: Offset>(
     fixed: &FixedSizeListArray,
     to_type: &DataType,
     options: CastOptions,
-) -> Result<ListArray<i32>> {
+) -> Result<ListArray<O>> {
     let new_values = cast(
         fixed.values().as_ref(),
-        ListArray::<i32>::get_child_type(to_type),
+        ListArray::<O>::get_child_type(to_type),
         options,
     )?;
 
     let offsets = (0..=fixed.len())
-        .map(|ix| (ix * fixed.size()) as i32)
+        .map(|ix| O::from_as_usize(ix * fixed.size()))
         .collect::<Vec<_>>();
     // Safety: offsets _are_ monotonically increasing
     let offsets = unsafe { Offsets::new_unchecked(offsets) };
 
-    Ok(ListArray::<i32>::new(
+    Ok(ListArray::<O>::new(
         to_type.clone(),
         offsets.into(),
         new_values,
@@ -370,20 +376,20 @@ fn cast_fixed_size_list_to_list(
     ))
 }
 
-fn cast_list_to_fixed_size_list(
-    list: &ListArray<i32>,
+fn cast_list_to_fixed_size_list<O: Offset>(
+    list: &ListArray<O>,
     inner: &Field,
     size: usize,
     options: CastOptions,
 ) -> Result<FixedSizeListArray> {
     let offsets = list.offsets().buffer().iter();
-    let expected = (0..list.len()).map(|ix| (ix * size) as i32);
+    let expected = (0..list.len()).map(|ix| O::from_as_usize(ix * size));
 
     match offsets
         .zip(expected)
         .find(|(actual, expected)| *actual != expected)
     {
-        Some(_) => Err(Error::NotYetImplemented(
+        Some(_) => Err(Error::InvalidArgumentError(
             "incompatible offsets in source list".to_string(),
         )),
         None => {
@@ -438,17 +444,32 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu
         (_, Struct(_)) => Err(Error::NotYetImplemented(
             "Cannot cast to struct from other types".to_string(),
         )),
-        (List(_), FixedSizeList(inner, size)) => cast_list_to_fixed_size_list(
+        (List(_), FixedSizeList(inner, size)) => cast_list_to_fixed_size_list::<i32>(
             array.as_any().downcast_ref().unwrap(),
             inner.as_ref(),
             *size,
             options,
         )
         .map(|x| x.boxed()),
-        (FixedSizeList(_, _), List(_)) => {
-            cast_fixed_size_list_to_list(array.as_any().downcast_ref().unwrap(), to_type, options)
-                .map(|x| x.boxed())
-        }
+        (LargeList(_), FixedSizeList(inner, size)) => cast_list_to_fixed_size_list::<i64>(
+            array.as_any().downcast_ref().unwrap(),
+            inner.as_ref(),
+            *size,
+            options,
+        )
+        .map(|x| x.boxed()),
+        (FixedSizeList(_, _), List(_)) => cast_fixed_size_list_to_list::<i32>(
+            array.as_any().downcast_ref().unwrap(),
+            to_type,
+            options,
+        )
+        .map(|x| x.boxed()),
+        (FixedSizeList(_, _), LargeList(_)) => cast_fixed_size_list_to_list::<i64>(
+            array.as_any().downcast_ref().unwrap(),
+            to_type,
+            options,
+        )
+        .map(|x| x.boxed()),
         (List(_), List(_)) => {
             cast_list::<i32>(array.as_any().downcast_ref().unwrap(), to_type, options)
                 .map(|x| x.boxed())

From a2a9bd7d32f9e107c1fc5ea79508b5e20a4e45b4 Mon Sep 17 00:00:00 2001
From: sundyli <543950155@qq.com>
Date: Wed, 24 May 2023 23:02:58 -0700
Subject: [PATCH 16/80] ci: Disable integration-ipc tests (#1492)

Disable integration-ipc.yml
---
 .github/workflows/integration-ipc.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/integration-ipc.yml b/.github/workflows/integration-ipc.yml
index 11aa79bed5..7dcb8b0c7d 100644
--- a/.github/workflows/integration-ipc.yml
+++ b/.github/workflows/integration-ipc.yml
@@ -1,6 +1,7 @@
 name: Integration IPC / Flight
 
-on: [push, pull_request]
+# on: [push, pull_request]
+on: []
 
 jobs:
   docker:

From f348fd60351092216c611d58e5ae6680bcef2038 Mon Sep 17 00:00:00 2001
From: Howard Zuo <someotherusername@gmail.com>
Date: Thu, 25 May 2023 07:45:10 -0400
Subject: [PATCH 17/80] Implement `into_inner` for Utf8Array and BinaryArray
 for reusing Buffer allocations (#1491)

---
 src/array/binary/mod.rs | 12 ++++++++++++
 src/array/utf8/mod.rs   | 12 ++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs
index b08e78dc6d..7247decb30 100644
--- a/src/array/binary/mod.rs
+++ b/src/array/binary/mod.rs
@@ -221,6 +221,18 @@ impl<O: Offset> BinaryArray<O> {
     impl_mut_validity!();
     impl_into_array!();
 
+    /// Returns its internal representation
+    #[must_use]
+    pub fn into_inner(self) -> (DataType, OffsetsBuffer<O>, Buffer<u8>, Option<Bitmap>) {
+        let Self {
+            data_type,
+            offsets,
+            values,
+            validity,
+        } = self;
+        (data_type, offsets, values, validity)
+    }
+
     /// Try to convert this `BinaryArray` to a `MutableBinaryArray`
     #[must_use]
     pub fn into_mut(self) -> Either<Self, MutableBinaryArray<O>> {
diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs
index 7392ccdd04..9440ae4330 100644
--- a/src/array/utf8/mod.rs
+++ b/src/array/utf8/mod.rs
@@ -240,6 +240,18 @@ impl<O: Offset> Utf8Array<O> {
     impl_mut_validity!();
     impl_into_array!();
 
+    /// Returns its internal representation
+    #[must_use]
+    pub fn into_inner(self) -> (DataType, OffsetsBuffer<O>, Buffer<u8>, Option<Bitmap>) {
+        let Self {
+            data_type,
+            offsets,
+            values,
+            validity,
+        } = self;
+        (data_type, offsets, values, validity)
+    }
+
     /// Try to convert this `Utf8Array` to a `MutableUtf8Array`
     #[must_use]
     pub fn into_mut(self) -> Either<Self, MutableUtf8Array<O>> {

From 320a90bf300f388747dba97bf8ddd838af4a41ea Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Thu, 25 May 2023 14:18:55 +0200
Subject: [PATCH 18/80] feat: expose `non_null_sum` (#1493)

---
 src/compute/aggregate/sum.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/compute/aggregate/sum.rs b/src/compute/aggregate/sum.rs
index 9156a304c2..614a4cf538 100644
--- a/src/compute/aggregate/sum.rs
+++ b/src/compute/aggregate/sum.rs
@@ -21,7 +21,8 @@ pub trait Sum<T> {
 }
 
 #[multiversion(targets = "simd")]
-fn nonnull_sum<T>(values: &[T]) -> T
+/// Compute the sum of a slice
+pub fn sum_slice<T>(values: &[T]) -> T
 where
     T: NativeType + Simd + Add<Output = T> + std::iter::Sum<T>,
     T::Simd: Sum<T> + Add<Output = T::Simd>,
@@ -97,7 +98,7 @@ where
     }
 
     match array.validity() {
-        None => Some(nonnull_sum(array.values())),
+        None => Some(sum_slice(array.values())),
         Some(bitmap) => Some(null_sum(array.values(), bitmap)),
     }
 }

From 23e6ab937cdbba426bf1b532ab0e84947dafd38a Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Thu, 1 Jun 2023 13:34:35 +0200
Subject: [PATCH 19/80] chore: feature gate decimal arithmetic (#1494)

---
 Cargo.toml                     | 3 ++-
 src/compute/arithmetics/mod.rs | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 7c03aad27c..7e43f68c97 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -213,7 +213,8 @@ io_json_integration = ["hex", "serde", "serde_derive", "serde_json", "io_ipc"]
 io_print = ["comfy-table"]
 # the compute kernels. Disabling this significantly reduces compile time.
 compute_aggregate = ["multiversion"]
-compute_arithmetics = ["strength_reduce"]
+compute_arithmetics_decimal = ["strength_reduce"]
+compute_arithmetics = ["strength_reduce", "compute_arithmetics_decimal"]
 compute_bitwise = []
 compute_boolean = []
 compute_boolean_kleene = []
diff --git a/src/compute/arithmetics/mod.rs b/src/compute/arithmetics/mod.rs
index 3f09fe4d0b..b1ec2a12bc 100644
--- a/src/compute/arithmetics/mod.rs
+++ b/src/compute/arithmetics/mod.rs
@@ -13,6 +13,7 @@
 //!   adjusts the precision and scale to make the resulting value fit.
 #[forbid(unsafe_code)]
 pub mod basic;
+#[cfg(feature = "compute_arithmetics_decimal")]
 pub mod decimal;
 pub mod time;
 

From fbaf35e81d5357ebece1c49ac4e8a93aef26617a Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Thu, 1 Jun 2023 14:19:26 +0200
Subject: [PATCH 20/80] chore: update rustc and fix clippy (#1496)

---
 rust-toolchain.toml                           |  2 +-
 src/compute/sort/mod.rs                       | 56 +++++++++----------
 src/ffi/stream.rs                             |  2 +-
 .../parquet/read/deserialize/nested_utils.rs  |  6 +-
 tests/it/compute/sort/mod.rs                  | 14 ++---
 5 files changed, 34 insertions(+), 46 deletions(-)

diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 5f7fac67f2..904c6cc5fc 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,2 +1,2 @@
 [toolchain]
-channel = "nightly-2022-12-05"
+channel = "nightly-2023-06-01"
diff --git a/src/compute/sort/mod.rs b/src/compute/sort/mod.rs
index 92272d057c..be85c9f6cb 100644
--- a/src/compute/sort/mod.rs
+++ b/src/compute/sort/mod.rs
@@ -6,10 +6,7 @@ use crate::compute::take;
 use crate::datatypes::*;
 use crate::error::{Error, Result};
 use crate::offset::Offset;
-use crate::{
-    array::*,
-    types::{Index, NativeType},
-};
+use crate::{array::*, types::Index};
 
 mod binary;
 mod boolean;
@@ -156,14 +153,14 @@ pub fn sort_to_indices<I: Index>(
         DataType::List(field) => {
             let (v, n) = partition_validity(values);
             match &field.data_type {
-                DataType::Int8 => Ok(sort_list::<I, i32, i8>(values, v, n, options, limit)),
-                DataType::Int16 => Ok(sort_list::<I, i32, i16>(values, v, n, options, limit)),
-                DataType::Int32 => Ok(sort_list::<I, i32, i32>(values, v, n, options, limit)),
-                DataType::Int64 => Ok(sort_list::<I, i32, i64>(values, v, n, options, limit)),
-                DataType::UInt8 => Ok(sort_list::<I, i32, u8>(values, v, n, options, limit)),
-                DataType::UInt16 => Ok(sort_list::<I, i32, u16>(values, v, n, options, limit)),
-                DataType::UInt32 => Ok(sort_list::<I, i32, u32>(values, v, n, options, limit)),
-                DataType::UInt64 => Ok(sort_list::<I, i32, u64>(values, v, n, options, limit)),
+                DataType::Int8 => Ok(sort_list::<I, i32>(values, v, n, options, limit)),
+                DataType::Int16 => Ok(sort_list::<I, i32>(values, v, n, options, limit)),
+                DataType::Int32 => Ok(sort_list::<I, i32>(values, v, n, options, limit)),
+                DataType::Int64 => Ok(sort_list::<I, i32>(values, v, n, options, limit)),
+                DataType::UInt8 => Ok(sort_list::<I, i32>(values, v, n, options, limit)),
+                DataType::UInt16 => Ok(sort_list::<I, i32>(values, v, n, options, limit)),
+                DataType::UInt32 => Ok(sort_list::<I, i32>(values, v, n, options, limit)),
+                DataType::UInt64 => Ok(sort_list::<I, i32>(values, v, n, options, limit)),
                 t => Err(Error::NotYetImplemented(format!(
                     "Sort not supported for list type {t:?}"
                 ))),
@@ -172,14 +169,14 @@ pub fn sort_to_indices<I: Index>(
         DataType::LargeList(field) => {
             let (v, n) = partition_validity(values);
             match field.data_type() {
-                DataType::Int8 => Ok(sort_list::<I, i64, i8>(values, v, n, options, limit)),
-                DataType::Int16 => Ok(sort_list::<I, i64, i16>(values, v, n, options, limit)),
-                DataType::Int32 => Ok(sort_list::<I, i64, i32>(values, v, n, options, limit)),
-                DataType::Int64 => Ok(sort_list::<I, i64, i64>(values, v, n, options, limit)),
-                DataType::UInt8 => Ok(sort_list::<I, i64, u8>(values, v, n, options, limit)),
-                DataType::UInt16 => Ok(sort_list::<I, i64, u16>(values, v, n, options, limit)),
-                DataType::UInt32 => Ok(sort_list::<I, i64, u32>(values, v, n, options, limit)),
-                DataType::UInt64 => Ok(sort_list::<I, i64, u64>(values, v, n, options, limit)),
+                DataType::Int8 => Ok(sort_list::<I, i64>(values, v, n, options, limit)),
+                DataType::Int16 => Ok(sort_list::<I, i64>(values, v, n, options, limit)),
+                DataType::Int32 => Ok(sort_list::<I, i64>(values, v, n, options, limit)),
+                DataType::Int64 => Ok(sort_list::<I, i64>(values, v, n, options, limit)),
+                DataType::UInt8 => Ok(sort_list::<I, i64>(values, v, n, options, limit)),
+                DataType::UInt16 => Ok(sort_list::<I, i64>(values, v, n, options, limit)),
+                DataType::UInt32 => Ok(sort_list::<I, i64>(values, v, n, options, limit)),
+                DataType::UInt64 => Ok(sort_list::<I, i64>(values, v, n, options, limit)),
                 t => Err(Error::NotYetImplemented(format!(
                     "Sort not supported for list type {t:?}"
                 ))),
@@ -188,14 +185,14 @@ pub fn sort_to_indices<I: Index>(
         DataType::FixedSizeList(field, _) => {
             let (v, n) = partition_validity(values);
             match field.data_type() {
-                DataType::Int8 => Ok(sort_list::<I, i32, i8>(values, v, n, options, limit)),
-                DataType::Int16 => Ok(sort_list::<I, i32, i16>(values, v, n, options, limit)),
-                DataType::Int32 => Ok(sort_list::<I, i32, i32>(values, v, n, options, limit)),
-                DataType::Int64 => Ok(sort_list::<I, i32, i64>(values, v, n, options, limit)),
-                DataType::UInt8 => Ok(sort_list::<I, i32, u8>(values, v, n, options, limit)),
-                DataType::UInt16 => Ok(sort_list::<I, i32, u16>(values, v, n, options, limit)),
-                DataType::UInt32 => Ok(sort_list::<I, i32, u32>(values, v, n, options, limit)),
-                DataType::UInt64 => Ok(sort_list::<I, i32, u64>(values, v, n, options, limit)),
+                DataType::Int8 => Ok(sort_list::<I, i32>(values, v, n, options, limit)),
+                DataType::Int16 => Ok(sort_list::<I, i32>(values, v, n, options, limit)),
+                DataType::Int32 => Ok(sort_list::<I, i32>(values, v, n, options, limit)),
+                DataType::Int64 => Ok(sort_list::<I, i32>(values, v, n, options, limit)),
+                DataType::UInt8 => Ok(sort_list::<I, i32>(values, v, n, options, limit)),
+                DataType::UInt16 => Ok(sort_list::<I, i32>(values, v, n, options, limit)),
+                DataType::UInt32 => Ok(sort_list::<I, i32>(values, v, n, options, limit)),
+                DataType::UInt64 => Ok(sort_list::<I, i32>(values, v, n, options, limit)),
                 t => Err(Error::NotYetImplemented(format!(
                     "Sort not supported for list type {t:?}"
                 ))),
@@ -305,7 +302,7 @@ impl Default for SortOptions {
     }
 }
 
-fn sort_list<I, O, T>(
+fn sort_list<I, O>(
     values: &dyn Array,
     value_indices: Vec<I>,
     null_indices: Vec<I>,
@@ -315,7 +312,6 @@ fn sort_list<I, O, T>(
 where
     I: Index,
     O: Offset,
-    T: NativeType + std::cmp::PartialOrd,
 {
     let mut valids: Vec<(I, Box<dyn Array>)> = values
         .as_any()
diff --git a/src/ffi/stream.rs b/src/ffi/stream.rs
index 5e0dabe012..2aeb1b47f5 100644
--- a/src/ffi/stream.rs
+++ b/src/ffi/stream.rs
@@ -132,7 +132,7 @@ unsafe extern "C" fn get_next(iter: *mut ArrowArrayStream, array: *mut ArrowArra
     if iter.is_null() {
         return 2001;
     }
-    let mut private = &mut *((*iter).private_data as *mut PrivateData);
+    let private = &mut *((*iter).private_data as *mut PrivateData);
 
     match private.iter.next() {
         Some(Ok(item)) => {
diff --git a/src/io/parquet/read/deserialize/nested_utils.rs b/src/io/parquet/read/deserialize/nested_utils.rs
index 211531f6c0..86c7f5bdab 100644
--- a/src/io/parquet/read/deserialize/nested_utils.rs
+++ b/src/io/parquet/read/deserialize/nested_utils.rs
@@ -452,11 +452,7 @@ fn extend_offsets2<'a, D: NestedDecoder<'a>>(
 
                 let is_valid = nest.is_nullable() && def > cum_sum[depth];
                 nest.push(length, is_valid);
-                if nest.is_required() && !is_valid {
-                    is_required = true;
-                } else {
-                    is_required = false
-                };
+                is_required = nest.is_required() && !is_valid;
 
                 if depth == max_depth - 1 {
                     // the leaf / primitive
diff --git a/tests/it/compute/sort/mod.rs b/tests/it/compute/sort/mod.rs
index 2ede887364..736cfbadba 100644
--- a/tests/it/compute/sort/mod.rs
+++ b/tests/it/compute/sort/mod.rs
@@ -41,11 +41,7 @@ fn string_arrays(data: &[Option<&str>], options: SortOptions, expected_data: &[O
     assert_eq!(expected, output.as_ref())
 }
 
-fn string_dict_arrays<K: DictionaryKey>(
-    data: &[Option<&str>],
-    options: SortOptions,
-    expected_data: &[Option<&str>],
-) {
+fn string_dict_arrays(data: &[Option<&str>], options: SortOptions, expected_data: &[Option<&str>]) {
     let mut input = MutableDictionaryArray::<i32, MutableUtf8Array<i32>>::new();
     input.try_extend(data.iter().copied()).unwrap();
     let input = input.into_arc();
@@ -351,7 +347,7 @@ fn strings() {
 
 #[test]
 fn string_dicts() {
-    string_dict_arrays::<i8>(
+    string_dict_arrays(
         &[
             None,
             Some("bad"),
@@ -374,7 +370,7 @@ fn string_dicts() {
         ],
     );
 
-    string_dict_arrays::<i16>(
+    string_dict_arrays(
         &[
             None,
             Some("bad"),
@@ -397,7 +393,7 @@ fn string_dicts() {
         ],
     );
 
-    string_dict_arrays::<i32>(
+    string_dict_arrays(
         &[
             None,
             Some("bad"),
@@ -420,7 +416,7 @@ fn string_dicts() {
         ],
     );
 
-    string_dict_arrays::<i16>(
+    string_dict_arrays(
         &[
             None,
             Some("bad"),

From 1db6afad063555bb891202aaa5d2da6a5aa45f0b Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Thu, 1 Jun 2023 14:45:14 +0200
Subject: [PATCH 21/80] Added cast LargeBinary -> LargeList<u8> (#1497)

---
 src/compute/cast/binary_to.rs | 12 +++++++
 src/compute/cast/mod.rs       | 59 +++++++++++++++++++++--------------
 2 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/src/compute/cast/binary_to.rs b/src/compute/cast/binary_to.rs
index d84c7dd1bd..82f827e3f6 100644
--- a/src/compute/cast/binary_to.rs
+++ b/src/compute/cast/binary_to.rs
@@ -144,3 +144,15 @@ pub fn fixed_size_binary_binary<O: Offset>(
         from.validity().cloned(),
     )
 }
+
+/// Conversion of binary
+pub fn binary_to_list<O: Offset>(from: &BinaryArray<O>, to_data_type: DataType) -> ListArray<O> {
+    let values = from.values().clone();
+    let values = PrimitiveArray::new(DataType::UInt8, values, None);
+    ListArray::<O>::new(
+        to_data_type,
+        from.offsets().clone(),
+        values.boxed(),
+        from.validity().cloned(),
+    )
+}
diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs
index afbf0fe44e..d97878d497 100644
--- a/src/compute/cast/mod.rs
+++ b/src/compute/cast/mod.rs
@@ -137,7 +137,14 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
         (Binary, to_type) => {
             is_numeric(to_type) || matches!(to_type, LargeBinary | Utf8 | LargeUtf8)
         }
-        (LargeBinary, to_type) => is_numeric(to_type) || matches!(to_type, Binary | LargeUtf8),
+        (LargeBinary, to_type) => {
+            is_numeric(to_type)
+                || match to_type {
+                    Binary | LargeUtf8 => true,
+                    LargeList(field) => matches!(field.data_type, UInt8),
+                    _ => false,
+                }
+        }
         (FixedSizeBinary(_), to_type) => matches!(to_type, Binary | LargeBinary),
         (Timestamp(_, _), Utf8) => true,
         (Timestamp(_, _), LargeUtf8) => true,
@@ -684,29 +691,35 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu
             ))),
         },
 
-        (LargeBinary, _) => match to_type {
-            UInt8 => binary_to_primitive_dyn::<i64, u8>(array, to_type, options),
-            UInt16 => binary_to_primitive_dyn::<i64, u16>(array, to_type, options),
-            UInt32 => binary_to_primitive_dyn::<i64, u32>(array, to_type, options),
-            UInt64 => binary_to_primitive_dyn::<i64, u64>(array, to_type, options),
-            Int8 => binary_to_primitive_dyn::<i64, i8>(array, to_type, options),
-            Int16 => binary_to_primitive_dyn::<i64, i16>(array, to_type, options),
-            Int32 => binary_to_primitive_dyn::<i64, i32>(array, to_type, options),
-            Int64 => binary_to_primitive_dyn::<i64, i64>(array, to_type, options),
-            Float32 => binary_to_primitive_dyn::<i64, f32>(array, to_type, options),
-            Float64 => binary_to_primitive_dyn::<i64, f64>(array, to_type, options),
-            Binary => {
-                binary_large_to_binary(array.as_any().downcast_ref().unwrap(), to_type.clone())
-                    .map(|x| x.boxed())
-            }
-            LargeUtf8 => {
-                binary_to_utf8::<i64>(array.as_any().downcast_ref().unwrap(), to_type.clone())
-                    .map(|x| x.boxed())
+        (LargeBinary, _) => {
+            match to_type {
+                UInt8 => binary_to_primitive_dyn::<i64, u8>(array, to_type, options),
+                UInt16 => binary_to_primitive_dyn::<i64, u16>(array, to_type, options),
+                UInt32 => binary_to_primitive_dyn::<i64, u32>(array, to_type, options),
+                UInt64 => binary_to_primitive_dyn::<i64, u64>(array, to_type, options),
+                Int8 => binary_to_primitive_dyn::<i64, i8>(array, to_type, options),
+                Int16 => binary_to_primitive_dyn::<i64, i16>(array, to_type, options),
+                Int32 => binary_to_primitive_dyn::<i64, i32>(array, to_type, options),
+                Int64 => binary_to_primitive_dyn::<i64, i64>(array, to_type, options),
+                Float32 => binary_to_primitive_dyn::<i64, f32>(array, to_type, options),
+                Float64 => binary_to_primitive_dyn::<i64, f64>(array, to_type, options),
+                Binary => {
+                    binary_large_to_binary(array.as_any().downcast_ref().unwrap(), to_type.clone())
+                        .map(|x| x.boxed())
+                }
+                LargeUtf8 => {
+                    binary_to_utf8::<i64>(array.as_any().downcast_ref().unwrap(), to_type.clone())
+                        .map(|x| x.boxed())
+                }
+                LargeList(inner) if matches!(inner.data_type, DataType::UInt8) => Ok(
+                    binary_to_list::<i64>(array.as_any().downcast_ref().unwrap(), to_type.clone())
+                        .boxed(),
+                ),
+                _ => Err(Error::NotYetImplemented(format!(
+                    "Casting from {from_type:?} to {to_type:?} not supported",
+                ))),
             }
-            _ => Err(Error::NotYetImplemented(format!(
-                "Casting from {from_type:?} to {to_type:?} not supported",
-            ))),
-        },
+        }
         (FixedSizeBinary(_), _) => match to_type {
             Binary => Ok(fixed_size_binary_binary::<i32>(
                 array.as_any().downcast_ref().unwrap(),

From 1ebb7dbe7ca548d0628a028396bfc7c8eaa684c0 Mon Sep 17 00:00:00 2001
From: Reece Kibble <reecek@uniciant.com>
Date: Thu, 1 Jun 2023 21:07:24 +0800
Subject: [PATCH 22/80] Fix missing `UInt64` case for page filtering (#1498)

---
 src/io/parquet/read/indexes/mod.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/io/parquet/read/indexes/mod.rs b/src/io/parquet/read/indexes/mod.rs
index 08a8dde5d8..de7b69b3a4 100644
--- a/src/io/parquet/read/indexes/mod.rs
+++ b/src/io/parquet/read/indexes/mod.rs
@@ -147,7 +147,8 @@ fn deserialize(
                 .unwrap();
             Ok(primitive::deserialize_i32(&index.indexes, data_type).into())
         }
-        PhysicalType::Primitive(PrimitiveType::Int64) => {
+        PhysicalType::Primitive(PrimitiveType::UInt64)
+        | PhysicalType::Primitive(PrimitiveType::Int64) => {
             let index = indexes.pop_front().unwrap();
             match index.physical_type() {
                 ParquetPhysicalType::Int64 => {

From fb5e4d591c7149df590a330365fae55d2370962f Mon Sep 17 00:00:00 2001
From: sundyli <543950155@qq.com>
Date: Thu, 1 Jun 2023 06:07:41 -0700
Subject: [PATCH 23/80] Bump arrow-rs to 40.0.0 (#1495)

---
 Cargo.toml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 7e43f68c97..79651e5ab1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -101,10 +101,10 @@ odbc-api = { version = "0.36", optional = true }
 ahash = "0.8"
 
 # Support conversion to/from arrow-rs
-arrow-buffer = { version = "39.0.0", optional = true }
-arrow-schema = { version = "39.0.0", optional = true }
-arrow-data = { version = "39.0.0", optional = true }
-arrow-array = { version = "39.0.0", optional = true }
+arrow-buffer = { version = "^40.0.0", optional = true }
+arrow-schema = { version = "^40.0.0", optional = true }
+arrow-data = { version = "^40.0.0", optional = true }
+arrow-array = { version = "^40.0.0", optional = true }
 
 [target.wasm32-unknown-unknown.dependencies]
 getrandom = { version = "0.2", features = ["js"] }

From 82c959a88bae848fa34848707a2e1e632f8e4054 Mon Sep 17 00:00:00 2001
From: Qqwy / Marten <w-m@wmcode.nl>
Date: Mon, 5 Jun 2023 05:59:05 +0200
Subject: [PATCH 24/80] Relax the checks for the presence of ArrowSchema.name
 as this field is optional (#1499)

---
 src/ffi/schema.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/ffi/schema.rs b/src/ffi/schema.rs
index 2751583ef1..e41de33e43 100644
--- a/src/ffi/schema.rs
+++ b/src/ffi/schema.rs
@@ -154,15 +154,18 @@ impl ArrowSchema {
     }
 
     /// returns the name of this schema.
+    ///
+    /// Since this field is optional, `""` is returned if it is not set (as per the spec).
     pub(crate) fn name(&self) -> &str {
-        assert!(!self.name.is_null());
+        if self.name.is_null() {
+            return "";
+        }
         // safe because the lifetime of `self.name` equals `self`
         unsafe { CStr::from_ptr(self.name) }.to_str().unwrap()
     }
 
     pub(crate) fn child(&self, index: usize) -> &'static Self {
         assert!(index < self.n_children as usize);
-        assert!(!self.name.is_null());
         unsafe { self.children.add(index).as_ref().unwrap().as_ref().unwrap() }
     }
 

From 99e30d3549219934adfee00a60f15f6c0ffb06e4 Mon Sep 17 00:00:00 2001
From: Qqwy / Marten <w-m@wmcode.nl>
Date: Mon, 5 Jun 2023 05:59:21 +0200
Subject: [PATCH 25/80] Allows to use wrappers other than Box to build
 ArrowArrayStreamReader. (#1500)

---
 src/ffi/stream.rs | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/ffi/stream.rs b/src/ffi/stream.rs
index 2aeb1b47f5..9611935821 100644
--- a/src/ffi/stream.rs
+++ b/src/ffi/stream.rs
@@ -1,4 +1,5 @@
 use std::ffi::{CStr, CString};
+use std::ops::DerefMut;
 
 use crate::{array::Array, datatypes::Field, error::Error};
 
@@ -45,12 +46,12 @@ unsafe fn handle_error(iter: &mut ArrowArrayStream) -> Error {
 }
 
 /// Implements an iterator of [`Array`] consumed from the [C stream interface](https://arrow.apache.org/docs/format/CStreamInterface.html).
-pub struct ArrowArrayStreamReader {
-    iter: Box<ArrowArrayStream>,
+pub struct ArrowArrayStreamReader<Iter: DerefMut<Target = ArrowArrayStream>> {
+    iter: Iter,
     field: Field,
 }
 
-impl ArrowArrayStreamReader {
+impl<Iter: DerefMut<Target = ArrowArrayStream>> ArrowArrayStreamReader<Iter> {
     /// Returns a new [`ArrowArrayStreamReader`]
     /// # Error
     /// Errors iff the [`ArrowArrayStream`] is out of specification
@@ -60,7 +61,7 @@ impl ArrowArrayStreamReader {
     /// In particular:
     /// * The `ArrowArrayStream` fulfills the invariants of the C stream interface
     /// * The schema `get_schema` produces fulfills the C data interface
-    pub unsafe fn try_new(mut iter: Box<ArrowArrayStream>) -> Result<Self, Error> {
+    pub unsafe fn try_new(mut iter: Iter) -> Result<Self, Error> {
         if iter.get_next.is_none() {
             return Err(Error::OutOfSpec(
                 "The C stream MUST contain a non-null get_next".to_string(),

From 0bfc3c05287c500ff42120dd17d5cf800484590b Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Fri, 9 Jun 2023 08:55:32 +0200
Subject: [PATCH 26/80] inline MutablePrimitiveArray::push (#1505)

---
 src/array/primitive/mutable.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/array/primitive/mutable.rs b/src/array/primitive/mutable.rs
index ce6c8fa3c9..4432ab2e33 100644
--- a/src/array/primitive/mutable.rs
+++ b/src/array/primitive/mutable.rs
@@ -132,6 +132,7 @@ impl<T: NativeType> MutablePrimitiveArray<T> {
     }
 
     /// Adds a new value to the array.
+    #[inline]
     pub fn push(&mut self, value: Option<T>) {
         match value {
             Some(value) => {

From f46e26cfef09f6355e22af2e13c82fb07859d29f Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Mon, 12 Jun 2023 10:30:47 +0200
Subject: [PATCH 27/80] feat: add from_inner (#1506)

---
 src/array/boolean/mod.rs   | 17 ++++++++
 src/array/primitive/mod.rs | 28 +++++++++++++
 src/bitmap/immutable.rs    | 85 ++++++++++++++++++++++++++++----------
 src/buffer/immutable.rs    | 24 +++++++++++
 src/buffer/mod.rs          | 52 ++++++++++++++++++++++-
 src/ffi/array.rs           |  3 +-
 6 files changed, 185 insertions(+), 24 deletions(-)

diff --git a/src/array/boolean/mod.rs b/src/array/boolean/mod.rs
index 78555572bb..0b634ee90e 100644
--- a/src/array/boolean/mod.rs
+++ b/src/array/boolean/mod.rs
@@ -354,6 +354,23 @@ impl BooleanArray {
         } = self;
         (data_type, values, validity)
     }
+
+    /// Creates a `[BooleanArray]` from its internal representation.
+    /// This is the inverted from `[BooleanArray::into_inner]`
+    ///
+    /// # Safety
+    /// Callers must ensure all invariants of this struct are upheld.
+    pub unsafe fn from_inner_unchecked(
+        data_type: DataType,
+        values: Bitmap,
+        validity: Option<Bitmap>,
+    ) -> Self {
+        Self {
+            data_type,
+            values,
+            validity,
+        }
+    }
 }
 
 impl Array for BooleanArray {
diff --git a/src/array/primitive/mod.rs b/src/array/primitive/mod.rs
index 9e4ae03881..04b74a3529 100644
--- a/src/array/primitive/mod.rs
+++ b/src/array/primitive/mod.rs
@@ -289,6 +289,34 @@ impl<T: NativeType> PrimitiveArray<T> {
         (data_type, values, validity)
     }
 
+    /// Creates a `[PrimitiveArray]` from its internal representation.
+    /// This is the inverted from `[PrimitiveArray::into_inner]`
+    pub fn from_inner(
+        data_type: DataType,
+        values: Buffer<T>,
+        validity: Option<Bitmap>,
+    ) -> Result<Self, Error> {
+        check(&data_type, &values, validity.as_ref().map(|v| v.len()))?;
+        Ok(unsafe { Self::from_inner_unchecked(data_type, values, validity) })
+    }
+
+    /// Creates a `[PrimitiveArray]` from its internal representation.
+    /// This is the inverted from `[PrimitiveArray::into_inner]`
+    ///
+    /// # Safety
+    /// Callers must ensure all invariants of this struct are upheld.
+    pub unsafe fn from_inner_unchecked(
+        data_type: DataType,
+        values: Buffer<T>,
+        validity: Option<Bitmap>,
+    ) -> Self {
+        Self {
+            data_type,
+            values,
+            validity,
+        }
+    }
+
     /// Try to convert this [`PrimitiveArray`] to a [`MutablePrimitiveArray`] via copy-on-write semantics.
     ///
     /// A [`PrimitiveArray`] is backed by a [`Buffer`] and [`Bitmap`] which are essentially `Arc<Vec<_>>`.
diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs
index be1e3a662d..2a5e14dc5b 100644
--- a/src/bitmap/immutable.rs
+++ b/src/bitmap/immutable.rs
@@ -65,6 +65,17 @@ impl Default for Bitmap {
     }
 }
 
+pub(super) fn check(bytes: &[u8], offset: usize, length: usize) -> Result<(), Error> {
+    if offset + length > bytes.len().saturating_mul(8) {
+        return Err(Error::InvalidArgumentError(format!(
+            "The offset + length of the bitmap ({}) must be `<=` to the number of bytes times 8 ({})",
+            offset + length,
+            bytes.len().saturating_mul(8)
+        )));
+    }
+    Ok(())
+}
+
 impl Bitmap {
     /// Initializes an empty [`Bitmap`].
     #[inline]
@@ -77,13 +88,7 @@ impl Bitmap {
     /// This function errors iff `length > bytes.len() * 8`
     #[inline]
     pub fn try_new(bytes: Vec<u8>, length: usize) -> Result<Self, Error> {
-        if length > bytes.len().saturating_mul(8) {
-            return Err(Error::InvalidArgumentError(format!(
-                "The length of the bitmap ({}) must be `<=` to the number of bytes times 8 ({})",
-                length,
-                bytes.len().saturating_mul(8)
-            )));
-        }
+        check(&bytes, 0, length)?;
         let unset_bits = count_zeros(&bytes, 0, length);
         Ok(Self {
             length,
@@ -117,21 +122,6 @@ impl Bitmap {
         BitChunks::new(&self.bytes, self.offset, self.length)
     }
 
-    /// Creates a new [`Bitmap`] from [`Bytes`] and a length.
-    /// # Panic
-    /// Panics iff `length <= bytes.len() * 8`
-    #[inline]
-    pub(crate) fn from_bytes(bytes: Bytes<u8>, length: usize) -> Self {
-        assert!(length <= bytes.len() * 8);
-        let unset_bits = count_zeros(&bytes, 0, length);
-        Self {
-            length,
-            offset: 0,
-            bytes: Arc::new(bytes),
-            unset_bits,
-        }
-    }
-
     /// Returns the byte slice of this [`Bitmap`].
     ///
     /// The returned tuple contains:
@@ -327,6 +317,57 @@ impl Bitmap {
             None
         }
     }
+
+    /// Returns its internal representation
+    #[must_use]
+    pub fn into_inner(self) -> (Arc<Bytes<u8>>, usize, usize, usize) {
+        let Self {
+            bytes,
+            offset,
+            length,
+            unset_bits,
+        } = self;
+        (bytes, offset, length, unset_bits)
+    }
+
+    /// Creates a `[Bitmap]` from its internal representation.
+    /// This is the inverted from `[Bitmap::into_inner]`
+    ///
+    /// # Safety
+    /// The invariants of this struct must be upheld
+    pub unsafe fn from_inner(
+        bytes: Arc<Bytes<u8>>,
+        offset: usize,
+        length: usize,
+        unset_bits: usize,
+    ) -> Result<Self, Error> {
+        check(&bytes, offset, length)?;
+        Ok(Self {
+            bytes,
+            offset,
+            length,
+            unset_bits,
+        })
+    }
+
+    /// Creates a `[Bitmap]` from its internal representation.
+    /// This is the inverted from `[Bitmap::into_inner]`
+    ///
+    /// # Safety
+    /// Callers must ensure all invariants of this struct are upheld.
+    pub unsafe fn from_inner_unchecked(
+        bytes: Arc<Bytes<u8>>,
+        offset: usize,
+        length: usize,
+        unset_bits: usize,
+    ) -> Self {
+        Self {
+            bytes,
+            offset,
+            length,
+            unset_bits,
+        }
+    }
 }
 
 impl<P: AsRef<[bool]>> From<P> for Bitmap {
diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs
index 10c7968b0d..0da4a41ace 100644
--- a/src/buffer/immutable.rs
+++ b/src/buffer/immutable.rs
@@ -244,6 +244,30 @@ impl<T> Buffer<T> {
     pub fn shared_count_weak(&self) -> usize {
         Arc::weak_count(&self.data)
     }
+
+    /// Returns its internal representation
+    #[must_use]
+    pub fn into_inner(self) -> (Arc<Bytes<T>>, usize, usize) {
+        let Self {
+            data,
+            offset,
+            length,
+        } = self;
+        (data, offset, length)
+    }
+
+    /// Creates a `[Bitmap]` from its internal representation.
+    /// This is the inverted from `[Bitmap::into_inner]`
+    ///
+    /// # Safety
+    /// Callers must ensure all invariants of this struct are upheld.
+    pub unsafe fn from_inner_unchecked(data: Arc<Bytes<T>>, offset: usize, length: usize) -> Self {
+        Self {
+            data,
+            offset,
+            length,
+        }
+    }
 }
 
 impl<T> From<Vec<T>> for Buffer<T> {
diff --git a/src/buffer/mod.rs b/src/buffer/mod.rs
index 7ce2677532..46c0a4d64a 100644
--- a/src/buffer/mod.rs
+++ b/src/buffer/mod.rs
@@ -4,6 +4,7 @@ mod immutable;
 mod iterator;
 
 use crate::ffi::InternalArrowArray;
+use std::ops::Deref;
 
 pub(crate) enum BytesAllocator {
     InternalArrowArray(InternalArrowArray),
@@ -11,8 +12,57 @@ pub(crate) enum BytesAllocator {
     #[cfg(feature = "arrow")]
     Arrow(arrow_buffer::Buffer),
 }
+pub(crate) type BytesInner<T> = foreign_vec::ForeignVec<BytesAllocator, T>;
 
-pub(crate) type Bytes<T> = foreign_vec::ForeignVec<BytesAllocator, T>;
+/// Bytes representation.
+#[repr(transparent)]
+pub struct Bytes<T>(BytesInner<T>);
+
+impl<T> Bytes<T> {
+    /// Takes ownership of an allocated memory region.
+    /// # Panics
+    /// This function panics if and only if pointer is not null
+    /// # Safety
+    /// This function is safe if and only if `ptr` is valid for `length`
+    /// # Implementation
+    /// This function leaks if and only if `owner` does not deallocate
+    /// the region `[ptr, ptr+length[` when dropped.
+    #[inline]
+    pub(crate) unsafe fn from_foreign(ptr: *const T, length: usize, owner: BytesAllocator) -> Self {
+        Self(BytesInner::from_foreign(ptr, length, owner))
+    }
+
+    /// Returns a `Some` mutable reference of [`Vec<T>`] iff this was initialized
+    /// from a [`Vec<T>`] and `None` otherwise.
+    #[inline]
+    pub(crate) fn get_vec(&mut self) -> Option<&mut Vec<T>> {
+        self.0.get_vec()
+    }
+}
+
+impl<T> Deref for Bytes<T> {
+    type Target = [T];
+
+    #[inline]
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T> From<Vec<T>> for Bytes<T> {
+    #[inline]
+    fn from(data: Vec<T>) -> Self {
+        let inner: BytesInner<T> = data.into();
+        Bytes(inner)
+    }
+}
+
+impl<T> From<BytesInner<T>> for Bytes<T> {
+    #[inline]
+    fn from(value: BytesInner<T>) -> Self {
+        Self(value)
+    }
+}
 
 #[cfg(feature = "arrow")]
 pub(crate) fn to_buffer<T: crate::types::NativeType>(
diff --git a/src/ffi/array.rs b/src/ffi/array.rs
index ad1b0568a7..1fa68eabbe 100644
--- a/src/ffi/array.rs
+++ b/src/ffi/array.rs
@@ -258,10 +258,11 @@ unsafe fn create_bitmap(
 
     let len: usize = array.length.try_into().expect("length to fit in `usize`");
     let offset: usize = array.offset.try_into().expect("Offset to fit in `usize`");
+    let null_count: usize = array.null_count();
     let bytes_len = bytes_for(offset + len);
     let bytes = Bytes::from_foreign(ptr, bytes_len, BytesAllocator::InternalArrowArray(owner));
 
-    Ok(Bitmap::from_bytes(bytes, offset + len).sliced(offset, len))
+    Bitmap::from_inner(Arc::new(bytes), offset, len, null_count)
 }
 
 fn buffer_offset(array: &ArrowArray, data_type: &DataType, i: usize) -> usize {

From 2d2e7053f9a50810bfe9cecff25ab39089aef98e Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Mon, 12 Jun 2023 14:31:15 +0200
Subject: [PATCH 28/80] defer null_count compute (#1507)

* defer null_count compute

* rogue import
---
 src/ffi/array.rs | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/ffi/array.rs b/src/ffi/array.rs
index 1fa68eabbe..b1c77d7366 100644
--- a/src/ffi/array.rs
+++ b/src/ffi/array.rs
@@ -1,6 +1,7 @@
 //! Contains functionality to load an ArrayData from the C Data Interface
 use std::sync::Arc;
 
+use crate::bitmap::utils::count_zeros;
 use crate::buffer::BytesAllocator;
 use crate::{
     array::*,
@@ -253,15 +254,22 @@ unsafe fn create_bitmap(
     data_type: &DataType,
     owner: InternalArrowArray,
     index: usize,
+    // if this is the validity bitmap
+    // we can use the null count directly
+    is_validity: bool,
 ) -> Result<Bitmap> {
     let ptr = get_buffer_ptr(array, data_type, index)?;
 
     let len: usize = array.length.try_into().expect("length to fit in `usize`");
-    let offset: usize = array.offset.try_into().expect("Offset to fit in `usize`");
-    let null_count: usize = array.null_count();
+    let offset: usize = array.offset.try_into().expect("offset to fit in `usize`");
     let bytes_len = bytes_for(offset + len);
     let bytes = Bytes::from_foreign(ptr, bytes_len, BytesAllocator::InternalArrowArray(owner));
 
+    let null_count: usize = if is_validity {
+        array.null_count()
+    } else {
+        count_zeros(bytes.as_ref(), offset, len)
+    };
     Bitmap::from_inner(Arc::new(bytes), offset, len, null_count)
 }
 
@@ -420,7 +428,7 @@ pub trait ArrowArrayRef: std::fmt::Debug {
         if self.array().null_count() == 0 {
             Ok(None)
         } else {
-            create_bitmap(self.array(), self.data_type(), self.owner(), 0).map(Some)
+            create_bitmap(self.array(), self.data_type(), self.owner(), 0, true).map(Some)
         }
     }
 
@@ -436,7 +444,7 @@ pub trait ArrowArrayRef: std::fmt::Debug {
     /// * the buffer at position `index` is valid for the declared length
     /// * the buffers' pointer is not mutable for the lifetime of `owner`
     unsafe fn bitmap(&self, index: usize) -> Result<Bitmap> {
-        create_bitmap(self.array(), self.data_type(), self.owner(), index)
+        create_bitmap(self.array(), self.data_type(), self.owner(), index, false)
     }
 
     /// # Safety

From 0d568a38850176d710523771a3a8c14b3d93b9ba Mon Sep 17 00:00:00 2001
From: Colin Jermain <cjermain@users.noreply.github.com>
Date: Mon, 19 Jun 2023 05:25:10 -0400
Subject: [PATCH 29/80] Adding new constructor for MutableListArray (#1503)

---
 src/array/list/mutable.rs | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/array/list/mutable.rs b/src/array/list/mutable.rs
index 8841ceaf53..2bb39dea87 100644
--- a/src/array/list/mutable.rs
+++ b/src/array/list/mutable.rs
@@ -142,6 +142,23 @@ impl<O: Offset, M: MutableArray> MutableListArray<O, M> {
         Self::new_from(values, data_type, capacity)
     }
 
+    /// Creates a new [`MutableListArray`] from a [`MutableArray`], [`Offsets`] and
+    /// [`MutableBitmap`].
+    pub fn new_from_mutable(
+        values: M,
+        offsets: Offsets<O>,
+        validity: Option<MutableBitmap>,
+    ) -> Self {
+        assert_eq!(values.len(), offsets.last().to_usize());
+        let data_type = values.data_type().clone();
+        Self {
+            data_type,
+            offsets,
+            values,
+            validity,
+        }
+    }
+
     #[inline]
     /// Needs to be called when a valid value was extended to this array.
     /// This is a relatively low level function, prefer `try_push` when you can.

From 3ab9b61f0ec6656d4613ad077c87bad6fac4682d Mon Sep 17 00:00:00 2001
From: Colin Jermain <cjermain@users.noreply.github.com>
Date: Mon, 19 Jun 2023 22:19:13 -0400
Subject: [PATCH 30/80] Fixing dtype for MutableListArray (#1509)

---
 src/array/list/mutable.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/array/list/mutable.rs b/src/array/list/mutable.rs
index 2bb39dea87..d24475e86d 100644
--- a/src/array/list/mutable.rs
+++ b/src/array/list/mutable.rs
@@ -150,7 +150,7 @@ impl<O: Offset, M: MutableArray> MutableListArray<O, M> {
         validity: Option<MutableBitmap>,
     ) -> Self {
         assert_eq!(values.len(), offsets.last().to_usize());
-        let data_type = values.data_type().clone();
+        let data_type = ListArray::<O>::default_datatype(values.data_type().clone());
         Self {
             data_type,
             offsets,

From 98287faf826d74982be2c1734b514a4e86ff14d1 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Mon, 26 Jun 2023 09:06:58 +0200
Subject: [PATCH 31/80] feat: add json null serialization (#1512)

---
 src/io/json/write/serialize.rs | 10 ++++++++++
 src/util/mod.rs                |  2 ++
 tests/it/io/json/write.rs      |  9 +++++++++
 3 files changed, 21 insertions(+)

diff --git a/src/io/json/write/serialize.rs b/src/io/json/write/serialize.rs
index dfa104d78e..7d46b82e0f 100644
--- a/src/io/json/write/serialize.rs
+++ b/src/io/json/write/serialize.rs
@@ -51,6 +51,15 @@ fn boolean_serializer<'a>(
     materialize_serializer(f, array.iter(), offset, take)
 }
 
+fn null_serializer(
+    len: usize,
+    offset: usize,
+    take: usize,
+) -> Box<dyn StreamingIterator<Item = [u8]> + Send + Sync> {
+    let f = |_x: (), buf: &mut Vec<u8>| buf.extend_from_slice(b"null");
+    materialize_serializer(f, std::iter::repeat(()).take(len), offset, take)
+}
+
 fn primitive_serializer<'a, T: NativeType + ToLexical>(
     array: &'a PrimitiveArray<T>,
     offset: usize,
@@ -376,6 +385,7 @@ pub(crate) fn new_serializer<'a>(
                 )
             }
         }
+        DataType::Null => null_serializer(array.len(), offset, take),
         other => todo!("Writing {:?} to JSON", other),
     }
 }
diff --git a/src/util/mod.rs b/src/util/mod.rs
index c88c76e6da..90642b151a 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -5,6 +5,7 @@
     feature = "io_csv_write",
     feature = "io_csv_read",
     feature = "io_json",
+    feature = "io_json_write",
     feature = "compute_cast"
 ))]
 mod lexical;
@@ -13,6 +14,7 @@ mod lexical;
     feature = "io_csv_write",
     feature = "io_csv_read",
     feature = "io_json",
+    feature = "io_json_write",
     feature = "compute_cast"
 ))]
 pub use lexical::*;
diff --git a/tests/it/io/json/write.rs b/tests/it/io/json/write.rs
index 9c8d1313f2..44cc7ef125 100644
--- a/tests/it/io/json/write.rs
+++ b/tests/it/io/json/write.rs
@@ -26,6 +26,15 @@ fn int32() -> Result<()> {
     test!(array, expected)
 }
 
+#[test]
+fn null() -> Result<()> {
+    let array = NullArray::new(DataType::Null, 3);
+
+    let expected = r#"[null,null,null]"#;
+
+    test!(array, expected)
+}
+
 #[test]
 fn f32() -> Result<()> {
     let array = Float32Array::from([

From e923e03d38ed84c1d2ff37867517d31b3ed8aca9 Mon Sep 17 00:00:00 2001
From: Frank Murphy <frank@wallaroo.ai>
Date: Mon, 26 Jun 2023 03:08:31 -0400
Subject: [PATCH 32/80] Fix list array parsing in pandas record json (#1511)

---
 src/io/json/read/deserialize.rs  | 13 +++++--------
 src/io/json/read/infer_schema.rs |  7 +------
 tests/it/io/json/read.rs         | 29 +++++++++++++++++++++++++++++
 3 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/src/io/json/read/deserialize.rs b/src/io/json/read/deserialize.rs
index d4147cd033..d889ce4320 100644
--- a/src/io/json/read/deserialize.rs
+++ b/src/io/json/read/deserialize.rs
@@ -625,14 +625,11 @@ fn allocate_array(f: &Field) -> Box<dyn MutableArray> {
             f.data_type().clone(),
             *size,
         )),
-        DataType::List(inner) => match inner.data_type() {
-            DataType::List(_) => Box::new(MutableListArray::<i32, _>::new_from(
-                allocate_array(inner),
-                inner.data_type().clone(),
-                0,
-            )),
-            _ => allocate_array(inner),
-        },
+        DataType::List(inner) => Box::new(MutableListArray::<i32, _>::new_from(
+            allocate_array(inner),
+            f.data_type().clone(),
+            0,
+        )),
         _ => todo!(),
     }
 }
diff --git a/src/io/json/read/infer_schema.rs b/src/io/json/read/infer_schema.rs
index f098bf80d3..c4ba23f56f 100644
--- a/src/io/json/read/infer_schema.rs
+++ b/src/io/json/read/infer_schema.rs
@@ -38,12 +38,7 @@ pub fn infer_records_schema(json: &Value) -> Result<Schema> {
 
                 Ok(Field {
                     name: name.clone(),
-                    data_type: DataType::List(Box::new(Field {
-                        name: format!("{name}-records"),
-                        data_type,
-                        is_nullable: true,
-                        metadata: Metadata::default(),
-                    })),
+                    data_type,
                     is_nullable: true,
                     metadata: Metadata::default(),
                 })
diff --git a/tests/it/io/json/read.rs b/tests/it/io/json/read.rs
index 2f78069396..fe91fb378e 100644
--- a/tests/it/io/json/read.rs
+++ b/tests/it/io/json/read.rs
@@ -164,6 +164,35 @@ fn read_json_fixed_size_records() -> Result<()> {
     Ok(())
 }
 
+#[test]
+fn read_json_records_with_schema() -> Result<()> {
+    let raw = b"[{\"matrix\":[0.0,2.0]},{\"matrix\":[0.0,0.0,2.1,3.0]}]";
+    let schema = Schema {
+        fields: vec![Field::new(
+            "matrix",
+            DataType::List(Box::new(Field::new("inner", DataType::Float32, false))),
+            false,
+        )],
+        metadata: Metadata::default(),
+    };
+
+    let json = json_deserializer::parse(raw)?;
+    let actual = read::deserialize_records(&json, &schema)?;
+    assert_eq!(
+        format!("{:?}", actual.arrays()[0]),
+        "ListArray[[0, 2], [0, 0, 2.1, 3]]"
+    );
+
+    let schema = read::infer_records_schema(&json)?;
+    let actual = read::deserialize_records(&json, &schema)?;
+    assert_eq!(
+        format!("{:?}", actual.arrays()[0]),
+        "ListArray[[0, 2], [0, 0, 2.1, 3]]"
+    );
+
+    Ok(())
+}
+
 #[test]
 fn deserialize_timestamp_string_ns() -> Result<()> {
     let data = br#"["2023-04-07T12:23:34.000000001Z"]"#;

From d1240b68ab9cb3d3cb3aaee514a2b376f6e0dfb6 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Mon, 26 Jun 2023 12:17:33 +0200
Subject: [PATCH 33/80] perf: don't needlessly trigger bitcount (#1513)

---
 src/bitmap/immutable.rs |  5 ++++-
 src/bitmap/mutable.rs   | 16 ++++++++++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs
index 2a5e14dc5b..41799e0adb 100644
--- a/src/bitmap/immutable.rs
+++ b/src/bitmap/immutable.rs
@@ -282,7 +282,10 @@ impl Bitmap {
     /// Initializes an new [`Bitmap`] filled with unset values.
     #[inline]
     pub fn new_zeroed(length: usize) -> Self {
-        MutableBitmap::from_len_zeroed(length).into()
+        // don't use `MutableBitmap::from_len_zeroed().into()`
+        // it triggers a bitcount
+        let bytes = vec![0; length.saturating_add(7) / 8];
+        unsafe { Bitmap::from_inner_unchecked(Arc::new(bytes.into()), 0, length, length) }
     }
 
     /// Counts the nulls (unset bits) starting from `offset` bits and for `length` bits.
diff --git a/src/bitmap/mutable.rs b/src/bitmap/mutable.rs
index 1cc2193917..31834f2165 100644
--- a/src/bitmap/mutable.rs
+++ b/src/bitmap/mutable.rs
@@ -1,5 +1,6 @@
 use std::hint::unreachable_unchecked;
 use std::iter::FromIterator;
+use std::sync::Arc;
 
 use crate::bitmap::utils::{merge_reversed, set_bit_unchecked};
 use crate::error::Error;
@@ -336,8 +337,19 @@ impl From<MutableBitmap> for Bitmap {
 impl From<MutableBitmap> for Option<Bitmap> {
     #[inline]
     fn from(buffer: MutableBitmap) -> Self {
-        if buffer.unset_bits() > 0 {
-            Some(Bitmap::try_new(buffer.buffer, buffer.length).unwrap())
+        let unset_bits = buffer.unset_bits();
+        if unset_bits > 0 {
+            // safety:
+            // invariants of the `MutableBitmap` equal that of `Bitmap`
+            let bitmap = unsafe {
+                Bitmap::from_inner_unchecked(
+                    Arc::new(buffer.buffer.into()),
+                    0,
+                    buffer.length,
+                    unset_bits,
+                )
+            };
+            Some(bitmap)
         } else {
             None
         }

From 8ee5ad8c774e7355218376caf73d94bce2a769a6 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Mon, 3 Jul 2023 14:25:33 +0200
Subject: [PATCH 34/80] tag unsafe function as such (#1516)

---
 src/array/mod.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/array/mod.rs b/src/array/mod.rs
index 50eb962b2b..bbbbedc359 100644
--- a/src/array/mod.rs
+++ b/src/array/mod.rs
@@ -528,8 +528,8 @@ macro_rules! impl_sliced {
         /// The caller must ensure that `offset + length <= self.len()`.
         #[inline]
         #[must_use]
-        pub fn sliced_unchecked(mut self, offset: usize, length: usize) -> Self {
-            unsafe { self.slice_unchecked(offset, length) };
+        pub unsafe fn sliced_unchecked(mut self, offset: usize, length: usize) -> Self {
+            self.slice_unchecked(offset, length);
             self
         }
     };

From e9386ffa36159abe8f6d82fa4ae2e40d2dd28e95 Mon Sep 17 00:00:00 2001
From: Frank Murphy <frank@wallaroo.ai>
Date: Sat, 8 Jul 2023 08:18:50 -0400
Subject: [PATCH 35/80] Pandas record support for utf8 and bool arrays (#1517)

Co-authored-by: JONBRWN <jonathanbrown.a@gmail.com>
---
 src/io/json/read/deserialize.rs |  3 +++
 tests/it/io/json/read.rs        | 36 +++++++++++++++++++++++++++++++--
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/src/io/json/read/deserialize.rs b/src/io/json/read/deserialize.rs
index d889ce4320..048e29a300 100644
--- a/src/io/json/read/deserialize.rs
+++ b/src/io/json/read/deserialize.rs
@@ -620,6 +620,9 @@ fn allocate_array(f: &Field) -> Box<dyn MutableArray> {
         DataType::Float16 => Box::new(MutablePrimitiveArray::<f16>::new()),
         DataType::Float32 => Box::new(MutablePrimitiveArray::<f32>::new()),
         DataType::Float64 => Box::new(MutablePrimitiveArray::<f64>::new()),
+        DataType::Boolean => Box::new(MutableBooleanArray::new()),
+        DataType::Utf8 => Box::new(MutableUtf8Array::<i32>::new()),
+        DataType::LargeUtf8 => Box::new(MutableUtf8Array::<i64>::new()),
         DataType::FixedSizeList(inner, size) => Box::new(MutableFixedSizeListArray::<_>::new_from(
             allocate_array(inner),
             f.data_type().clone(),
diff --git a/tests/it/io/json/read.rs b/tests/it/io/json/read.rs
index fe91fb378e..411564190b 100644
--- a/tests/it/io/json/read.rs
+++ b/tests/it/io/json/read.rs
@@ -43,7 +43,9 @@ fn read_json_records() -> Result<()> {
                 [2, 3],
                 [4, 5, 6]
             ],
-            "b": [1, 2, 3]
+            "b": [1, 2, 3],
+            "c": ["test"],
+            "d": [true]
         },
         {
             "a": [
@@ -53,7 +55,9 @@ fn read_json_records() -> Result<()> {
             ]
         },
         {
-            "b": [7, 8, 9]
+            "b": [7, 8, 9],
+            "c": ["string"],
+            "d": [false]
         }
     ]"#;
 
@@ -96,6 +100,30 @@ fn read_json_records() -> Result<()> {
     b.try_extend(b_iter).unwrap();
     let b_expected: ListArray<i32> = b.into();
 
+    let c_iter = vec![vec![Some("test")], vec![Some("string")]];
+
+    let c_iter = c_iter.into_iter().map(Some);
+    let mut c = MutableListArray::<i32, MutableUtf8Array<i32>>::new_with_field(
+        MutableUtf8Array::<i32>::new(),
+        "item",
+        true,
+    );
+
+    c.try_extend(c_iter).unwrap();
+    let c_expected: ListArray<i32> = c.into();
+
+    let d_iter = vec![vec![Some(true)], vec![Some(false)]];
+
+    let d_iter = d_iter.into_iter().map(Some);
+    let mut d = MutableListArray::<i32, MutableBooleanArray>::new_with_field(
+        MutableBooleanArray::new(),
+        "item",
+        true,
+    );
+
+    d.try_extend(d_iter).unwrap();
+    let d_expected: ListArray<i32> = d.into();
+
     let json = json_deserializer::parse(data)?;
 
     let schema = read::infer_records_schema(&json)?;
@@ -106,6 +134,10 @@ fn read_json_records() -> Result<()> {
             (&a_expected, arr.as_ref())
         } else if f.name == "b" {
             (&b_expected, arr.as_ref())
+        } else if f.name == "c" {
+            (&c_expected, arr.as_ref())
+        } else if f.name == "d" {
+            (&d_expected, arr.as_ref())
         } else {
             panic!("unexpected field found: {}", f.name);
         };

From 589b3f842bb2092cbddc41bea05039898a0db74d Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Sat, 15 Jul 2023 11:28:35 +0200
Subject: [PATCH 36/80] arrow2 0.17.3 release (#1520)

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 79651e5ab1..24e4ec9173 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "arrow2"
-version = "0.17.1"
+version = "0.17.3"
 license = "Apache-2.0"
 description = "Unofficial implementation of Apache Arrow spec in safe Rust"
 homepage = "https://github.com/jorgecarleitao/arrow2"

From 031bc7bc9ff720073b553a23696e7288db269229 Mon Sep 17 00:00:00 2001
From: Yang Xiufeng <yangxiufeng.c@gmail.com>
Date: Tue, 18 Jul 2023 15:12:04 +0800
Subject: [PATCH 37/80] Bump arrow-rs version to 43. (#1521)

---
 Cargo.toml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 24e4ec9173..05d7851811 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -101,10 +101,10 @@ odbc-api = { version = "0.36", optional = true }
 ahash = "0.8"
 
 # Support conversion to/from arrow-rs
-arrow-buffer = { version = "^40.0.0", optional = true }
-arrow-schema = { version = "^40.0.0", optional = true }
-arrow-data = { version = "^40.0.0", optional = true }
-arrow-array = { version = "^40.0.0", optional = true }
+arrow-buffer = { version = ">=40, <44", optional = true }
+arrow-schema = { version = ">=40, <44", optional = true }
+arrow-data = { version = ">=40, <44", optional = true }
+arrow-array = { version = ">=40, <44", optional = true }
 
 [target.wasm32-unknown-unknown.dependencies]
 getrandom = { version = "0.2", features = ["js"] }

From f175c1cf2070735d747d6b355c7e9286caae2c19 Mon Sep 17 00:00:00 2001
From: Frank Murphy <frank@wallaroo.ai>
Date: Thu, 27 Jul 2023 04:30:31 -0400
Subject: [PATCH 38/80] Sampling tests for parquet round trips (#1519)

---
 Cargo.toml                          |  11 +++
 tests/it/io/parquet/mod.rs          |   3 +
 tests/it/io/parquet/sample_tests.rs | 119 ++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+)
 create mode 100644 tests/it/io/parquet/sample_tests.rs

diff --git a/Cargo.toml b/Cargo.toml
index 05d7851811..ed5882cfd0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -129,6 +129,14 @@ proptest = { version = "1", default_features = false, features = ["std"] }
 avro-rs = { version = "0.13", features = ["snappy"] }
 # use for flaky testing
 rand = "0.8"
+# use for generating and testing random data samples
+sample-arrow2 = "0.1"
+sample-std = "0.1"
+sample-test = "0.1"
+
+# ugly hack needed to match this library in sample_arrow2
+[patch.crates-io]
+arrow2 = { path = "." }
 
 [package.metadata.docs.rs]
 features = ["full"]
@@ -188,6 +196,9 @@ io_parquet_compression = [
     "io_parquet_brotli"
 ]
 
+# sample testing of generated arrow data
+io_parquet_sample_test = ["io_parquet"]
+
 # compression backends
 io_parquet_zstd = ["parquet2/zstd"]
 io_parquet_snappy = ["parquet2/snappy"]
diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs
index cdf5b41573..1ad218e0fe 100644
--- a/tests/it/io/parquet/mod.rs
+++ b/tests/it/io/parquet/mod.rs
@@ -21,6 +21,9 @@ mod read_indexes;
 mod write;
 mod write_async;
 
+#[cfg(feature = "io_parquet_sample_test")]
+mod sample_tests;
+
 type ArrayStats = (Box<dyn Array>, Statistics);
 
 fn new_struct(
diff --git a/tests/it/io/parquet/sample_tests.rs b/tests/it/io/parquet/sample_tests.rs
new file mode 100644
index 0000000000..959f120128
--- /dev/null
+++ b/tests/it/io/parquet/sample_tests.rs
@@ -0,0 +1,119 @@
+use arrow2::io::parquet::write::*;
+use arrow2::{
+    chunk::Chunk,
+    datatypes::{Field, Metadata, Schema},
+    error::Result,
+    io::parquet::read as p_read,
+};
+use std::borrow::Borrow;
+use std::io::Cursor;
+
+use sample_arrow2::{
+    array::ArbitraryArray,
+    chunk::{ArbitraryChunk, ChainedChunk},
+    datatypes::{sample_flat, ArbitraryDataType},
+};
+use sample_std::{Chance, Random, Regex, Sample};
+use sample_test::sample_test;
+
+fn deep_chunk(depth: usize, len: usize) -> ArbitraryChunk<Regex, Chance> {
+    let names = Regex::new("[a-z]{4,8}");
+    let data_type = ArbitraryDataType {
+        struct_branch: 1..3,
+        names: names.clone(),
+        // TODO: this breaks the test
+        // nullable: Chance(0.5),
+        nullable: Chance(0.0),
+        flat: sample_flat,
+    }
+    .sample_depth(depth);
+
+    let array = ArbitraryArray {
+        names,
+        branch: 0..10,
+        len: len..(len + 1),
+        null: Chance(0.1),
+        // TODO: this breaks the test
+        // is_nullable: true,
+        is_nullable: false,
+    };
+
+    ArbitraryChunk {
+        // TODO: shrinking appears to be an issue with chunks this large. issues
+        // currently reproduce on the smaller sizes anyway.
+        // chunk_len: 10..1000,
+        chunk_len: 1..10,
+        array_count: 1..2,
+        data_type,
+        array,
+    }
+}
+
+#[sample_test]
+fn round_trip_sample(
+    #[sample(deep_chunk(5, 100).sample_one())] chained: ChainedChunk,
+) -> Result<()> {
+    sample_test::env_logger_init();
+    let chunks = vec![chained.value];
+    let name = Regex::new("[a-z]{4, 8}");
+    let mut g = Random::new();
+
+    // TODO: this probably belongs in a helper in sample-arrow2
+    let schema = Schema {
+        fields: chunks
+            .first()
+            .unwrap()
+            .iter()
+            .map(|arr| {
+                Field::new(
+                    name.generate(&mut g),
+                    arr.data_type().clone(),
+                    arr.validity().is_some(),
+                )
+            })
+            .collect(),
+        metadata: Metadata::default(),
+    };
+
+    let options = WriteOptions {
+        write_statistics: true,
+        compression: CompressionOptions::Uncompressed,
+        version: Version::V2,
+        data_pagesize_limit: None,
+    };
+
+    let encodings: Vec<_> = schema
+        .borrow()
+        .fields
+        .iter()
+        .map(|field| transverse(field.data_type(), |_| Encoding::Plain))
+        .collect();
+
+    let row_groups = RowGroupIterator::try_new(
+        chunks.clone().into_iter().map(Ok),
+        &schema,
+        options,
+        encodings,
+    )?;
+
+    let buffer = Cursor::new(vec![]);
+    let mut writer = FileWriter::try_new(buffer, schema, options)?;
+
+    for group in row_groups {
+        writer.write(group?)?;
+    }
+    writer.end(None)?;
+
+    let mut buffer = writer.into_inner();
+
+    let metadata = p_read::read_metadata(&mut buffer)?;
+    let schema = p_read::infer_schema(&metadata)?;
+
+    let mut reader = p_read::FileReader::new(buffer, metadata.row_groups, schema, None, None, None);
+
+    let result: Vec<_> = reader.collect::<Result<_>>()?;
+
+    assert_eq!(result, chunks);
+
+    Ok(())
+}

From 15c5ec1ee8ffd123c305e0072bb959baae54d353 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Thu, 27 Jul 2023 13:45:29 +0200
Subject: [PATCH 39/80] feat: add duration type to json writer (#1522)

---
 src/io/json/write/serialize.rs | 43 +++++++++++++++++++++++++++++++---
 src/temporal_conversions.rs    | 26 +++++++++++++++++++-
 2 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/src/io/json/write/serialize.rs b/src/io/json/write/serialize.rs
index 7d46b82e0f..abf845714c 100644
--- a/src/io/json/write/serialize.rs
+++ b/src/io/json/write/serialize.rs
@@ -1,4 +1,4 @@
-use chrono::{NaiveDate, NaiveDateTime};
+use chrono::{Duration, NaiveDate, NaiveDateTime};
 use lexical_core::ToLexical;
 use std::io::Write;
 use streaming_iterator::StreamingIterator;
@@ -8,8 +8,9 @@ use crate::datatypes::{IntegerType, TimeUnit};
 use crate::io::iterator::BufStreamingIterator;
 use crate::offset::Offset;
 use crate::temporal_conversions::{
-    date32_to_date, date64_to_date, timestamp_ms_to_datetime, timestamp_ns_to_datetime,
-    timestamp_s_to_datetime, timestamp_us_to_datetime,
+    date32_to_date, date64_to_date, duration_ms_to_duration, duration_ns_to_duration,
+    duration_s_to_duration, duration_us_to_duration, timestamp_ms_to_datetime,
+    timestamp_ns_to_datetime, timestamp_s_to_datetime, timestamp_us_to_datetime,
 };
 use crate::util::lexical_to_bytes_mut;
 use crate::{array::*, datatypes::DataType, types::NativeType};
@@ -266,6 +267,28 @@ where
     materialize_serializer(f, array.iter(), offset, take)
 }
 
+fn duration_serializer<'a, T, F>(
+    array: &'a PrimitiveArray<T>,
+    convert: F,
+    offset: usize,
+    take: usize,
+) -> Box<dyn StreamingIterator<Item = [u8]> + 'a + Send + Sync>
+where
+    T: NativeType,
+    F: Fn(T) -> Duration + 'static + Send + Sync,
+{
+    let f = move |x: Option<&T>, buf: &mut Vec<u8>| {
+        if let Some(x) = x {
+            let duration = convert(*x);
+            write!(buf, "\"{duration}\"").unwrap();
+        } else {
+            buf.extend_from_slice(b"null")
+        }
+    };
+
+    materialize_serializer(f, array.iter(), offset, take)
+}
+
 fn timestamp_serializer<'a, F>(
     array: &'a PrimitiveArray<i64>,
     convert: F,
@@ -385,6 +408,20 @@ pub(crate) fn new_serializer<'a>(
                 )
             }
         }
+        DataType::Duration(tu) => {
+            let convert = match tu {
+                TimeUnit::Nanosecond => duration_ns_to_duration,
+                TimeUnit::Microsecond => duration_us_to_duration,
+                TimeUnit::Millisecond => duration_ms_to_duration,
+                TimeUnit::Second => duration_s_to_duration,
+            };
+            duration_serializer(
+                array.as_any().downcast_ref().unwrap(),
+                convert,
+                offset,
+                take,
+            )
+        }
         DataType::Null => null_serializer(array.len(), offset, take),
         other => todo!("Writing {:?} to JSON", other),
     }
diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs
index a76700f444..48f2078a2a 100644
--- a/src/temporal_conversions.rs
+++ b/src/temporal_conversions.rs
@@ -2,7 +2,7 @@
 
 use chrono::{
     format::{parse, Parsed, StrftimeItems},
-    Datelike, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime,
+    Datelike, Duration, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime,
 };
 
 use crate::error::Result;
@@ -66,6 +66,30 @@ pub fn time32s_to_time(v: i32) -> NaiveTime {
     NaiveTime::from_num_seconds_from_midnight_opt(v as u32, 0).expect("invalid time")
 }
 
+/// converts a `i64` representing a `duration(s)` to [`Duration`]
+#[inline]
+pub fn duration_s_to_duration(v: i64) -> Duration {
+    Duration::seconds(v)
+}
+
+/// converts a `i64` representing a `duration(ms)` to [`Duration`]
+#[inline]
+pub fn duration_ms_to_duration(v: i64) -> Duration {
+    Duration::milliseconds(v)
+}
+
+/// converts a `i64` representing a `duration(us)` to [`Duration`]
+#[inline]
+pub fn duration_us_to_duration(v: i64) -> Duration {
+    Duration::microseconds(v)
+}
+
+/// converts a `i64` representing a `duration(ns)` to [`Duration`]
+#[inline]
+pub fn duration_ns_to_duration(v: i64) -> Duration {
+    Duration::nanoseconds(v)
+}
+
 /// converts a `i32` representing a `time32(ms)` to [`NaiveTime`]
 #[inline]
 pub fn time32ms_to_time(v: i32) -> NaiveTime {

From 393c3f8c58309388acc6a5d08575b360671e0a0f Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Thu, 27 Jul 2023 14:51:10 +0200
Subject: [PATCH 40/80] feat: add temporal conversions that don't panic (#1523)

---
 src/temporal_conversions.rs | 58 ++++++++++++++++++++++++++++++++-----
 1 file changed, 50 insertions(+), 8 deletions(-)

diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs
index 48f2078a2a..b706a45b29 100644
--- a/src/temporal_conversions.rs
+++ b/src/temporal_conversions.rs
@@ -32,14 +32,25 @@ pub const EPOCH_DAYS_FROM_CE: i32 = 719_163;
 /// converts a `i32` representing a `date32` to [`NaiveDateTime`]
 #[inline]
 pub fn date32_to_datetime(v: i32) -> NaiveDateTime {
+    date32_to_datetime_opt(v).expect("invalid or out-of-range datetime")
+}
+
+/// converts a `i32` representing a `date32` to [`NaiveDateTime`]
+#[inline]
+pub fn date32_to_datetime_opt(v: i32) -> Option<NaiveDateTime> {
     NaiveDateTime::from_timestamp_opt(v as i64 * SECONDS_IN_DAY, 0)
-        .expect("invalid or out-of-range datetime")
 }
 
 /// converts a `i32` representing a `date32` to [`NaiveDate`]
 #[inline]
 pub fn date32_to_date(days: i32) -> NaiveDate {
-    NaiveDate::from_num_days_from_ce_opt(EPOCH_DAYS_FROM_CE + days).expect("out-of-range date")
+    date32_to_date_opt(days).expect("out-of-range date")
+}
+
+/// converts a `i32` representing a `date32` to [`NaiveDate`]
+#[inline]
+pub fn date32_to_date_opt(days: i32) -> Option<NaiveDate> {
+    NaiveDate::from_num_days_from_ce_opt(EPOCH_DAYS_FROM_CE + days)
 }
 
 /// converts a `i64` representing a `date64` to [`NaiveDateTime`]
@@ -105,6 +116,12 @@ pub fn time32ms_to_time(v: i32) -> NaiveTime {
 /// converts a `i64` representing a `time64(us)` to [`NaiveTime`]
 #[inline]
 pub fn time64us_to_time(v: i64) -> NaiveTime {
+    time64us_to_time_opt(v).expect("invalid time")
+}
+
+/// converts a `i64` representing a `time64(us)` to [`NaiveTime`]
+#[inline]
+pub fn time64us_to_time_opt(v: i64) -> Option<NaiveTime> {
     NaiveTime::from_num_seconds_from_midnight_opt(
         // extract seconds from microseconds
         (v / MICROSECONDS) as u32,
@@ -112,30 +129,46 @@ pub fn time64us_to_time(v: i64) -> NaiveTime {
         // nanoseconds
         (v % MICROSECONDS * MILLISECONDS) as u32,
     )
-    .expect("invalid time")
 }
 
 /// converts a `i64` representing a `time64(ns)` to [`NaiveTime`]
 #[inline]
 pub fn time64ns_to_time(v: i64) -> NaiveTime {
+    time64ns_to_time_opt(v).expect("invalid time")
+}
+
+/// converts a `i64` representing a `time64(ns)` to [`NaiveTime`]
+#[inline]
+pub fn time64ns_to_time_opt(v: i64) -> Option<NaiveTime> {
     NaiveTime::from_num_seconds_from_midnight_opt(
         // extract seconds from nanoseconds
         (v / NANOSECONDS) as u32,
         // discard extracted seconds
         (v % NANOSECONDS) as u32,
     )
-    .expect("invalid time")
 }
 
 /// converts a `i64` representing a `timestamp(s)` to [`NaiveDateTime`]
 #[inline]
 pub fn timestamp_s_to_datetime(seconds: i64) -> NaiveDateTime {
-    NaiveDateTime::from_timestamp_opt(seconds, 0).expect("invalid or out-of-range datetime")
+    timestamp_s_to_datetime_opt(seconds).expect("invalid or out-of-range datetime")
+}
+
+/// converts a `i64` representing a `timestamp(s)` to [`NaiveDateTime`]
+#[inline]
+pub fn timestamp_s_to_datetime_opt(seconds: i64) -> Option<NaiveDateTime> {
+    NaiveDateTime::from_timestamp_opt(seconds, 0)
 }
 
 /// converts a `i64` representing a `timestamp(ms)` to [`NaiveDateTime`]
 #[inline]
 pub fn timestamp_ms_to_datetime(v: i64) -> NaiveDateTime {
+    timestamp_ms_to_datetime_opt(v).expect("invalid or out-of-range datetime")
+}
+
+/// converts a `i64` representing a `timestamp(ms)` to [`NaiveDateTime`]
+#[inline]
+pub fn timestamp_ms_to_datetime_opt(v: i64) -> Option<NaiveDateTime> {
     if v >= 0 {
         NaiveDateTime::from_timestamp_opt(
             // extract seconds from milliseconds
@@ -157,12 +190,17 @@ pub fn timestamp_ms_to_datetime(v: i64) -> NaiveDateTime {
             )
         }
     }
-    .expect("invalid or out-of-range datetime")
 }
 
 /// converts a `i64` representing a `timestamp(us)` to [`NaiveDateTime`]
 #[inline]
 pub fn timestamp_us_to_datetime(v: i64) -> NaiveDateTime {
+    timestamp_us_to_datetime_opt(v).expect("invalid or out-of-range datetime")
+}
+
+/// converts a `i64` representing a `timestamp(us)` to [`NaiveDateTime`]
+#[inline]
+pub fn timestamp_us_to_datetime_opt(v: i64) -> Option<NaiveDateTime> {
     if v >= 0 {
         NaiveDateTime::from_timestamp_opt(
             // extract seconds from microseconds
@@ -184,12 +222,17 @@ pub fn timestamp_us_to_datetime(v: i64) -> NaiveDateTime {
             )
         }
     }
-    .expect("invalid or out-of-range datetime")
 }
 
 /// converts a `i64` representing a `timestamp(ns)` to [`NaiveDateTime`]
 #[inline]
 pub fn timestamp_ns_to_datetime(v: i64) -> NaiveDateTime {
+    timestamp_ns_to_datetime_opt(v).expect("invalid or out-of-range datetime")
+}
+
+/// converts a `i64` representing a `timestamp(ns)` to [`NaiveDateTime`]
+#[inline]
+pub fn timestamp_ns_to_datetime_opt(v: i64) -> Option<NaiveDateTime> {
     if v >= 0 {
         NaiveDateTime::from_timestamp_opt(
             // extract seconds from nanoseconds
@@ -211,7 +254,6 @@ pub fn timestamp_ns_to_datetime(v: i64) -> NaiveDateTime {
             )
         }
     }
-    .expect("invalid or out-of-range datetime")
 }
 
 /// Converts a timestamp in `time_unit` and `timezone` into [`chrono::DateTime`].

From de20f2dd18604d1ecefc5fd6385952120014259a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nils=20O=2E=20Sel=C3=A5sdal?= <nos@utel.tech>
Date: Thu, 27 Jul 2023 15:08:37 +0200
Subject: [PATCH 41/80] Add datetime with timezone support to Json serializer
 (#1510)

---
 src/io/json/write/serialize.rs | 89 ++++++++++++++++++++++++++--------
 tests/it/io/json/write.rs      | 55 +++++++++++++++++++++
 2 files changed, 125 insertions(+), 19 deletions(-)

diff --git a/src/io/json/write/serialize.rs b/src/io/json/write/serialize.rs
index abf845714c..1a6fc2e4a5 100644
--- a/src/io/json/write/serialize.rs
+++ b/src/io/json/write/serialize.rs
@@ -7,10 +7,13 @@ use crate::bitmap::utils::ZipValidity;
 use crate::datatypes::{IntegerType, TimeUnit};
 use crate::io::iterator::BufStreamingIterator;
 use crate::offset::Offset;
+#[cfg(feature = "chrono-tz")]
+use crate::temporal_conversions::parse_offset_tz;
 use crate::temporal_conversions::{
     date32_to_date, date64_to_date, duration_ms_to_duration, duration_ns_to_duration,
-    duration_s_to_duration, duration_us_to_duration, timestamp_ms_to_datetime,
-    timestamp_ns_to_datetime, timestamp_s_to_datetime, timestamp_us_to_datetime,
+    duration_s_to_duration, duration_us_to_duration, parse_offset, timestamp_ms_to_datetime,
+    timestamp_ns_to_datetime, timestamp_s_to_datetime, timestamp_to_datetime,
+    timestamp_us_to_datetime,
 };
 use crate::util::lexical_to_bytes_mut;
 use crate::{array::*, datatypes::DataType, types::NativeType};
@@ -309,6 +312,51 @@ where
     materialize_serializer(f, array.iter(), offset, take)
 }
 
+fn timestamp_tz_serializer<'a>(
+    array: &'a PrimitiveArray<i64>,
+    time_unit: TimeUnit,
+    tz: &str,
+    offset: usize,
+    take: usize,
+) -> Box<dyn StreamingIterator<Item = [u8]> + 'a + Send + Sync> {
+    match parse_offset(tz) {
+        Ok(parsed_tz) => {
+            let f = move |x: Option<&i64>, buf: &mut Vec<u8>| {
+                if let Some(x) = x {
+                    let dt_str = timestamp_to_datetime(*x, time_unit, &parsed_tz).to_rfc3339();
+                    write!(buf, "\"{dt_str}\"").unwrap();
+                } else {
+                    buf.extend_from_slice(b"null")
+                }
+            };
+
+            materialize_serializer(f, array.iter(), offset, take)
+        }
+        #[cfg(feature = "chrono-tz")]
+        _ => match parse_offset_tz(tz) {
+            Ok(parsed_tz) => {
+                let f = move |x: Option<&i64>, buf: &mut Vec<u8>| {
+                    if let Some(x) = x {
+                        let dt_str = timestamp_to_datetime(*x, time_unit, &parsed_tz).to_rfc3339();
+                        write!(buf, "\"{dt_str}\"").unwrap();
+                    } else {
+                        buf.extend_from_slice(b"null")
+                    }
+                };
+
+                materialize_serializer(f, array.iter(), offset, take)
+            }
+            _ => {
+                panic!("Timezone {} is invalid or not supported", tz);
+            }
+        },
+        #[cfg(not(feature = "chrono-tz"))]
+        _ => {
+            panic!("Invalid Offset format (must be [-]00:00) or chrono-tz feature not active");
+        }
+    }
+}
+
 pub(crate) fn new_serializer<'a>(
     array: &'a dyn Array,
     offset: usize,
@@ -390,24 +438,27 @@ pub(crate) fn new_serializer<'a>(
             offset,
             take,
         ),
-        DataType::Timestamp(tu, tz) => {
-            if tz.is_some() {
-                todo!("still have to implement timezone")
-            } else {
-                let convert = match tu {
-                    TimeUnit::Nanosecond => timestamp_ns_to_datetime,
-                    TimeUnit::Microsecond => timestamp_us_to_datetime,
-                    TimeUnit::Millisecond => timestamp_ms_to_datetime,
-                    TimeUnit::Second => timestamp_s_to_datetime,
-                };
-                timestamp_serializer(
-                    array.as_any().downcast_ref().unwrap(),
-                    convert,
-                    offset,
-                    take,
-                )
-            }
+        DataType::Timestamp(tu, None) => {
+            let convert = match tu {
+                TimeUnit::Nanosecond => timestamp_ns_to_datetime,
+                TimeUnit::Microsecond => timestamp_us_to_datetime,
+                TimeUnit::Millisecond => timestamp_ms_to_datetime,
+                TimeUnit::Second => timestamp_s_to_datetime,
+            };
+            timestamp_serializer(
+                array.as_any().downcast_ref().unwrap(),
+                convert,
+                offset,
+                take,
+            )
         }
+        DataType::Timestamp(time_unit, Some(tz)) => timestamp_tz_serializer(
+            array.as_any().downcast_ref().unwrap(),
+            *time_unit,
+            tz,
+            offset,
+            take,
+        ),
         DataType::Duration(tu) => {
             let convert = match tu {
                 TimeUnit::Nanosecond => duration_ns_to_duration,
diff --git a/tests/it/io/json/write.rs b/tests/it/io/json/write.rs
index 44cc7ef125..1e6ede9fec 100644
--- a/tests/it/io/json/write.rs
+++ b/tests/it/io/json/write.rs
@@ -422,3 +422,58 @@ fn write_timestamp() -> Result<()> {
 
     test!(array, expected)
 }
+
+#[test]
+fn write_timestamp_with_tz_secs() -> Result<()> {
+    let array = PrimitiveArray::new(
+        DataType::Timestamp(TimeUnit::Second, Some("UTC".to_owned())),
+        vec![10i64, 1 << 32, 1 << 33].into(),
+        None,
+    );
+
+    let expected =
+        r#"["1970-01-01T00:00:10+00:00","2106-02-07T06:28:16+00:00","2242-03-16T12:56:32+00:00"]"#;
+    test!(array, expected)
+}
+
+#[test]
+fn write_timestamp_with_tz_micros() -> Result<()> {
+    let array = PrimitiveArray::new(
+        DataType::Timestamp(TimeUnit::Microsecond, Some("+02:00".to_owned())),
+        vec![
+            10i64 * 1_000_000,
+            (1 << 32) * 1_000_000,
+            (1 << 33) * 1_000_000,
+            1_234_567_890_123_450,
+            1_234_567_890_120_000,
+        ]
+        .into(),
+        None,
+    );
+    // Note, default chrono DateTime string conversion strips off milli/micro/nanoseconds parts
+    // if they are zero
+    let expected = r#"["1970-01-01T02:00:10+02:00","2106-02-07T08:28:16+02:00","2242-03-16T14:56:32+02:00","2009-02-14T01:31:30.123450+02:00","2009-02-14T01:31:30.120+02:00"]"#;
+
+    test!(array, expected)
+}
+#[cfg(feature = "chrono-tz")]
+#[test]
+fn write_timestamp_with_chrono_tz_millis() -> Result<()> {
+    let array = PrimitiveArray::new(
+        DataType::Timestamp(TimeUnit::Millisecond, Some("Europe/Oslo".to_owned())),
+        vec![
+            10i64 * 1_000,
+            (1 << 32) * 1_000,
+            (1 << 33) * 1_000,
+            1_234_567_890_123,
+            1_239_874_560_120,
+        ]
+        .into(),
+        None,
+    );
+    // Note, default chrono DateTime string conversion strips off milli/micro/nanoseconds parts
+    // if they are zero
+    let expected = r#"["1970-01-01T01:00:10+01:00","2106-02-07T07:28:16+01:00","2242-03-16T13:56:32+01:00","2009-02-14T00:31:30.123+01:00","2009-04-16T11:36:00.120+02:00"]"#;
+
+    test!(array, expected)
+}

From d5c78e7ba45fcebfbafd55a82ba2601ee3ea9617 Mon Sep 17 00:00:00 2001
From: Qqwy / Marten <qqwy@gmx.com>
Date: Thu, 27 Jul 2023 15:10:00 +0200
Subject: [PATCH 42/80] ArrowArrayStreamReader::try_new(): Safeguard against
 released streams (#1501)

---
 src/ffi/stream.rs      |  9 ++++++++-
 tests/it/ffi/stream.rs | 13 ++++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/ffi/stream.rs b/src/ffi/stream.rs
index 9611935821..08fcaf0f43 100644
--- a/src/ffi/stream.rs
+++ b/src/ffi/stream.rs
@@ -54,7 +54,8 @@ pub struct ArrowArrayStreamReader<Iter: DerefMut<Target = ArrowArrayStream>> {
 impl<Iter: DerefMut<Target = ArrowArrayStream>> ArrowArrayStreamReader<Iter> {
     /// Returns a new [`ArrowArrayStreamReader`]
     /// # Error
-    /// Errors iff the [`ArrowArrayStream`] is out of specification
+    /// Errors iff the [`ArrowArrayStream`] is out of specification,
+    /// or was already released prior to calling this function.
     /// # Safety
     /// This method is intrinsically `unsafe` since it assumes that the `ArrowArrayStream`
     /// contains a valid Arrow C stream interface.
@@ -62,6 +63,12 @@ impl<Iter: DerefMut<Target = ArrowArrayStream>> ArrowArrayStreamReader<Iter> {
     /// * The `ArrowArrayStream` fulfills the invariants of the C stream interface
     /// * The schema `get_schema` produces fulfills the C data interface
     pub unsafe fn try_new(mut iter: Iter) -> Result<Self, Error> {
+        if iter.release.is_none() {
+            return Err(Error::InvalidArgumentError(
+                "The C stream was already released".to_string(),
+            ));
+        };
+
         if iter.get_next.is_none() {
             return Err(Error::OutOfSpec(
                 "The C stream MUST contain a non-null get_next".to_string(),
diff --git a/tests/it/ffi/stream.rs b/tests/it/ffi/stream.rs
index 44d0e1e7cc..53887d4362 100644
--- a/tests/it/ffi/stream.rs
+++ b/tests/it/ffi/stream.rs
@@ -1,6 +1,6 @@
 use arrow2::array::*;
 use arrow2::datatypes::Field;
-use arrow2::{error::Result, ffi};
+use arrow2::{error::Error, error::Result, ffi};
 
 fn _test_round_trip(arrays: Vec<Box<dyn Array>>) -> Result<()> {
     let field = Field::new("a", arrays[0].data_type().clone(), true);
@@ -30,3 +30,14 @@ fn round_trip() -> Result<()> {
 
     _test_round_trip(vec![array.clone(), array.clone(), array])
 }
+
+#[test]
+fn stream_reader_try_new_invalid_argument_error_on_released_stream() {
+    let released_stream = Box::new(ffi::ArrowArrayStream::empty());
+    let reader = unsafe { ffi::ArrowArrayStreamReader::try_new(released_stream) };
+    // poor man's assert_matches:
+    match reader {
+        Err(Error::InvalidArgumentError(_)) => {}
+        _ => panic!("ArrowArrayStreamReader::try_new did not return an InvalidArgumentError"),
+    }
+}

From 92050ec64877fe1348116e0f5dc6e06b949c0519 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Sun, 6 Aug 2023 09:59:37 +0200
Subject: [PATCH 43/80] feat: better error message when reader feather v1
 (#1528)

---
 src/ffi/array.rs               | 14 +++++++++++---
 src/io/ipc/mod.rs              |  3 ++-
 src/io/ipc/read/file.rs        |  9 ++++++---
 src/io/ipc/read/file_async.rs  |  4 ++--
 src/io/ipc/write/file_async.rs |  6 +++---
 src/io/ipc/write/writer.rs     |  6 +++---
 6 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/src/ffi/array.rs b/src/ffi/array.rs
index b1c77d7366..4057d0be8f 100644
--- a/src/ffi/array.rs
+++ b/src/ffi/array.rs
@@ -209,7 +209,7 @@ unsafe fn get_buffer_ptr<T: NativeType>(
     let ptr = *buffers.add(index);
     if ptr.is_null() {
         return Err(Error::oos(format!(
-            "An array of type {data_type:?}
+            "An array of type {data_type:?} 
             must have a non-null buffer {index}"
         )));
     }
@@ -235,9 +235,14 @@ unsafe fn create_buffer<T: NativeType>(
     owner: InternalArrowArray,
     index: usize,
 ) -> Result<Buffer<T>> {
+    let len = buffer_len(array, data_type, index)?;
+
+    if len == 0 {
+        return Ok(Buffer::new());
+    }
+
     let ptr = get_buffer_ptr(array, data_type, index)?;
 
-    let len = buffer_len(array, data_type, index)?;
     let offset = buffer_offset(array, data_type, index);
     let bytes = Bytes::from_foreign(ptr, len, BytesAllocator::InternalArrowArray(owner));
 
@@ -258,9 +263,12 @@ unsafe fn create_bitmap(
     // we can use the null count directly
     is_validity: bool,
 ) -> Result<Bitmap> {
+    let len: usize = array.length.try_into().expect("length to fit in `usize`");
+    if len == 0 {
+        return Ok(Bitmap::new());
+    }
     let ptr = get_buffer_ptr(array, data_type, index)?;
 
-    let len: usize = array.length.try_into().expect("length to fit in `usize`");
     let offset: usize = array.offset.try_into().expect("offset to fit in `usize`");
     let bytes_len = bytes_for(offset + len);
     let bytes = Bytes::from_foreign(ptr, bytes_len, BytesAllocator::InternalArrowArray(owner));
diff --git a/src/io/ipc/mod.rs b/src/io/ipc/mod.rs
index e618a92687..2bb233a147 100644
--- a/src/io/ipc/mod.rs
+++ b/src/io/ipc/mod.rs
@@ -80,7 +80,8 @@ pub mod append;
 pub mod read;
 pub mod write;
 
-const ARROW_MAGIC: [u8; 6] = [b'A', b'R', b'R', b'O', b'W', b'1'];
+const ARROW_MAGIC_V1: [u8; 4] = [b'F', b'E', b'A', b'1'];
+const ARROW_MAGIC_V2: [u8; 6] = [b'A', b'R', b'R', b'O', b'W', b'1'];
 pub(crate) const CONTINUATION_MARKER: [u8; 4] = [0xff; 4];
 
 /// Struct containing `dictionary_id` and nested `IpcField`, allowing users
diff --git a/src/io/ipc/read/file.rs b/src/io/ipc/read/file.rs
index 341cdfeba6..dd4a5852c7 100644
--- a/src/io/ipc/read/file.rs
+++ b/src/io/ipc/read/file.rs
@@ -8,7 +8,7 @@ use crate::datatypes::Schema;
 use crate::error::{Error, Result};
 use crate::io::ipc::IpcSchema;
 
-use super::super::{ARROW_MAGIC, CONTINUATION_MARKER};
+use super::super::{ARROW_MAGIC_V1, ARROW_MAGIC_V2, CONTINUATION_MARKER};
 use super::common::*;
 use super::schema::fb_to_schema;
 use super::Dictionaries;
@@ -151,7 +151,7 @@ fn read_footer_len<R: Read + Seek>(reader: &mut R) -> Result<(u64, usize)> {
     reader.read_exact(&mut footer)?;
     let footer_len = i32::from_le_bytes(footer[..4].try_into().unwrap());
 
-    if footer[4..] != ARROW_MAGIC {
+    if footer[4..] != ARROW_MAGIC_V2 {
         return Err(Error::from(OutOfSpecKind::InvalidFooter));
     }
     let footer_len = footer_len
@@ -215,7 +215,10 @@ pub fn read_file_metadata<R: Read + Seek>(reader: &mut R) -> Result<FileMetadata
     let mut magic_buffer: [u8; 6] = [0; 6];
     let start = reader.stream_position()?;
     reader.read_exact(&mut magic_buffer)?;
-    if magic_buffer != ARROW_MAGIC {
+    if magic_buffer != ARROW_MAGIC_V2 {
+        if &magic_buffer[..4] == ARROW_MAGIC_V1 {
+            return Err(Error::NotYetImplemented("feather v1 not supported".into()));
+        }
         return Err(Error::from(OutOfSpecKind::InvalidHeader));
     }
 
diff --git a/src/io/ipc/read/file_async.rs b/src/io/ipc/read/file_async.rs
index bc74890cc2..6594318035 100644
--- a/src/io/ipc/read/file_async.rs
+++ b/src/io/ipc/read/file_async.rs
@@ -11,7 +11,7 @@ use crate::array::*;
 use crate::chunk::Chunk;
 use crate::datatypes::{Field, Schema};
 use crate::error::{Error, Result};
-use crate::io::ipc::{IpcSchema, ARROW_MAGIC, CONTINUATION_MARKER};
+use crate::io::ipc::{IpcSchema, ARROW_MAGIC_V2, CONTINUATION_MARKER};
 
 use super::common::{apply_projection, prepare_projection, read_dictionary, read_record_batch};
 use super::file::{deserialize_footer, get_record_batch};
@@ -135,7 +135,7 @@ async fn read_footer_len<R: AsyncRead + AsyncSeek + Unpin>(reader: &mut R) -> Re
     reader.read_exact(&mut footer).await?;
     let footer_len = i32::from_le_bytes(footer[..4].try_into().unwrap());
 
-    if footer[4..] != ARROW_MAGIC {
+    if footer[4..] != ARROW_MAGIC_V2 {
         return Err(Error::from(OutOfSpecKind::InvalidFooter));
     }
     footer_len
diff --git a/src/io/ipc/write/file_async.rs b/src/io/ipc/write/file_async.rs
index 02dd5a4c7f..6bf7753664 100644
--- a/src/io/ipc/write/file_async.rs
+++ b/src/io/ipc/write/file_async.rs
@@ -11,7 +11,7 @@ use super::schema::serialize_schema;
 use super::{default_ipc_fields, schema_to_bytes, Record};
 use crate::datatypes::*;
 use crate::error::{Error, Result};
-use crate::io::ipc::{IpcField, ARROW_MAGIC};
+use crate::io::ipc::{IpcField, ARROW_MAGIC_V2};
 
 type WriteOutput<W> = (usize, Option<Block>, Vec<Block>, Option<W>);
 
@@ -105,7 +105,7 @@ where
     }
 
     async fn start(mut writer: W, encoded: EncodedData) -> Result<WriteOutput<W>> {
-        writer.write_all(&ARROW_MAGIC[..]).await?;
+        writer.write_all(&ARROW_MAGIC_V2[..]).await?;
         writer.write_all(&[0, 0]).await?;
         let (meta, data) = write_message(&mut writer, encoded).await?;
 
@@ -149,7 +149,7 @@ where
         writer
             .write_all(&(footer.len() as i32).to_le_bytes())
             .await?;
-        writer.write_all(&ARROW_MAGIC).await?;
+        writer.write_all(&ARROW_MAGIC_V2).await?;
         writer.close().await?;
 
         Ok((0, None, vec![], None))
diff --git a/src/io/ipc/write/writer.rs b/src/io/ipc/write/writer.rs
index 1637f8ea7c..b92f1b2ba8 100644
--- a/src/io/ipc/write/writer.rs
+++ b/src/io/ipc/write/writer.rs
@@ -4,7 +4,7 @@ use arrow_format::ipc::planus::Builder;
 
 use super::{
     super::IpcField,
-    super::ARROW_MAGIC,
+    super::ARROW_MAGIC_V2,
     common::{DictionaryTracker, EncodedData, WriteOptions},
     common_sync::{write_continuation, write_message},
     default_ipc_fields, schema, schema_to_bytes,
@@ -114,7 +114,7 @@ impl<W: Write> FileWriter<W> {
             return Err(Error::oos("The IPC file can only be started once"));
         }
         // write magic to header
-        self.writer.write_all(&ARROW_MAGIC[..])?;
+        self.writer.write_all(&ARROW_MAGIC_V2[..])?;
         // create an 8-byte boundary after the header
         self.writer.write_all(&[0, 0])?;
         // write the schema, set the written bytes to the schema
@@ -205,7 +205,7 @@ impl<W: Write> FileWriter<W> {
         self.writer.write_all(footer_data)?;
         self.writer
             .write_all(&(footer_data.len() as i32).to_le_bytes())?;
-        self.writer.write_all(&ARROW_MAGIC)?;
+        self.writer.write_all(&ARROW_MAGIC_V2)?;
         self.writer.flush()?;
         self.state = State::Finished;
 

From 2ecd3e823f63884ca77b146a8cd8fcdea9f328fd Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Mon, 7 Aug 2023 08:55:11 +0200
Subject: [PATCH 44/80] fix oob if in .get (#1529)

---
 src/array/mod.rs        | 11 ++++++++++-
 src/io/ipc/read/file.rs |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/array/mod.rs b/src/array/mod.rs
index bbbbedc359..04b7b2c8e3 100644
--- a/src/array/mod.rs
+++ b/src/array/mod.rs
@@ -73,9 +73,18 @@ pub trait Array: Send + Sync + dyn_clone::DynClone + 'static {
     /// Panics iff `i >= self.len()`.
     #[inline]
     fn is_null(&self, i: usize) -> bool {
+        assert!(i < self.len());
+        unsafe { self.is_null_unchecked(i) }
+    }
+
+    /// Returns whether slot `i` is null.
+    /// # Safety
+    /// The caller must ensure `i < self.len()`
+    #[inline]
+    unsafe fn is_null_unchecked(&self, i: usize) -> bool {
         self.validity()
             .as_ref()
-            .map(|x| !x.get_bit(i))
+            .map(|x| !x.get_bit_unchecked(i))
             .unwrap_or(false)
     }
 
diff --git a/src/io/ipc/read/file.rs b/src/io/ipc/read/file.rs
index dd4a5852c7..e95b37e44d 100644
--- a/src/io/ipc/read/file.rs
+++ b/src/io/ipc/read/file.rs
@@ -216,7 +216,7 @@ pub fn read_file_metadata<R: Read + Seek>(reader: &mut R) -> Result<FileMetadata
     let start = reader.stream_position()?;
     reader.read_exact(&mut magic_buffer)?;
     if magic_buffer != ARROW_MAGIC_V2 {
-        if &magic_buffer[..4] == ARROW_MAGIC_V1 {
+        if magic_buffer[..4] == ARROW_MAGIC_V1 {
             return Err(Error::NotYetImplemented("feather v1 not supported".into()));
         }
         return Err(Error::from(OutOfSpecKind::InvalidHeader));

From 9beabec8cfb5502582d31ab898fdd36e7af0873c Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Thu, 10 Aug 2023 11:09:09 +0200
Subject: [PATCH 45/80] feat: handle unaligned pointers in FFI (#1535)

---
 src/ffi/array.rs | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/ffi/array.rs b/src/ffi/array.rs
index 4057d0be8f..1a25b98510 100644
--- a/src/ffi/array.rs
+++ b/src/ffi/array.rs
@@ -213,13 +213,7 @@ unsafe fn get_buffer_ptr<T: NativeType>(
             must have a non-null buffer {index}"
         )));
     }
-    if ptr.align_offset(std::mem::align_of::<T>()) != 0 {
-        return Err(Error::oos(format!(
-            "An ArrowArray of type {data_type:?}
-            must have buffer {index} aligned to type {}",
-            std::any::type_name::<T>()
-        )));
-    }
+
     // note: we can't prove that this pointer is not mutably shared - part of the safety invariant
     Ok(ptr as *mut T)
 }
@@ -241,12 +235,21 @@ unsafe fn create_buffer<T: NativeType>(
         return Ok(Buffer::new());
     }
 
-    let ptr = get_buffer_ptr(array, data_type, index)?;
-
     let offset = buffer_offset(array, data_type, index);
-    let bytes = Bytes::from_foreign(ptr, len, BytesAllocator::InternalArrowArray(owner));
+    let ptr: *mut T = get_buffer_ptr(array, data_type, index)?;
 
-    Ok(Buffer::from_bytes(bytes).sliced(offset, len - offset))
+    // We have to check alignment.
+    // This is the zero-copy path.
+    if ptr.align_offset(std::mem::align_of::<T>()) == 0 {
+        let bytes = Bytes::from_foreign(ptr, len, BytesAllocator::InternalArrowArray(owner));
+        Ok(Buffer::from_bytes(bytes).sliced(offset, len - offset))
+    }
+    // This is the path where alignment isn't correct.
+    // We copy the data to a new vec
+    else {
+        let buf = std::slice::from_raw_parts(ptr, len - offset).to_vec();
+        Ok(Buffer::from(buf))
+    }
 }
 
 /// returns the buffer `i` of `array` interpreted as a [`Bitmap`].
@@ -269,6 +272,8 @@ unsafe fn create_bitmap(
     }
     let ptr = get_buffer_ptr(array, data_type, index)?;
 
+    // Pointer of u8 has alignment 1, so we don't have to check alignment.
+
     let offset: usize = array.offset.try_into().expect("offset to fit in `usize`");
     let bytes_len = bytes_for(offset + len);
     let bytes = Bytes::from_foreign(ptr, bytes_len, BytesAllocator::InternalArrowArray(owner));

From a760a4ce847fc0fec781f26908b05efb2cafc1c8 Mon Sep 17 00:00:00 2001
From: Sebastian Holmin <sebastian.holmin@hotmail.com>
Date: Mon, 14 Aug 2023 10:57:03 +0200
Subject: [PATCH 46/80] Update multiversion to 0.7.3 (#1536)

Fixes https://github.com/calebzulawski/multiversion/issues/37.
---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index ed5882cfd0..cafbe8360e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -92,7 +92,7 @@ serde_json = { version = "^1.0", features = ["preserve_order"], optional = true
 strength_reduce = { version = "0.2", optional = true }
 
 # For instruction multiversioning
-multiversion = { version = "0.7.1", optional = true }
+multiversion = { version = "0.7.3", optional = true }
 
 # For support for odbc
 odbc-api = { version = "0.36", optional = true }

From 36e905d9e23e595693f33e44e514215074b6199b Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Mon, 14 Aug 2023 14:10:28 +0200
Subject: [PATCH 47/80] arrow2 0.17.4 release (#1537)

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index cafbe8360e..911e8e3960 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "arrow2"
-version = "0.17.3"
+version = "0.17.4"
 license = "Apache-2.0"
 description = "Unofficial implementation of Apache Arrow spec in safe Rust"
 homepage = "https://github.com/jorgecarleitao/arrow2"

From 076ceee9dec15834943c574cb4343f0237a51447 Mon Sep 17 00:00:00 2001
From: RinChanNOW <rinchannow@bupt.edu.cn>
Date: Tue, 15 Aug 2023 18:59:46 +0800
Subject: [PATCH 48/80] Bump version of arrow-rs. (#1540)

* Bump version of arrow-rs.

* Remove version limit.
---
 Cargo.toml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 911e8e3960..0b8a4ff7e6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -101,10 +101,10 @@ odbc-api = { version = "0.36", optional = true }
 ahash = "0.8"
 
 # Support conversion to/from arrow-rs
-arrow-buffer = { version = ">=40, <44", optional = true }
-arrow-schema = { version = ">=40, <44", optional = true }
-arrow-data = { version = ">=40, <44", optional = true }
-arrow-array = { version = ">=40, <44", optional = true }
+arrow-buffer = { version = ">=40", optional = true }
+arrow-schema = { version = ">=40", optional = true }
+arrow-data = { version = ">=40", optional = true }
+arrow-array = { version = ">=40", optional = true }
 
 [target.wasm32-unknown-unknown.dependencies]
 getrandom = { version = "0.2", features = ["js"] }

From 86b8a5b34a743f042095c25e77b02ac39f25b866 Mon Sep 17 00:00:00 2001
From: Orson Peters <orsonpeters@gmail.com>
Date: Wed, 16 Aug 2023 08:58:49 +0200
Subject: [PATCH 49/80] feat: expose (utf8|binary)_substring (#1539)

---
 src/compute/substring.rs | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/compute/substring.rs b/src/compute/substring.rs
index 7a20114ed1..2919b3037b 100644
--- a/src/compute/substring.rs
+++ b/src/compute/substring.rs
@@ -24,7 +24,13 @@ use crate::{
     offset::{Offset, Offsets},
 };
 
-fn utf8_substring<O: Offset>(array: &Utf8Array<O>, start: O, length: &Option<O>) -> Utf8Array<O> {
+/// Returns a Utf8Array<O> with a substring starting from `start` and with optional length `length` of each of the elements in `array`.
+/// `start` can be negative, in which case the start counts from the end of the string.
+pub fn utf8_substring<O: Offset>(
+    array: &Utf8Array<O>,
+    start: O,
+    length: &Option<O>,
+) -> Utf8Array<O> {
     let length = length.map(|v| v.to_usize());
 
     let iter = array.values_iter().map(|str_val| {
@@ -68,7 +74,9 @@ fn utf8_substring<O: Offset>(array: &Utf8Array<O>, start: O, length: &Option<O>)
     new.with_validity(array.validity().cloned())
 }
 
-fn binary_substring<O: Offset>(
+/// Returns a BinaryArray<O> with a substring starting from `start` and with optional length `length` of each of the elements in `array`.
+/// `start` can be negative, in which case the start counts from the end of the string.
+pub fn binary_substring<O: Offset>(
     array: &BinaryArray<O>,
     start: O,
     length: &Option<O>,

From c1446fb17229f1e5e703a8623b0946c6c8f5c3df Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Wed, 16 Aug 2023 08:58:59 +0200
Subject: [PATCH 50/80] chore: update dependencies (#1542)

---
 Cargo.toml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 0b8a4ff7e6..0f3f9ec27b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,7 +16,7 @@ bench = false
 
 [dependencies]
 foreign_vec = "0.1.0"
-either = "1.6"
+either = "1.9"
 num-traits = "0.2"
 dyn-clone = "1"
 bytemuck = { version = "1", features = ["derive"] }
@@ -29,10 +29,10 @@ ethnum = "1"
 # crate provides HashMap that assumes pre-hashed values.
 hash_hasher = "^2.0.3"
 # For SIMD utf8 validation
-simdutf8 = "0.1.3"
+simdutf8 = "0.1.4"
 
 # A Rust port of SwissTable
-hashbrown = { version = "0.13", default-features = false, optional = true }
+hashbrown = { version = "0.14", default-features = false, optional = true }
 
 # for timezone support
 chrono-tz = { version = "0.8", optional = true }
@@ -46,8 +46,8 @@ csv-core = { version = "0.1", optional = true }
 # for csv async io
 csv-async = { version = "^1.1", optional = true }
 
-regex = { version = "^1.3", optional = true }
-regex-syntax = { version = "^0.6", optional = true }
+regex = { version = "1.9", optional = true }
+regex-syntax = { version = "0.7", optional = true }
 streaming-iterator = { version = "0.1", optional = true }
 fallible-streaming-iterator = { version = "0.1", optional = true }
 
@@ -62,7 +62,7 @@ arrow-format = { version = "0.8", optional = true, features = ["ipc"] }
 hex = { version = "^0.4", optional = true }
 
 # for IPC compression
-lz4 = { version = "1.23.1", optional = true }
+lz4 = { version = "1.24", optional = true }
 zstd = { version = "0.12", optional = true }
 
 rand = { version = "0.8", optional = true }

From 5720d9a2450823b68f2174807af74a44c058b0b8 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijn@degooijer.io>
Date: Sat, 19 Aug 2023 10:05:20 +0200
Subject: [PATCH 51/80] Fix `any`/`all` for Kleene logic (#1545)

---
 src/compute/boolean.rs             | 42 +++++++++++++++++--
 src/compute/boolean_kleene.rs      | 66 +++++++++++++++++++++++++-----
 tests/it/compute/boolean.rs        |  9 ++--
 tests/it/compute/boolean_kleene.rs |  4 +-
 4 files changed, 101 insertions(+), 20 deletions(-)

diff --git a/src/compute/boolean.rs b/src/compute/boolean.rs
index b8200ef5a9..e34b90c637 100644
--- a/src/compute/boolean.rs
+++ b/src/compute/boolean.rs
@@ -230,11 +230,28 @@ pub fn or_scalar(array: &BooleanArray, scalar: &BooleanScalar) -> BooleanArray {
     }
 }
 
-/// Returns whether any of the values in the array is `true`
+/// Returns whether any of the values in the array are `true`.
+///
+/// Null values are ignored.
+///
+/// # Example
+///
+/// ```
+/// use arrow2::array::BooleanArray;
+/// use arrow2::compute::boolean::any;
+///
+/// let a = BooleanArray::from(&[Some(true), Some(false)]);
+/// let b = BooleanArray::from(&[Some(false), Some(false)]);
+/// let c = BooleanArray::from(&[None, Some(false)]);
+///
+/// assert_eq!(any(&a), true);
+/// assert_eq!(any(&b), false);
+/// assert_eq!(any(&c), false);
+/// ```
 pub fn any(array: &BooleanArray) -> bool {
     if array.is_empty() {
         false
-    } else if array.validity().is_some() {
+    } else if array.null_count() > 0 {
         array.into_iter().any(|v| v == Some(true))
     } else {
         let vals = array.values();
@@ -242,12 +259,29 @@ pub fn any(array: &BooleanArray) -> bool {
     }
 }
 
-/// Check if all of the values in the array are `true`
+/// Returns whether all values in the array are `true`.
+///
+/// Null values are ignored.
+///
+/// # Example
+///
+/// ```
+/// use arrow2::array::BooleanArray;
+/// use arrow2::compute::boolean::all;
+///
+/// let a = BooleanArray::from(&[Some(true), Some(true)]);
+/// let b = BooleanArray::from(&[Some(false), Some(true)]);
+/// let c = BooleanArray::from(&[None, Some(true)]);
+///
+/// assert_eq!(all(&a), true);
+/// assert_eq!(all(&b), false);
+/// assert_eq!(all(&c), true);
+/// ```
 pub fn all(array: &BooleanArray) -> bool {
     if array.is_empty() {
         true
     } else if array.null_count() > 0 {
-        false
+        !array.into_iter().any(|v| v == Some(false))
     } else {
         let vals = array.values();
         vals.unset_bits() == 0
diff --git a/src/compute/boolean_kleene.rs b/src/compute/boolean_kleene.rs
index fc6b717543..b19efeaa78 100644
--- a/src/compute/boolean_kleene.rs
+++ b/src/compute/boolean_kleene.rs
@@ -234,26 +234,70 @@ pub fn and_scalar(array: &BooleanArray, scalar: &BooleanScalar) -> BooleanArray
     }
 }
 
-/// Returns whether any of the values in the array is `true`
-pub fn any(array: &BooleanArray) -> bool {
+/// Returns whether any of the values in the array are `true`.
+///
+/// The output is unknown (`None`) if the array contains any null values and
+/// no `true` values.
+///
+/// # Example
+///
+/// ```
+/// use arrow2::array::BooleanArray;
+/// use arrow2::compute::boolean_kleene::any;
+///
+/// let a = BooleanArray::from(&[Some(true), Some(false)]);
+/// let b = BooleanArray::from(&[Some(false), Some(false)]);
+/// let c = BooleanArray::from(&[None, Some(false)]);
+///
+/// assert_eq!(any(&a), Some(true));
+/// assert_eq!(any(&b), Some(false));
+/// assert_eq!(any(&c), None);
+/// ```
+pub fn any(array: &BooleanArray) -> Option<bool> {
     if array.is_empty() {
-        false
-    } else if array.validity().is_some() {
-        array.into_iter().any(|v| v == Some(true))
+        Some(false)
+    } else if array.null_count() > 0 {
+        if array.into_iter().any(|v| v == Some(true)) {
+            Some(true)
+        } else {
+            None
+        }
     } else {
         let vals = array.values();
-        vals.unset_bits() != vals.len()
+        Some(vals.unset_bits() != vals.len())
     }
 }
 
-/// Returns whether all values in the array are `true`
-pub fn all(array: &BooleanArray) -> bool {
+/// Returns whether all values in the array are `true`.
+///
+/// The output is unknown (`None`) if the array contains any null values and
+/// no `false` values.
+///
+/// # Example
+///
+/// ```
+/// use arrow2::array::BooleanArray;
+/// use arrow2::compute::boolean_kleene::all;
+///
+/// let a = BooleanArray::from(&[Some(true), Some(true)]);
+/// let b = BooleanArray::from(&[Some(false), Some(true)]);
+/// let c = BooleanArray::from(&[None, Some(true)]);
+///
+/// assert_eq!(all(&a), Some(true));
+/// assert_eq!(all(&b), Some(false));
+/// assert_eq!(all(&c), None);
+/// ```
+pub fn all(array: &BooleanArray) -> Option<bool> {
     if array.is_empty() {
-        true
+        Some(true)
     } else if array.null_count() > 0 {
-        false
+        if array.into_iter().any(|v| v == Some(false)) {
+            Some(false)
+        } else {
+            None
+        }
     } else {
         let vals = array.values();
-        vals.unset_bits() == 0
+        Some(vals.unset_bits() == 0)
     }
 }
diff --git a/tests/it/compute/boolean.rs b/tests/it/compute/boolean.rs
index 8c505a164f..ae4c0fde85 100644
--- a/tests/it/compute/boolean.rs
+++ b/tests/it/compute/boolean.rs
@@ -429,21 +429,24 @@ fn test_any_all() {
     assert!(!any(&array));
     assert!(!all(&array));
     let array = BooleanArray::from(&[None, Some(true), Some(true)]);
-    assert!(!all(&array));
     assert!(any(&array));
+    assert!(all(&array));
     let array = BooleanArray::from_iter(std::iter::repeat(false).take(10).map(Some));
     assert!(!any(&array));
     assert!(!all(&array));
     let array = BooleanArray::from_iter(std::iter::repeat(true).take(10).map(Some));
-    assert!(all(&array));
     assert!(any(&array));
+    assert!(all(&array));
     let array = BooleanArray::from_iter([true, false, true, true].map(Some));
-    assert!(!all(&array));
     assert!(any(&array));
+    assert!(!all(&array));
     let array = BooleanArray::from(&[Some(true)]);
     assert!(any(&array));
     assert!(all(&array));
     let array = BooleanArray::from(&[Some(false)]);
     assert!(!any(&array));
     assert!(!all(&array));
+    let array = BooleanArray::from(&[]);
+    assert!(!any(&array));
+    assert!(all(&array));
 }
diff --git a/tests/it/compute/boolean_kleene.rs b/tests/it/compute/boolean_kleene.rs
index 8dac6e63c4..902e5b425a 100644
--- a/tests/it/compute/boolean_kleene.rs
+++ b/tests/it/compute/boolean_kleene.rs
@@ -218,6 +218,6 @@ fn array_or_none() {
 #[test]
 fn array_empty() {
     let array = BooleanArray::from(&[]);
-    assert!(!any(&array));
-    assert!(all(&array));
+    assert_eq!(any(&array), Some(false));
+    assert_eq!(all(&array), Some(true));
 }

From 7edf5f9e359e0ed02e9d0c6b9318b06964d805f0 Mon Sep 17 00:00:00 2001
From: Orson Peters <orsonpeters@gmail.com>
Date: Sat, 19 Aug 2023 10:05:45 +0200
Subject: [PATCH 52/80] fix: slice values in list to fixed-size list cast
 (#1544)

---
 src/compute/cast/mod.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs
index d97878d497..14622f9b03 100644
--- a/src/compute/cast/mod.rs
+++ b/src/compute/cast/mod.rs
@@ -400,7 +400,11 @@ fn cast_list_to_fixed_size_list<O: Offset>(
             "incompatible offsets in source list".to_string(),
         )),
         None => {
-            let new_values = cast(list.values().as_ref(), inner.data_type(), options)?;
+            let sliced_values = list.values().sliced(
+                list.offsets().first().to_usize(),
+                list.offsets().range().to_usize(),
+            );
+            let new_values = cast(sliced_values.as_ref(), inner.data_type(), options)?;
             Ok(FixedSizeListArray::new(
                 DataType::FixedSizeList(Box::new(inner.clone()), size),
                 new_values,

From 2b3e2a9e83725a557d78b90cd39298c5bef0ca4a Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Sun, 20 Aug 2023 11:04:58 +0200
Subject: [PATCH 53/80] feat: add fallible extend to mutable arrays (#1546)

---
 src/array/binary/mutable.rs        | 12 ++++++++++++
 src/array/binary/mutable_values.rs | 12 ++++++++++++
 src/array/utf8/mutable.rs          | 12 ++++++++++++
 src/array/utf8/mutable_values.rs   | 12 ++++++++++++
 4 files changed, 48 insertions(+)

diff --git a/src/array/binary/mutable.rs b/src/array/binary/mutable.rs
index 25bbe286c2..db0312fb54 100644
--- a/src/array/binary/mutable.rs
+++ b/src/array/binary/mutable.rs
@@ -404,6 +404,18 @@ impl<O: Offset> MutableBinaryArray<O> {
         let (offsets, values) = values_iter(iterator);
         Self::try_new(Self::default_data_type(), offsets, values, None).unwrap()
     }
+
+    /// Extend with a fallible iterator
+    pub fn extend_fallible<T, I, E>(&mut self, iter: I) -> std::result::Result<(), E>
+    where
+        E: std::error::Error,
+        I: IntoIterator<Item = std::result::Result<Option<T>, E>>,
+        T: AsRef<[u8]>,
+    {
+        let mut iter = iter.into_iter();
+        self.reserve(iter.size_hint().0, 0);
+        iter.try_for_each(|x| Ok(self.push(x?)))
+    }
 }
 
 impl<O: Offset, T: AsRef<[u8]>> Extend<Option<T>> for MutableBinaryArray<O> {
diff --git a/src/array/binary/mutable_values.rs b/src/array/binary/mutable_values.rs
index 59f42b238f..53b2d93a9a 100644
--- a/src/array/binary/mutable_values.rs
+++ b/src/array/binary/mutable_values.rs
@@ -314,6 +314,18 @@ impl<O: Offset> MutableBinaryValuesArray<O> {
         }
         Ok(array)
     }
+
+    /// Extend with a fallible iterator
+    pub fn extend_fallible<T, I, E>(&mut self, iter: I) -> std::result::Result<(), E>
+    where
+        E: std::error::Error,
+        I: IntoIterator<Item = std::result::Result<T, E>>,
+        T: AsRef<[u8]>,
+    {
+        let mut iter = iter.into_iter();
+        self.reserve(iter.size_hint().0, 0);
+        iter.try_for_each(|x| Ok(self.push(x?)))
+    }
 }
 
 impl<O: Offset, T: AsRef<[u8]>> Extend<T> for MutableBinaryValuesArray<O> {
diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs
index ca2013eac7..6f5b7973ed 100644
--- a/src/array/utf8/mutable.rs
+++ b/src/array/utf8/mutable.rs
@@ -483,6 +483,18 @@ impl<O: Offset> MutableUtf8Array<O> {
     pub fn from_iter_values<T: AsRef<str>, I: Iterator<Item = T>>(iterator: I) -> Self {
         MutableUtf8ValuesArray::from_iter(iterator).into()
     }
+
+    /// Extend with a fallible iterator
+    pub fn extend_fallible<T, I, E>(&mut self, iter: I) -> std::result::Result<(), E>
+    where
+        E: std::error::Error,
+        I: IntoIterator<Item = std::result::Result<Option<T>, E>>,
+        T: AsRef<str>,
+    {
+        let mut iter = iter.into_iter();
+        self.reserve(iter.size_hint().0, 0);
+        iter.try_for_each(|x| Ok(self.push(x?)))
+    }
 }
 
 impl<O: Offset, T: AsRef<str>> Extend<Option<T>> for MutableUtf8Array<O> {
diff --git a/src/array/utf8/mutable_values.rs b/src/array/utf8/mutable_values.rs
index fc8708667d..c70c870388 100644
--- a/src/array/utf8/mutable_values.rs
+++ b/src/array/utf8/mutable_values.rs
@@ -359,6 +359,18 @@ impl<O: Offset> MutableUtf8ValuesArray<O> {
         }
         Ok(array)
     }
+
+    /// Extend with a fallible iterator
+    pub fn extend_fallible<T, I, E>(&mut self, iter: I) -> std::result::Result<(), E>
+    where
+        E: std::error::Error,
+        I: IntoIterator<Item = std::result::Result<T, E>>,
+        T: AsRef<str>,
+    {
+        let mut iter = iter.into_iter();
+        self.reserve(iter.size_hint().0, 0);
+        iter.try_for_each(|x| Ok(self.push(x?)))
+    }
 }
 
 impl<O: Offset, T: AsRef<str>> Extend<T> for MutableUtf8ValuesArray<O> {

From 3d7d9acdda34f36d904e362de411792714794331 Mon Sep 17 00:00:00 2001
From: Weijie Guo <reswqa@163.com>
Date: Sun, 20 Aug 2023 19:15:58 +0800
Subject: [PATCH 54/80] feat: Support cast to large list. (#1547)

---
 src/compute/cast/mod.rs  | 14 ++++++++++++++
 tests/it/compute/cast.rs | 23 +++++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs
index 14622f9b03..a9d27aa12d 100644
--- a/src/compute/cast/mod.rs
+++ b/src/compute/cast/mod.rs
@@ -104,6 +104,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
         (List(list_from), LargeList(list_to)) if list_from == list_to => true,
         (LargeList(list_from), List(list_to)) if list_from == list_to => true,
         (_, List(list_to)) => can_cast_types(from_type, &list_to.data_type),
+        (_, LargeList(list_to)) => can_cast_types(from_type, &list_to.data_type),
         (Dictionary(_, from_value_type, _), Dictionary(_, to_value_type, _)) => {
             can_cast_types(from_value_type, to_value_type)
         }
@@ -509,6 +510,19 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu
             Ok(Box::new(list_array))
         }
 
+        (_, LargeList(to)) => {
+            // cast primitive to list's primitive
+            let values = cast(array, &to.data_type, options)?;
+            // create offsets, where if array.len() = 2, we have [0,1,2]
+            let offsets = (0..=array.len() as i64).collect::<Vec<_>>();
+            // Safety: offsets _are_ monotonically increasing
+            let offsets = unsafe { Offsets::new_unchecked(offsets) };
+
+            let list_array = ListArray::<i64>::new(to_type.clone(), offsets.into(), values, None);
+
+            Ok(Box::new(list_array))
+        }
+
         (Dictionary(index_type, ..), _) => match_integer_type!(index_type, |$T| {
             dictionary_cast_dyn::<$T>(array, to_type, options)
         }),
diff --git a/tests/it/compute/cast.rs b/tests/it/compute/cast.rs
index 01cb31d2f2..d8a9ecfce1 100644
--- a/tests/it/compute/cast.rs
+++ b/tests/it/compute/cast.rs
@@ -1,5 +1,6 @@
 use arrow2::array::*;
 use arrow2::compute::cast::{can_cast_types, cast, CastOptions};
+use arrow2::datatypes::DataType::LargeList;
 use arrow2::datatypes::*;
 use arrow2::types::{days_ms, months_days_ns, NativeType};
 
@@ -120,6 +121,28 @@ fn i32_to_i32() {
     assert_eq!(c, &expected);
 }
 
+#[test]
+fn i32_to_large_list_i32() {
+    let array = Int32Array::from_slice([5, 6, 7, 8, 9]);
+    let b = cast(
+        &array,
+        &LargeList(Box::new(Field::new("item", DataType::Int32, true))),
+        CastOptions::default(),
+    )
+    .unwrap();
+
+    let arr = b.as_any().downcast_ref::<ListArray<i64>>().unwrap();
+    assert_eq!(&[0, 1, 2, 3, 4, 5], arr.offsets().as_slice());
+    let values = arr.values();
+    let c = values
+        .as_any()
+        .downcast_ref::<PrimitiveArray<i32>>()
+        .unwrap();
+
+    let expected = Int32Array::from_slice([5, 6, 7, 8, 9]);
+    assert_eq!(c, &expected);
+}
+
 #[test]
 fn i32_to_list_i32() {
     let array = Int32Array::from_slice([5, 6, 7, 8, 9]);

From 697f7fb2bda471fb67ea9ae145975c477345d84b Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Sun, 20 Aug 2023 14:35:59 +0200
Subject: [PATCH 55/80] fix clippy (#1548)

---
 src/array/binary/mutable.rs        | 5 ++++-
 src/array/binary/mutable_values.rs | 5 ++++-
 src/array/utf8/mutable.rs          | 5 ++++-
 src/array/utf8/mutable_values.rs   | 5 ++++-
 4 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/array/binary/mutable.rs b/src/array/binary/mutable.rs
index db0312fb54..32a6f17acb 100644
--- a/src/array/binary/mutable.rs
+++ b/src/array/binary/mutable.rs
@@ -414,7 +414,10 @@ impl<O: Offset> MutableBinaryArray<O> {
     {
         let mut iter = iter.into_iter();
         self.reserve(iter.size_hint().0, 0);
-        iter.try_for_each(|x| Ok(self.push(x?)))
+        iter.try_for_each(|x| {
+            self.push(x?);
+            Ok(())
+        })
     }
 }
 
diff --git a/src/array/binary/mutable_values.rs b/src/array/binary/mutable_values.rs
index 53b2d93a9a..3e14d9c578 100644
--- a/src/array/binary/mutable_values.rs
+++ b/src/array/binary/mutable_values.rs
@@ -324,7 +324,10 @@ impl<O: Offset> MutableBinaryValuesArray<O> {
     {
         let mut iter = iter.into_iter();
         self.reserve(iter.size_hint().0, 0);
-        iter.try_for_each(|x| Ok(self.push(x?)))
+        iter.try_for_each(|x| {
+            self.push(x?);
+            Ok(())
+        })
     }
 }
 
diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs
index 6f5b7973ed..108fe8e474 100644
--- a/src/array/utf8/mutable.rs
+++ b/src/array/utf8/mutable.rs
@@ -493,7 +493,10 @@ impl<O: Offset> MutableUtf8Array<O> {
     {
         let mut iter = iter.into_iter();
         self.reserve(iter.size_hint().0, 0);
-        iter.try_for_each(|x| Ok(self.push(x?)))
+        iter.try_for_each(|x| {
+            self.push(x?);
+            Ok(())
+        })
     }
 }
 
diff --git a/src/array/utf8/mutable_values.rs b/src/array/utf8/mutable_values.rs
index c70c870388..dce8b09e4c 100644
--- a/src/array/utf8/mutable_values.rs
+++ b/src/array/utf8/mutable_values.rs
@@ -369,7 +369,10 @@ impl<O: Offset> MutableUtf8ValuesArray<O> {
     {
         let mut iter = iter.into_iter();
         self.reserve(iter.size_hint().0, 0);
-        iter.try_for_each(|x| Ok(self.push(x?)))
+        iter.try_for_each(|x| {
+            self.push(x?);
+            Ok(())
+        })
     }
 }
 

From ba6a882bc1542b0b899774b696ebea77482b5c31 Mon Sep 17 00:00:00 2001
From: Weijie Guo <reswqa@163.com>
Date: Mon, 21 Aug 2023 15:36:01 +0800
Subject: [PATCH 56/80] fix: LargeBinary to LargeList should be taken as a
 special (#1550)

---
 src/compute/cast/mod.rs | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs
index a9d27aa12d..8f89151a06 100644
--- a/src/compute/cast/mod.rs
+++ b/src/compute/cast/mod.rs
@@ -104,7 +104,6 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
         (List(list_from), LargeList(list_to)) if list_from == list_to => true,
         (LargeList(list_from), List(list_to)) if list_from == list_to => true,
         (_, List(list_to)) => can_cast_types(from_type, &list_to.data_type),
-        (_, LargeList(list_to)) => can_cast_types(from_type, &list_to.data_type),
         (Dictionary(_, from_value_type, _), Dictionary(_, to_value_type, _)) => {
             can_cast_types(from_value_type, to_value_type)
         }
@@ -151,7 +150,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
         (Timestamp(_, _), LargeUtf8) => true,
         (_, Utf8) => is_numeric(from_type) || from_type == &Binary,
         (_, LargeUtf8) => is_numeric(from_type) || from_type == &LargeBinary,
-
+        (_, LargeList(list_to)) => can_cast_types(from_type, &list_to.data_type),
         (_, Binary) => is_numeric(from_type),
         (_, LargeBinary) => is_numeric(from_type),
 
@@ -510,19 +509,6 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu
             Ok(Box::new(list_array))
         }
 
-        (_, LargeList(to)) => {
-            // cast primitive to list's primitive
-            let values = cast(array, &to.data_type, options)?;
-            // create offsets, where if array.len() = 2, we have [0,1,2]
-            let offsets = (0..=array.len() as i64).collect::<Vec<_>>();
-            // Safety: offsets _are_ monotonically increasing
-            let offsets = unsafe { Offsets::new_unchecked(offsets) };
-
-            let list_array = ListArray::<i64>::new(to_type.clone(), offsets.into(), values, None);
-
-            Ok(Box::new(list_array))
-        }
-
         (Dictionary(index_type, ..), _) => match_integer_type!(index_type, |$T| {
             dictionary_cast_dyn::<$T>(array, to_type, options)
         }),
@@ -754,6 +740,19 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu
             ))),
         },
 
+        (_, LargeList(to)) => {
+            // cast primitive to list's primitive
+            let values = cast(array, &to.data_type, options)?;
+            // create offsets, where if array.len() = 2, we have [0,1,2]
+            let offsets = (0..=array.len() as i64).collect::<Vec<_>>();
+            // Safety: offsets _are_ monotonically increasing
+            let offsets = unsafe { Offsets::new_unchecked(offsets) };
+
+            let list_array = ListArray::<i64>::new(to_type.clone(), offsets.into(), values, None);
+
+            Ok(Box::new(list_array))
+        }
+
         (_, Binary) => match from_type {
             UInt8 => primitive_to_binary_dyn::<u8, i32>(array),
             UInt16 => primitive_to_binary_dyn::<u16, i32>(array),

From f609d0c0cc138f00f297f05e8fe23f6bf195938c Mon Sep 17 00:00:00 2001
From: Yijun Zhao <ariesdevil77@gmail.com>
Date: Tue, 29 Aug 2023 20:24:57 +0800
Subject: [PATCH 57/80] Support nested decimal read write (#1553)

* add decimal and decimal256 supports to array_to_page_nested

* add decimal and decimal256 supports to array_to_page_nested

* support nested decimal 256

* fix reviewer comments

* add tests

* fix tests
---
 parquet_integration/write_parquet.py          |   6 +
 .../deserialize/fixed_size_binary/basic.rs    |  34 ++--
 .../read/deserialize/fixed_size_binary/mod.rs |   2 +
 .../deserialize/fixed_size_binary/nested.rs   | 189 ++++++++++++++++++
 src/io/parquet/read/deserialize/nested.rs     | 159 +++++++++++++++
 src/io/parquet/write/mod.rs                   | 133 ++++++++++++
 tests/it/io/parquet/mod.rs                    |  86 ++++++--
 tests/it/io/parquet/read.rs                   |  12 ++
 tests/it/io/parquet/write.rs                  |  24 +++
 9 files changed, 609 insertions(+), 36 deletions(-)
 create mode 100644 src/io/parquet/read/deserialize/fixed_size_binary/nested.rs

diff --git a/parquet_integration/write_parquet.py b/parquet_integration/write_parquet.py
index acfd819d57..a7f7560fc5 100644
--- a/parquet_integration/write_parquet.py
+++ b/parquet_integration/write_parquet.py
@@ -178,6 +178,8 @@ def case_nested() -> Tuple[dict, pa.Schema, str]:
         [""],
     ]
 
+    decimal_nullable = [[Decimal(n) if n is not None else None for n in sublist] if sublist is not None else None for sublist in items_nullable]
+
     list_struct_nullable = [
         [{"a": "a"}, {"a": "b"}],
         None,
@@ -222,6 +224,8 @@ def case_nested() -> Tuple[dict, pa.Schema, str]:
         pa.field("list_bool", pa.list_(pa.bool_())),
         pa.field("list_utf8", pa.list_(pa.utf8())),
         pa.field("list_large_binary", pa.list_(pa.large_binary())),
+        pa.field("list_decimal", pa.list_(pa.decimal128(9, 0))),
+        pa.field("list_decimal256", pa.list_(pa.decimal256(9, 0))),
         pa.field("list_nested_i64", pa.list_(pa.list_(pa.int64()))),
         pa.field("list_nested_inner_required_i64", pa.list_(pa.list_(pa.int64()))),
         pa.field(
@@ -251,6 +255,8 @@ def case_nested() -> Tuple[dict, pa.Schema, str]:
             "list_bool": boolean,
             "list_utf8": string,
             "list_large_binary": string,
+            "list_decimal": decimal_nullable,
+            "list_decimal256": decimal_nullable,
             "list_nested_i64": items_nested,
             "list_nested_inner_required_i64": items_required_nested,
             "list_nested_inner_required_required_i64": items_required_nested_2,
diff --git a/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs b/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs
index ab47aa98cf..c77ff5f027 100644
--- a/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs
+++ b/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs
@@ -19,16 +19,16 @@ use super::super::utils::{
 use super::super::Pages;
 use super::utils::FixedSizeBinary;
 
-type Dict = Vec<u8>;
+pub(super) type Dict = Vec<u8>;
 
 #[derive(Debug)]
-struct Optional<'a> {
-    values: std::slice::ChunksExact<'a, u8>,
-    validity: OptionalPageValidity<'a>,
+pub(super) struct Optional<'a> {
+    pub(super) values: std::slice::ChunksExact<'a, u8>,
+    pub(super) validity: OptionalPageValidity<'a>,
 }
 
 impl<'a> Optional<'a> {
-    fn try_new(page: &'a DataPage, size: usize) -> Result<Self> {
+    pub(super) fn try_new(page: &'a DataPage, size: usize) -> Result<Self> {
         let (_, _, values) = split_buffer(page)?;
 
         let values = values.chunks_exact(size);
@@ -41,12 +41,12 @@ impl<'a> Optional<'a> {
 }
 
 #[derive(Debug)]
-struct Required<'a> {
+pub(super) struct Required<'a> {
     pub values: std::slice::ChunksExact<'a, u8>,
 }
 
 impl<'a> Required<'a> {
-    fn new(page: &'a DataPage, size: usize) -> Self {
+    pub(super) fn new(page: &'a DataPage, size: usize) -> Self {
         let values = page.buffer();
         assert_eq!(values.len() % size, 0);
         let values = values.chunks_exact(size);
@@ -60,7 +60,7 @@ impl<'a> Required<'a> {
 }
 
 #[derive(Debug)]
-struct FilteredRequired<'a> {
+pub(super) struct FilteredRequired<'a> {
     pub values: SliceFilteredIter<std::slice::ChunksExact<'a, u8>>,
 }
 
@@ -83,13 +83,13 @@ impl<'a> FilteredRequired<'a> {
 }
 
 #[derive(Debug)]
-struct RequiredDictionary<'a> {
+pub(super) struct RequiredDictionary<'a> {
     pub values: hybrid_rle::HybridRleDecoder<'a>,
-    dict: &'a Dict,
+    pub dict: &'a Dict,
 }
 
 impl<'a> RequiredDictionary<'a> {
-    fn try_new(page: &'a DataPage, dict: &'a Dict) -> Result<Self> {
+    pub(super) fn try_new(page: &'a DataPage, dict: &'a Dict) -> Result<Self> {
         let values = dict_indices_decoder(page)?;
 
         Ok(Self { dict, values })
@@ -102,14 +102,14 @@ impl<'a> RequiredDictionary<'a> {
 }
 
 #[derive(Debug)]
-struct OptionalDictionary<'a> {
-    values: hybrid_rle::HybridRleDecoder<'a>,
-    validity: OptionalPageValidity<'a>,
-    dict: &'a Dict,
+pub(super) struct OptionalDictionary<'a> {
+    pub(super) values: hybrid_rle::HybridRleDecoder<'a>,
+    pub(super) validity: OptionalPageValidity<'a>,
+    pub(super) dict: &'a Dict,
 }
 
 impl<'a> OptionalDictionary<'a> {
-    fn try_new(page: &'a DataPage, dict: &'a Dict) -> Result<Self> {
+    pub(super) fn try_new(page: &'a DataPage, dict: &'a Dict) -> Result<Self> {
         let values = dict_indices_decoder(page)?;
 
         Ok(Self {
@@ -267,7 +267,7 @@ impl<'a> Decoder<'a> for BinaryDecoder {
     }
 }
 
-fn finish(
+pub fn finish(
     data_type: &DataType,
     values: FixedSizeBinary,
     validity: MutableBitmap,
diff --git a/src/io/parquet/read/deserialize/fixed_size_binary/mod.rs b/src/io/parquet/read/deserialize/fixed_size_binary/mod.rs
index 0ed9e60eac..c48bfe276b 100644
--- a/src/io/parquet/read/deserialize/fixed_size_binary/mod.rs
+++ b/src/io/parquet/read/deserialize/fixed_size_binary/mod.rs
@@ -1,6 +1,8 @@
 mod basic;
 mod dictionary;
+mod nested;
 mod utils;
 
 pub use basic::Iter;
 pub use dictionary::{DictIter, NestedDictIter};
+pub use nested::NestedIter;
diff --git a/src/io/parquet/read/deserialize/fixed_size_binary/nested.rs b/src/io/parquet/read/deserialize/fixed_size_binary/nested.rs
new file mode 100644
index 0000000000..5cef9eabfc
--- /dev/null
+++ b/src/io/parquet/read/deserialize/fixed_size_binary/nested.rs
@@ -0,0 +1,189 @@
+use std::collections::VecDeque;
+
+use parquet2::{
+    encoding::Encoding,
+    page::{DataPage, DictPage},
+    schema::Repetition,
+};
+
+use super::super::utils::{not_implemented, MaybeNext, PageState};
+use super::utils::FixedSizeBinary;
+use crate::array::FixedSizeBinaryArray;
+use crate::io::parquet::read::deserialize::fixed_size_binary::basic::{
+    finish, Dict, Optional, OptionalDictionary, Required, RequiredDictionary,
+};
+use crate::io::parquet::read::deserialize::nested_utils::{next, NestedDecoder};
+use crate::io::parquet::read::deserialize::utils::Pushable;
+use crate::io::parquet::read::{InitNested, NestedState};
+use crate::{bitmap::MutableBitmap, datatypes::DataType, error::Result, io::parquet::read::Pages};
+
+#[derive(Debug)]
+enum State<'a> {
+    Optional(Optional<'a>),
+    Required(Required<'a>),
+    RequiredDictionary(RequiredDictionary<'a>),
+    OptionalDictionary(OptionalDictionary<'a>),
+}
+
+impl<'a> PageState<'a> for State<'a> {
+    fn len(&self) -> usize {
+        match self {
+            State::Optional(state) => state.validity.len(),
+            State::Required(state) => state.len(),
+            State::RequiredDictionary(state) => state.len(),
+            State::OptionalDictionary(state) => state.validity.len(),
+        }
+    }
+}
+
+#[derive(Debug, Default)]
+struct BinaryDecoder {
+    size: usize,
+}
+
+impl<'a> NestedDecoder<'a> for BinaryDecoder {
+    type State = State<'a>;
+    type Dictionary = Dict;
+    type DecodedState = (FixedSizeBinary, MutableBitmap);
+
+    fn build_state(
+        &self,
+        page: &'a DataPage,
+        dict: Option<&'a Self::Dictionary>,
+    ) -> Result<Self::State> {
+        let is_optional =
+            page.descriptor.primitive_type.field_info.repetition == Repetition::Optional;
+        let is_filtered = page.selected_rows().is_some();
+
+        match (page.encoding(), dict, is_optional, is_filtered) {
+            (Encoding::Plain, _, true, false) => {
+                Ok(State::Optional(Optional::try_new(page, self.size)?))
+            }
+            (Encoding::Plain, _, false, false) => {
+                Ok(State::Required(Required::new(page, self.size)))
+            }
+            (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false, false) => {
+                RequiredDictionary::try_new(page, dict).map(State::RequiredDictionary)
+            }
+            (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), true, false) => {
+                OptionalDictionary::try_new(page, dict).map(State::OptionalDictionary)
+            }
+            _ => Err(not_implemented(page)),
+        }
+    }
+
+    fn with_capacity(&self, capacity: usize) -> Self::DecodedState {
+        (
+            FixedSizeBinary::with_capacity(capacity, self.size),
+            MutableBitmap::with_capacity(capacity),
+        )
+    }
+
+    fn push_valid(&self, state: &mut Self::State, decoded: &mut Self::DecodedState) -> Result<()> {
+        let (values, validity) = decoded;
+        match state {
+            State::Optional(page) => {
+                let value = page.values.by_ref().next().unwrap_or_default();
+                values.push(value);
+                validity.push(true);
+            }
+            State::Required(page) => {
+                let value = page.values.by_ref().next().unwrap_or_default();
+                values.push(value);
+            }
+            State::RequiredDictionary(page) => {
+                let item = page
+                    .values
+                    .by_ref()
+                    .next()
+                    .map(|index| {
+                        let index = index.unwrap() as usize;
+                        &page.dict[index * self.size..(index + 1) * self.size]
+                    })
+                    .unwrap_or_default();
+                values.push(item);
+            }
+            State::OptionalDictionary(page) => {
+                let item = page
+                    .values
+                    .by_ref()
+                    .next()
+                    .map(|index| {
+                        let index = index.unwrap() as usize;
+                        &page.dict[index * self.size..(index + 1) * self.size]
+                    })
+                    .unwrap_or_default();
+                values.push(item);
+                validity.push(true);
+            }
+        }
+        Ok(())
+    }
+
+    fn push_null(&self, decoded: &mut Self::DecodedState) {
+        let (values, validity) = decoded;
+        values.push_null();
+        validity.push(false);
+    }
+
+    fn deserialize_dict(&self, page: &DictPage) -> Self::Dictionary {
+        page.buffer.clone()
+    }
+}
+
+pub struct NestedIter<I: Pages> {
+    iter: I,
+    data_type: DataType,
+    size: usize,
+    init: Vec<InitNested>,
+    items: VecDeque<(NestedState, (FixedSizeBinary, MutableBitmap))>,
+    dict: Option<Dict>,
+    chunk_size: Option<usize>,
+    remaining: usize,
+}
+
+impl<I: Pages> NestedIter<I> {
+    pub fn new(
+        iter: I,
+        init: Vec<InitNested>,
+        data_type: DataType,
+        num_rows: usize,
+        chunk_size: Option<usize>,
+    ) -> Self {
+        let size = FixedSizeBinaryArray::get_size(&data_type);
+        Self {
+            iter,
+            data_type,
+            size,
+            init,
+            items: VecDeque::new(),
+            dict: None,
+            chunk_size,
+            remaining: num_rows,
+        }
+    }
+}
+
+impl<I: Pages> Iterator for NestedIter<I> {
+    type Item = Result<(NestedState, FixedSizeBinaryArray)>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let maybe_state = next(
+            &mut self.iter,
+            &mut self.items,
+            &mut self.dict,
+            &mut self.remaining,
+            &self.init,
+            self.chunk_size,
+            &BinaryDecoder { size: self.size },
+        );
+        match maybe_state {
+            MaybeNext::Some(Ok((nested, decoded))) => {
+                Some(Ok((nested, finish(&self.data_type, decoded.0, decoded.1))))
+            }
+            MaybeNext::Some(Err(e)) => Some(Err(e)),
+            MaybeNext::None => None,
+            MaybeNext::More => self.next(),
+        }
+    }
+}
diff --git a/src/io/parquet/read/deserialize/nested.rs b/src/io/parquet/read/deserialize/nested.rs
index 0887751438..ff27ca007b 100644
--- a/src/io/parquet/read/deserialize/nested.rs
+++ b/src/io/parquet/read/deserialize/nested.rs
@@ -1,5 +1,7 @@
+use ethnum::I256;
 use parquet2::schema::types::PrimitiveType;
 
+use crate::array::PrimitiveArray;
 use crate::{
     datatypes::{DataType, Field},
     error::{Error, Result},
@@ -261,6 +263,163 @@ where
                 });
                 Box::new(iter) as _
             }
+            DataType::Decimal(_, _) => {
+                init.push(InitNested::Primitive(field.is_nullable));
+                let type_ = types.pop().unwrap();
+                match type_.physical_type {
+                    PhysicalType::Int32 => primitive(primitive::NestedIter::new(
+                        columns.pop().unwrap(),
+                        init,
+                        field.data_type.clone(),
+                        num_rows,
+                        chunk_size,
+                        |x: i32| x as i128,
+                    )),
+                    PhysicalType::Int64 => primitive(primitive::NestedIter::new(
+                        columns.pop().unwrap(),
+                        init,
+                        field.data_type.clone(),
+                        num_rows,
+                        chunk_size,
+                        |x: i64| x as i128,
+                    )),
+                    PhysicalType::FixedLenByteArray(n) if n > 16 => {
+                        return Err(Error::InvalidArgumentError(format!(
+                            "Can't decode Decimal128 type from `FixedLenByteArray` of len {n}"
+                        )))
+                    }
+                    PhysicalType::FixedLenByteArray(n) => {
+                        let iter = fixed_size_binary::NestedIter::new(
+                            columns.pop().unwrap(),
+                            init,
+                            DataType::FixedSizeBinary(n),
+                            num_rows,
+                            chunk_size,
+                        );
+                        // Convert the fixed length byte array to Decimal.
+                        let iter = iter.map(move |x| {
+                            let (mut nested, array) = x?;
+                            let values = array
+                                .values()
+                                .chunks_exact(n)
+                                .map(|value: &[u8]| super::super::convert_i128(value, n))
+                                .collect::<Vec<_>>();
+                            let validity = array.validity().cloned();
+
+                            let array: Box<dyn Array> = Box::new(PrimitiveArray::<i128>::try_new(
+                                field.data_type.clone(),
+                                values.into(),
+                                validity,
+                            )?);
+
+                            let _ = nested.nested.pop().unwrap(); // the primitive
+
+                            Ok((nested, array))
+                        });
+                        Box::new(iter)
+                    }
+                    _ => {
+                        return Err(Error::nyi(format!(
+                            "Deserializing type for Decimal {:?} from parquet",
+                            type_.physical_type
+                        )))
+                    }
+                }
+            }
+            DataType::Decimal256(_, _) => {
+                init.push(InitNested::Primitive(field.is_nullable));
+                let type_ = types.pop().unwrap();
+                match type_.physical_type {
+                    PhysicalType::Int32 => primitive(primitive::NestedIter::new(
+                        columns.pop().unwrap(),
+                        init,
+                        field.data_type.clone(),
+                        num_rows,
+                        chunk_size,
+                        |x: i32| i256(I256::new(x as i128)),
+                    )),
+                    PhysicalType::Int64 => primitive(primitive::NestedIter::new(
+                        columns.pop().unwrap(),
+                        init,
+                        field.data_type.clone(),
+                        num_rows,
+                        chunk_size,
+                        |x: i64| i256(I256::new(x as i128)),
+                    )),
+                    PhysicalType::FixedLenByteArray(n) if n <= 16 => {
+                        let iter = fixed_size_binary::NestedIter::new(
+                            columns.pop().unwrap(),
+                            init,
+                            DataType::FixedSizeBinary(n),
+                            num_rows,
+                            chunk_size,
+                        );
+                        // Convert the fixed length byte array to Decimal.
+                        let iter = iter.map(move |x| {
+                            let (mut nested, array) = x?;
+                            let values = array
+                                .values()
+                                .chunks_exact(n)
+                                .map(|value| i256(I256::new(super::super::convert_i128(value, n))))
+                                .collect::<Vec<_>>();
+                            let validity = array.validity().cloned();
+
+                            let array: Box<dyn Array> = Box::new(PrimitiveArray::<i256>::try_new(
+                                field.data_type.clone(),
+                                values.into(),
+                                validity,
+                            )?);
+
+                            let _ = nested.nested.pop().unwrap(); // the primitive
+
+                            Ok((nested, array))
+                        });
+                        Box::new(iter) as _
+                    }
+
+                    PhysicalType::FixedLenByteArray(n) if n <= 32 => {
+                        let iter = fixed_size_binary::NestedIter::new(
+                            columns.pop().unwrap(),
+                            init,
+                            DataType::FixedSizeBinary(n),
+                            num_rows,
+                            chunk_size,
+                        );
+                        // Convert the fixed length byte array to Decimal.
+                        let iter = iter.map(move |x| {
+                            let (mut nested, array) = x?;
+                            let values = array
+                                .values()
+                                .chunks_exact(n)
+                                .map(super::super::convert_i256)
+                                .collect::<Vec<_>>();
+                            let validity = array.validity().cloned();
+
+                            let array: Box<dyn Array> = Box::new(PrimitiveArray::<i256>::try_new(
+                                field.data_type.clone(),
+                                values.into(),
+                                validity,
+                            )?);
+
+                            let _ = nested.nested.pop().unwrap(); // the primitive
+
+                            Ok((nested, array))
+                        });
+                        Box::new(iter) as _
+                    }
+                    PhysicalType::FixedLenByteArray(n) => {
+                        return Err(Error::InvalidArgumentError(format!(
+                            "Can't decode Decimal256 type from from `FixedLenByteArray` of len {n}"
+                        )))
+                    }
+                    _ => {
+                        return Err(Error::nyi(format!(
+                            "Deserializing type for Decimal {:?} from parquet",
+                            type_.physical_type
+                        )))
+                    }
+                }
+            }
             DataType::Struct(fields) => {
                 let columns = fields
                     .iter()
diff --git a/src/io/parquet/write/mod.rs b/src/io/parquet/write/mod.rs
index a0040a9a0d..7889ea04fa 100644
--- a/src/io/parquet/write/mod.rs
+++ b/src/io/parquet/write/mod.rs
@@ -677,6 +677,139 @@ fn array_to_page_nested(
             let array = array.as_any().downcast_ref().unwrap();
             primitive::nested_array_to_page::<f64, f64>(array, options, type_, nested)
         }
+        Decimal(precision, _) => {
+            let type_ = type_;
+            let precision = *precision;
+            let array = array
+                .as_any()
+                .downcast_ref::<PrimitiveArray<i128>>()
+                .unwrap();
+            if precision <= 9 {
+                let values = array
+                    .values()
+                    .iter()
+                    .map(|x| *x as i32)
+                    .collect::<Vec<_>>()
+                    .into();
+
+                let array =
+                    PrimitiveArray::<i32>::new(DataType::Int32, values, array.validity().cloned());
+                primitive::nested_array_to_page::<i32, i32>(&array, options, type_, nested)
+            } else if precision <= 18 {
+                let values = array
+                    .values()
+                    .iter()
+                    .map(|x| *x as i64)
+                    .collect::<Vec<_>>()
+                    .into();
+
+                let array =
+                    PrimitiveArray::<i64>::new(DataType::Int64, values, array.validity().cloned());
+                primitive::nested_array_to_page::<i64, i64>(&array, options, type_, nested)
+            } else {
+                let size = decimal_length_from_precision(precision);
+
+                let statistics = if options.write_statistics {
+                    let stats =
+                        fixed_len_bytes::build_statistics_decimal(array, type_.clone(), size);
+                    Some(stats)
+                } else {
+                    None
+                };
+
+                let mut values = Vec::<u8>::with_capacity(size * array.len());
+                array.values().iter().for_each(|x| {
+                    let bytes = &x.to_be_bytes()[16 - size..];
+                    values.extend_from_slice(bytes)
+                });
+                let array = FixedSizeBinaryArray::new(
+                    DataType::FixedSizeBinary(size),
+                    values.into(),
+                    array.validity().cloned(),
+                );
+                fixed_len_bytes::array_to_page(&array, options, type_, statistics)
+            }
+        }
+        Decimal256(precision, _) => {
+            let type_ = type_;
+            let precision = *precision;
+            let array = array
+                .as_any()
+                .downcast_ref::<PrimitiveArray<i256>>()
+                .unwrap();
+            if precision <= 9 {
+                let values = array
+                    .values()
+                    .iter()
+                    .map(|x| x.0.as_i32())
+                    .collect::<Vec<_>>()
+                    .into();
+
+                let array =
+                    PrimitiveArray::<i32>::new(DataType::Int32, values, array.validity().cloned());
+                primitive::nested_array_to_page::<i32, i32>(&array, options, type_, nested)
+            } else if precision <= 18 {
+                let values = array
+                    .values()
+                    .iter()
+                    .map(|x| x.0.as_i64())
+                    .collect::<Vec<_>>()
+                    .into();
+
+                let array =
+                    PrimitiveArray::<i64>::new(DataType::Int64, values, array.validity().cloned());
+                primitive::nested_array_to_page::<i64, i64>(&array, options, type_, nested)
+            } else if precision <= 38 {
+                let size = decimal_length_from_precision(precision);
+                let statistics = if options.write_statistics {
+                    let stats = fixed_len_bytes::build_statistics_decimal256_with_i128(
+                        array,
+                        type_.clone(),
+                        size,
+                    );
+                    Some(stats)
+                } else {
+                    None
+                };
+
+                let mut values = Vec::<u8>::with_capacity(size * array.len());
+                array.values().iter().for_each(|x| {
+                    let bytes = &x.0.low().to_be_bytes()[16 - size..];
+                    values.extend_from_slice(bytes)
+                });
+                let array = FixedSizeBinaryArray::new(
+                    DataType::FixedSizeBinary(size),
+                    values.into(),
+                    array.validity().cloned(),
+                );
+                fixed_len_bytes::array_to_page(&array, options, type_, statistics)
+            } else {
+                let size = 32;
+                let array = array
+                    .as_any()
+                    .downcast_ref::<PrimitiveArray<i256>>()
+                    .unwrap();
+                let statistics = if options.write_statistics {
+                    let stats =
+                        fixed_len_bytes::build_statistics_decimal256(array, type_.clone(), size);
+                    Some(stats)
+                } else {
+                    None
+                };
+                let mut values = Vec::<u8>::with_capacity(size * array.len());
+                array.values().iter().for_each(|x| {
+                    let bytes = &x.to_be_bytes();
+                    values.extend_from_slice(bytes)
+                });
+                let array = FixedSizeBinaryArray::new(
+                    DataType::FixedSizeBinary(size),
+                    values.into(),
+                    array.validity().cloned(),
+                );
+
+                fixed_len_bytes::array_to_page(&array, options, type_, statistics)
+            }
+        }
         other => Err(Error::NotYetImplemented(format!(
             "Writing nested parquet pages for data type {other:?}"
         ))),
diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs
index 1ad218e0fe..4539d21a33 100644
--- a/tests/it/io/parquet/mod.rs
+++ b/tests/it/io/parquet/mod.rs
@@ -63,10 +63,9 @@ pub fn read_column<R: Read + Seek>(mut reader: R, column: &str) -> Result<ArrayS
 
     let mut reader = p_read::FileReader::new(reader, metadata.row_groups, schema, None, None, None);
 
-    Ok((
-        reader.next().unwrap()?.into_arrays().pop().unwrap(),
-        statistics,
-    ))
+    let array = reader.next().unwrap()?.into_arrays().pop().unwrap();
+
+    Ok((array, statistics))
 }
 
 pub fn pyarrow_nested_edge(column: &str) -> Box<dyn Array> {
@@ -131,26 +130,26 @@ pub fn pyarrow_nested_edge(column: &str) -> Box<dyn Array> {
 }
 
 pub fn pyarrow_nested_nullable(column: &str) -> Box<dyn Array> {
+    let i64_values = &[
+        Some(0),
+        Some(1),
+        Some(2),
+        None,
+        Some(3),
+        Some(4),
+        Some(5),
+        Some(6),
+        Some(7),
+        Some(8),
+        Some(9),
+        Some(10),
+    ];
     let offsets = vec![0, 2, 2, 5, 8, 8, 11, 11, 12].try_into().unwrap();
 
     let values = match column {
         "list_int64" => {
             // [[0, 1], None, [2, None, 3], [4, 5, 6], [], [7, 8, 9], None, [10]]
-            PrimitiveArray::<i64>::from(&[
-                Some(0),
-                Some(1),
-                Some(2),
-                None,
-                Some(3),
-                Some(4),
-                Some(5),
-                Some(6),
-                Some(7),
-                Some(8),
-                Some(9),
-                Some(10),
-            ])
-            .boxed()
+            PrimitiveArray::<i64>::from(i64_values).boxed()
         }
         "list_int64_required" | "list_int64_optional_required" | "list_int64_required_required" => {
             // [[0, 1], None, [2, 0, 3], [4, 5, 6], [], [7, 8, 9], None, [10]]
@@ -241,6 +240,21 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box<dyn Array> {
             Some(b"bbb".to_vec()),
             Some(b"".to_vec()),
         ])),
+        "list_decimal" => {
+            let values = i64_values
+                .iter()
+                .map(|x| x.map(|x| x as i128))
+                .collect::<Vec<_>>();
+            Box::new(PrimitiveArray::<i128>::from(values).to(DataType::Decimal(9, 0)))
+        }
+        "list_decimal256" => {
+            let values = i64_values
+                .iter()
+                .map(|x| x.map(|x| i256(x.as_i256())))
+                .collect::<Vec<_>>();
+            let array = PrimitiveArray::<i256>::from(values).to(DataType::Decimal256(9, 0));
+            Box::new(array)
+        }
         "list_nested_i64"
         | "list_nested_inner_required_i64"
         | "list_nested_inner_required_required_i64" => Box::new(NullArray::new(DataType::Null, 1)),
@@ -422,6 +436,8 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box<dyn Array> {
                 "list_bool" => Field::new("item", DataType::Boolean, true),
                 "list_utf8" => Field::new("item", DataType::Utf8, true),
                 "list_large_binary" => Field::new("item", DataType::LargeBinary, true),
+                "list_decimal" => Field::new("item", DataType::Decimal(9, 0), true),
+                "list_decimal256" => Field::new("item", DataType::Decimal256(9, 0), true),
                 "list_struct_nullable" => Field::new("item", values.data_type().clone(), true),
                 "list_struct_list_nullable" => Field::new("item", values.data_type().clone(), true),
                 other => unreachable!("{}", other),
@@ -868,6 +884,38 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics {
             min_value: new_list(Box::new(BinaryArray::<i64>::from_slice([b""])), true).boxed(),
             max_value: new_list(Box::new(BinaryArray::<i64>::from_slice([b"ccc"])), true).boxed(),
         },
+        "list_decimal" => Statistics {
+            distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(),
+            null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(),
+            min_value: new_list(
+                Box::new(Int128Array::from_slice([0]).to(DataType::Decimal(9, 0))),
+                true,
+            )
+            .boxed(),
+            max_value: new_list(
+                Box::new(Int128Array::from_slice([10]).to(DataType::Decimal(9, 0))),
+                true,
+            )
+            .boxed(),
+        },
+        "list_decimal256" => Statistics {
+            distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(),
+            null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(),
+            min_value: new_list(
+                Box::new(
+                    Int256Array::from_slice([i256(0.as_i256())]).to(DataType::Decimal256(9, 0)),
+                ),
+                true,
+            )
+            .boxed(),
+            max_value: new_list(
+                Box::new(
+                    Int256Array::from_slice([i256(10.as_i256())]).to(DataType::Decimal256(9, 0)),
+                ),
+                true,
+            )
+            .boxed(),
+        },
         "list_int64" => Statistics {
             distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(),
             null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(),
diff --git a/tests/it/io/parquet/read.rs b/tests/it/io/parquet/read.rs
index 786bdf6f96..a2237b4926 100644
--- a/tests/it/io/parquet/read.rs
+++ b/tests/it/io/parquet/read.rs
@@ -62,6 +62,8 @@ fn test_pyarrow_integration(
         "list_nested_i64",
         "list_utf8",
         "list_bool",
+        "list_decimal",
+        "list_decimal256",
         "list_nested_inner_required_required_i64",
         "list_nested_inner_required_i64",
         // pyarrow counts null struct items as nulls
@@ -322,6 +324,16 @@ fn v1_nested_large_binary() -> Result<()> {
     test_pyarrow_integration("list_large_binary", 1, "nested", false, false, None)
 }
 
+#[test]
+fn v2_nested_decimal_nullable() -> Result<()> {
+    test_pyarrow_integration("list_decimal", 2, "nested", false, false, None)
+}
+
+#[test]
+fn v2_nested_decimal256_nullable() -> Result<()> {
+    test_pyarrow_integration("list_decimal256", 2, "nested", false, false, None)
+}
+
 #[test]
 fn v2_nested_nested() -> Result<()> {
     test_pyarrow_integration("list_nested_i64", 2, "nested", false, false, None)
diff --git a/tests/it/io/parquet/write.rs b/tests/it/io/parquet/write.rs
index 439710eb24..5fda011374 100644
--- a/tests/it/io/parquet/write.rs
+++ b/tests/it/io/parquet/write.rs
@@ -404,6 +404,30 @@ fn list_struct_nullable() -> Result<()> {
     )
 }
 
+#[test]
+fn list_decimal_nullable() -> Result<()> {
+    round_trip_opt_stats(
+        "list_decimal",
+        "nested",
+        Version::V1,
+        CompressionOptions::Uncompressed,
+        vec![Encoding::Plain],
+        true,
+    )
+}
+
+#[test]
+fn list_decimal256_nullable() -> Result<()> {
+    round_trip_opt_stats(
+        "list_decimal256",
+        "nested",
+        Version::V1,
+        CompressionOptions::Uncompressed,
+        vec![Encoding::Plain],
+        true,
+    )
+}
+
 #[test]
 fn v1_nested_struct_list_nullable() -> Result<()> {
     round_trip_opt_stats(

From a9139196f0f1498959a804d533fe2397cd36fb2a Mon Sep 17 00:00:00 2001
From: Jay Chia <17691182+jaychia@users.noreply.github.com>
Date: Sat, 2 Sep 2023 17:10:03 -0700
Subject: [PATCH 58/80] Correctly coerce Parquet Int96 timestamps into
 requested TimeUnits (#1532)

* Add correct coercion logic when reading int96 timestamps into specified timeunits

* Refactor to better inline nested function call in Iter::new

* Fix static check issues - immediately wrap and return an ArrayIter object

* Fix lints

---------

Co-authored-by: Jay Chia <jaychia94@gmail.com@users.noreply.github.com>
---
 src/io/parquet/read/deserialize/simple.rs | 77 ++++++++++++++++++++---
 tests/it/io/parquet/read.rs               | 58 +++++++++++++++++
 2 files changed, 125 insertions(+), 10 deletions(-)

diff --git a/src/io/parquet/read/deserialize/simple.rs b/src/io/parquet/read/deserialize/simple.rs
index b4b614980e..d19296a4b7 100644
--- a/src/io/parquet/read/deserialize/simple.rs
+++ b/src/io/parquet/read/deserialize/simple.rs
@@ -391,6 +391,44 @@ fn unifiy_timestmap_unit(
     }
 }
 
+#[inline]
+pub fn int96_to_i64_us(value: [u32; 3]) -> i64 {
+    const JULIAN_DAY_OF_EPOCH: i64 = 2_440_588;
+    const SECONDS_PER_DAY: i64 = 86_400;
+    const MICROS_PER_SECOND: i64 = 1_000_000;
+
+    let day = value[2] as i64;
+    let microseconds = (((value[1] as i64) << 32) + value[0] as i64) / 1_000;
+    let seconds = (day - JULIAN_DAY_OF_EPOCH) * SECONDS_PER_DAY;
+
+    seconds * MICROS_PER_SECOND + microseconds
+}
+
+#[inline]
+pub fn int96_to_i64_ms(value: [u32; 3]) -> i64 {
+    const JULIAN_DAY_OF_EPOCH: i64 = 2_440_588;
+    const SECONDS_PER_DAY: i64 = 86_400;
+    const MILLIS_PER_SECOND: i64 = 1_000;
+
+    let day = value[2] as i64;
+    let milliseconds = (((value[1] as i64) << 32) + value[0] as i64) / 1_000_000;
+    let seconds = (day - JULIAN_DAY_OF_EPOCH) * SECONDS_PER_DAY;
+
+    seconds * MILLIS_PER_SECOND + milliseconds
+}
+
+#[inline]
+pub fn int96_to_i64_s(value: [u32; 3]) -> i64 {
+    const JULIAN_DAY_OF_EPOCH: i64 = 2_440_588;
+    const SECONDS_PER_DAY: i64 = 86_400;
+
+    let day = value[2] as i64;
+    let seconds = (((value[1] as i64) << 32) + value[0] as i64) / 1_000_000_000;
+    let day_seconds = (day - JULIAN_DAY_OF_EPOCH) * SECONDS_PER_DAY;
+
+    day_seconds + seconds
+}
+
 fn timestamp<'a, I: Pages + 'a>(
     pages: I,
     physical_type: &PhysicalType,
@@ -401,16 +439,35 @@ fn timestamp<'a, I: Pages + 'a>(
     time_unit: TimeUnit,
 ) -> Result<ArrayIter<'a>> {
     if physical_type == &PhysicalType::Int96 {
-        let iter = primitive::Iter::new(pages, data_type, num_rows, chunk_size, int96_to_i64_ns);
-        let logical_type = PrimitiveLogicalType::Timestamp {
-            unit: ParquetTimeUnit::Nanoseconds,
-            is_adjusted_to_utc: false,
-        };
-        let (factor, is_multiplier) = unifiy_timestmap_unit(&Some(logical_type), time_unit);
-        return match (factor, is_multiplier) {
-            (1, _) => Ok(dyn_iter(iden(iter))),
-            (a, true) => Ok(dyn_iter(op(iter, move |x| x * a))),
-            (a, false) => Ok(dyn_iter(op(iter, move |x| x / a))),
+        return match time_unit {
+            TimeUnit::Nanosecond => Ok(dyn_iter(iden(primitive::Iter::new(
+                pages,
+                data_type,
+                num_rows,
+                chunk_size,
+                int96_to_i64_ns,
+            )))),
+            TimeUnit::Microsecond => Ok(dyn_iter(iden(primitive::Iter::new(
+                pages,
+                data_type,
+                num_rows,
+                chunk_size,
+                int96_to_i64_us,
+            )))),
+            TimeUnit::Millisecond => Ok(dyn_iter(iden(primitive::Iter::new(
+                pages,
+                data_type,
+                num_rows,
+                chunk_size,
+                int96_to_i64_ms,
+            )))),
+            TimeUnit::Second => Ok(dyn_iter(iden(primitive::Iter::new(
+                pages,
+                data_type,
+                num_rows,
+                chunk_size,
+                int96_to_i64_s,
+            )))),
         };
     };
 
diff --git a/tests/it/io/parquet/read.rs b/tests/it/io/parquet/read.rs
index a2237b4926..8f45eb874d 100644
--- a/tests/it/io/parquet/read.rs
+++ b/tests/it/io/parquet/read.rs
@@ -782,3 +782,61 @@ fn invalid_utf8() -> Result<()> {
     );
     Ok(())
 }
+
+#[test]
+fn read_int96_timestamps() -> Result<()> {
+    use std::collections::BTreeMap;
+
+    let timestamp_data = &[
+        0x50, 0x41, 0x52, 0x31, 0x15, 0x04, 0x15, 0x48, 0x15, 0x3c, 0x4c, 0x15, 0x06, 0x15, 0x00,
+        0x12, 0x00, 0x00, 0x24, 0x00, 0x00, 0x0d, 0x01, 0x08, 0x9f, 0xd5, 0x1f, 0x0d, 0x0a, 0x44,
+        0x00, 0x00, 0x59, 0x68, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14,
+        0xfb, 0x2a, 0x00, 0x15, 0x00, 0x15, 0x14, 0x15, 0x18, 0x2c, 0x15, 0x06, 0x15, 0x10, 0x15,
+        0x06, 0x15, 0x06, 0x1c, 0x00, 0x00, 0x00, 0x0a, 0x24, 0x02, 0x00, 0x00, 0x00, 0x06, 0x01,
+        0x02, 0x03, 0x24, 0x00, 0x26, 0x9e, 0x01, 0x1c, 0x15, 0x06, 0x19, 0x35, 0x10, 0x00, 0x06,
+        0x19, 0x18, 0x0a, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x73, 0x15, 0x02,
+        0x16, 0x06, 0x16, 0x9e, 0x01, 0x16, 0x96, 0x01, 0x26, 0x60, 0x26, 0x08, 0x29, 0x2c, 0x15,
+        0x04, 0x15, 0x00, 0x15, 0x02, 0x00, 0x15, 0x00, 0x15, 0x10, 0x15, 0x02, 0x00, 0x00, 0x00,
+        0x15, 0x04, 0x19, 0x2c, 0x35, 0x00, 0x18, 0x06, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x15,
+        0x02, 0x00, 0x15, 0x06, 0x25, 0x02, 0x18, 0x0a, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61,
+        0x6d, 0x70, 0x73, 0x00, 0x16, 0x06, 0x19, 0x1c, 0x19, 0x1c, 0x26, 0x9e, 0x01, 0x1c, 0x15,
+        0x06, 0x19, 0x35, 0x10, 0x00, 0x06, 0x19, 0x18, 0x0a, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74,
+        0x61, 0x6d, 0x70, 0x73, 0x15, 0x02, 0x16, 0x06, 0x16, 0x9e, 0x01, 0x16, 0x96, 0x01, 0x26,
+        0x60, 0x26, 0x08, 0x29, 0x2c, 0x15, 0x04, 0x15, 0x00, 0x15, 0x02, 0x00, 0x15, 0x00, 0x15,
+        0x10, 0x15, 0x02, 0x00, 0x00, 0x00, 0x16, 0x9e, 0x01, 0x16, 0x06, 0x26, 0x08, 0x16, 0x96,
+        0x01, 0x14, 0x00, 0x00, 0x28, 0x20, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2d, 0x63,
+        0x70, 0x70, 0x2d, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x20, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f,
+        0x6e, 0x20, 0x31, 0x32, 0x2e, 0x30, 0x2e, 0x30, 0x19, 0x1c, 0x1c, 0x00, 0x00, 0x00, 0x95,
+        0x00, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31,
+    ];
+
+    let parse = |time_unit: TimeUnit| {
+        let mut reader = Cursor::new(timestamp_data);
+        let metadata = read_metadata(&mut reader)?;
+        let schema = arrow2::datatypes::Schema {
+            fields: vec![arrow2::datatypes::Field::new(
+                "timestamps",
+                arrow2::datatypes::DataType::Timestamp(time_unit, None),
+                false,
+            )],
+            metadata: BTreeMap::new(),
+        };
+        let reader = FileReader::new(reader, metadata.row_groups, schema, Some(5), None, None);
+        reader.collect::<Result<Vec<_>>>()
+    };
+
+    // This data contains int96 timestamps in the year 1000 and 3000, which are out of range for
+    // Timestamp(TimeUnit::Nanoseconds) and will cause a panic in dev builds/overflow in release builds
+    // However, the code should work for the Microsecond/Millisecond time units
+    for time_unit in [
+        arrow2::datatypes::TimeUnit::Microsecond,
+        arrow2::datatypes::TimeUnit::Millisecond,
+        arrow2::datatypes::TimeUnit::Second,
+    ] {
+        parse(time_unit).expect("Should not error");
+    }
+    std::panic::catch_unwind(|| parse(arrow2::datatypes::TimeUnit::Nanosecond))
+        .expect_err("Should be a panic error");
+
+    Ok(())
+}

From f6135f57bf08cceb1be9c2f7d104e297bdedba48 Mon Sep 17 00:00:00 2001
From: sundyli <543950155@qq.com>
Date: Sat, 2 Sep 2023 19:19:26 -0700
Subject: [PATCH 59/80] chore: fix clippy (#1558)

---
 src/compute/cast/primitive_to.rs |  8 ++++----
 src/compute/temporal.rs          | 16 ++++++++++++----
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs
index 585e826cdd..30b265e2d5 100644
--- a/src/compute/cast/primitive_to.rs
+++ b/src/compute/cast/primitive_to.rs
@@ -433,7 +433,7 @@ where
                 x.map(|x| {
                     let datetime = timestamp_ns_to_datetime(*x);
                     let offset = timezone.offset_from_utc_datetime(&datetime);
-                    chrono::DateTime::<T>::from_utc(datetime, offset).to_rfc3339()
+                    chrono::DateTime::<T>::from_naive_utc_and_offset(datetime, offset).to_rfc3339()
                 })
             });
             Utf8Array::from_trusted_len_iter(iter)
@@ -443,7 +443,7 @@ where
                 x.map(|x| {
                     let datetime = timestamp_us_to_datetime(*x);
                     let offset = timezone.offset_from_utc_datetime(&datetime);
-                    chrono::DateTime::<T>::from_utc(datetime, offset).to_rfc3339()
+                    chrono::DateTime::<T>::from_naive_utc_and_offset(datetime, offset).to_rfc3339()
                 })
             });
             Utf8Array::from_trusted_len_iter(iter)
@@ -453,7 +453,7 @@ where
                 x.map(|x| {
                     let datetime = timestamp_ms_to_datetime(*x);
                     let offset = timezone.offset_from_utc_datetime(&datetime);
-                    chrono::DateTime::<T>::from_utc(datetime, offset).to_rfc3339()
+                    chrono::DateTime::<T>::from_naive_utc_and_offset(datetime, offset).to_rfc3339()
                 })
             });
             Utf8Array::from_trusted_len_iter(iter)
@@ -463,7 +463,7 @@ where
                 x.map(|x| {
                     let datetime = timestamp_s_to_datetime(*x);
                     let offset = timezone.offset_from_utc_datetime(&datetime);
-                    chrono::DateTime::<T>::from_utc(datetime, offset).to_rfc3339()
+                    chrono::DateTime::<T>::from_naive_utc_and_offset(datetime, offset).to_rfc3339()
                 })
             });
             Utf8Array::from_trusted_len_iter(iter)
diff --git a/src/compute/temporal.rs b/src/compute/temporal.rs
index d1fd6cc735..60e573da4b 100644
--- a/src/compute/temporal.rs
+++ b/src/compute/temporal.rs
@@ -288,7 +288,9 @@ where
             let op = |x| {
                 let datetime = timestamp_s_to_datetime(x);
                 let offset = timezone.offset_from_utc_datetime(&datetime);
-                extract(chrono::DateTime::<T>::from_utc(datetime, offset))
+                extract(chrono::DateTime::<T>::from_naive_utc_and_offset(
+                    datetime, offset,
+                ))
             };
             unary(array, op, A::PRIMITIVE.into())
         }
@@ -296,7 +298,9 @@ where
             let op = |x| {
                 let datetime = timestamp_ms_to_datetime(x);
                 let offset = timezone.offset_from_utc_datetime(&datetime);
-                extract(chrono::DateTime::<T>::from_utc(datetime, offset))
+                extract(chrono::DateTime::<T>::from_naive_utc_and_offset(
+                    datetime, offset,
+                ))
             };
             unary(array, op, A::PRIMITIVE.into())
         }
@@ -304,7 +308,9 @@ where
             let op = |x| {
                 let datetime = timestamp_us_to_datetime(x);
                 let offset = timezone.offset_from_utc_datetime(&datetime);
-                extract(chrono::DateTime::<T>::from_utc(datetime, offset))
+                extract(chrono::DateTime::<T>::from_naive_utc_and_offset(
+                    datetime, offset,
+                ))
             };
             unary(array, op, A::PRIMITIVE.into())
         }
@@ -312,7 +318,9 @@ where
             let op = |x| {
                 let datetime = timestamp_ns_to_datetime(x);
                 let offset = timezone.offset_from_utc_datetime(&datetime);
-                extract(chrono::DateTime::<T>::from_utc(datetime, offset))
+                extract(chrono::DateTime::<T>::from_naive_utc_and_offset(
+                    datetime, offset,
+                ))
             };
             unary(array, op, A::PRIMITIVE.into())
         }

From cf9ec83318513928a10fce1a7c02033cec99fdf2 Mon Sep 17 00:00:00 2001
From: Ivan Smirnov <aldanor@users.noreply.github.com>
Date: Sun, 3 Sep 2023 03:21:23 +0100
Subject: [PATCH 60/80] MutableDictionaryArray rewrite: use values stored in
 the array instead of the hash->hash map (#1555)

---
 Cargo.toml                           |   4 +-
 src/array/dictionary/mod.rs          |   1 +
 src/array/dictionary/mutable.rs      | 135 ++++++++---------
 src/array/dictionary/value_map.rs    | 207 +++++++++++++++++++++++++++
 src/array/indexable.rs               | 197 +++++++++++++++++++++++++
 src/array/mod.rs                     |   4 +-
 src/compute/cast/primitive_to.rs     |   4 +-
 tests/it/array/dictionary/mutable.rs |  15 --
 8 files changed, 469 insertions(+), 98 deletions(-)
 create mode 100644 src/array/dictionary/value_map.rs
 create mode 100644 src/array/indexable.rs

diff --git a/Cargo.toml b/Cargo.toml
index 0f3f9ec27b..1bb20a6955 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,7 +32,7 @@ hash_hasher = "^2.0.3"
 simdutf8 = "0.1.4"
 
 # A Rust port of SwissTable
-hashbrown = { version = "0.14", default-features = false, optional = true }
+hashbrown = { version = "0.14", default-features = false, features = ["ahash"] }
 
 # for timezone support
 chrono-tz = { version = "0.8", optional = true }
@@ -243,7 +243,7 @@ compute_merge_sort = ["itertools", "compute_sort"]
 compute_nullif = ["compute_comparison"]
 compute_partition = ["compute_sort"]
 compute_regex_match = ["regex"]
-compute_sort = ["compute_take", "hashbrown"]
+compute_sort = ["compute_take"]
 compute_substring = []
 compute_take = []
 compute_temporal = []
diff --git a/src/array/dictionary/mod.rs b/src/array/dictionary/mod.rs
index f7d4a0f43d..a4be1ea210 100644
--- a/src/array/dictionary/mod.rs
+++ b/src/array/dictionary/mod.rs
@@ -20,6 +20,7 @@ mod iterator;
 mod mutable;
 use crate::array::specification::check_indexes_unchecked;
 mod typed_iterator;
+mod value_map;
 
 use crate::array::dictionary::typed_iterator::{DictValue, DictionaryValuesIterTyped};
 pub use iterator::*;
diff --git a/src/array/dictionary/mutable.rs b/src/array/dictionary/mutable.rs
index 444de34bcc..16a00a1357 100644
--- a/src/array/dictionary/mutable.rs
+++ b/src/array/dictionary/mutable.rs
@@ -1,15 +1,15 @@
-use std::hash::{Hash, Hasher};
-use std::{collections::hash_map::DefaultHasher, sync::Arc};
-
-use hash_hasher::HashedMap;
+use std::hash::Hash;
+use std::sync::Arc;
 
+use crate::array::indexable::{AsIndexed, Indexable};
 use crate::{
     array::{primitive::MutablePrimitiveArray, Array, MutableArray, TryExtend, TryPush},
     bitmap::MutableBitmap,
     datatypes::DataType,
-    error::{Error, Result},
+    error::Result,
 };
 
+use super::value_map::ValueMap;
 use super::{DictionaryArray, DictionaryKey};
 
 /// A mutable, strong-typed version of [`DictionaryArray`].
@@ -30,55 +30,29 @@ use super::{DictionaryArray, DictionaryKey};
 #[derive(Debug)]
 pub struct MutableDictionaryArray<K: DictionaryKey, M: MutableArray> {
     data_type: DataType,
+    map: ValueMap<K, M>,
+    // invariant: `max(keys) < map.values().len()`
     keys: MutablePrimitiveArray<K>,
-    map: HashedMap<u64, K>,
-    // invariant: `keys.len() <= values.len()`
-    values: M,
 }
 
 impl<K: DictionaryKey, M: MutableArray> From<MutableDictionaryArray<K, M>> for DictionaryArray<K> {
-    fn from(mut other: MutableDictionaryArray<K, M>) -> Self {
+    fn from(other: MutableDictionaryArray<K, M>) -> Self {
         // Safety - the invariant of this struct ensures that this is up-held
         unsafe {
             DictionaryArray::<K>::try_new_unchecked(
                 other.data_type,
                 other.keys.into(),
-                other.values.as_box(),
+                other.map.into_boxed().as_box(),
             )
             .unwrap()
         }
     }
 }
 
-impl<K: DictionaryKey, M: MutableArray> From<M> for MutableDictionaryArray<K, M> {
-    fn from(values: M) -> Self {
-        Self {
-            data_type: DataType::Dictionary(
-                K::KEY_TYPE,
-                Box::new(values.data_type().clone()),
-                false,
-            ),
-            keys: MutablePrimitiveArray::<K>::new(),
-            map: HashedMap::default(),
-            values,
-        }
-    }
-}
-
 impl<K: DictionaryKey, M: MutableArray + Default> MutableDictionaryArray<K, M> {
     /// Creates an empty [`MutableDictionaryArray`].
     pub fn new() -> Self {
-        let values = M::default();
-        Self {
-            data_type: DataType::Dictionary(
-                K::KEY_TYPE,
-                Box::new(values.data_type().clone()),
-                false,
-            ),
-            keys: MutablePrimitiveArray::<K>::new(),
-            map: HashedMap::default(),
-            values,
-        }
+        Self::try_empty(M::default()).unwrap()
     }
 }
 
@@ -89,22 +63,34 @@ impl<K: DictionaryKey, M: MutableArray + Default> Default for MutableDictionaryA
 }
 
 impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {
-    /// Returns whether the value should be pushed to the values or not
-    fn try_push_valid<T: Hash>(&mut self, value: &T) -> Result<bool> {
-        let mut hasher = DefaultHasher::new();
-        value.hash(&mut hasher);
-        let hash = hasher.finish();
-        match self.map.get(&hash) {
-            Some(key) => {
-                self.keys.push(Some(*key));
-                Ok(false)
-            }
-            None => {
-                let key = K::try_from(self.map.len()).map_err(|_| Error::Overflow)?;
-                self.map.insert(hash, key);
-                self.keys.push(Some(key));
-                Ok(true)
-            }
+    /// Creates an empty [`MutableDictionaryArray`] from a given empty values array.
+    /// # Errors
+    /// Errors if the array is non-empty.
+    pub fn try_empty(values: M) -> Result<Self> {
+        Ok(Self::from_value_map(ValueMap::<K, M>::try_empty(values)?))
+    }
+
+    /// Creates an empty [`MutableDictionaryArray`] preloaded with a given dictionary of values.
+    /// Indices associated with those values are automatically assigned based on the order of
+    /// the values.
+    /// # Errors
+    /// Errors if there's more values than the maximum value of `K`.
+    pub fn from_values(values: M) -> Result<Self>
+    where
+        M: Indexable,
+        M::Type: Eq + Hash,
+    {
+        Ok(Self::from_value_map(ValueMap::<K, M>::from_values(values)?))
+    }
+
+    fn from_value_map(value_map: ValueMap<K, M>) -> Self {
+        let keys = MutablePrimitiveArray::<K>::new();
+        let data_type =
+            DataType::Dictionary(K::KEY_TYPE, Box::new(value_map.data_type().clone()), false);
+        Self {
+            data_type,
+            map: value_map,
+            keys,
         }
     }
 
@@ -113,14 +99,9 @@ impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {
         self.keys.push(None)
     }
 
-    /// returns a mutable reference to the inner values.
-    fn mut_values(&mut self) -> &mut M {
-        &mut self.values
-    }
-
     /// returns a reference to the inner values.
     pub fn values(&self) -> &M {
-        &self.values
+        self.map.values()
     }
 
     /// converts itself into [`Arc<dyn Array>`]
@@ -142,15 +123,10 @@ impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {
 
     /// Shrinks the capacity of the [`MutableDictionaryArray`] to fit its current length.
     pub fn shrink_to_fit(&mut self) {
-        self.values.shrink_to_fit();
+        self.map.shrink_to_fit();
         self.keys.shrink_to_fit();
     }
 
-    /// Returns the dictionary map
-    pub fn map(&self) -> &HashedMap<u64, K> {
-        &self.map
-    }
-
     /// Returns the dictionary keys
     pub fn keys(&self) -> &MutablePrimitiveArray<K> {
         &self.keys
@@ -160,7 +136,7 @@ impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {
         DictionaryArray::<K>::try_new(
             self.data_type.clone(),
             std::mem::take(&mut self.keys).into(),
-            self.values.as_box(),
+            self.map.take_into(),
         )
         .unwrap()
     }
@@ -208,17 +184,20 @@ impl<K: DictionaryKey, M: 'static + MutableArray> MutableArray for MutableDictio
     }
 }
 
-impl<K, M, T: Hash> TryExtend<Option<T>> for MutableDictionaryArray<K, M>
+impl<K, M, T> TryExtend<Option<T>> for MutableDictionaryArray<K, M>
 where
     K: DictionaryKey,
-    M: MutableArray + TryExtend<Option<T>>,
+    M: MutableArray + Indexable + TryExtend<Option<T>>,
+    T: AsIndexed<M>,
+    M::Type: Eq + Hash,
 {
     fn try_extend<II: IntoIterator<Item = Option<T>>>(&mut self, iter: II) -> Result<()> {
         for value in iter {
             if let Some(value) = value {
-                if self.try_push_valid(&value)? {
-                    self.mut_values().try_extend(std::iter::once(Some(value)))?;
-                }
+                let key = self
+                    .map
+                    .try_push_valid(value, |arr, v| arr.try_extend(std::iter::once(Some(v))))?;
+                self.keys.try_push(Some(key))?;
             } else {
                 self.push_null();
             }
@@ -230,19 +209,19 @@ where
 impl<K, M, T> TryPush<Option<T>> for MutableDictionaryArray<K, M>
 where
     K: DictionaryKey,
-    M: MutableArray + TryPush<Option<T>>,
-    T: Hash,
+    M: MutableArray + Indexable + TryPush<Option<T>>,
+    T: AsIndexed<M>,
+    M::Type: Eq + Hash,
 {
     fn try_push(&mut self, item: Option<T>) -> Result<()> {
         if let Some(value) = item {
-            if self.try_push_valid(&value)? {
-                self.values.try_push(Some(value))
-            } else {
-                Ok(())
-            }
+            let key = self
+                .map
+                .try_push_valid(value, |arr, v| arr.try_push(Some(v)))?;
+            self.keys.try_push(Some(key))?;
         } else {
             self.push_null();
-            Ok(())
         }
+        Ok(())
     }
 }
diff --git a/src/array/dictionary/value_map.rs b/src/array/dictionary/value_map.rs
new file mode 100644
index 0000000000..35603de4cf
--- /dev/null
+++ b/src/array/dictionary/value_map.rs
@@ -0,0 +1,207 @@
+use std::borrow::Borrow;
+use std::fmt::{self, Debug};
+use std::hash::{Hash, Hasher};
+use std::pin::Pin;
+use std::ptr::NonNull;
+
+use hashbrown::{Equivalent, HashMap};
+
+use crate::array::Array;
+use crate::{
+    array::indexable::{AsIndexed, Indexable},
+    array::MutableArray,
+    datatypes::DataType,
+    error::{Error, Result},
+};
+
+use super::DictionaryKey;
+
+struct NonNullSend<M: ?Sized>(NonNull<M>);
+
+// safety: these pointers are for internal self-referential purposes to pinned array only
+unsafe impl<M> Send for NonNullSend<M> {}
+unsafe impl<M> Sync for NonNullSend<M> {}
+
+impl<M: ?Sized> From<&M> for NonNullSend<M> {
+    #[inline]
+    fn from(reference: &M) -> Self {
+        Self(NonNull::from(reference))
+    }
+}
+
+struct ValueRef<M> {
+    array: NonNullSend<M>,
+    index: usize,
+}
+
+impl<M> ValueRef<M> {
+    #[inline]
+    pub fn new(array: &Pin<Box<M>>, index: usize) -> Self {
+        Self {
+            array: NonNullSend::from(Pin::get_ref(array.as_ref())),
+            index,
+        }
+    }
+
+    #[inline]
+    pub fn get_array(&self) -> &M {
+        // safety: the invariant of the struct
+        unsafe { self.array.0.as_ref() }
+    }
+
+    #[inline]
+    pub unsafe fn get_unchecked(&self) -> M::Value<'_>
+    where
+        M: Indexable,
+    {
+        self.get_array().value_unchecked_at(self.index)
+    }
+
+    #[inline]
+    pub unsafe fn equals(&self, other: &M::Type) -> bool
+    where
+        M: Indexable,
+        M::Type: Eq,
+    {
+        self.get_unchecked().borrow() == other
+    }
+}
+
+impl<M: Indexable> PartialEq for ValueRef<M>
+where
+    M::Type: PartialEq,
+{
+    #[inline]
+    fn eq(&self, other: &Self) -> bool {
+        // safety: the way these value refs are constructed, they are always within bounds
+        unsafe {
+            self.get_unchecked()
+                .borrow()
+                .eq(other.get_unchecked().borrow())
+        }
+    }
+}
+
+impl<M: Indexable> Eq for ValueRef<M> where for<'a> M::Type: Eq {}
+
+impl<M: Indexable> Hash for ValueRef<M>
+where
+    M::Type: Hash,
+{
+    #[inline]
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        // safety: the way these value refs are constructed, they are always within bounds
+        unsafe { self.get_unchecked().borrow().hash(state) }
+    }
+}
+
+// To avoid blanket implementation issues with `Equivalent` trait (we only use hashbrown
+// instead of the default HashMap to avoid blanket implementation problems with Borrow).
+#[derive(Hash)]
+struct Wrapped<'a, T: ?Sized>(&'a T);
+
+impl<'a, M: Indexable> Equivalent<ValueRef<M>> for Wrapped<'a, M::Type>
+where
+    M::Type: Eq,
+{
+    #[inline]
+    fn equivalent(&self, key: &ValueRef<M>) -> bool {
+        // safety: invariant of the struct
+        unsafe { key.equals(self.0) }
+    }
+}
+
+pub struct ValueMap<K: DictionaryKey, M: MutableArray> {
+    values: Pin<Box<M>>,
+    map: HashMap<ValueRef<M>, K>,
+}
+
+impl<K: DictionaryKey, M: MutableArray> ValueMap<K, M> {
+    pub fn try_empty(values: M) -> Result<Self> {
+        if !values.is_empty() {
+            return Err(Error::InvalidArgumentError(
+                "initializing value map with non-empty values array".into(),
+            ));
+        }
+        Ok(Self {
+            values: Box::pin(values),
+            map: HashMap::default(),
+        })
+    }
+
+    pub fn from_values(values: M) -> Result<Self>
+    where
+        M: Indexable,
+        M::Type: Eq + Hash,
+    {
+        let values = Box::pin(values);
+        let map = (0..values.len())
+            .map(|i| {
+                let key = K::try_from(i).map_err(|_| Error::Overflow)?;
+                Ok((ValueRef::new(&values, i), key))
+            })
+            .collect::<Result<_>>()?;
+        Ok(Self { values, map })
+    }
+
+    pub fn data_type(&self) -> &DataType {
+        Pin::get_ref(self.values.as_ref()).data_type()
+    }
+
+    pub fn into_boxed(self) -> Box<M> {
+        // safety: we unpin the pointer but the value map is dropped along with all
+        // the value references that might refer to the pinned array
+        unsafe { Pin::into_inner_unchecked(self.values) }
+    }
+
+    pub fn take_into(&mut self) -> Box<dyn Array> {
+        // safety: we unpin the pointer but the value map is manually cleared
+        let arr = unsafe { self.values.as_mut().get_unchecked_mut().as_box() };
+        self.map.clear();
+        arr
+    }
+
+    #[inline]
+    pub fn values(&self) -> &M {
+        &self.values
+    }
+
+    /// Try to insert a value and return its index (it may or may not get inserted).
+    pub fn try_push_valid<V>(
+        &mut self,
+        value: V,
+        mut push: impl FnMut(&mut M, V) -> Result<()>,
+    ) -> Result<K>
+    where
+        M: Indexable,
+        V: AsIndexed<M>,
+        M::Type: Eq + Hash,
+    {
+        if let Some(&key) = self.map.get(&Wrapped(value.as_indexed())) {
+            return Ok(key);
+        }
+        let index = self.values.len();
+        let key = K::try_from(index).map_err(|_| Error::Overflow)?;
+        // safety: we don't move the data out of the mutable pinned reference
+        unsafe {
+            push(self.values.as_mut().get_unchecked_mut(), value)?;
+        }
+        debug_assert_eq!(self.values.len(), index + 1);
+        self.map.insert(ValueRef::new(&self.values, index), key);
+        debug_assert_eq!(self.values.len(), self.map.len());
+        Ok(key)
+    }
+
+    pub fn shrink_to_fit(&mut self) {
+        // safety: we don't move the data out of the mutable pinned reference
+        unsafe {
+            self.values.as_mut().get_unchecked_mut().shrink_to_fit();
+        }
+    }
+}
+
+impl<K: DictionaryKey, M: MutableArray> Debug for ValueMap<K, M> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        Pin::get_ref(self.values.as_ref()).fmt(f)
+    }
+}
diff --git a/src/array/indexable.rs b/src/array/indexable.rs
new file mode 100644
index 0000000000..76001bfcf5
--- /dev/null
+++ b/src/array/indexable.rs
@@ -0,0 +1,197 @@
+use std::borrow::Borrow;
+
+use crate::{
+    array::{
+        MutableArray, MutableBinaryArray, MutableBinaryValuesArray, MutableBooleanArray,
+        MutableFixedSizeBinaryArray, MutablePrimitiveArray, MutableUtf8Array,
+        MutableUtf8ValuesArray,
+    },
+    offset::Offset,
+    types::NativeType,
+};
+
+/// Trait for arrays that can be indexed directly to extract a value.
+pub trait Indexable {
+    /// The type of the element at index `i`; may be a reference type or a value type.
+    type Value<'a>: Borrow<Self::Type>
+    where
+        Self: 'a;
+
+    type Type: ?Sized;
+
+    /// Returns the element at index `i`.
+    /// # Panic
+    /// May panic if `i >= self.len()`.
+    fn value_at(&self, index: usize) -> Self::Value<'_>;
+
+    /// Returns the element at index `i`.
+    /// # Safety
+    /// Assumes that the `i < self.len`.
+    #[inline]
+    unsafe fn value_unchecked_at(&self, index: usize) -> Self::Value<'_> {
+        self.value_at(index)
+    }
+}
+
+pub trait AsIndexed<M: Indexable> {
+    fn as_indexed(&self) -> &M::Type;
+}
+
+impl Indexable for MutableBooleanArray {
+    type Value<'a> = bool;
+    type Type = bool;
+
+    #[inline]
+    fn value_at(&self, i: usize) -> Self::Value<'_> {
+        self.values().get(i)
+    }
+}
+
+impl AsIndexed<MutableBooleanArray> for bool {
+    #[inline]
+    fn as_indexed(&self) -> &bool {
+        self
+    }
+}
+
+impl<O: Offset> Indexable for MutableBinaryArray<O> {
+    type Value<'a> = &'a [u8];
+    type Type = [u8];
+
+    #[inline]
+    fn value_at(&self, i: usize) -> Self::Value<'_> {
+        // TODO: add .value() / .value_unchecked() to MutableBinaryArray?
+        assert!(i < self.len());
+        unsafe { self.value_unchecked_at(i) }
+    }
+
+    #[inline]
+    unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> {
+        // TODO: add .value() / .value_unchecked() to MutableBinaryArray?
+        // soundness: the invariant of the function
+        let (start, end) = self.offsets().start_end_unchecked(i);
+        // soundness: the invariant of the struct
+        self.values().get_unchecked(start..end)
+    }
+}
+
+impl<O: Offset> AsIndexed<MutableBinaryArray<O>> for &[u8] {
+    #[inline]
+    fn as_indexed(&self) -> &[u8] {
+        self
+    }
+}
+
+impl<O: Offset> Indexable for MutableBinaryValuesArray<O> {
+    type Value<'a> = &'a [u8];
+    type Type = [u8];
+
+    #[inline]
+    fn value_at(&self, i: usize) -> Self::Value<'_> {
+        self.value(i)
+    }
+
+    #[inline]
+    unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> {
+        self.value_unchecked(i)
+    }
+}
+
+impl<O: Offset> AsIndexed<MutableBinaryValuesArray<O>> for &[u8] {
+    #[inline]
+    fn as_indexed(&self) -> &[u8] {
+        self
+    }
+}
+
+impl Indexable for MutableFixedSizeBinaryArray {
+    type Value<'a> = &'a [u8];
+    type Type = [u8];
+
+    #[inline]
+    fn value_at(&self, i: usize) -> Self::Value<'_> {
+        self.value(i)
+    }
+
+    #[inline]
+    unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> {
+        // soundness: the invariant of the struct
+        self.value_unchecked(i)
+    }
+}
+
+impl AsIndexed<MutableFixedSizeBinaryArray> for &[u8] {
+    #[inline]
+    fn as_indexed(&self) -> &[u8] {
+        self
+    }
+}
+
+// TODO: should NativeType derive from Hash?
+impl<T: NativeType> Indexable for MutablePrimitiveArray<T> {
+    type Value<'a> = T;
+    type Type = T;
+
+    #[inline]
+    fn value_at(&self, i: usize) -> Self::Value<'_> {
+        assert!(i < self.len());
+        // TODO: add Length trait? (for both Array and MutableArray)
+        unsafe { self.value_unchecked_at(i) }
+    }
+
+    #[inline]
+    unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> {
+        *self.values().get_unchecked(i)
+    }
+}
+
+impl<T: NativeType> AsIndexed<MutablePrimitiveArray<T>> for T {
+    #[inline]
+    fn as_indexed(&self) -> &T {
+        self
+    }
+}
+
+impl<O: Offset> Indexable for MutableUtf8Array<O> {
+    type Value<'a> = &'a str;
+    type Type = str;
+
+    #[inline]
+    fn value_at(&self, i: usize) -> Self::Value<'_> {
+        self.value(i)
+    }
+
+    #[inline]
+    unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> {
+        self.value_unchecked(i)
+    }
+}
+
+impl<O: Offset, V: AsRef<str>> AsIndexed<MutableUtf8Array<O>> for V {
+    #[inline]
+    fn as_indexed(&self) -> &str {
+        self.as_ref()
+    }
+}
+
+impl<O: Offset> Indexable for MutableUtf8ValuesArray<O> {
+    type Value<'a> = &'a str;
+    type Type = str;
+
+    #[inline]
+    fn value_at(&self, i: usize) -> Self::Value<'_> {
+        self.value(i)
+    }
+
+    #[inline]
+    unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> {
+        self.value_unchecked(i)
+    }
+}
+
+impl<O: Offset, V: AsRef<str>> AsIndexed<MutableUtf8ValuesArray<O>> for V {
+    #[inline]
+    fn as_indexed(&self) -> &str {
+        self.as_ref()
+    }
+}
diff --git a/src/array/mod.rs b/src/array/mod.rs
index 04b7b2c8e3..1575130989 100644
--- a/src/array/mod.rs
+++ b/src/array/mod.rs
@@ -720,8 +720,10 @@ mod utf8;
 mod equal;
 mod ffi;
 mod fmt;
-pub mod growable;
+mod indexable;
 mod iterator;
+
+pub mod growable;
 pub mod ord;
 
 pub(crate) use iterator::ArrayAccessor;
diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs
index 30b265e2d5..110288817a 100644
--- a/src/compute/cast/primitive_to.rs
+++ b/src/compute/cast/primitive_to.rs
@@ -306,9 +306,9 @@ pub fn primitive_to_dictionary<T: NativeType + Eq + Hash, K: DictionaryKey>(
     from: &PrimitiveArray<T>,
 ) -> Result<DictionaryArray<K>> {
     let iter = from.iter().map(|x| x.copied());
-    let mut array = MutableDictionaryArray::<K, _>::from(MutablePrimitiveArray::<T>::from(
+    let mut array = MutableDictionaryArray::<K, _>::try_empty(MutablePrimitiveArray::<T>::from(
         from.data_type().clone(),
-    ));
+    ))?;
     array.try_extend(iter)?;
 
     Ok(array.into())
diff --git a/tests/it/array/dictionary/mutable.rs b/tests/it/array/dictionary/mutable.rs
index b6103dcccf..1b54a92647 100644
--- a/tests/it/array/dictionary/mutable.rs
+++ b/tests/it/array/dictionary/mutable.rs
@@ -1,8 +1,5 @@
 use arrow2::array::*;
 use arrow2::error::Result;
-use hash_hasher::HashedMap;
-use std::collections::hash_map::DefaultHasher;
-use std::hash::{Hash, Hasher};
 
 #[test]
 fn primitive() -> Result<()> {
@@ -61,16 +58,4 @@ fn push_utf8() {
     expected_keys.push(Some(0));
     expected_keys.push(Some(1));
     assert_eq!(*new.keys(), expected_keys);
-
-    let expected_map = ["A", "B", "C"]
-        .iter()
-        .enumerate()
-        .map(|(index, value)| {
-            let mut hasher = DefaultHasher::new();
-            value.hash(&mut hasher);
-            let hash = hasher.finish();
-            (hash, index as i32)
-        })
-        .collect::<HashedMap<_, _>>();
-    assert_eq!(*new.map(), expected_map);
 }

From b2017d7cc3611cd9d578bc675ebc3fe176d3a907 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Sun, 3 Sep 2023 14:30:49 +0200
Subject: [PATCH 61/80] Revert "MutableDictionaryArray rewrite: use values
 stored in the array instead of the hash->hash map" (#1559)

---
 Cargo.toml                           |   4 +-
 src/array/dictionary/mod.rs          |   1 -
 src/array/dictionary/mutable.rs      | 135 +++++++++--------
 src/array/dictionary/value_map.rs    | 207 ---------------------------
 src/array/indexable.rs               | 197 -------------------------
 src/array/mod.rs                     |   4 +-
 src/compute/cast/primitive_to.rs     |   4 +-
 tests/it/array/dictionary/mutable.rs |  15 ++
 8 files changed, 98 insertions(+), 469 deletions(-)
 delete mode 100644 src/array/dictionary/value_map.rs
 delete mode 100644 src/array/indexable.rs

diff --git a/Cargo.toml b/Cargo.toml
index 1bb20a6955..0f3f9ec27b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,7 +32,7 @@ hash_hasher = "^2.0.3"
 simdutf8 = "0.1.4"
 
 # A Rust port of SwissTable
-hashbrown = { version = "0.14", default-features = false, features = ["ahash"] }
+hashbrown = { version = "0.14", default-features = false, optional = true }
 
 # for timezone support
 chrono-tz = { version = "0.8", optional = true }
@@ -243,7 +243,7 @@ compute_merge_sort = ["itertools", "compute_sort"]
 compute_nullif = ["compute_comparison"]
 compute_partition = ["compute_sort"]
 compute_regex_match = ["regex"]
-compute_sort = ["compute_take"]
+compute_sort = ["compute_take", "hashbrown"]
 compute_substring = []
 compute_take = []
 compute_temporal = []
diff --git a/src/array/dictionary/mod.rs b/src/array/dictionary/mod.rs
index a4be1ea210..f7d4a0f43d 100644
--- a/src/array/dictionary/mod.rs
+++ b/src/array/dictionary/mod.rs
@@ -20,7 +20,6 @@ mod iterator;
 mod mutable;
 use crate::array::specification::check_indexes_unchecked;
 mod typed_iterator;
-mod value_map;
 
 use crate::array::dictionary::typed_iterator::{DictValue, DictionaryValuesIterTyped};
 pub use iterator::*;
diff --git a/src/array/dictionary/mutable.rs b/src/array/dictionary/mutable.rs
index 16a00a1357..444de34bcc 100644
--- a/src/array/dictionary/mutable.rs
+++ b/src/array/dictionary/mutable.rs
@@ -1,15 +1,15 @@
-use std::hash::Hash;
-use std::sync::Arc;
+use std::hash::{Hash, Hasher};
+use std::{collections::hash_map::DefaultHasher, sync::Arc};
+
+use hash_hasher::HashedMap;
 
-use crate::array::indexable::{AsIndexed, Indexable};
 use crate::{
     array::{primitive::MutablePrimitiveArray, Array, MutableArray, TryExtend, TryPush},
     bitmap::MutableBitmap,
     datatypes::DataType,
-    error::Result,
+    error::{Error, Result},
 };
 
-use super::value_map::ValueMap;
 use super::{DictionaryArray, DictionaryKey};
 
 /// A mutable, strong-typed version of [`DictionaryArray`].
@@ -30,29 +30,55 @@ use super::{DictionaryArray, DictionaryKey};
 #[derive(Debug)]
 pub struct MutableDictionaryArray<K: DictionaryKey, M: MutableArray> {
     data_type: DataType,
-    map: ValueMap<K, M>,
-    // invariant: `max(keys) < map.values().len()`
     keys: MutablePrimitiveArray<K>,
+    map: HashedMap<u64, K>,
+    // invariant: `keys.len() <= values.len()`
+    values: M,
 }
 
 impl<K: DictionaryKey, M: MutableArray> From<MutableDictionaryArray<K, M>> for DictionaryArray<K> {
-    fn from(other: MutableDictionaryArray<K, M>) -> Self {
+    fn from(mut other: MutableDictionaryArray<K, M>) -> Self {
         // Safety - the invariant of this struct ensures that this is up-held
         unsafe {
             DictionaryArray::<K>::try_new_unchecked(
                 other.data_type,
                 other.keys.into(),
-                other.map.into_boxed().as_box(),
+                other.values.as_box(),
             )
             .unwrap()
         }
     }
 }
 
+impl<K: DictionaryKey, M: MutableArray> From<M> for MutableDictionaryArray<K, M> {
+    fn from(values: M) -> Self {
+        Self {
+            data_type: DataType::Dictionary(
+                K::KEY_TYPE,
+                Box::new(values.data_type().clone()),
+                false,
+            ),
+            keys: MutablePrimitiveArray::<K>::new(),
+            map: HashedMap::default(),
+            values,
+        }
+    }
+}
+
 impl<K: DictionaryKey, M: MutableArray + Default> MutableDictionaryArray<K, M> {
     /// Creates an empty [`MutableDictionaryArray`].
     pub fn new() -> Self {
-        Self::try_empty(M::default()).unwrap()
+        let values = M::default();
+        Self {
+            data_type: DataType::Dictionary(
+                K::KEY_TYPE,
+                Box::new(values.data_type().clone()),
+                false,
+            ),
+            keys: MutablePrimitiveArray::<K>::new(),
+            map: HashedMap::default(),
+            values,
+        }
     }
 }
 
@@ -63,34 +89,22 @@ impl<K: DictionaryKey, M: MutableArray + Default> Default for MutableDictionaryA
 }
 
 impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {
-    /// Creates an empty [`MutableDictionaryArray`] from a given empty values array.
-    /// # Errors
-    /// Errors if the array is non-empty.
-    pub fn try_empty(values: M) -> Result<Self> {
-        Ok(Self::from_value_map(ValueMap::<K, M>::try_empty(values)?))
-    }
-
-    /// Creates an empty [`MutableDictionaryArray`] preloaded with a given dictionary of values.
-    /// Indices associated with those values are automatically assigned based on the order of
-    /// the values.
-    /// # Errors
-    /// Errors if there's more values than the maximum value of `K`.
-    pub fn from_values(values: M) -> Result<Self>
-    where
-        M: Indexable,
-        M::Type: Eq + Hash,
-    {
-        Ok(Self::from_value_map(ValueMap::<K, M>::from_values(values)?))
-    }
-
-    fn from_value_map(value_map: ValueMap<K, M>) -> Self {
-        let keys = MutablePrimitiveArray::<K>::new();
-        let data_type =
-            DataType::Dictionary(K::KEY_TYPE, Box::new(value_map.data_type().clone()), false);
-        Self {
-            data_type,
-            map: value_map,
-            keys,
+    /// Returns whether the value should be pushed to the values or not
+    fn try_push_valid<T: Hash>(&mut self, value: &T) -> Result<bool> {
+        let mut hasher = DefaultHasher::new();
+        value.hash(&mut hasher);
+        let hash = hasher.finish();
+        match self.map.get(&hash) {
+            Some(key) => {
+                self.keys.push(Some(*key));
+                Ok(false)
+            }
+            None => {
+                let key = K::try_from(self.map.len()).map_err(|_| Error::Overflow)?;
+                self.map.insert(hash, key);
+                self.keys.push(Some(key));
+                Ok(true)
+            }
         }
     }
 
@@ -99,9 +113,14 @@ impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {
         self.keys.push(None)
     }
 
+    /// returns a mutable reference to the inner values.
+    fn mut_values(&mut self) -> &mut M {
+        &mut self.values
+    }
+
     /// returns a reference to the inner values.
     pub fn values(&self) -> &M {
-        self.map.values()
+        &self.values
     }
 
     /// converts itself into [`Arc<dyn Array>`]
@@ -123,10 +142,15 @@ impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {
 
     /// Shrinks the capacity of the [`MutableDictionaryArray`] to fit its current length.
     pub fn shrink_to_fit(&mut self) {
-        self.map.shrink_to_fit();
+        self.values.shrink_to_fit();
         self.keys.shrink_to_fit();
     }
 
+    /// Returns the dictionary map
+    pub fn map(&self) -> &HashedMap<u64, K> {
+        &self.map
+    }
+
     /// Returns the dictionary keys
     pub fn keys(&self) -> &MutablePrimitiveArray<K> {
         &self.keys
@@ -136,7 +160,7 @@ impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {
         DictionaryArray::<K>::try_new(
             self.data_type.clone(),
             std::mem::take(&mut self.keys).into(),
-            self.map.take_into(),
+            self.values.as_box(),
         )
         .unwrap()
     }
@@ -184,20 +208,17 @@ impl<K: DictionaryKey, M: 'static + MutableArray> MutableArray for MutableDictio
     }
 }
 
-impl<K, M, T> TryExtend<Option<T>> for MutableDictionaryArray<K, M>
+impl<K, M, T: Hash> TryExtend<Option<T>> for MutableDictionaryArray<K, M>
 where
     K: DictionaryKey,
-    M: MutableArray + Indexable + TryExtend<Option<T>>,
-    T: AsIndexed<M>,
-    M::Type: Eq + Hash,
+    M: MutableArray + TryExtend<Option<T>>,
 {
     fn try_extend<II: IntoIterator<Item = Option<T>>>(&mut self, iter: II) -> Result<()> {
         for value in iter {
             if let Some(value) = value {
-                let key = self
-                    .map
-                    .try_push_valid(value, |arr, v| arr.try_extend(std::iter::once(Some(v))))?;
-                self.keys.try_push(Some(key))?;
+                if self.try_push_valid(&value)? {
+                    self.mut_values().try_extend(std::iter::once(Some(value)))?;
+                }
             } else {
                 self.push_null();
             }
@@ -209,19 +230,19 @@ where
 impl<K, M, T> TryPush<Option<T>> for MutableDictionaryArray<K, M>
 where
     K: DictionaryKey,
-    M: MutableArray + Indexable + TryPush<Option<T>>,
-    T: AsIndexed<M>,
-    M::Type: Eq + Hash,
+    M: MutableArray + TryPush<Option<T>>,
+    T: Hash,
 {
     fn try_push(&mut self, item: Option<T>) -> Result<()> {
         if let Some(value) = item {
-            let key = self
-                .map
-                .try_push_valid(value, |arr, v| arr.try_push(Some(v)))?;
-            self.keys.try_push(Some(key))?;
+            if self.try_push_valid(&value)? {
+                self.values.try_push(Some(value))
+            } else {
+                Ok(())
+            }
         } else {
             self.push_null();
+            Ok(())
         }
-        Ok(())
     }
 }
diff --git a/src/array/dictionary/value_map.rs b/src/array/dictionary/value_map.rs
deleted file mode 100644
index 35603de4cf..0000000000
--- a/src/array/dictionary/value_map.rs
+++ /dev/null
@@ -1,207 +0,0 @@
-use std::borrow::Borrow;
-use std::fmt::{self, Debug};
-use std::hash::{Hash, Hasher};
-use std::pin::Pin;
-use std::ptr::NonNull;
-
-use hashbrown::{Equivalent, HashMap};
-
-use crate::array::Array;
-use crate::{
-    array::indexable::{AsIndexed, Indexable},
-    array::MutableArray,
-    datatypes::DataType,
-    error::{Error, Result},
-};
-
-use super::DictionaryKey;
-
-struct NonNullSend<M: ?Sized>(NonNull<M>);
-
-// safety: these pointers are for internal self-referential purposes to pinned array only
-unsafe impl<M> Send for NonNullSend<M> {}
-unsafe impl<M> Sync for NonNullSend<M> {}
-
-impl<M: ?Sized> From<&M> for NonNullSend<M> {
-    #[inline]
-    fn from(reference: &M) -> Self {
-        Self(NonNull::from(reference))
-    }
-}
-
-struct ValueRef<M> {
-    array: NonNullSend<M>,
-    index: usize,
-}
-
-impl<M> ValueRef<M> {
-    #[inline]
-    pub fn new(array: &Pin<Box<M>>, index: usize) -> Self {
-        Self {
-            array: NonNullSend::from(Pin::get_ref(array.as_ref())),
-            index,
-        }
-    }
-
-    #[inline]
-    pub fn get_array(&self) -> &M {
-        // safety: the invariant of the struct
-        unsafe { self.array.0.as_ref() }
-    }
-
-    #[inline]
-    pub unsafe fn get_unchecked(&self) -> M::Value<'_>
-    where
-        M: Indexable,
-    {
-        self.get_array().value_unchecked_at(self.index)
-    }
-
-    #[inline]
-    pub unsafe fn equals(&self, other: &M::Type) -> bool
-    where
-        M: Indexable,
-        M::Type: Eq,
-    {
-        self.get_unchecked().borrow() == other
-    }
-}
-
-impl<M: Indexable> PartialEq for ValueRef<M>
-where
-    M::Type: PartialEq,
-{
-    #[inline]
-    fn eq(&self, other: &Self) -> bool {
-        // safety: the way these value refs are constructed, they are always within bounds
-        unsafe {
-            self.get_unchecked()
-                .borrow()
-                .eq(other.get_unchecked().borrow())
-        }
-    }
-}
-
-impl<M: Indexable> Eq for ValueRef<M> where for<'a> M::Type: Eq {}
-
-impl<M: Indexable> Hash for ValueRef<M>
-where
-    M::Type: Hash,
-{
-    #[inline]
-    fn hash<H: Hasher>(&self, state: &mut H) {
-        // safety: the way these value refs are constructed, they are always within bounds
-        unsafe { self.get_unchecked().borrow().hash(state) }
-    }
-}
-
-// To avoid blanket implementation issues with `Equivalent` trait (we only use hashbrown
-// instead of the default HashMap to avoid blanket implementation problems with Borrow).
-#[derive(Hash)]
-struct Wrapped<'a, T: ?Sized>(&'a T);
-
-impl<'a, M: Indexable> Equivalent<ValueRef<M>> for Wrapped<'a, M::Type>
-where
-    M::Type: Eq,
-{
-    #[inline]
-    fn equivalent(&self, key: &ValueRef<M>) -> bool {
-        // safety: invariant of the struct
-        unsafe { key.equals(self.0) }
-    }
-}
-
-pub struct ValueMap<K: DictionaryKey, M: MutableArray> {
-    values: Pin<Box<M>>,
-    map: HashMap<ValueRef<M>, K>,
-}
-
-impl<K: DictionaryKey, M: MutableArray> ValueMap<K, M> {
-    pub fn try_empty(values: M) -> Result<Self> {
-        if !values.is_empty() {
-            return Err(Error::InvalidArgumentError(
-                "initializing value map with non-empty values array".into(),
-            ));
-        }
-        Ok(Self {
-            values: Box::pin(values),
-            map: HashMap::default(),
-        })
-    }
-
-    pub fn from_values(values: M) -> Result<Self>
-    where
-        M: Indexable,
-        M::Type: Eq + Hash,
-    {
-        let values = Box::pin(values);
-        let map = (0..values.len())
-            .map(|i| {
-                let key = K::try_from(i).map_err(|_| Error::Overflow)?;
-                Ok((ValueRef::new(&values, i), key))
-            })
-            .collect::<Result<_>>()?;
-        Ok(Self { values, map })
-    }
-
-    pub fn data_type(&self) -> &DataType {
-        Pin::get_ref(self.values.as_ref()).data_type()
-    }
-
-    pub fn into_boxed(self) -> Box<M> {
-        // safety: we unpin the pointer but the value map is dropped along with all
-        // the value references that might refer to the pinned array
-        unsafe { Pin::into_inner_unchecked(self.values) }
-    }
-
-    pub fn take_into(&mut self) -> Box<dyn Array> {
-        // safety: we unpin the pointer but the value map is manually cleared
-        let arr = unsafe { self.values.as_mut().get_unchecked_mut().as_box() };
-        self.map.clear();
-        arr
-    }
-
-    #[inline]
-    pub fn values(&self) -> &M {
-        &self.values
-    }
-
-    /// Try to insert a value and return its index (it may or may not get inserted).
-    pub fn try_push_valid<V>(
-        &mut self,
-        value: V,
-        mut push: impl FnMut(&mut M, V) -> Result<()>,
-    ) -> Result<K>
-    where
-        M: Indexable,
-        V: AsIndexed<M>,
-        M::Type: Eq + Hash,
-    {
-        if let Some(&key) = self.map.get(&Wrapped(value.as_indexed())) {
-            return Ok(key);
-        }
-        let index = self.values.len();
-        let key = K::try_from(index).map_err(|_| Error::Overflow)?;
-        // safety: we don't move the data out of the mutable pinned reference
-        unsafe {
-            push(self.values.as_mut().get_unchecked_mut(), value)?;
-        }
-        debug_assert_eq!(self.values.len(), index + 1);
-        self.map.insert(ValueRef::new(&self.values, index), key);
-        debug_assert_eq!(self.values.len(), self.map.len());
-        Ok(key)
-    }
-
-    pub fn shrink_to_fit(&mut self) {
-        // safety: we don't move the data out of the mutable pinned reference
-        unsafe {
-            self.values.as_mut().get_unchecked_mut().shrink_to_fit();
-        }
-    }
-}
-
-impl<K: DictionaryKey, M: MutableArray> Debug for ValueMap<K, M> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        Pin::get_ref(self.values.as_ref()).fmt(f)
-    }
-}
diff --git a/src/array/indexable.rs b/src/array/indexable.rs
deleted file mode 100644
index 76001bfcf5..0000000000
--- a/src/array/indexable.rs
+++ /dev/null
@@ -1,197 +0,0 @@
-use std::borrow::Borrow;
-
-use crate::{
-    array::{
-        MutableArray, MutableBinaryArray, MutableBinaryValuesArray, MutableBooleanArray,
-        MutableFixedSizeBinaryArray, MutablePrimitiveArray, MutableUtf8Array,
-        MutableUtf8ValuesArray,
-    },
-    offset::Offset,
-    types::NativeType,
-};
-
-/// Trait for arrays that can be indexed directly to extract a value.
-pub trait Indexable {
-    /// The type of the element at index `i`; may be a reference type or a value type.
-    type Value<'a>: Borrow<Self::Type>
-    where
-        Self: 'a;
-
-    type Type: ?Sized;
-
-    /// Returns the element at index `i`.
-    /// # Panic
-    /// May panic if `i >= self.len()`.
-    fn value_at(&self, index: usize) -> Self::Value<'_>;
-
-    /// Returns the element at index `i`.
-    /// # Safety
-    /// Assumes that the `i < self.len`.
-    #[inline]
-    unsafe fn value_unchecked_at(&self, index: usize) -> Self::Value<'_> {
-        self.value_at(index)
-    }
-}
-
-pub trait AsIndexed<M: Indexable> {
-    fn as_indexed(&self) -> &M::Type;
-}
-
-impl Indexable for MutableBooleanArray {
-    type Value<'a> = bool;
-    type Type = bool;
-
-    #[inline]
-    fn value_at(&self, i: usize) -> Self::Value<'_> {
-        self.values().get(i)
-    }
-}
-
-impl AsIndexed<MutableBooleanArray> for bool {
-    #[inline]
-    fn as_indexed(&self) -> &bool {
-        self
-    }
-}
-
-impl<O: Offset> Indexable for MutableBinaryArray<O> {
-    type Value<'a> = &'a [u8];
-    type Type = [u8];
-
-    #[inline]
-    fn value_at(&self, i: usize) -> Self::Value<'_> {
-        // TODO: add .value() / .value_unchecked() to MutableBinaryArray?
-        assert!(i < self.len());
-        unsafe { self.value_unchecked_at(i) }
-    }
-
-    #[inline]
-    unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> {
-        // TODO: add .value() / .value_unchecked() to MutableBinaryArray?
-        // soundness: the invariant of the function
-        let (start, end) = self.offsets().start_end_unchecked(i);
-        // soundness: the invariant of the struct
-        self.values().get_unchecked(start..end)
-    }
-}
-
-impl<O: Offset> AsIndexed<MutableBinaryArray<O>> for &[u8] {
-    #[inline]
-    fn as_indexed(&self) -> &[u8] {
-        self
-    }
-}
-
-impl<O: Offset> Indexable for MutableBinaryValuesArray<O> {
-    type Value<'a> = &'a [u8];
-    type Type = [u8];
-
-    #[inline]
-    fn value_at(&self, i: usize) -> Self::Value<'_> {
-        self.value(i)
-    }
-
-    #[inline]
-    unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> {
-        self.value_unchecked(i)
-    }
-}
-
-impl<O: Offset> AsIndexed<MutableBinaryValuesArray<O>> for &[u8] {
-    #[inline]
-    fn as_indexed(&self) -> &[u8] {
-        self
-    }
-}
-
-impl Indexable for MutableFixedSizeBinaryArray {
-    type Value<'a> = &'a [u8];
-    type Type = [u8];
-
-    #[inline]
-    fn value_at(&self, i: usize) -> Self::Value<'_> {
-        self.value(i)
-    }
-
-    #[inline]
-    unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> {
-        // soundness: the invariant of the struct
-        self.value_unchecked(i)
-    }
-}
-
-impl AsIndexed<MutableFixedSizeBinaryArray> for &[u8] {
-    #[inline]
-    fn as_indexed(&self) -> &[u8] {
-        self
-    }
-}
-
-// TODO: should NativeType derive from Hash?
-impl<T: NativeType> Indexable for MutablePrimitiveArray<T> {
-    type Value<'a> = T;
-    type Type = T;
-
-    #[inline]
-    fn value_at(&self, i: usize) -> Self::Value<'_> {
-        assert!(i < self.len());
-        // TODO: add Length trait? (for both Array and MutableArray)
-        unsafe { self.value_unchecked_at(i) }
-    }
-
-    #[inline]
-    unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> {
-        *self.values().get_unchecked(i)
-    }
-}
-
-impl<T: NativeType> AsIndexed<MutablePrimitiveArray<T>> for T {
-    #[inline]
-    fn as_indexed(&self) -> &T {
-        self
-    }
-}
-
-impl<O: Offset> Indexable for MutableUtf8Array<O> {
-    type Value<'a> = &'a str;
-    type Type = str;
-
-    #[inline]
-    fn value_at(&self, i: usize) -> Self::Value<'_> {
-        self.value(i)
-    }
-
-    #[inline]
-    unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> {
-        self.value_unchecked(i)
-    }
-}
-
-impl<O: Offset, V: AsRef<str>> AsIndexed<MutableUtf8Array<O>> for V {
-    #[inline]
-    fn as_indexed(&self) -> &str {
-        self.as_ref()
-    }
-}
-
-impl<O: Offset> Indexable for MutableUtf8ValuesArray<O> {
-    type Value<'a> = &'a str;
-    type Type = str;
-
-    #[inline]
-    fn value_at(&self, i: usize) -> Self::Value<'_> {
-        self.value(i)
-    }
-
-    #[inline]
-    unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> {
-        self.value_unchecked(i)
-    }
-}
-
-impl<O: Offset, V: AsRef<str>> AsIndexed<MutableUtf8ValuesArray<O>> for V {
-    #[inline]
-    fn as_indexed(&self) -> &str {
-        self.as_ref()
-    }
-}
diff --git a/src/array/mod.rs b/src/array/mod.rs
index 1575130989..04b7b2c8e3 100644
--- a/src/array/mod.rs
+++ b/src/array/mod.rs
@@ -720,10 +720,8 @@ mod utf8;
 mod equal;
 mod ffi;
 mod fmt;
-mod indexable;
-mod iterator;
-
 pub mod growable;
+mod iterator;
 pub mod ord;
 
 pub(crate) use iterator::ArrayAccessor;
diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs
index 110288817a..30b265e2d5 100644
--- a/src/compute/cast/primitive_to.rs
+++ b/src/compute/cast/primitive_to.rs
@@ -306,9 +306,9 @@ pub fn primitive_to_dictionary<T: NativeType + Eq + Hash, K: DictionaryKey>(
     from: &PrimitiveArray<T>,
 ) -> Result<DictionaryArray<K>> {
     let iter = from.iter().map(|x| x.copied());
-    let mut array = MutableDictionaryArray::<K, _>::try_empty(MutablePrimitiveArray::<T>::from(
+    let mut array = MutableDictionaryArray::<K, _>::from(MutablePrimitiveArray::<T>::from(
         from.data_type().clone(),
-    ))?;
+    ));
     array.try_extend(iter)?;
 
     Ok(array.into())
diff --git a/tests/it/array/dictionary/mutable.rs b/tests/it/array/dictionary/mutable.rs
index 1b54a92647..b6103dcccf 100644
--- a/tests/it/array/dictionary/mutable.rs
+++ b/tests/it/array/dictionary/mutable.rs
@@ -1,5 +1,8 @@
 use arrow2::array::*;
 use arrow2::error::Result;
+use hash_hasher::HashedMap;
+use std::collections::hash_map::DefaultHasher;
+use std::hash::{Hash, Hasher};
 
 #[test]
 fn primitive() -> Result<()> {
@@ -58,4 +61,16 @@ fn push_utf8() {
     expected_keys.push(Some(0));
     expected_keys.push(Some(1));
     assert_eq!(*new.keys(), expected_keys);
+
+    let expected_map = ["A", "B", "C"]
+        .iter()
+        .enumerate()
+        .map(|(index, value)| {
+            let mut hasher = DefaultHasher::new();
+            value.hash(&mut hasher);
+            let hash = hasher.finish();
+            (hash, index as i32)
+        })
+        .collect::<HashedMap<_, _>>();
+    assert_eq!(*new.map(), expected_map);
 }

From 87ab84460d86d5b729cfcf148406542ba40df48f Mon Sep 17 00:00:00 2001
From: Ivan Smirnov <aldanor@users.noreply.github.com>
Date: Mon, 4 Sep 2023 19:23:56 +0100
Subject: [PATCH 62/80] 2nd (safe) rewrite of MutableDictionaryArray (#1561)

---
 Cargo.toml                           |   4 +-
 src/array/dictionary/mod.rs          |   4 +-
 src/array/dictionary/mutable.rs      | 151 ++++++++++----------
 src/array/dictionary/value_map.rs    | 127 +++++++++++++++++
 src/array/indexable.rs               | 197 +++++++++++++++++++++++++++
 src/array/mod.rs                     |   5 +-
 src/compute/cast/primitive_to.rs     |   4 +-
 tests/it/array/dictionary/mutable.rs | 104 ++++++++++++--
 8 files changed, 498 insertions(+), 98 deletions(-)
 create mode 100644 src/array/dictionary/value_map.rs
 create mode 100644 src/array/indexable.rs

diff --git a/Cargo.toml b/Cargo.toml
index 0f3f9ec27b..1bb20a6955 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,7 +32,7 @@ hash_hasher = "^2.0.3"
 simdutf8 = "0.1.4"
 
 # A Rust port of SwissTable
-hashbrown = { version = "0.14", default-features = false, optional = true }
+hashbrown = { version = "0.14", default-features = false, features = ["ahash"] }
 
 # for timezone support
 chrono-tz = { version = "0.8", optional = true }
@@ -243,7 +243,7 @@ compute_merge_sort = ["itertools", "compute_sort"]
 compute_nullif = ["compute_comparison"]
 compute_partition = ["compute_sort"]
 compute_regex_match = ["regex"]
-compute_sort = ["compute_take", "hashbrown"]
+compute_sort = ["compute_take"]
 compute_substring = []
 compute_take = []
 compute_temporal = []
diff --git a/src/array/dictionary/mod.rs b/src/array/dictionary/mod.rs
index f7d4a0f43d..3a23e670a1 100644
--- a/src/array/dictionary/mod.rs
+++ b/src/array/dictionary/mod.rs
@@ -1,3 +1,4 @@
+use std::hash::Hash;
 use std::hint::unreachable_unchecked;
 
 use crate::{
@@ -20,6 +21,7 @@ mod iterator;
 mod mutable;
 use crate::array::specification::check_indexes_unchecked;
 mod typed_iterator;
+mod value_map;
 
 use crate::array::dictionary::typed_iterator::{DictValue, DictionaryValuesIterTyped};
 pub use iterator::*;
@@ -33,7 +35,7 @@ use super::{new_null_array, specification::check_indexes};
 ///
 /// Any implementation of this trait must ensure that `always_fits_usize` only
 /// returns `true` if all values succeeds on `value::try_into::<usize>().unwrap()`.
-pub unsafe trait DictionaryKey: NativeType + TryInto<usize> + TryFrom<usize> {
+pub unsafe trait DictionaryKey: NativeType + TryInto<usize> + TryFrom<usize> + Hash {
     /// The corresponding [`IntegerType`] of this key
     const KEY_TYPE: IntegerType;
 
diff --git a/src/array/dictionary/mutable.rs b/src/array/dictionary/mutable.rs
index 444de34bcc..b48a57a945 100644
--- a/src/array/dictionary/mutable.rs
+++ b/src/array/dictionary/mutable.rs
@@ -1,15 +1,15 @@
-use std::hash::{Hash, Hasher};
-use std::{collections::hash_map::DefaultHasher, sync::Arc};
-
-use hash_hasher::HashedMap;
+use std::hash::Hash;
+use std::sync::Arc;
 
+use crate::array::indexable::{AsIndexed, Indexable};
 use crate::{
     array::{primitive::MutablePrimitiveArray, Array, MutableArray, TryExtend, TryPush},
     bitmap::MutableBitmap,
     datatypes::DataType,
-    error::{Error, Result},
+    error::Result,
 };
 
+use super::value_map::ValueMap;
 use super::{DictionaryArray, DictionaryKey};
 
 /// A mutable, strong-typed version of [`DictionaryArray`].
@@ -30,55 +30,29 @@ use super::{DictionaryArray, DictionaryKey};
 #[derive(Debug)]
 pub struct MutableDictionaryArray<K: DictionaryKey, M: MutableArray> {
     data_type: DataType,
+    map: ValueMap<K, M>,
+    // invariant: `max(keys) < map.values().len()`
     keys: MutablePrimitiveArray<K>,
-    map: HashedMap<u64, K>,
-    // invariant: `keys.len() <= values.len()`
-    values: M,
 }
 
 impl<K: DictionaryKey, M: MutableArray> From<MutableDictionaryArray<K, M>> for DictionaryArray<K> {
-    fn from(mut other: MutableDictionaryArray<K, M>) -> Self {
+    fn from(other: MutableDictionaryArray<K, M>) -> Self {
         // Safety - the invariant of this struct ensures that this is up-held
         unsafe {
             DictionaryArray::<K>::try_new_unchecked(
                 other.data_type,
                 other.keys.into(),
-                other.values.as_box(),
+                other.map.into_values().as_box(),
             )
             .unwrap()
         }
     }
 }
 
-impl<K: DictionaryKey, M: MutableArray> From<M> for MutableDictionaryArray<K, M> {
-    fn from(values: M) -> Self {
-        Self {
-            data_type: DataType::Dictionary(
-                K::KEY_TYPE,
-                Box::new(values.data_type().clone()),
-                false,
-            ),
-            keys: MutablePrimitiveArray::<K>::new(),
-            map: HashedMap::default(),
-            values,
-        }
-    }
-}
-
 impl<K: DictionaryKey, M: MutableArray + Default> MutableDictionaryArray<K, M> {
     /// Creates an empty [`MutableDictionaryArray`].
     pub fn new() -> Self {
-        let values = M::default();
-        Self {
-            data_type: DataType::Dictionary(
-                K::KEY_TYPE,
-                Box::new(values.data_type().clone()),
-                false,
-            ),
-            keys: MutablePrimitiveArray::<K>::new(),
-            map: HashedMap::default(),
-            values,
-        }
+        Self::try_empty(M::default()).unwrap()
     }
 }
 
@@ -89,38 +63,61 @@ impl<K: DictionaryKey, M: MutableArray + Default> Default for MutableDictionaryA
 }
 
 impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {
-    /// Returns whether the value should be pushed to the values or not
-    fn try_push_valid<T: Hash>(&mut self, value: &T) -> Result<bool> {
-        let mut hasher = DefaultHasher::new();
-        value.hash(&mut hasher);
-        let hash = hasher.finish();
-        match self.map.get(&hash) {
-            Some(key) => {
-                self.keys.push(Some(*key));
-                Ok(false)
-            }
-            None => {
-                let key = K::try_from(self.map.len()).map_err(|_| Error::Overflow)?;
-                self.map.insert(hash, key);
-                self.keys.push(Some(key));
-                Ok(true)
-            }
+    /// Creates an empty [`MutableDictionaryArray`] from a given empty values array.
+    /// # Errors
+    /// Errors if the array is non-empty.
+    pub fn try_empty(values: M) -> Result<Self> {
+        Ok(Self::from_value_map(ValueMap::<K, M>::try_empty(values)?))
+    }
+
+    /// Creates an empty [`MutableDictionaryArray`] preloaded with a given dictionary of values.
+    /// Indices associated with those values are automatically assigned based on the order of
+    /// the values.
+    /// # Errors
+    /// Errors if there's more values than the maximum value of `K` or if values are not unique.
+    pub fn from_values(values: M) -> Result<Self>
+    where
+        M: Indexable,
+        M::Type: Eq + Hash,
+    {
+        Ok(Self::from_value_map(ValueMap::<K, M>::from_values(values)?))
+    }
+
+    fn from_value_map(value_map: ValueMap<K, M>) -> Self {
+        let keys = MutablePrimitiveArray::<K>::new();
+        let data_type =
+            DataType::Dictionary(K::KEY_TYPE, Box::new(value_map.data_type().clone()), false);
+        Self {
+            data_type,
+            map: value_map,
+            keys,
         }
     }
 
+    /// Creates an empty [`MutableDictionaryArray`] retaining the same dictionary as the current
+    /// mutable dictionary array, but with no data. This may come useful when serializing the
+    /// array into multiple chunks, where there's a requirement that the dictionary is the same.
+    /// No copying is performed, the value map is moved over to the new array.
+    pub fn into_empty(self) -> Self {
+        Self::from_value_map(self.map)
+    }
+
+    /// Same as `into_empty` but clones the inner value map instead of taking full ownership.
+    pub fn to_empty(&self) -> Self
+    where
+        M: Clone,
+    {
+        Self::from_value_map(self.map.clone())
+    }
+
     /// pushes a null value
     pub fn push_null(&mut self) {
         self.keys.push(None)
     }
 
-    /// returns a mutable reference to the inner values.
-    fn mut_values(&mut self) -> &mut M {
-        &mut self.values
-    }
-
     /// returns a reference to the inner values.
     pub fn values(&self) -> &M {
-        &self.values
+        self.map.values()
     }
 
     /// converts itself into [`Arc<dyn Array>`]
@@ -142,15 +139,10 @@ impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {
 
     /// Shrinks the capacity of the [`MutableDictionaryArray`] to fit its current length.
     pub fn shrink_to_fit(&mut self) {
-        self.values.shrink_to_fit();
+        self.map.shrink_to_fit();
         self.keys.shrink_to_fit();
     }
 
-    /// Returns the dictionary map
-    pub fn map(&self) -> &HashedMap<u64, K> {
-        &self.map
-    }
-
     /// Returns the dictionary keys
     pub fn keys(&self) -> &MutablePrimitiveArray<K> {
         &self.keys
@@ -160,7 +152,7 @@ impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {
         DictionaryArray::<K>::try_new(
             self.data_type.clone(),
             std::mem::take(&mut self.keys).into(),
-            self.values.as_box(),
+            self.map.take_into(),
         )
         .unwrap()
     }
@@ -208,17 +200,20 @@ impl<K: DictionaryKey, M: 'static + MutableArray> MutableArray for MutableDictio
     }
 }
 
-impl<K, M, T: Hash> TryExtend<Option<T>> for MutableDictionaryArray<K, M>
+impl<K, M, T> TryExtend<Option<T>> for MutableDictionaryArray<K, M>
 where
     K: DictionaryKey,
-    M: MutableArray + TryExtend<Option<T>>,
+    M: MutableArray + Indexable + TryExtend<Option<T>>,
+    T: AsIndexed<M>,
+    M::Type: Eq + Hash,
 {
     fn try_extend<II: IntoIterator<Item = Option<T>>>(&mut self, iter: II) -> Result<()> {
         for value in iter {
             if let Some(value) = value {
-                if self.try_push_valid(&value)? {
-                    self.mut_values().try_extend(std::iter::once(Some(value)))?;
-                }
+                let key = self
+                    .map
+                    .try_push_valid(value, |arr, v| arr.try_extend(std::iter::once(Some(v))))?;
+                self.keys.try_push(Some(key))?;
             } else {
                 self.push_null();
             }
@@ -230,19 +225,19 @@ where
 impl<K, M, T> TryPush<Option<T>> for MutableDictionaryArray<K, M>
 where
     K: DictionaryKey,
-    M: MutableArray + TryPush<Option<T>>,
-    T: Hash,
+    M: MutableArray + Indexable + TryPush<Option<T>>,
+    T: AsIndexed<M>,
+    M::Type: Eq + Hash,
 {
     fn try_push(&mut self, item: Option<T>) -> Result<()> {
         if let Some(value) = item {
-            if self.try_push_valid(&value)? {
-                self.values.try_push(Some(value))
-            } else {
-                Ok(())
-            }
+            let key = self
+                .map
+                .try_push_valid(value, |arr, v| arr.try_push(Some(v)))?;
+            self.keys.try_push(Some(key))?;
         } else {
             self.push_null();
-            Ok(())
         }
+        Ok(())
     }
 }
diff --git a/src/array/dictionary/value_map.rs b/src/array/dictionary/value_map.rs
new file mode 100644
index 0000000000..35b59aaa2a
--- /dev/null
+++ b/src/array/dictionary/value_map.rs
@@ -0,0 +1,127 @@
+use std::borrow::Borrow;
+use std::fmt::{self, Debug};
+use std::hash::{BuildHasher, Hash, Hasher};
+
+use hashbrown::hash_map::RawEntryMut;
+use hashbrown::HashMap;
+
+use crate::array::Array;
+use crate::{
+    array::indexable::{AsIndexed, Indexable},
+    array::MutableArray,
+    datatypes::DataType,
+    error::{Error, Result},
+};
+
+use super::DictionaryKey;
+
+#[derive(Clone)]
+pub struct ValueMap<K: DictionaryKey, M: MutableArray> {
+    values: M,
+    map: HashMap<K, ()>, // NB: *only* use insert_hashed_nocheck() and no other hashmap API
+}
+
+impl<K: DictionaryKey, M: MutableArray> ValueMap<K, M> {
+    pub fn try_empty(values: M) -> Result<Self> {
+        if !values.is_empty() {
+            return Err(Error::InvalidArgumentError(
+                "initializing value map with non-empty values array".into(),
+            ));
+        }
+        Ok(Self {
+            values,
+            map: HashMap::default(),
+        })
+    }
+
+    pub fn from_values(values: M) -> Result<Self>
+    where
+        M: Indexable,
+        M::Type: Eq + Hash,
+    {
+        let mut map = HashMap::with_capacity(values.len());
+        for index in 0..values.len() {
+            let key = K::try_from(index).map_err(|_| Error::Overflow)?;
+            // safety: we only iterate within bounds
+            let value = unsafe { values.value_unchecked_at(index) };
+            let mut hasher = map.hasher().build_hasher();
+            value.borrow().hash(&mut hasher);
+            let hash = hasher.finish();
+            match map.raw_entry_mut().from_hash(hash, |_| true) {
+                RawEntryMut::Occupied(_) => {
+                    return Err(Error::InvalidArgumentError(
+                        "duplicate value in dictionary values array".into(),
+                    ))
+                }
+                RawEntryMut::Vacant(entry) => {
+                    entry.insert_hashed_nocheck(hash, key, ()); // NB: don't use .insert() here!
+                }
+            }
+        }
+        Ok(Self { values, map })
+    }
+
+    pub fn data_type(&self) -> &DataType {
+        self.values.data_type()
+    }
+
+    pub fn into_values(self) -> M {
+        self.values
+    }
+
+    pub fn take_into(&mut self) -> Box<dyn Array> {
+        let arr = self.values.as_box();
+        self.map.clear();
+        arr
+    }
+
+    #[inline]
+    pub fn values(&self) -> &M {
+        &self.values
+    }
+
+    /// Try to insert a value and return its index (it may or may not get inserted).
+    pub fn try_push_valid<V>(
+        &mut self,
+        value: V,
+        mut push: impl FnMut(&mut M, V) -> Result<()>,
+    ) -> Result<K>
+    where
+        M: Indexable,
+        V: AsIndexed<M>,
+        M::Type: Eq + Hash,
+    {
+        let mut hasher = self.map.hasher().build_hasher();
+        value.as_indexed().hash(&mut hasher);
+        let hash = hasher.finish();
+
+        Ok(
+            match self.map.raw_entry_mut().from_hash(hash, |key| {
+                // safety: we've already checked (the inverse) when we pushed it, so it should be ok?
+                let index = unsafe { key.as_usize() };
+                // safety: invariant of the struct, it's always in bounds since we maintain it
+                let stored_value = unsafe { self.values.value_unchecked_at(index) };
+                stored_value.borrow() == value.as_indexed()
+            }) {
+                RawEntryMut::Occupied(entry) => *entry.key(),
+                RawEntryMut::Vacant(entry) => {
+                    let index = self.values.len();
+                    let key = K::try_from(index).map_err(|_| Error::Overflow)?;
+                    entry.insert_hashed_nocheck(hash, key, ()); // NB: don't use .insert() here!
+                    push(&mut self.values, value)?;
+                    key
+                }
+            },
+        )
+    }
+
+    pub fn shrink_to_fit(&mut self) {
+        self.values.shrink_to_fit();
+    }
+}
+
+impl<K: DictionaryKey, M: MutableArray> Debug for ValueMap<K, M> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.values.fmt(f)
+    }
+}
diff --git a/src/array/indexable.rs b/src/array/indexable.rs
new file mode 100644
index 0000000000..76001bfcf5
--- /dev/null
+++ b/src/array/indexable.rs
@@ -0,0 +1,197 @@
+use std::borrow::Borrow;
+
+use crate::{
+    array::{
+        MutableArray, MutableBinaryArray, MutableBinaryValuesArray, MutableBooleanArray,
+        MutableFixedSizeBinaryArray, MutablePrimitiveArray, MutableUtf8Array,
+        MutableUtf8ValuesArray,
+    },
+    offset::Offset,
+    types::NativeType,
+};
+
+/// Trait for arrays that can be indexed directly to extract a value.
+pub trait Indexable {
+    /// The type of the element at index `i`; may be a reference type or a value type.
+    type Value<'a>: Borrow<Self::Type>
+    where
+        Self: 'a;
+
+    type Type: ?Sized;
+
+    /// Returns the element at index `i`.
+    /// # Panic
+    /// May panic if `i >= self.len()`.
+    fn value_at(&self, index: usize) -> Self::Value<'_>;
+
+    /// Returns the element at index `i`.
+    /// # Safety
+    /// Assumes that the `i < self.len`.
+    #[inline]
+    unsafe fn value_unchecked_at(&self, index: usize) -> Self::Value<'_> {
+        self.value_at(index)
+    }
+}
+
+pub trait AsIndexed<M: Indexable> {
+    fn as_indexed(&self) -> &M::Type;
+}
+
+impl Indexable for MutableBooleanArray {
+    type Value<'a> = bool;
+    type Type = bool;
+
+    #[inline]
+    fn value_at(&self, i: usize) -> Self::Value<'_> {
+        self.values().get(i)
+    }
+}
+
+impl AsIndexed<MutableBooleanArray> for bool {
+    #[inline]
+    fn as_indexed(&self) -> &bool {
+        self
+    }
+}
+
+impl<O: Offset> Indexable for MutableBinaryArray<O> {
+    type Value<'a> = &'a [u8];
+    type Type = [u8];
+
+    #[inline]
+    fn value_at(&self, i: usize) -> Self::Value<'_> {
+        // TODO: add .value() / .value_unchecked() to MutableBinaryArray?
+        assert!(i < self.len());
+        unsafe { self.value_unchecked_at(i) }
+    }
+
+    #[inline]
+    unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> {
+        // TODO: add .value() / .value_unchecked() to MutableBinaryArray?
+        // soundness: the invariant of the function
+        let (start, end) = self.offsets().start_end_unchecked(i);
+        // soundness: the invariant of the struct
+        self.values().get_unchecked(start..end)
+    }
+}
+
+impl<O: Offset> AsIndexed<MutableBinaryArray<O>> for &[u8] {
+    #[inline]
+    fn as_indexed(&self) -> &[u8] {
+        self
+    }
+}
+
+impl<O: Offset> Indexable for MutableBinaryValuesArray<O> {
+    type Value<'a> = &'a [u8];
+    type Type = [u8];
+
+    #[inline]
+    fn value_at(&self, i: usize) -> Self::Value<'_> {
+        self.value(i)
+    }
+
+    #[inline]
+    unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> {
+        self.value_unchecked(i)
+    }
+}
+
+impl<O: Offset> AsIndexed<MutableBinaryValuesArray<O>> for &[u8] {
+    #[inline]
+    fn as_indexed(&self) -> &[u8] {
+        self
+    }
+}
+
+impl Indexable for MutableFixedSizeBinaryArray {
+    type Value<'a> = &'a [u8];
+    type Type = [u8];
+
+    #[inline]
+    fn value_at(&self, i: usize) -> Self::Value<'_> {
+        self.value(i)
+    }
+
+    #[inline]
+    unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> {
+        // soundness: the invariant of the struct
+        self.value_unchecked(i)
+    }
+}
+
+impl AsIndexed<MutableFixedSizeBinaryArray> for &[u8] {
+    #[inline]
+    fn as_indexed(&self) -> &[u8] {
+        self
+    }
+}
+
+// TODO: should NativeType derive from Hash?
+impl<T: NativeType> Indexable for MutablePrimitiveArray<T> {
+    type Value<'a> = T;
+    type Type = T;
+
+    #[inline]
+    fn value_at(&self, i: usize) -> Self::Value<'_> {
+        assert!(i < self.len());
+        // TODO: add Length trait? (for both Array and MutableArray)
+        unsafe { self.value_unchecked_at(i) }
+    }
+
+    #[inline]
+    unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> {
+        *self.values().get_unchecked(i)
+    }
+}
+
+impl<T: NativeType> AsIndexed<MutablePrimitiveArray<T>> for T {
+    #[inline]
+    fn as_indexed(&self) -> &T {
+        self
+    }
+}
+
+impl<O: Offset> Indexable for MutableUtf8Array<O> {
+    type Value<'a> = &'a str;
+    type Type = str;
+
+    #[inline]
+    fn value_at(&self, i: usize) -> Self::Value<'_> {
+        self.value(i)
+    }
+
+    #[inline]
+    unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> {
+        self.value_unchecked(i)
+    }
+}
+
+impl<O: Offset, V: AsRef<str>> AsIndexed<MutableUtf8Array<O>> for V {
+    #[inline]
+    fn as_indexed(&self) -> &str {
+        self.as_ref()
+    }
+}
+
+impl<O: Offset> Indexable for MutableUtf8ValuesArray<O> {
+    type Value<'a> = &'a str;
+    type Type = str;
+
+    #[inline]
+    fn value_at(&self, i: usize) -> Self::Value<'_> {
+        self.value(i)
+    }
+
+    #[inline]
+    unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> {
+        self.value_unchecked(i)
+    }
+}
+
+impl<O: Offset, V: AsRef<str>> AsIndexed<MutableUtf8ValuesArray<O>> for V {
+    #[inline]
+    fn as_indexed(&self) -> &str {
+        self.as_ref()
+    }
+}
diff --git a/src/array/mod.rs b/src/array/mod.rs
index 04b7b2c8e3..02735c3d0b 100644
--- a/src/array/mod.rs
+++ b/src/array/mod.rs
@@ -720,8 +720,11 @@ mod utf8;
 mod equal;
 mod ffi;
 mod fmt;
-pub mod growable;
+#[doc(hidden)]
+pub mod indexable;
 mod iterator;
+
+pub mod growable;
 pub mod ord;
 
 pub(crate) use iterator::ArrayAccessor;
diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs
index 30b265e2d5..110288817a 100644
--- a/src/compute/cast/primitive_to.rs
+++ b/src/compute/cast/primitive_to.rs
@@ -306,9 +306,9 @@ pub fn primitive_to_dictionary<T: NativeType + Eq + Hash, K: DictionaryKey>(
     from: &PrimitiveArray<T>,
 ) -> Result<DictionaryArray<K>> {
     let iter = from.iter().map(|x| x.copied());
-    let mut array = MutableDictionaryArray::<K, _>::from(MutablePrimitiveArray::<T>::from(
+    let mut array = MutableDictionaryArray::<K, _>::try_empty(MutablePrimitiveArray::<T>::from(
         from.data_type().clone(),
-    ));
+    ))?;
     array.try_extend(iter)?;
 
     Ok(array.into())
diff --git a/tests/it/array/dictionary/mutable.rs b/tests/it/array/dictionary/mutable.rs
index b6103dcccf..9570339893 100644
--- a/tests/it/array/dictionary/mutable.rs
+++ b/tests/it/array/dictionary/mutable.rs
@@ -1,8 +1,11 @@
+use std::borrow::Borrow;
+use std::collections::HashSet;
+use std::fmt::Debug;
+use std::hash::Hash;
+
+use arrow2::array::indexable::{AsIndexed, Indexable};
 use arrow2::array::*;
 use arrow2::error::Result;
-use hash_hasher::HashedMap;
-use std::collections::hash_map::DefaultHasher;
-use std::hash::{Hash, Hasher};
 
 #[test]
 fn primitive() -> Result<()> {
@@ -61,16 +64,89 @@ fn push_utf8() {
     expected_keys.push(Some(0));
     expected_keys.push(Some(1));
     assert_eq!(*new.keys(), expected_keys);
+}
+
+#[test]
+fn into_empty() {
+    let mut new: MutableDictionaryArray<i32, MutableUtf8Array<i32>> = MutableDictionaryArray::new();
+    for value in [Some("A"), Some("B"), None, Some("C"), Some("A"), Some("B")] {
+        new.try_push(value).unwrap();
+    }
+    let values = new.values().clone();
+    let empty = new.into_empty();
+    assert_eq!(empty.values(), &values);
+    assert!(empty.is_empty());
+}
+
+#[test]
+fn from_values() {
+    let mut new: MutableDictionaryArray<i32, MutableUtf8Array<i32>> = MutableDictionaryArray::new();
+    for value in [Some("A"), Some("B"), None, Some("C"), Some("A"), Some("B")] {
+        new.try_push(value).unwrap();
+    }
+    let mut values = new.values().clone();
+    let empty = MutableDictionaryArray::<i32, _>::from_values(values.clone()).unwrap();
+    assert_eq!(empty.values(), &values);
+    assert!(empty.is_empty());
+    values.push(Some("A"));
+    assert!(MutableDictionaryArray::<i32, _>::from_values(values).is_err());
+}
 
-    let expected_map = ["A", "B", "C"]
-        .iter()
-        .enumerate()
-        .map(|(index, value)| {
-            let mut hasher = DefaultHasher::new();
-            value.hash(&mut hasher);
-            let hash = hasher.finish();
-            (hash, index as i32)
-        })
-        .collect::<HashedMap<_, _>>();
-    assert_eq!(*new.map(), expected_map);
+#[test]
+fn try_empty() {
+    let mut values = MutableUtf8Array::<i32>::new();
+    MutableDictionaryArray::<i32, _>::try_empty(values.clone()).unwrap();
+    values.push(Some("A"));
+    assert!(MutableDictionaryArray::<i32, _>::try_empty(values.clone()).is_err());
+}
+
+fn test_push_ex<M, T>(values: Vec<T>, gen: impl Fn(usize) -> T)
+where
+    M: MutableArray + Indexable + TryPush<Option<T>> + TryExtend<Option<T>> + Default + 'static,
+    M::Type: Eq + Hash + Debug,
+    T: AsIndexed<M> + Default + Clone + Eq + Hash,
+{
+    for is_extend in [false, true] {
+        let mut set = HashSet::new();
+        let mut arr = MutableDictionaryArray::<u8, M>::new();
+        macro_rules! push {
+            ($v:expr) => {
+                if is_extend {
+                    arr.try_extend(std::iter::once($v))
+                } else {
+                    arr.try_push($v)
+                }
+            };
+        }
+        arr.push_null();
+        push!(None).unwrap();
+        assert_eq!(arr.len(), 2);
+        assert_eq!(arr.values().len(), 0);
+        for (i, v) in values.iter().cloned().enumerate() {
+            push!(Some(v.clone())).unwrap();
+            let is_dup = !set.insert(v.clone());
+            if !is_dup {
+                assert_eq!(arr.values().value_at(i).borrow(), v.as_indexed());
+                assert_eq!(arr.keys().value_at(arr.keys().len() - 1), i as u8);
+            }
+            assert_eq!(arr.values().len(), set.len());
+            assert_eq!(arr.len(), 3 + i);
+        }
+        for i in 0..256 - set.len() {
+            push!(Some(gen(i))).unwrap();
+        }
+        assert!(push!(Some(gen(256))).is_err());
+    }
+}
+
+#[test]
+fn test_push_utf8_ex() {
+    test_push_ex::<MutableUtf8Array<i32>, _>(vec!["a".into(), "b".into(), "a".into()], |i| {
+        i.to_string()
+    })
+}
+
+#[test]
+fn test_push_i64_ex() {
+    test_push_ex::<MutablePrimitiveArray<i64>, _>(vec![10, 20, 30, 20], |i| 1000 + i as i64);
 }

From 767834e7c54e39f5b540e5c7341a5b8007a2345f Mon Sep 17 00:00:00 2001
From: Ivan Smirnov <aldanor@users.noreply.github.com>
Date: Wed, 6 Sep 2023 14:07:24 +0100
Subject: [PATCH 63/80] Fix: native hashed-hash for MutableDictionaryArray
 (#1564)

---
 src/array/dictionary/value_map.rs    | 82 ++++++++++++++++++++++------
 tests/it/array/dictionary/mutable.rs | 17 ++++++
 2 files changed, 82 insertions(+), 17 deletions(-)

diff --git a/src/array/dictionary/value_map.rs b/src/array/dictionary/value_map.rs
index 35b59aaa2a..eb0f8790ca 100644
--- a/src/array/dictionary/value_map.rs
+++ b/src/array/dictionary/value_map.rs
@@ -1,6 +1,6 @@
 use std::borrow::Borrow;
 use std::fmt::{self, Debug};
-use std::hash::{BuildHasher, Hash, Hasher};
+use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};
 
 use hashbrown::hash_map::RawEntryMut;
 use hashbrown::HashMap;
@@ -15,10 +15,54 @@ use crate::{
 
 use super::DictionaryKey;
 
+/// Hasher for pre-hashed values; similar to `hash_hasher` but with native endianness.
+///
+/// We know that we'll only use it for `u64` values, so we can avoid endian conversion.
+///
+/// Invariant: hash of a u64 value is always equal to itself.
+#[derive(Copy, Clone, Default)]
+pub struct PassthroughHasher(u64);
+
+impl Hasher for PassthroughHasher {
+    #[inline]
+    fn write_u64(&mut self, value: u64) {
+        self.0 = value;
+    }
+
+    fn write(&mut self, _: &[u8]) {
+        unreachable!();
+    }
+
+    #[inline]
+    fn finish(&self) -> u64 {
+        self.0
+    }
+}
+
+#[derive(Clone)]
+pub struct Hashed<K> {
+    hash: u64,
+    key: K,
+}
+
+#[inline]
+fn ahash_hash<T: Hash + ?Sized>(value: &T) -> u64 {
+    let mut hasher = BuildHasherDefault::<ahash::AHasher>::default().build_hasher();
+    value.hash(&mut hasher);
+    hasher.finish()
+}
+
+impl<K> Hash for Hashed<K> {
+    #[inline]
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.hash.hash(state)
+    }
+}
+
 #[derive(Clone)]
 pub struct ValueMap<K: DictionaryKey, M: MutableArray> {
-    values: M,
-    map: HashMap<K, ()>, // NB: *only* use insert_hashed_nocheck() and no other hashmap API
+    pub values: M,
+    pub map: HashMap<Hashed<K>, (), BuildHasherDefault<PassthroughHasher>>, // NB: *only* use insert_hashed_nocheck() and no other hashmap API
 }
 
 impl<K: DictionaryKey, M: MutableArray> ValueMap<K, M> {
@@ -39,22 +83,28 @@ impl<K: DictionaryKey, M: MutableArray> ValueMap<K, M> {
         M: Indexable,
         M::Type: Eq + Hash,
     {
-        let mut map = HashMap::with_capacity(values.len());
+        let mut map = HashMap::<Hashed<K>, _, _>::with_capacity_and_hasher(
+            values.len(),
+            BuildHasherDefault::<PassthroughHasher>::default(),
+        );
         for index in 0..values.len() {
             let key = K::try_from(index).map_err(|_| Error::Overflow)?;
             // safety: we only iterate within bounds
             let value = unsafe { values.value_unchecked_at(index) };
-            let mut hasher = map.hasher().build_hasher();
-            value.borrow().hash(&mut hasher);
-            let hash = hasher.finish();
-            match map.raw_entry_mut().from_hash(hash, |_| true) {
+            let hash = ahash_hash(value.borrow());
+            match map.raw_entry_mut().from_hash(hash, |item| {
+                // safety: invariant of the struct, it's always in bounds since we maintain it
+                let stored_value = unsafe { values.value_unchecked_at(item.key.as_usize()) };
+                stored_value.borrow() == value.borrow()
+            }) {
                 RawEntryMut::Occupied(_) => {
                     return Err(Error::InvalidArgumentError(
                         "duplicate value in dictionary values array".into(),
                     ))
                 }
                 RawEntryMut::Vacant(entry) => {
-                    entry.insert_hashed_nocheck(hash, key, ()); // NB: don't use .insert() here!
+                    // NB: don't use .insert() here!
+                    entry.insert_hashed_nocheck(hash, Hashed { hash, key }, ());
                 }
             }
         }
@@ -91,24 +141,22 @@ impl<K: DictionaryKey, M: MutableArray> ValueMap<K, M> {
         V: AsIndexed<M>,
         M::Type: Eq + Hash,
     {
-        let mut hasher = self.map.hasher().build_hasher();
-        value.as_indexed().hash(&mut hasher);
-        let hash = hasher.finish();
-
+        let hash = ahash_hash(value.as_indexed());
         Ok(
-            match self.map.raw_entry_mut().from_hash(hash, |key| {
+            match self.map.raw_entry_mut().from_hash(hash, |item| {
                 // safety: we've already checked (the inverse) when we pushed it, so it should be ok?
-                let index = unsafe { key.as_usize() };
+                let index = unsafe { item.key.as_usize() };
                 // safety: invariant of the struct, it's always in bounds since we maintain it
                 let stored_value = unsafe { self.values.value_unchecked_at(index) };
                 stored_value.borrow() == value.as_indexed()
             }) {
-                RawEntryMut::Occupied(entry) => *entry.key(),
+                RawEntryMut::Occupied(entry) => entry.key().key,
                 RawEntryMut::Vacant(entry) => {
                     let index = self.values.len();
                     let key = K::try_from(index).map_err(|_| Error::Overflow)?;
-                    entry.insert_hashed_nocheck(hash, key, ()); // NB: don't use .insert() here!
+                    entry.insert_hashed_nocheck(hash, Hashed { hash, key }, ()); // NB: don't use .insert() here!
                     push(&mut self.values, value)?;
+                    debug_assert_eq!(self.values.len(), index + 1);
                     key
                 }
             },
diff --git a/tests/it/array/dictionary/mutable.rs b/tests/it/array/dictionary/mutable.rs
index 9570339893..a7845114d9 100644
--- a/tests/it/array/dictionary/mutable.rs
+++ b/tests/it/array/dictionary/mutable.rs
@@ -150,3 +150,20 @@ fn test_push_utf8_ex() {
 fn test_push_i64_ex() {
     test_push_ex::<MutablePrimitiveArray<i64>, _>(vec![10, 20, 30, 20], |i| 1000 + i as i64);
 }
+
+#[test]
+fn test_big_dict() {
+    let n = 10;
+    let strings = (0..10).map(|i| i.to_string()).collect::<Vec<_>>();
+    let mut arr = MutableDictionaryArray::<u8, MutableUtf8Array<i32>>::new();
+    for s in &strings {
+        arr.try_push(Some(s)).unwrap();
+    }
+    assert_eq!(arr.values().len(), n);
+    for _ in 0..10_000 {
+        for s in &strings {
+            arr.try_push(Some(s)).unwrap();
+        }
+    }
+    assert_eq!(arr.values().len(), n);
+}

From fb7b5fe3f61764da41a37124eee3d808a9409fb6 Mon Sep 17 00:00:00 2001
From: Jay Chia <17691182+jaychia@users.noreply.github.com>
Date: Wed, 6 Sep 2023 18:36:11 -0700
Subject: [PATCH 64/80] Add SchemaInferenceOptions  options to infer_schema and
 option to configure int96 inference (#1533)

Co-authored-by: Jay Chia <jaychia94@gmail.com@users.noreply.github.com>
---
 src/io/parquet/read/schema/convert.rs | 126 +++++++++++++++++++++-----
 src/io/parquet/read/schema/mod.rs     |  34 ++++++-
 2 files changed, 132 insertions(+), 28 deletions(-)

diff --git a/src/io/parquet/read/schema/convert.rs b/src/io/parquet/read/schema/convert.rs
index 821d510764..007797bd9d 100644
--- a/src/io/parquet/read/schema/convert.rs
+++ b/src/io/parquet/read/schema/convert.rs
@@ -1,4 +1,4 @@
-//! This module has a single entry point, [`parquet_to_arrow_schema`].
+//! This module has entry points, [`parquet_to_arrow_schema`] and the more configurable [`parquet_to_arrow_schema_with_options`].
 use parquet2::schema::{
     types::{
         FieldInfo, GroupConvertedType, GroupLogicalType, IntegerType, ParquetType, PhysicalType,
@@ -8,11 +8,23 @@ use parquet2::schema::{
 };
 
 use crate::datatypes::{DataType, Field, IntervalUnit, TimeUnit};
+use crate::io::parquet::read::schema::SchemaInferenceOptions;
 
 /// Converts [`ParquetType`]s to a [`Field`], ignoring parquet fields that do not contain
 /// any physical column.
 pub fn parquet_to_arrow_schema(fields: &[ParquetType]) -> Vec<Field> {
-    fields.iter().filter_map(to_field).collect::<Vec<_>>()
+    parquet_to_arrow_schema_with_options(fields, &None)
+}
+
+/// Like [`parquet_to_arrow_schema`] but with configurable options which affect the behavior of schema inference
+pub fn parquet_to_arrow_schema_with_options(
+    fields: &[ParquetType],
+    options: &Option<SchemaInferenceOptions>,
+) -> Vec<Field> {
+    fields
+        .iter()
+        .filter_map(|f| to_field(f, options.as_ref().unwrap_or(&Default::default())))
+        .collect::<Vec<_>>()
 }
 
 fn from_int32(
@@ -169,7 +181,10 @@ fn from_fixed_len_byte_array(
 }
 
 /// Maps a [`PhysicalType`] with optional metadata to a [`DataType`]
-fn to_primitive_type_inner(primitive_type: &PrimitiveType) -> DataType {
+fn to_primitive_type_inner(
+    primitive_type: &PrimitiveType,
+    options: &SchemaInferenceOptions,
+) -> DataType {
     match primitive_type.physical_type {
         PhysicalType::Boolean => DataType::Boolean,
         PhysicalType::Int32 => {
@@ -178,7 +193,7 @@ fn to_primitive_type_inner(primitive_type: &PrimitiveType) -> DataType {
         PhysicalType::Int64 => {
             from_int64(primitive_type.logical_type, primitive_type.converted_type)
         }
-        PhysicalType::Int96 => DataType::Timestamp(TimeUnit::Nanosecond, None),
+        PhysicalType::Int96 => DataType::Timestamp(options.int96_coerce_to_timeunit, None),
         PhysicalType::Float => DataType::Float32,
         PhysicalType::Double => DataType::Float64,
         PhysicalType::ByteArray => {
@@ -195,8 +210,8 @@ fn to_primitive_type_inner(primitive_type: &PrimitiveType) -> DataType {
 /// Entry point for converting parquet primitive type to arrow type.
 ///
 /// This function takes care of repetition.
-fn to_primitive_type(primitive_type: &PrimitiveType) -> DataType {
-    let base_type = to_primitive_type_inner(primitive_type);
+fn to_primitive_type(primitive_type: &PrimitiveType, options: &SchemaInferenceOptions) -> DataType {
+    let base_type = to_primitive_type_inner(primitive_type, options);
 
     if primitive_type.field_info.repetition == Repetition::Repeated {
         DataType::List(Box::new(Field::new(
@@ -214,23 +229,27 @@ fn non_repeated_group(
     converted_type: &Option<GroupConvertedType>,
     fields: &[ParquetType],
     parent_name: &str,
+    options: &SchemaInferenceOptions,
 ) -> Option<DataType> {
     debug_assert!(!fields.is_empty());
     match (logical_type, converted_type) {
-        (Some(GroupLogicalType::List), _) => to_list(fields, parent_name),
-        (None, Some(GroupConvertedType::List)) => to_list(fields, parent_name),
-        (Some(GroupLogicalType::Map), _) => to_list(fields, parent_name),
+        (Some(GroupLogicalType::List), _) => to_list(fields, parent_name, options),
+        (None, Some(GroupConvertedType::List)) => to_list(fields, parent_name, options),
+        (Some(GroupLogicalType::Map), _) => to_list(fields, parent_name, options),
         (None, Some(GroupConvertedType::Map) | Some(GroupConvertedType::MapKeyValue)) => {
-            to_map(fields)
+            to_map(fields, options)
         }
-        _ => to_struct(fields),
+        _ => to_struct(fields, options),
     }
 }
 
 /// Converts a parquet group type to an arrow [`DataType::Struct`].
 /// Returns [`None`] if all its fields are empty
-fn to_struct(fields: &[ParquetType]) -> Option<DataType> {
-    let fields = fields.iter().filter_map(to_field).collect::<Vec<Field>>();
+fn to_struct(fields: &[ParquetType], options: &SchemaInferenceOptions) -> Option<DataType> {
+    let fields = fields
+        .iter()
+        .filter_map(|f| to_field(f, options))
+        .collect::<Vec<Field>>();
     if fields.is_empty() {
         None
     } else {
@@ -240,8 +259,8 @@ fn to_struct(fields: &[ParquetType]) -> Option<DataType> {
 
 /// Converts a parquet group type to an arrow [`DataType::Struct`].
 /// Returns [`None`] if all its fields are empty
-fn to_map(fields: &[ParquetType]) -> Option<DataType> {
-    let inner = to_field(&fields[0])?;
+fn to_map(fields: &[ParquetType], options: &SchemaInferenceOptions) -> Option<DataType> {
+    let inner = to_field(&fields[0], options)?;
     Some(DataType::Map(Box::new(inner), false))
 }
 
@@ -254,16 +273,17 @@ fn to_group_type(
     converted_type: &Option<GroupConvertedType>,
     fields: &[ParquetType],
     parent_name: &str,
+    options: &SchemaInferenceOptions,
 ) -> Option<DataType> {
     debug_assert!(!fields.is_empty());
     if field_info.repetition == Repetition::Repeated {
         Some(DataType::List(Box::new(Field::new(
             &field_info.name,
-            to_struct(fields)?,
+            to_struct(fields, options)?,
             is_nullable(field_info),
         ))))
     } else {
-        non_repeated_group(logical_type, converted_type, fields, parent_name)
+        non_repeated_group(logical_type, converted_type, fields, parent_name, options)
     }
 }
 
@@ -279,10 +299,10 @@ pub(crate) fn is_nullable(field_info: &FieldInfo) -> bool {
 /// Converts parquet schema to arrow field.
 /// Returns `None` iff the parquet type has no associated primitive types,
 /// i.e. if it is a column-less group type.
-fn to_field(type_: &ParquetType) -> Option<Field> {
+fn to_field(type_: &ParquetType, options: &SchemaInferenceOptions) -> Option<Field> {
     Some(Field::new(
         &type_.get_field_info().name,
-        to_data_type(type_)?,
+        to_data_type(type_, options)?,
         is_nullable(type_.get_field_info()),
     ))
 }
@@ -291,11 +311,15 @@ fn to_field(type_: &ParquetType) -> Option<Field> {
 ///
 /// To fully understand this algorithm, please refer to
 /// [parquet doc](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md).
-fn to_list(fields: &[ParquetType], parent_name: &str) -> Option<DataType> {
+fn to_list(
+    fields: &[ParquetType],
+    parent_name: &str,
+    options: &SchemaInferenceOptions,
+) -> Option<DataType> {
     let item = fields.first().unwrap();
 
     let item_type = match item {
-        ParquetType::PrimitiveType(primitive) => Some(to_primitive_type_inner(primitive)),
+        ParquetType::PrimitiveType(primitive) => Some(to_primitive_type_inner(primitive, options)),
         ParquetType::GroupType { fields, .. } => {
             if fields.len() == 1
                 && item.name() != "array"
@@ -303,9 +327,9 @@ fn to_list(fields: &[ParquetType], parent_name: &str) -> Option<DataType> {
             {
                 // extract the repetition field
                 let nested_item = fields.first().unwrap();
-                to_data_type(nested_item)
+                to_data_type(nested_item, options)
             } else {
-                to_struct(fields)
+                to_struct(fields, options)
             }
         }
     }?;
@@ -346,9 +370,12 @@ fn to_list(fields: &[ParquetType], parent_name: &str) -> Option<DataType> {
 ///
 /// If this schema is a group type and none of its children is reserved in the
 /// conversion, the result is Ok(None).
-pub(crate) fn to_data_type(type_: &ParquetType) -> Option<DataType> {
+pub(crate) fn to_data_type(
+    type_: &ParquetType,
+    options: &SchemaInferenceOptions,
+) -> Option<DataType> {
     match type_ {
-        ParquetType::PrimitiveType(primitive) => Some(to_primitive_type(primitive)),
+        ParquetType::PrimitiveType(primitive) => Some(to_primitive_type(primitive, options)),
         ParquetType::GroupType {
             field_info,
             logical_type,
@@ -364,6 +391,7 @@ pub(crate) fn to_data_type(type_: &ParquetType) -> Option<DataType> {
                     converted_type,
                     fields,
                     &field_info.name,
+                    options,
                 )
             }
         }
@@ -973,4 +1001,52 @@ mod tests {
         assert_eq!(arrow_fields, fields);
         Ok(())
     }
+
+    #[test]
+    fn test_int96_options() -> Result<()> {
+        for tu in [
+            TimeUnit::Second,
+            TimeUnit::Microsecond,
+            TimeUnit::Millisecond,
+            TimeUnit::Nanosecond,
+        ] {
+            let message_type = "
+            message arrow_schema {
+                REQUIRED INT96   int96_field;
+                OPTIONAL GROUP   int96_list (LIST) {
+                    REPEATED GROUP list {
+                        OPTIONAL INT96 element;
+                    }
+                }
+                REQUIRED GROUP int96_struct {
+                    REQUIRED INT96 int96_field;
+                }
+            }
+            ";
+            let coerced_to = DataType::Timestamp(tu, None);
+            let arrow_fields = vec![
+                Field::new("int96_field", coerced_to.clone(), false),
+                Field::new(
+                    "int96_list",
+                    DataType::List(Box::new(Field::new("element", coerced_to.clone(), true))),
+                    true,
+                ),
+                Field::new(
+                    "int96_struct",
+                    DataType::Struct(vec![Field::new("int96_field", coerced_to.clone(), false)]),
+                    false,
+                ),
+            ];
+
+            let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;
+            let fields = parquet_to_arrow_schema_with_options(
+                parquet_schema.fields(),
+                &Some(SchemaInferenceOptions {
+                    int96_coerce_to_timeunit: tu,
+                }),
+            );
+            assert_eq!(arrow_fields, fields);
+        }
+        Ok(())
+    }
 }
diff --git a/src/io/parquet/read/schema/mod.rs b/src/io/parquet/read/schema/mod.rs
index d47055ef6a..6a1e49ae7e 100644
--- a/src/io/parquet/read/schema/mod.rs
+++ b/src/io/parquet/read/schema/mod.rs
@@ -1,11 +1,11 @@
 //! APIs to handle Parquet <-> Arrow schemas.
-use crate::datatypes::Schema;
+use crate::datatypes::{Schema, TimeUnit};
 use crate::error::Result;
 
 mod convert;
 mod metadata;
 
-pub use convert::parquet_to_arrow_schema;
+pub use convert::{parquet_to_arrow_schema, parquet_to_arrow_schema_with_options};
 pub use metadata::read_schema_from_metadata;
 pub use parquet2::metadata::{FileMetaData, KeyValue, SchemaDescriptor};
 pub use parquet2::schema::types::ParquetType;
@@ -14,6 +14,26 @@ pub(crate) use convert::*;
 
 use self::metadata::parse_key_value_metadata;
 
+/// Options when inferring schemas from Parquet
+pub struct SchemaInferenceOptions {
+    /// When inferring schemas from the Parquet INT96 timestamp type, this is the corresponding TimeUnit
+    /// in the inferred Arrow Timestamp type.
+    ///
+    /// This defaults to `TimeUnit::Nanosecond`, but INT96 timestamps outside of the range of years 1678-2262,
+    /// will overflow when parsed as `Timestamp(TimeUnit::Nanosecond)`. Setting this to a lower resolution
+    /// (e.g. TimeUnit::Milliseconds) will result in loss of precision, but support a larger range of dates
+    /// without overflowing when parsing the data.
+    pub int96_coerce_to_timeunit: TimeUnit,
+}
+
+impl Default for SchemaInferenceOptions {
+    fn default() -> Self {
+        SchemaInferenceOptions {
+            int96_coerce_to_timeunit: TimeUnit::Nanosecond,
+        }
+    }
+}
+
 /// Infers a [`Schema`] from parquet's [`FileMetaData`]. This first looks for the metadata key
 /// `"ARROW:schema"`; if it does not exist, it converts the parquet types declared in the
 /// file's parquet schema to Arrow's equivalent.
@@ -21,11 +41,19 @@ use self::metadata::parse_key_value_metadata;
 /// This function errors iff the key `"ARROW:schema"` exists but is not correctly encoded,
 /// indicating that that the file's arrow metadata was incorrectly written.
 pub fn infer_schema(file_metadata: &FileMetaData) -> Result<Schema> {
+    infer_schema_with_options(file_metadata, &None)
+}
+
+/// Like [`infer_schema`] but with configurable options which affects the behavior of inference
+pub fn infer_schema_with_options(
+    file_metadata: &FileMetaData,
+    options: &Option<SchemaInferenceOptions>,
+) -> Result<Schema> {
     let mut metadata = parse_key_value_metadata(file_metadata.key_value_metadata());
 
     let schema = read_schema_from_metadata(&mut metadata)?;
     Ok(schema.unwrap_or_else(|| {
-        let fields = parquet_to_arrow_schema(file_metadata.schema().fields());
+        let fields = parquet_to_arrow_schema_with_options(file_metadata.schema().fields(), options);
         Schema { fields, metadata }
     }))
 }

From 7c93e358fc400bf3c0c0219c22eefc6b38fc2d12 Mon Sep 17 00:00:00 2001
From: Weijie Guo <reswqa@163.com>
Date: Mon, 11 Sep 2023 18:03:34 +0800
Subject: [PATCH 65/80] fix: More types supports cast to LargeList (#1567)

---
 src/compute/cast/mod.rs | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs
index 8f89151a06..688291dd12 100644
--- a/src/compute/cast/mod.rs
+++ b/src/compute/cast/mod.rs
@@ -104,6 +104,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
         (List(list_from), LargeList(list_to)) if list_from == list_to => true,
         (LargeList(list_from), List(list_to)) if list_from == list_to => true,
         (_, List(list_to)) => can_cast_types(from_type, &list_to.data_type),
+        (_, LargeList(list_to)) if from_type != &LargeBinary => {
+            can_cast_types(from_type, &list_to.data_type)
+        }
         (Dictionary(_, from_value_type, _), Dictionary(_, to_value_type, _)) => {
             can_cast_types(from_value_type, to_value_type)
         }
@@ -150,7 +153,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
         (Timestamp(_, _), LargeUtf8) => true,
         (_, Utf8) => is_numeric(from_type) || from_type == &Binary,
         (_, LargeUtf8) => is_numeric(from_type) || from_type == &LargeBinary,
-        (_, LargeList(list_to)) => can_cast_types(from_type, &list_to.data_type),
+
         (_, Binary) => is_numeric(from_type),
         (_, LargeBinary) => is_numeric(from_type),
 
@@ -509,6 +512,19 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu
             Ok(Box::new(list_array))
         }
 
+        (_, LargeList(to)) if from_type != &LargeBinary => {
+            // cast primitive to list's primitive
+            let values = cast(array, &to.data_type, options)?;
+            // create offsets, where if array.len() = 2, we have [0,1,2]
+            let offsets = (0..=array.len() as i64).collect::<Vec<_>>();
+            // Safety: offsets _are_ monotonically increasing
+            let offsets = unsafe { Offsets::new_unchecked(offsets) };
+
+            let list_array = ListArray::<i64>::new(to_type.clone(), offsets.into(), values, None);
+
+            Ok(Box::new(list_array))
+        }
+
         (Dictionary(index_type, ..), _) => match_integer_type!(index_type, |$T| {
             dictionary_cast_dyn::<$T>(array, to_type, options)
         }),
@@ -740,19 +756,6 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu
             ))),
         },
 
-        (_, LargeList(to)) => {
-            // cast primitive to list's primitive
-            let values = cast(array, &to.data_type, options)?;
-            // create offsets, where if array.len() = 2, we have [0,1,2]
-            let offsets = (0..=array.len() as i64).collect::<Vec<_>>();
-            // Safety: offsets _are_ monotonically increasing
-            let offsets = unsafe { Offsets::new_unchecked(offsets) };
-
-            let list_array = ListArray::<i64>::new(to_type.clone(), offsets.into(), values, None);
-
-            Ok(Box::new(list_array))
-        }
-
         (_, Binary) => match from_type {
             UInt8 => primitive_to_binary_dyn::<u8, i32>(array),
             UInt16 => primitive_to_binary_dyn::<u16, i32>(array),

From 231a6fa61c3aad9d766165557501e28d73cf6b9a Mon Sep 17 00:00:00 2001
From: sundyli <543950155@qq.com>
Date: Thu, 21 Sep 2023 19:51:56 -0700
Subject: [PATCH 66/80] fix parquet datatype conversion from arrow (#1570)

---
 src/io/csv/read_utils.rs              |  4 ++--
 src/io/odbc/read/deserialize.rs       |  4 ++--
 src/io/parquet/read/schema/convert.rs | 16 ++++++++++++++--
 src/io/parquet/write/schema.rs        |  4 ++--
 src/temporal_conversions.rs           | 12 ++++++------
 5 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/src/io/csv/read_utils.rs b/src/io/csv/read_utils.rs
index d23f6c1197..cb91417ed1 100644
--- a/src/io/csv/read_utils.rs
+++ b/src/io/csv/read_utils.rs
@@ -136,7 +136,7 @@ fn deserialize_datetime<T: chrono::TimeZone>(string: &str, tz: &T) -> Option<i64
             .to_datetime()
             .map(|x| x.naive_utc())
             .map(|x| tz.from_utc_datetime(&x))
-            .map(|x| x.timestamp_nanos())
+            .map(|x| x.timestamp_nanos_opt().unwrap())
             .ok()
     } else {
         None
@@ -228,7 +228,7 @@ pub(crate) fn deserialize_column<B: ByteRecordGeneric>(
         Timestamp(time_unit, None) => deserialize_primitive(rows, column, datatype, |bytes| {
             to_utf8(bytes)
                 .and_then(|x| x.parse::<chrono::NaiveDateTime>().ok())
-                .map(|x| x.timestamp_nanos())
+                .map(|x| x.timestamp_nanos_opt().unwrap())
                 .map(|x| match time_unit {
                     TimeUnit::Second => x / 1_000_000_000,
                     TimeUnit::Millisecond => x / 1_000_000,
diff --git a/src/io/odbc/read/deserialize.rs b/src/io/odbc/read/deserialize.rs
index 3e18fa279b..be0a548e1a 100644
--- a/src/io/odbc/read/deserialize.rs
+++ b/src/io/odbc/read/deserialize.rs
@@ -264,12 +264,12 @@ fn timestamp_ms(timestamp: &odbc_api::sys::Timestamp) -> i64 {
 
 fn timestamp_us(timestamp: &odbc_api::sys::Timestamp) -> i64 {
     timestamp_to_naive(timestamp)
-        .map(|x| x.timestamp_nanos() / 1000)
+        .map(|x| x.timestamp_nanos_opt().unwrap() / 1000)
         .unwrap_or(0)
 }
 
 fn timestamp_ns(timestamp: &odbc_api::sys::Timestamp) -> i64 {
     timestamp_to_naive(timestamp)
-        .map(|x| x.timestamp_nanos())
+        .map(|x| x.timestamp_nanos_opt().unwrap())
         .unwrap_or(0)
 }
diff --git a/src/io/parquet/read/schema/convert.rs b/src/io/parquet/read/schema/convert.rs
index 007797bd9d..dea14ca86b 100644
--- a/src/io/parquet/read/schema/convert.rs
+++ b/src/io/parquet/read/schema/convert.rs
@@ -165,10 +165,18 @@ fn from_fixed_len_byte_array(
 ) -> DataType {
     match (logical_type, converted_type) {
         (Some(PrimitiveLogicalType::Decimal(precision, scale)), _) => {
-            DataType::Decimal(precision, scale)
+            if length < 32 {
+                DataType::Decimal(precision, scale)
+            } else {
+                DataType::Decimal256(precision, scale)
+            }
         }
         (None, Some(PrimitiveConvertedType::Decimal(precision, scale))) => {
-            DataType::Decimal(precision, scale)
+            if length < 32 {
+                DataType::Decimal(precision, scale)
+            } else {
+                DataType::Decimal256(precision, scale)
+            }
         }
         (None, Some(PrimitiveConvertedType::Interval)) => {
             // There is currently no reliable way of determining which IntervalUnit
@@ -451,11 +459,15 @@ mod tests {
         message test_schema {
             REQUIRED BYTE_ARRAY binary;
             REQUIRED FIXED_LEN_BYTE_ARRAY (20) fixed_binary;
+            REQUIRED FIXED_LEN_BYTE_ARRAY (7) decimal_128 (Decimal(16, 2)) ;
+            REQUIRED FIXED_LEN_BYTE_ARRAY (32) decimal_256 (Decimal(44, 2)) ;
         }
         ";
         let expected = vec![
             Field::new("binary", DataType::Binary, false),
             Field::new("fixed_binary", DataType::FixedSizeBinary(20), false),
+            Field::new("decimal_128", DataType::Decimal(16, 2), false),
+            Field::new("decimal_256", DataType::Decimal256(44, 2), false),
         ];
 
         let parquet_schema = SchemaDescriptor::try_from_message(message)?;
diff --git a/src/io/parquet/write/schema.rs b/src/io/parquet/write/schema.rs
index 48dd853ea4..69af988d82 100644
--- a/src/io/parquet/write/schema.rs
+++ b/src/io/parquet/write/schema.rs
@@ -333,8 +333,8 @@ pub fn to_parquet_type(field: &Field) -> Result<ParquetType> {
                     name,
                     PhysicalType::FixedLenByteArray(32),
                     repetition,
-                    None,
-                    None,
+                    Some(PrimitiveConvertedType::Decimal(precision, scale)),
+                    logical_type,
                     None,
                 )?)
             }
diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs
index b706a45b29..f2864c3417 100644
--- a/src/temporal_conversions.rs
+++ b/src/temporal_conversions.rs
@@ -363,7 +363,7 @@ pub fn utf8_to_timestamp_scalar<T: chrono::TimeZone>(
                 TimeUnit::Second => x.timestamp(),
                 TimeUnit::Millisecond => x.timestamp_millis(),
                 TimeUnit::Microsecond => x.timestamp_micros(),
-                TimeUnit::Nanosecond => x.timestamp_nanos(),
+                TimeUnit::Nanosecond => x.timestamp_nanos_opt().unwrap(),
             })
             .ok()
     } else {
@@ -390,7 +390,7 @@ pub fn utf8_to_naive_timestamp_scalar(value: &str, fmt: &str, tu: &TimeUnit) ->
             TimeUnit::Second => x.timestamp(),
             TimeUnit::Millisecond => x.timestamp_millis(),
             TimeUnit::Microsecond => x.timestamp_micros(),
-            TimeUnit::Nanosecond => x.timestamp_nanos(),
+            TimeUnit::Nanosecond => x.timestamp_nanos_opt().unwrap(),
         })
         .ok()
 }
@@ -515,8 +515,8 @@ pub fn add_naive_interval(timestamp: i64, time_unit: TimeUnit, interval: months_
     match time_unit {
         TimeUnit::Second => new_datetime_tz.timestamp_millis() / 1000,
         TimeUnit::Millisecond => new_datetime_tz.timestamp_millis(),
-        TimeUnit::Microsecond => new_datetime_tz.timestamp_nanos() / 1000,
-        TimeUnit::Nanosecond => new_datetime_tz.timestamp_nanos(),
+        TimeUnit::Microsecond => new_datetime_tz.timestamp_nanos_opt().unwrap() / 1000,
+        TimeUnit::Nanosecond => new_datetime_tz.timestamp_nanos_opt().unwrap(),
     }
 }
 
@@ -544,7 +544,7 @@ pub fn add_interval<T: chrono::TimeZone>(
     match time_unit {
         TimeUnit::Second => new_datetime_tz.timestamp_millis() / 1000,
         TimeUnit::Millisecond => new_datetime_tz.timestamp_millis(),
-        TimeUnit::Microsecond => new_datetime_tz.timestamp_nanos() / 1000,
-        TimeUnit::Nanosecond => new_datetime_tz.timestamp_nanos(),
+        TimeUnit::Microsecond => new_datetime_tz.timestamp_nanos_opt().unwrap() / 1000,
+        TimeUnit::Nanosecond => new_datetime_tz.timestamp_nanos_opt().unwrap(),
     }
 }

From 8880501b07405bc9d3b75210ca883eecacffb8e1 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Sat, 23 Sep 2023 19:00:34 +0800
Subject: [PATCH 67/80] fix typo in merge_sort comment (#1571)

---
 src/compute/merge_sort/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compute/merge_sort/mod.rs b/src/compute/merge_sort/mod.rs
index 598ba66d31..f57b09bb4a 100644
--- a/src/compute/merge_sort/mod.rs
+++ b/src/compute/merge_sort/mod.rs
@@ -74,7 +74,7 @@ use crate::error::Result;
 /// This is used to keep track of contiguous blocks of slots.
 /// An array of MergeSlice, `[MergeSlice]`, represents inter-leaved array slices.
 /// For example, `[(0, 0, 2), (1, 0, 1), (0, 2, 3)]` represents 2 arrays (a0 and a1) arranged as follows:
-/// `[a0[0..2], a1[0..1], a0[2..3]]`
+/// `[a0[0..2], a1[0..1], a0[2..5]]`
 /// This representation is useful when building arrays in memory as it allows to memcopy slices of arrays.
 /// This is particularly useful in merge-sort because sorted arrays (passed to the merge-sort) are more likely
 /// to have contiguous blocks of sorted elements (than by random).

From 6271f48e4d8d1bf2b43ef1da81f6d9f681e38d63 Mon Sep 17 00:00:00 2001
From: Yijun Zhao <ariesdevil77@gmail.com>
Date: Thu, 28 Sep 2023 11:05:46 +0800
Subject: [PATCH 68/80] Add test for list_nested_decimal (#1572)

---
 parquet_integration/write_parquet.py | 12 +++++
 tests/it/io/parquet/mod.rs           | 73 ++++++++++++++++++++++++++++
 tests/it/io/parquet/read.rs          |  5 ++
 3 files changed, 90 insertions(+)

diff --git a/parquet_integration/write_parquet.py b/parquet_integration/write_parquet.py
index a7f7560fc5..072b59c775 100644
--- a/parquet_integration/write_parquet.py
+++ b/parquet_integration/write_parquet.py
@@ -179,6 +179,16 @@ def case_nested() -> Tuple[dict, pa.Schema, str]:
     ]
 
     decimal_nullable = [[Decimal(n) if n is not None else None for n in sublist] if sublist is not None else None for sublist in items_nullable]
+    decimal_nested = [
+        [[Decimal(0), Decimal(1)]],
+        None,
+        [[Decimal(2), None], [Decimal(3)]],
+        [[Decimal(4), Decimal(5)], [Decimal(6)]],
+        [],
+        [[Decimal(7)], None, [Decimal(9)]],
+        [[], [None], None],
+        [[Decimal(10)]],
+    ]
 
     list_struct_nullable = [
         [{"a": "a"}, {"a": "b"}],
@@ -227,6 +237,7 @@ def case_nested() -> Tuple[dict, pa.Schema, str]:
         pa.field("list_decimal", pa.list_(pa.decimal128(9, 0))),
         pa.field("list_decimal256", pa.list_(pa.decimal256(9, 0))),
         pa.field("list_nested_i64", pa.list_(pa.list_(pa.int64()))),
+        pa.field("list_nested_decimal", pa.list_(pa.list_(pa.decimal128(9, 0)))),
         pa.field("list_nested_inner_required_i64", pa.list_(pa.list_(pa.int64()))),
         pa.field(
             "list_nested_inner_required_required_i64", pa.list_(pa.list_(pa.int64()))
@@ -258,6 +269,7 @@ def case_nested() -> Tuple[dict, pa.Schema, str]:
             "list_decimal": decimal_nullable,
             "list_decimal256": decimal_nullable,
             "list_nested_i64": items_nested,
+            "list_nested_decimal": decimal_nested,
             "list_nested_inner_required_i64": items_required_nested,
             "list_nested_inner_required_required_i64": items_required_nested_2,
             "list_struct_nullable": list_struct_nullable,
diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs
index 4539d21a33..94d6cdf77e 100644
--- a/tests/it/io/parquet/mod.rs
+++ b/tests/it/io/parquet/mod.rs
@@ -256,6 +256,7 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box<dyn Array> {
             Box::new(array)
         }
         "list_nested_i64"
+        | "list_nested_decimal"
         | "list_nested_inner_required_i64"
         | "list_nested_inner_required_required_i64" => Box::new(NullArray::new(DataType::Null, 1)),
         "struct_list_nullable" => pyarrow_nested_nullable("list_utf8"),
@@ -389,6 +390,48 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box<dyn Array> {
             let array: ListArray<i32> = a.into();
             Box::new(array)
         }
+        "list_nested_decimal" => {
+            // [
+            //     [[Decimal(0), Decimal(1)]],
+            //     None,
+            //     [[Decimal(2), None], [Decimal(3)]],
+            //     [[Decimal(4), Decimal(5)], [Decimal(6)]],
+            //     [],
+            //     [[Decimal(7)], None, [Decimal(9)]],
+            //     [[], [None], None],
+            //     [[Decimal(10)]],
+            // ]
+
+            let data = [
+                Some(vec![Some(vec![Some(0), Some(1)])]),
+                None,
+                Some(vec![Some(vec![Some(2), None]), Some(vec![Some(3)])]),
+                Some(vec![Some(vec![Some(4), Some(5)]), Some(vec![Some(6)])]),
+                Some(vec![]),
+                Some(vec![Some(vec![Some(7)]), None, Some(vec![Some(9)])]),
+                Some(vec![Some(vec![]), Some(vec![None]), None]),
+                Some(vec![Some(vec![Some(10)])]),
+            ];
+
+            let inner_array = MutablePrimitiveArray::<i128>::from(DataType::Decimal(9, 0));
+            let middle_array = MutableListArray::<i32, MutablePrimitiveArray<i128>>::new_from(
+                inner_array.clone(),
+                ListArray::<i32>::default_datatype(inner_array.data_type().clone()),
+                0,
+            );
+            let mut outer_array = MutableListArray::<
+                i32,
+                MutableListArray<i32, MutablePrimitiveArray<i128>>,
+            >::new_from(
+                middle_array.clone(),
+                ListArray::<i32>::default_datatype(middle_array.data_type().clone()),
+                0,
+            );
+
+            outer_array.try_extend(data).unwrap();
+            let array: ListArray<i32> = outer_array.into();
+            Box::new(array)
+        }
         "list_nested_inner_required_i64" => {
             let data = [
                 Some(vec![Some(vec![Some(0), Some(1)])]),
@@ -948,6 +991,36 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics {
             )
             .boxed(),
         },
+        "list_nested_decimal" => Statistics {
+            distinct_count: new_list(
+                new_list(UInt64Array::from([None]).boxed(), true).boxed(),
+                true,
+            )
+            .boxed(),
+            null_count: new_list(
+                new_list(Box::new(UInt64Array::from_slice([7])), true).boxed(),
+                true,
+            )
+            .boxed(),
+            min_value: new_list(
+                new_list(
+                    Box::new(Int128Array::from_slice([0]).to(DataType::Decimal(9, 0))),
+                    true,
+                )
+                .boxed(),
+                true,
+            )
+            .boxed(),
+            max_value: new_list(
+                new_list(
+                    Box::new(Int128Array::from_slice([10]).to(DataType::Decimal(9, 0))),
+                    true,
+                )
+                .boxed(),
+                true,
+            )
+            .boxed(),
+        },
         "list_nested_inner_required_required_i64" => Statistics {
             distinct_count: UInt64Array::from([None]).boxed(),
             null_count: UInt64Array::from([Some(0)]).boxed(),
diff --git a/tests/it/io/parquet/read.rs b/tests/it/io/parquet/read.rs
index 8f45eb874d..7689f1532f 100644
--- a/tests/it/io/parquet/read.rs
+++ b/tests/it/io/parquet/read.rs
@@ -339,6 +339,11 @@ fn v2_nested_nested() -> Result<()> {
     test_pyarrow_integration("list_nested_i64", 2, "nested", false, false, None)
 }
 
+#[test]
+fn v2_nested_nested_decimal() -> Result<()> {
+    test_pyarrow_integration("list_nested_decimal", 2, "nested", false, false, None)
+}
+
 #[test]
 fn v2_nested_nested_required() -> Result<()> {
     test_pyarrow_integration(

From 63e99ad2828669134fe4ca6f8685c75f986a9732 Mon Sep 17 00:00:00 2001
From: Jk Xu <54522439+Dousir9@users.noreply.github.com>
Date: Wed, 4 Oct 2023 09:39:32 +0800
Subject: [PATCH 69/80] Improve bitmap slice unchecked (#1574)

---
 src/bitmap/immutable.rs | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs
index 41799e0adb..c453a6f31a 100644
--- a/src/bitmap/immutable.rs
+++ b/src/bitmap/immutable.rs
@@ -172,11 +172,15 @@ impl Bitmap {
     /// The caller must ensure that `self.offset + offset + length <= self.len()`
     #[inline]
     pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
-        // first guard a no-op slice so that we don't do a bitcount
-        // if there isn't any data sliced
-        if !(offset == 0 && length == self.length) {
-            // count the smallest chunk
-            if length < self.length / 2 {
+        // we don't do a bitcount in the following cases:
+        // 1. if there isn't any data sliced.
+        // 2. if this [`Bitmap`] is all true or all false.
+        if !(offset == 0 && length == self.length || self.unset_bits == 0) {
+            // if `self.unset_bits == self.length` is false, we count the smallest chunk
+            // and do a bitcount.
+            if self.unset_bits == self.length {
+                self.unset_bits = length;
+            } else if length < self.length / 2 {
                 // count the null values in the slice
                 self.unset_bits = count_zeros(&self.bytes, self.offset + offset, length);
             } else {
@@ -186,9 +190,9 @@ impl Bitmap {
                 let tail_count = count_zeros(&self.bytes, start_end, self.length - length - offset);
                 self.unset_bits -= head_count + tail_count;
             }
-            self.offset += offset;
-            self.length = length;
         }
+        self.offset += offset;
+        self.length = length;
     }
 
     /// Slices `self`, offsetting by `offset` and truncating up to `length` bits.

From ced09386227974e178ad0deeb57c433229e640c3 Mon Sep 17 00:00:00 2001
From: Paul C <paul.luap09@gmail.com>
Date: Fri, 6 Oct 2023 19:22:05 -0500
Subject: [PATCH 70/80] fix: fix deserialization of parquets with large string
 list columns causing stack overflow (#1575)

---
 .../parquet/read/deserialize/binary/nested.rs | 34 +++++++++++--------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/io/parquet/read/deserialize/binary/nested.rs b/src/io/parquet/read/deserialize/binary/nested.rs
index 76d58f9c49..7aa4244163 100644
--- a/src/io/parquet/read/deserialize/binary/nested.rs
+++ b/src/io/parquet/read/deserialize/binary/nested.rs
@@ -170,22 +170,26 @@ impl<O: Offset, I: Pages> Iterator for NestedIter<O, I> {
     type Item = Result<(NestedState, Box<dyn Array>)>;
 
     fn next(&mut self) -> Option<Self::Item> {
-        let maybe_state = next(
-            &mut self.iter,
-            &mut self.items,
-            &mut self.dict,
-            &mut self.remaining,
-            &self.init,
-            self.chunk_size,
-            &BinaryDecoder::<O>::default(),
-        );
-        match maybe_state {
-            MaybeNext::Some(Ok((nested, decoded))) => {
-                Some(finish(&self.data_type, decoded.0, decoded.1).map(|array| (nested, array)))
+        loop {
+            let maybe_state = next(
+                &mut self.iter,
+                &mut self.items,
+                &mut self.dict,
+                &mut self.remaining,
+                &self.init,
+                self.chunk_size,
+                &BinaryDecoder::<O>::default(),
+            );
+            match maybe_state {
+                MaybeNext::Some(Ok((nested, decoded))) => {
+                    return Some(
+                        finish(&self.data_type, decoded.0, decoded.1).map(|array| (nested, array)),
+                    )
+                }
+                MaybeNext::Some(Err(e)) => return Some(Err(e)),
+                MaybeNext::None => return None,
+                MaybeNext::More => continue, // Using continue in a loop instead of calling next helps prevent stack overflow.
             }
-            MaybeNext::Some(Err(e)) => Some(Err(e)),
-            MaybeNext::None => None,
-            MaybeNext::More => self.next(),
         }
     }
 }

From 420936ed69205fe34d33babbd0ab04817e623649 Mon Sep 17 00:00:00 2001
From: Ryan Marcus <ryan@ryanmarc.us>
Date: Fri, 6 Oct 2023 20:24:18 -0400
Subject: [PATCH 71/80] Fixed typo (#1576)

---
 src/scalar/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scalar/mod.rs b/src/scalar/mod.rs
index e3404e4eaa..aab5ed929f 100644
--- a/src/scalar/mod.rs
+++ b/src/scalar/mod.rs
@@ -31,7 +31,7 @@ mod union;
 pub use union::UnionScalar;
 
 /// Trait object declaring an optional value with a [`DataType`].
-/// This strait is often used in APIs that accept multiple scalar types.
+/// This trait is often used in APIs that accept multiple scalar types.
 pub trait Scalar: std::fmt::Debug + Send + Sync + dyn_clone::DynClone + 'static {
     /// convert itself to
     fn as_any(&self) -> &dyn Any;

From 710d6b3d76ebd968651fb9541815210147221091 Mon Sep 17 00:00:00 2001
From: Yijun Zhao <ariesdevil77@gmail.com>
Date: Sat, 7 Oct 2023 09:59:49 +0800
Subject: [PATCH 72/80] fix: fix nested decimal read and write (#1573)

---
 parquet_integration/write_parquet.py          |  20 +-
 .../deserialize/fixed_size_binary/basic.rs    |   8 +-
 .../deserialize/fixed_size_binary/nested.rs   |   2 +-
 src/io/parquet/write/fixed_len_bytes.rs       |  32 +++-
 src/io/parquet/write/mod.rs                   |   6 +-
 tests/it/io/parquet/mod.rs                    | 164 +++++++++++++++-
 tests/it/io/parquet/read.rs                   |  88 ++++++++-
 tests/it/io/parquet/write.rs                  | 176 +++++++++++++++++-
 8 files changed, 467 insertions(+), 29 deletions(-)

diff --git a/parquet_integration/write_parquet.py b/parquet_integration/write_parquet.py
index 072b59c775..2e0e4b332b 100644
--- a/parquet_integration/write_parquet.py
+++ b/parquet_integration/write_parquet.py
@@ -234,8 +234,14 @@ def case_nested() -> Tuple[dict, pa.Schema, str]:
         pa.field("list_bool", pa.list_(pa.bool_())),
         pa.field("list_utf8", pa.list_(pa.utf8())),
         pa.field("list_large_binary", pa.list_(pa.large_binary())),
-        pa.field("list_decimal", pa.list_(pa.decimal128(9, 0))),
-        pa.field("list_decimal256", pa.list_(pa.decimal256(9, 0))),
+        pa.field("list_decimal_9", pa.list_(pa.decimal128(9, 0))),
+        pa.field("list_decimal_18", pa.list_(pa.decimal128(18, 0))),
+        pa.field("list_decimal_26", pa.list_(pa.decimal128(26, 0))),
+        pa.field("list_decimal256_9", pa.list_(pa.decimal256(9, 0))),
+        pa.field("list_decimal256_18", pa.list_(pa.decimal256(18, 0))),
+        pa.field("list_decimal256_26", pa.list_(pa.decimal256(26, 0))),
+        pa.field("list_decimal256_39", pa.list_(pa.decimal256(39, 0))),
+        pa.field("list_decimal256_76", pa.list_(pa.decimal256(76, 0))),
         pa.field("list_nested_i64", pa.list_(pa.list_(pa.int64()))),
         pa.field("list_nested_decimal", pa.list_(pa.list_(pa.decimal128(9, 0)))),
         pa.field("list_nested_inner_required_i64", pa.list_(pa.list_(pa.int64()))),
@@ -266,8 +272,14 @@ def case_nested() -> Tuple[dict, pa.Schema, str]:
             "list_bool": boolean,
             "list_utf8": string,
             "list_large_binary": string,
-            "list_decimal": decimal_nullable,
-            "list_decimal256": decimal_nullable,
+            "list_decimal_9": decimal_nullable,
+            "list_decimal_18": decimal_nullable,
+            "list_decimal_26": decimal_nullable,
+            "list_decimal256_9": decimal_nullable,
+            "list_decimal256_18": decimal_nullable,
+            "list_decimal256_26": decimal_nullable,
+            "list_decimal256_39": decimal_nullable,
+            "list_decimal256_76": decimal_nullable,
             "list_nested_i64": items_nested,
             "list_nested_decimal": decimal_nested,
             "list_nested_inner_required_i64": items_required_nested,
diff --git a/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs b/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs
index c77ff5f027..913d1e6be4 100644
--- a/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs
+++ b/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs
@@ -46,11 +46,11 @@ pub(super) struct Required<'a> {
 }
 
 impl<'a> Required<'a> {
-    pub(super) fn new(page: &'a DataPage, size: usize) -> Self {
-        let values = page.buffer();
+    pub(super) fn try_new(page: &'a DataPage, size: usize) -> Result<Self> {
+        let (_, _, values) = split_buffer(page)?;
         assert_eq!(values.len() % size, 0);
         let values = values.chunks_exact(size);
-        Self { values }
+        Ok(Self { values })
     }
 
     #[inline]
@@ -171,7 +171,7 @@ impl<'a> Decoder<'a> for BinaryDecoder {
                 Ok(State::Optional(Optional::try_new(page, self.size)?))
             }
             (Encoding::Plain, _, false, false) => {
-                Ok(State::Required(Required::new(page, self.size)))
+                Ok(State::Required(Required::try_new(page, self.size)?))
             }
             (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false, false) => {
                 RequiredDictionary::try_new(page, dict).map(State::RequiredDictionary)
diff --git a/src/io/parquet/read/deserialize/fixed_size_binary/nested.rs b/src/io/parquet/read/deserialize/fixed_size_binary/nested.rs
index 5cef9eabfc..19552447f9 100644
--- a/src/io/parquet/read/deserialize/fixed_size_binary/nested.rs
+++ b/src/io/parquet/read/deserialize/fixed_size_binary/nested.rs
@@ -60,7 +60,7 @@ impl<'a> NestedDecoder<'a> for BinaryDecoder {
                 Ok(State::Optional(Optional::try_new(page, self.size)?))
             }
             (Encoding::Plain, _, false, false) => {
-                Ok(State::Required(Required::new(page, self.size)))
+                Ok(State::Required(Required::try_new(page, self.size)?))
             }
             (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false, false) => {
                 RequiredDictionary::try_new(page, dict).map(State::RequiredDictionary)
diff --git a/src/io/parquet/write/fixed_len_bytes.rs b/src/io/parquet/write/fixed_len_bytes.rs
index 91b641da17..789b822a73 100644
--- a/src/io/parquet/write/fixed_len_bytes.rs
+++ b/src/io/parquet/write/fixed_len_bytes.rs
@@ -6,6 +6,7 @@ use parquet2::{
 };
 
 use super::{binary::ord_binary, utils, WriteOptions};
+use crate::io::parquet::write::{nested, Nested};
 use crate::types::i256;
 use crate::{
     array::{Array, FixedSizeBinaryArray, PrimitiveArray},
@@ -62,7 +63,36 @@ pub fn array_to_page(
     )
 }
 
-pub(super) fn build_statistics(
+pub fn nested_array_to_page(
+    array: &FixedSizeBinaryArray,
+    options: WriteOptions,
+    type_: PrimitiveType,
+    statistics: Option<FixedLenStatistics>,
+    nested: &[Nested],
+) -> Result<DataPage> {
+    let is_optional = is_nullable(&type_.field_info);
+
+    let mut buffer = vec![];
+    let (repetition_levels_byte_length, definition_levels_byte_length) =
+        nested::write_rep_and_def(options.version, nested, &mut buffer)?;
+
+    encode_plain(array, is_optional, &mut buffer);
+
+    utils::build_plain_page(
+        buffer,
+        nested::num_values(nested),
+        nested[0].len(),
+        array.null_count(),
+        repetition_levels_byte_length,
+        definition_levels_byte_length,
+        statistics.map(|x| serialize_statistics(&x)),
+        type_,
+        options,
+        Encoding::Plain,
+    )
+}
+
+pub fn build_statistics(
     array: &FixedSizeBinaryArray,
     primitive_type: PrimitiveType,
 ) -> FixedLenStatistics {
diff --git a/src/io/parquet/write/mod.rs b/src/io/parquet/write/mod.rs
index 7889ea04fa..d4134f27df 100644
--- a/src/io/parquet/write/mod.rs
+++ b/src/io/parquet/write/mod.rs
@@ -727,7 +727,7 @@ fn array_to_page_nested(
                     values.into(),
                     array.validity().cloned(),
                 );
-                fixed_len_bytes::array_to_page(&array, options, type_, statistics)
+                fixed_len_bytes::nested_array_to_page(&array, options, type_, statistics, nested)
             }
         }
         Decimal256(precision, _) => {
@@ -782,7 +782,7 @@ fn array_to_page_nested(
                     values.into(),
                     array.validity().cloned(),
                 );
-                fixed_len_bytes::array_to_page(&array, options, type_, statistics)
+                fixed_len_bytes::nested_array_to_page(&array, options, type_, statistics, nested)
             } else {
                 let size = 32;
                 let array = array
@@ -807,7 +807,7 @@ fn array_to_page_nested(
                     array.validity().cloned(),
                 );
 
-                fixed_len_bytes::array_to_page(&array, options, type_, statistics)
+                fixed_len_bytes::nested_array_to_page(&array, options, type_, statistics, nested)
             }
         }
         other => Err(Error::NotYetImplemented(format!(
diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs
index 94d6cdf77e..4803cc9c52 100644
--- a/tests/it/io/parquet/mod.rs
+++ b/tests/it/io/parquet/mod.rs
@@ -240,14 +240,28 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box<dyn Array> {
             Some(b"bbb".to_vec()),
             Some(b"".to_vec()),
         ])),
-        "list_decimal" => {
+        "list_decimal_9" => {
             let values = i64_values
                 .iter()
                 .map(|x| x.map(|x| x as i128))
                 .collect::<Vec<_>>();
             Box::new(PrimitiveArray::<i128>::from(values).to(DataType::Decimal(9, 0)))
         }
-        "list_decimal256" => {
+        "list_decimal_18" => {
+            let values = i64_values
+                .iter()
+                .map(|x| x.map(|x| x as i128))
+                .collect::<Vec<_>>();
+            Box::new(PrimitiveArray::<i128>::from(values).to(DataType::Decimal(18, 0)))
+        }
+        "list_decimal_26" => {
+            let values = i64_values
+                .iter()
+                .map(|x| x.map(|x| x as i128))
+                .collect::<Vec<_>>();
+            Box::new(PrimitiveArray::<i128>::from(values).to(DataType::Decimal(26, 0)))
+        }
+        "list_decimal256_9" => {
             let values = i64_values
                 .iter()
                 .map(|x| x.map(|x| i256(x.as_i256())))
@@ -255,6 +269,38 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box<dyn Array> {
             let array = PrimitiveArray::<i256>::from(values).to(DataType::Decimal256(9, 0));
             Box::new(array)
         }
+        "list_decimal256_18" => {
+            let values = i64_values
+                .iter()
+                .map(|x| x.map(|x| i256(x.as_i256())))
+                .collect::<Vec<_>>();
+            let array = PrimitiveArray::<i256>::from(values).to(DataType::Decimal256(18, 0));
+            Box::new(array)
+        }
+        "list_decimal256_26" => {
+            let values = i64_values
+                .iter()
+                .map(|x| x.map(|x| i256(x.as_i256())))
+                .collect::<Vec<_>>();
+            let array = PrimitiveArray::<i256>::from(values).to(DataType::Decimal256(26, 0));
+            Box::new(array)
+        }
+        "list_decimal256_39" => {
+            let values = i64_values
+                .iter()
+                .map(|x| x.map(|x| i256(x.as_i256())))
+                .collect::<Vec<_>>();
+            let array = PrimitiveArray::<i256>::from(values).to(DataType::Decimal256(39, 0));
+            Box::new(array)
+        }
+        "list_decimal256_76" => {
+            let values = i64_values
+                .iter()
+                .map(|x| x.map(|x| i256(x.as_i256())))
+                .collect::<Vec<_>>();
+            let array = PrimitiveArray::<i256>::from(values).to(DataType::Decimal256(76, 0));
+            Box::new(array)
+        }
         "list_nested_i64"
         | "list_nested_decimal"
         | "list_nested_inner_required_i64"
@@ -479,8 +525,14 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box<dyn Array> {
                 "list_bool" => Field::new("item", DataType::Boolean, true),
                 "list_utf8" => Field::new("item", DataType::Utf8, true),
                 "list_large_binary" => Field::new("item", DataType::LargeBinary, true),
-                "list_decimal" => Field::new("item", DataType::Decimal(9, 0), true),
-                "list_decimal256" => Field::new("item", DataType::Decimal256(9, 0), true),
+                "list_decimal_9" => Field::new("item", DataType::Decimal(9, 0), true),
+                "list_decimal_18" => Field::new("item", DataType::Decimal(18, 0), true),
+                "list_decimal_26" => Field::new("item", DataType::Decimal(26, 0), true),
+                "list_decimal256_9" => Field::new("item", DataType::Decimal256(9, 0), true),
+                "list_decimal256_18" => Field::new("item", DataType::Decimal256(18, 0), true),
+                "list_decimal256_26" => Field::new("item", DataType::Decimal256(26, 0), true),
+                "list_decimal256_39" => Field::new("item", DataType::Decimal256(39, 0), true),
+                "list_decimal256_76" => Field::new("item", DataType::Decimal256(76, 0), true),
                 "list_struct_nullable" => Field::new("item", values.data_type().clone(), true),
                 "list_struct_list_nullable" => Field::new("item", values.data_type().clone(), true),
                 other => unreachable!("{}", other),
@@ -927,7 +979,7 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics {
             min_value: new_list(Box::new(BinaryArray::<i64>::from_slice([b""])), true).boxed(),
             max_value: new_list(Box::new(BinaryArray::<i64>::from_slice([b"ccc"])), true).boxed(),
         },
-        "list_decimal" => Statistics {
+        "list_decimal_9" => Statistics {
             distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(),
             null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(),
             min_value: new_list(
@@ -941,7 +993,35 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics {
             )
             .boxed(),
         },
-        "list_decimal256" => Statistics {
+        "list_decimal_18" => Statistics {
+            distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(),
+            null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(),
+            min_value: new_list(
+                Box::new(Int128Array::from_slice([0]).to(DataType::Decimal(18, 0))),
+                true,
+            )
+            .boxed(),
+            max_value: new_list(
+                Box::new(Int128Array::from_slice([10]).to(DataType::Decimal(18, 0))),
+                true,
+            )
+            .boxed(),
+        },
+        "list_decimal_26" => Statistics {
+            distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(),
+            null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(),
+            min_value: new_list(
+                Box::new(Int128Array::from_slice([0]).to(DataType::Decimal(26, 0))),
+                true,
+            )
+            .boxed(),
+            max_value: new_list(
+                Box::new(Int128Array::from_slice([10]).to(DataType::Decimal(26, 0))),
+                true,
+            )
+            .boxed(),
+        },
+        "list_decimal256_9" => Statistics {
             distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(),
             null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(),
             min_value: new_list(
@@ -959,6 +1039,78 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics {
             )
             .boxed(),
         },
+        "list_decimal256_18" => Statistics {
+            distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(),
+            null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(),
+            min_value: new_list(
+                Box::new(
+                    Int256Array::from_slice([i256(0.as_i256())]).to(DataType::Decimal256(18, 0)),
+                ),
+                true,
+            )
+            .boxed(),
+            max_value: new_list(
+                Box::new(
+                    Int256Array::from_slice([i256(10.as_i256())]).to(DataType::Decimal256(18, 0)),
+                ),
+                true,
+            )
+            .boxed(),
+        },
+        "list_decimal256_26" => Statistics {
+            distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(),
+            null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(),
+            min_value: new_list(
+                Box::new(
+                    Int256Array::from_slice([i256(0.as_i256())]).to(DataType::Decimal256(26, 0)),
+                ),
+                true,
+            )
+            .boxed(),
+            max_value: new_list(
+                Box::new(
+                    Int256Array::from_slice([i256(10.as_i256())]).to(DataType::Decimal256(26, 0)),
+                ),
+                true,
+            )
+            .boxed(),
+        },
+        "list_decimal256_39" => Statistics {
+            distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(),
+            null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(),
+            min_value: new_list(
+                Box::new(
+                    Int256Array::from_slice([i256(0.as_i256())]).to(DataType::Decimal256(39, 0)),
+                ),
+                true,
+            )
+            .boxed(),
+            max_value: new_list(
+                Box::new(
+                    Int256Array::from_slice([i256(10.as_i256())]).to(DataType::Decimal256(39, 0)),
+                ),
+                true,
+            )
+            .boxed(),
+        },
+        "list_decimal256_76" => Statistics {
+            distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(),
+            null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(),
+            min_value: new_list(
+                Box::new(
+                    Int256Array::from_slice([i256(0.as_i256())]).to(DataType::Decimal256(76, 0)),
+                ),
+                true,
+            )
+            .boxed(),
+            max_value: new_list(
+                Box::new(
+                    Int256Array::from_slice([i256(10.as_i256())]).to(DataType::Decimal256(76, 0)),
+                ),
+                true,
+            )
+            .boxed(),
+        },
         "list_int64" => Statistics {
             distinct_count: new_list(UInt64Array::from([None]).boxed(), true).boxed(),
             null_count: new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed(),
diff --git a/tests/it/io/parquet/read.rs b/tests/it/io/parquet/read.rs
index 7689f1532f..12512116f4 100644
--- a/tests/it/io/parquet/read.rs
+++ b/tests/it/io/parquet/read.rs
@@ -62,8 +62,14 @@ fn test_pyarrow_integration(
         "list_nested_i64",
         "list_utf8",
         "list_bool",
-        "list_decimal",
-        "list_decimal256",
+        "list_decimal_9",
+        "list_decimal_18",
+        "list_decimal_26",
+        "list_decimal256_9",
+        "list_decimal256_18",
+        "list_decimal256_26",
+        "list_decimal256_39",
+        "list_decimal256_76",
         "list_nested_inner_required_required_i64",
         "list_nested_inner_required_i64",
         // pyarrow counts null struct items as nulls
@@ -325,13 +331,83 @@ fn v1_nested_large_binary() -> Result<()> {
 }
 
 #[test]
-fn v2_nested_decimal_nullable() -> Result<()> {
-    test_pyarrow_integration("list_decimal", 2, "nested", false, false, None)
+fn v1_nested_decimal_9_nullable() -> Result<()> {
+    test_pyarrow_integration("list_decimal_9", 1, "nested", false, false, None)
 }
 
 #[test]
-fn v2_nested_decimal256_nullable() -> Result<()> {
-    test_pyarrow_integration("list_decimal256", 2, "nested", false, false, None)
+fn v1_nested_decimal_18_nullable() -> Result<()> {
+    test_pyarrow_integration("list_decimal_18", 1, "nested", false, false, None)
+}
+
+#[test]
+fn v1_nested_decimal_26_nullable() -> Result<()> {
+    test_pyarrow_integration("list_decimal_26", 1, "nested", false, false, None)
+}
+
+#[test]
+fn v2_nested_decimal_9_nullable() -> Result<()> {
+    test_pyarrow_integration("list_decimal_9", 2, "nested", false, false, None)
+}
+
+#[test]
+fn v2_nested_decimal_18_nullable() -> Result<()> {
+    test_pyarrow_integration("list_decimal_18", 2, "nested", false, false, None)
+}
+
+#[test]
+fn v2_nested_decimal_26_nullable() -> Result<()> {
+    test_pyarrow_integration("list_decimal_26", 2, "nested", false, false, None)
+}
+
+#[test]
+fn v1_nested_decimal256_9_nullable() -> Result<()> {
+    test_pyarrow_integration("list_decimal256_9", 1, "nested", false, false, None)
+}
+
+#[test]
+fn v1_nested_decimal256_18_nullable() -> Result<()> {
+    test_pyarrow_integration("list_decimal256_18", 1, "nested", false, false, None)
+}
+
+#[test]
+fn v1_nested_decimal256_26_nullable() -> Result<()> {
+    test_pyarrow_integration("list_decimal256_26", 1, "nested", false, false, None)
+}
+
+#[test]
+fn v1_nested_decimal256_39_nullable() -> Result<()> {
+    test_pyarrow_integration("list_decimal256_39", 1, "nested", false, false, None)
+}
+
+#[test]
+fn v1_nested_decimal256_76_nullable() -> Result<()> {
+    test_pyarrow_integration("list_decimal256_76", 1, "nested", false, false, None)
+}
+
+#[test]
+fn v2_nested_decimal256_9_nullable() -> Result<()> {
+    test_pyarrow_integration("list_decimal256_9", 2, "nested", false, false, None)
+}
+
+#[test]
+fn v2_nested_decimal256_18_nullable() -> Result<()> {
+    test_pyarrow_integration("list_decimal256_18", 2, "nested", false, false, None)
+}
+
+#[test]
+fn v2_nested_decimal256_26_nullable() -> Result<()> {
+    test_pyarrow_integration("list_decimal256_26", 2, "nested", false, false, None)
+}
+
+#[test]
+fn v2_nested_decimal256_39_nullable() -> Result<()> {
+    test_pyarrow_integration("list_decimal256_39", 2, "nested", false, false, None)
+}
+
+#[test]
+fn v2_nested_decimal256_76_nullable() -> Result<()> {
+    test_pyarrow_integration("list_decimal256_76", 2, "nested", false, false, None)
 }
 
 #[test]
diff --git a/tests/it/io/parquet/write.rs b/tests/it/io/parquet/write.rs
index 5fda011374..dee5b8e253 100644
--- a/tests/it/io/parquet/write.rs
+++ b/tests/it/io/parquet/write.rs
@@ -405,9 +405,9 @@ fn list_struct_nullable() -> Result<()> {
 }
 
 #[test]
-fn list_decimal_nullable() -> Result<()> {
+fn list_decimal_9_nullable_v1() -> Result<()> {
     round_trip_opt_stats(
-        "list_decimal",
+        "list_decimal_9",
         "nested",
         Version::V1,
         CompressionOptions::Uncompressed,
@@ -417,9 +417,9 @@ fn list_decimal_nullable() -> Result<()> {
 }
 
 #[test]
-fn list_decimal256_nullable() -> Result<()> {
+fn list_decimal_18_nullable_v1() -> Result<()> {
     round_trip_opt_stats(
-        "list_decimal256",
+        "list_decimal_18",
         "nested",
         Version::V1,
         CompressionOptions::Uncompressed,
@@ -428,6 +428,174 @@ fn list_decimal256_nullable() -> Result<()> {
     )
 }
 
+#[test]
+fn list_decimal_26_nullable_v1() -> Result<()> {
+    round_trip_opt_stats(
+        "list_decimal_26",
+        "nested",
+        Version::V1,
+        CompressionOptions::Uncompressed,
+        vec![Encoding::Plain],
+        true,
+    )
+}
+
+#[test]
+fn list_decimal_9_nullable_v2() -> Result<()> {
+    round_trip_opt_stats(
+        "list_decimal_9",
+        "nested",
+        Version::V2,
+        CompressionOptions::Uncompressed,
+        vec![Encoding::Plain],
+        true,
+    )
+}
+
+#[test]
+fn list_decimal_18_nullable_v2() -> Result<()> {
+    round_trip_opt_stats(
+        "list_decimal_18",
+        "nested",
+        Version::V2,
+        CompressionOptions::Uncompressed,
+        vec![Encoding::Plain],
+        true,
+    )
+}
+
+#[test]
+fn list_decimal_26_nullable_v2() -> Result<()> {
+    round_trip_opt_stats(
+        "list_decimal_26",
+        "nested",
+        Version::V2,
+        CompressionOptions::Uncompressed,
+        vec![Encoding::Plain],
+        true,
+    )
+}
+
+#[test]
+fn list_decimal256_9_nullable_v1() -> Result<()> {
+    round_trip_opt_stats(
+        "list_decimal256_9",
+        "nested",
+        Version::V1,
+        CompressionOptions::Uncompressed,
+        vec![Encoding::Plain],
+        true,
+    )
+}
+
+#[test]
+fn list_decimal256_18_nullable_v1() -> Result<()> {
+    round_trip_opt_stats(
+        "list_decimal256_18",
+        "nested",
+        Version::V1,
+        CompressionOptions::Uncompressed,
+        vec![Encoding::Plain],
+        true,
+    )
+}
+
+#[test]
+fn list_decimal256_26_nullable_v1() -> Result<()> {
+    round_trip_opt_stats(
+        "list_decimal256_26",
+        "nested",
+        Version::V1,
+        CompressionOptions::Uncompressed,
+        vec![Encoding::Plain],
+        true,
+    )
+}
+
+#[test]
+fn list_decimal256_39_nullable_v1() -> Result<()> {
+    round_trip_opt_stats(
+        "list_decimal256_39",
+        "nested",
+        Version::V1,
+        CompressionOptions::Uncompressed,
+        vec![Encoding::Plain],
+        true,
+    )
+}
+
+#[test]
+fn list_decimal256_76_nullable_v1() -> Result<()> {
+    round_trip_opt_stats(
+        "list_decimal256_76",
+        "nested",
+        Version::V1,
+        CompressionOptions::Uncompressed,
+        vec![Encoding::Plain],
+        true,
+    )
+}
+
+#[test]
+fn list_decimal256_9_nullable_v2() -> Result<()> {
+    round_trip_opt_stats(
+        "list_decimal256_9",
+        "nested",
+        Version::V2,
+        CompressionOptions::Uncompressed,
+        vec![Encoding::Plain],
+        true,
+    )
+}
+
+#[test]
+fn list_decimal256_18_nullable_v2() -> Result<()> {
+    round_trip_opt_stats(
+        "list_decimal256_18",
+        "nested",
+        Version::V2,
+        CompressionOptions::Uncompressed,
+        vec![Encoding::Plain],
+        true,
+    )
+}
+
+#[test]
+fn list_decimal256_26_nullable_v2() -> Result<()> {
+    round_trip_opt_stats(
+        "list_decimal256_26",
+        "nested",
+        Version::V2,
+        CompressionOptions::Uncompressed,
+        vec![Encoding::Plain],
+        true,
+    )
+}
+
+#[test]
+fn list_decimal256_39_nullable_v2() -> Result<()> {
+    round_trip_opt_stats(
+        "list_decimal256_39",
+        "nested",
+        Version::V2,
+        CompressionOptions::Uncompressed,
+        vec![Encoding::Plain],
+        true,
+    )
+}
+
+#[test]
+fn list_decimal256_76_nullable_v2() -> Result<()> {
+    round_trip_opt_stats(
+        "list_decimal256_76",
+        "nested",
+        Version::V2,
+        CompressionOptions::Uncompressed,
+        vec![Encoding::Plain],
+        true,
+    )
+}
+
 #[test]
 fn v1_nested_struct_list_nullable() -> Result<()> {
     round_trip_opt_stats(

From dd80c891850213104c8c0d11b76b56401cb1ce52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= <git@jhorstmann.net>
Date: Sat, 7 Oct 2023 04:17:22 +0200
Subject: [PATCH 73/80] Fix the inferred nullability when converting a nested
 parquet schema to arrow (#1565)

---
 src/io/parquet/read/schema/convert.rs | 74 +++++++++++++++++++++------
 1 file changed, 58 insertions(+), 16 deletions(-)

diff --git a/src/io/parquet/read/schema/convert.rs b/src/io/parquet/read/schema/convert.rs
index dea14ca86b..1ec43acd4e 100644
--- a/src/io/parquet/read/schema/convert.rs
+++ b/src/io/parquet/read/schema/convert.rs
@@ -299,7 +299,7 @@ fn to_group_type(
 pub(crate) fn is_nullable(field_info: &FieldInfo) -> bool {
     match field_info.repetition {
         Repetition::Optional => true,
-        Repetition::Repeated => true,
+        Repetition::Repeated => false,
         Repetition::Required => false,
     }
 }
@@ -353,12 +353,12 @@ fn to_list(
             let field = fields.first().unwrap();
             (
                 &field.get_field_info().name,
-                field.get_field_info().repetition != Repetition::Required,
+                field.get_field_info().repetition == Repetition::Optional,
             )
         }
         _ => (
             &item.get_field_info().name,
-            item.get_field_info().repetition != Repetition::Required,
+            item.get_field_info().repetition == Repetition::Optional,
         ),
     };
 
@@ -611,7 +611,7 @@ mod tests {
         {
             arrow_fields.push(Field::new(
                 "my_list",
-                DataType::List(Box::new(Field::new("element", DataType::Utf8, true))),
+                DataType::List(Box::new(Field::new("element", DataType::Utf8, false))),
                 true,
             ));
         }
@@ -623,7 +623,7 @@ mod tests {
         {
             arrow_fields.push(Field::new(
                 "my_list",
-                DataType::List(Box::new(Field::new("element", DataType::Int32, true))),
+                DataType::List(Box::new(Field::new("element", DataType::Int32, false))),
                 true,
             ));
         }
@@ -642,7 +642,7 @@ mod tests {
             ]);
             arrow_fields.push(Field::new(
                 "my_list",
-                DataType::List(Box::new(Field::new("element", arrow_struct, true))),
+                DataType::List(Box::new(Field::new("element", arrow_struct, false))),
                 true,
             ));
         }
@@ -658,7 +658,7 @@ mod tests {
             let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]);
             arrow_fields.push(Field::new(
                 "my_list",
-                DataType::List(Box::new(Field::new("array", arrow_struct, true))),
+                DataType::List(Box::new(Field::new("array", arrow_struct, false))),
                 true,
             ));
         }
@@ -674,7 +674,7 @@ mod tests {
             let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]);
             arrow_fields.push(Field::new(
                 "my_list",
-                DataType::List(Box::new(Field::new("my_list_tuple", arrow_struct, true))),
+                DataType::List(Box::new(Field::new("my_list_tuple", arrow_struct, false))),
                 true,
             ));
         }
@@ -684,8 +684,50 @@ mod tests {
         {
             arrow_fields.push(Field::new(
                 "name",
-                DataType::List(Box::new(Field::new("name", DataType::Int32, true))),
-                true,
+                DataType::List(Box::new(Field::new("name", DataType::Int32, false))),
+                false,
+            ));
+        }
+
+        let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;
+        let fields = parquet_to_arrow_schema(parquet_schema.fields());
+
+        assert_eq!(arrow_fields, fields);
+        Ok(())
+    }
+
+    #[test]
+    fn test_parquet_list_with_struct() -> Result<()> {
+        let mut arrow_fields = Vec::new();
+
+        let message_type = "
+            message eventlog {
+              REQUIRED group events (LIST) {
+                REPEATED group array {
+                  REQUIRED BYTE_ARRAY event_name (STRING);
+                  REQUIRED INT64 event_time (TIMESTAMP(MILLIS,true));
+                }
+              }
+            }
+        ";
+
+        {
+            let struct_fields = vec![
+                Field::new("event_name", DataType::Utf8, false),
+                Field::new(
+                    "event_time",
+                    DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),
+                    false,
+                ),
+            ];
+            arrow_fields.push(Field::new(
+                "events",
+                DataType::List(Box::new(Field::new(
+                    "array",
+                    DataType::Struct(struct_fields),
+                    false,
+                ))),
+                false,
             ));
         }
 
@@ -812,9 +854,9 @@ mod tests {
                 DataType::List(Box::new(Field::new(
                     "innerGroup",
                     DataType::Struct(vec![Field::new("leaf3", DataType::Int32, true)]),
-                    true,
+                    false,
                 ))),
-                true,
+                false,
             );
 
             let outer_group_list = Field::new(
@@ -825,9 +867,9 @@ mod tests {
                         Field::new("leaf2", DataType::Int32, true),
                         inner_group_list,
                     ]),
-                    true,
+                    false,
                 ))),
-                true,
+                false,
             );
             arrow_fields.push(outer_group_list);
         }
@@ -888,8 +930,8 @@ mod tests {
             Field::new("string", DataType::Utf8, true),
             Field::new(
                 "bools",
-                DataType::List(Box::new(Field::new("bools", DataType::Boolean, true))),
-                true,
+                DataType::List(Box::new(Field::new("bools", DataType::Boolean, false))),
+                false,
             ),
             Field::new("date", DataType::Date32, true),
             Field::new("time_milli", DataType::Time32(TimeUnit::Millisecond), true),

From 6a4b53169a48cbd234cecde6ab6a98f84146fca2 Mon Sep 17 00:00:00 2001
From: Jk Xu <54522439+Dousir9@users.noreply.github.com>
Date: Fri, 13 Oct 2023 21:40:29 +0800
Subject: [PATCH 74/80] add new_constant for Bitmap (#1579)

---
 src/bitmap/immutable.rs | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs
index c453a6f31a..57502e911e 100644
--- a/src/bitmap/immutable.rs
+++ b/src/bitmap/immutable.rs
@@ -283,6 +283,15 @@ impl Bitmap {
         }
     }
 
+    /// Initializes an new [`Bitmap`] filled with set/unset values.
+    #[inline]
+    pub fn new_constant(value: bool, length: usize) -> Self {
+        match value {
+            true => Self::new_trued(length),
+            false => Self::new_zeroed(length),
+        }
+    }
+
     /// Initializes an new [`Bitmap`] filled with unset values.
     #[inline]
     pub fn new_zeroed(length: usize) -> Self {
@@ -292,6 +301,15 @@ impl Bitmap {
         unsafe { Bitmap::from_inner_unchecked(Arc::new(bytes.into()), 0, length, length) }
     }
 
+    /// Initializes an new [`Bitmap`] filled with set values.
+    #[inline]
+    pub fn new_trued(length: usize) -> Self {
+        // just set each byte to u8::MAX
+        // we will not access data with index >= length
+        let bytes = vec![0b11111111u8; length.saturating_add(7) / 8];
+        unsafe { Bitmap::from_inner_unchecked(Arc::new(bytes.into()), 0, length, length) }
+    }
+
     /// Counts the nulls (unset bits) starting from `offset` bits and for `length` bits.
     #[inline]
     pub fn null_count_range(&self, offset: usize, length: usize) -> usize {

From 3c61372d7ac76bba149316b1fbad6e981752e502 Mon Sep 17 00:00:00 2001
From: Jk Xu <54522439+Dousir9@users.noreply.github.com>
Date: Wed, 18 Oct 2023 15:26:47 +0800
Subject: [PATCH 75/80] fix bitmap new_trued (#1580)

---
 src/bitmap/immutable.rs      |  2 +-
 tests/it/bitmap/immutable.rs | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs
index 57502e911e..6883d3312f 100644
--- a/src/bitmap/immutable.rs
+++ b/src/bitmap/immutable.rs
@@ -307,7 +307,7 @@ impl Bitmap {
         // just set each byte to u8::MAX
         // we will not access data with index >= length
         let bytes = vec![0b11111111u8; length.saturating_add(7) / 8];
-        unsafe { Bitmap::from_inner_unchecked(Arc::new(bytes.into()), 0, length, length) }
+        unsafe { Bitmap::from_inner_unchecked(Arc::new(bytes.into()), 0, length, 0) }
     }
 
     /// Counts the nulls (unset bits) starting from `offset` bits and for `length` bits.
diff --git a/tests/it/bitmap/immutable.rs b/tests/it/bitmap/immutable.rs
index 5e6157413e..cc003009e0 100644
--- a/tests/it/bitmap/immutable.rs
+++ b/tests/it/bitmap/immutable.rs
@@ -32,6 +32,25 @@ fn as_slice_offset_middle() {
     assert_eq!(length, 5);
 }
 
+#[test]
+fn new_constant() {
+    let b = Bitmap::new_constant(true, 9);
+    let (slice, offset, length) = b.as_slice();
+    assert_eq!(slice[0], 0b11111111);
+    assert!((slice[1] & 0b00000001) > 0);
+    assert_eq!(offset, 0);
+    assert_eq!(length, 9);
+    assert_eq!(b.unset_bits(), 0);
+
+    let b = Bitmap::new_constant(false, 9);
+    let (slice, offset, length) = b.as_slice();
+    assert_eq!(slice[0], 0b00000000);
+    assert!((slice[1] & 0b00000001) == 0);
+    assert_eq!(offset, 0);
+    assert_eq!(length, 9);
+    assert_eq!(b.unset_bits(), 9);
+}
+
 #[test]
 fn debug() {
     let b = Bitmap::from([true, true, false, true, true, true, true, true, true]);

From 9a26422d00b83c65245f75e02eb436dedd91b5b8 Mon Sep 17 00:00:00 2001
From: sundyli <543950155@qq.com>
Date: Thu, 19 Oct 2023 17:19:15 -0700
Subject: [PATCH 76/80] chore: add max bytes_estimate to reserve the capacity
 of binary (#1581)

---
 src/io/parquet/read/deserialize/binary/utils.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/io/parquet/read/deserialize/binary/utils.rs b/src/io/parquet/read/deserialize/binary/utils.rs
index ec514766fa..a48063c56e 100644
--- a/src/io/parquet/read/deserialize/binary/utils.rs
+++ b/src/io/parquet/read/deserialize/binary/utils.rs
@@ -48,7 +48,8 @@ impl<O: Offset> Binary<O> {
         if self.offsets.len_proxy() == 100 && self.offsets.capacity() > 100 {
             let bytes_per_row = self.values.len() / 100 + 1;
             let bytes_estimate = bytes_per_row * self.offsets.capacity();
-            if bytes_estimate > self.values.capacity() {
+
+            if bytes_estimate > self.values.capacity() && bytes_estimate < 10 * 1024 * 1024 {
                 self.values.reserve(bytes_estimate - self.values.capacity());
             }
         }

From 346c866c4dbfd9d9517148fd6d18dd2f17b730d1 Mon Sep 17 00:00:00 2001
From: Ryan Marcus <ryan@ryanmarc.us>
Date: Sat, 21 Oct 2023 20:09:03 -0400
Subject: [PATCH 77/80] Add a "contains" fast-path to `like_utf8_scalar`
 (#1582)

---
 Cargo.toml               | 10 +++++++++-
 benches/like_kernels.rs  | 22 ++++++++++++++++++++++
 src/compute/like.rs      | 11 +++++++++++
 tests/it/compute/like.rs |  4 ++++
 4 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 benches/like_kernels.rs

diff --git a/Cargo.toml b/Cargo.toml
index 1bb20a6955..50dcea2e51 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -100,6 +100,9 @@ odbc-api = { version = "0.36", optional = true }
 # Faster hashing
 ahash = "0.8"
 
+# For `LIKE` matching "contains" fast-path 
+memchr = { version = "2.6", optional = true }
+
 # Support conversion to/from arrow-rs
 arrow-buffer = { version = ">=40", optional = true }
 arrow-schema = { version = ">=40", optional = true }
@@ -237,7 +240,7 @@ compute_filter = []
 compute_hash = ["multiversion"]
 compute_if_then_else = []
 compute_length = []
-compute_like = ["regex", "regex-syntax"]
+compute_like = ["regex", "regex-syntax", "dep:memchr"]
 compute_limit = []
 compute_merge_sort = ["itertools", "compute_sort"]
 compute_nullif = ["compute_comparison"]
@@ -394,3 +397,8 @@ harness = false
 [[bench]]
 name = "assign_ops"
 harness = false
+
+[[bench]]
+name = "like_kernels"
+harness = false
+
diff --git a/benches/like_kernels.rs b/benches/like_kernels.rs
new file mode 100644
index 0000000000..24f700244c
--- /dev/null
+++ b/benches/like_kernels.rs
@@ -0,0 +1,22 @@
+use arrow2::util::bench_util::create_string_array;
+use criterion::{criterion_group, criterion_main, Criterion};
+
+use arrow2::array::*;
+use arrow2::compute::like::like_utf8_scalar;
+
+fn bench_like(array: &Utf8Array<i32>, pattern: &str) {
+    criterion::black_box(like_utf8_scalar(array, pattern).unwrap());
+}
+
+fn add_benchmark(c: &mut Criterion) {
+    for size_log2 in 16..21_u32 {
+        let size = size_log2.pow(2) as usize;
+        let array = create_string_array::<i32>(100, size, 0.0, 0);
+        c.bench_function(&format!("LIKE length = 2^{}", size_log2), |b| {
+            b.iter(|| bench_like(&array, "%abba%"))
+        });
+    }
+}
+
+criterion_group!(benches, add_benchmark);
+criterion_main!(benches);
diff --git a/src/compute/like.rs b/src/compute/like.rs
index 98c1ea92f2..d52e9c5e9f 100644
--- a/src/compute/like.rs
+++ b/src/compute/like.rs
@@ -152,6 +152,17 @@ fn a_like_utf8_scalar<O: Offset, F: Fn(bool) -> bool>(
         // fast path, can use ends_with
         let ends_with = &rhs[1..];
         Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x.ends_with(ends_with))))
+    } else if rhs.starts_with('%')
+        && rhs.ends_with('%')
+        && !rhs.ends_with("\\%")
+        && !rhs[1..rhs.len() - 1].contains(is_like_pattern)
+    {
+        let needle = &rhs[1..rhs.len() - 1];
+        let finder = memchr::memmem::Finder::new(needle);
+        Bitmap::from_trusted_len_iter(
+            lhs.values_iter()
+                .map(|x| op(finder.find(x.as_bytes()).is_some())),
+        )
     } else {
         let re_pattern = replace_pattern(rhs);
         let re = Regex::new(&format!("^{re_pattern}$")).map_err(|e| {
diff --git a/tests/it/compute/like.rs b/tests/it/compute/like.rs
index c7026be7ca..8b99beb081 100644
--- a/tests/it/compute/like.rs
+++ b/tests/it/compute/like.rs
@@ -58,6 +58,10 @@ fn test_like_utf8_scalar() -> Result<()> {
     let result = like_utf8_scalar(&array, "A\\_row").unwrap();
     assert_eq!(result, BooleanArray::from_slice([true, false]));
 
+    let array = Utf8Array::<i32>::from_slice(["Arrow", "Arrow", "row your", "boat"]);
+    let result = like_utf8_scalar(&array, "%row%").unwrap();
+    assert_eq!(result, BooleanArray::from_slice([true, true, true, false]));
+
     Ok(())
 }
 

From 45313f7e1af6e164a7fd45940db2611d81ddeb1d Mon Sep 17 00:00:00 2001
From: sundyli <543950155@qq.com>
Date: Tue, 24 Oct 2023 00:49:14 -0700
Subject: [PATCH 78/80] bump chrono to 0.4.31 (#1584)

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 50dcea2e51..5deab656f5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,7 +20,7 @@ either = "1.9"
 num-traits = "0.2"
 dyn-clone = "1"
 bytemuck = { version = "1", features = ["derive"] }
-chrono = { version = "0.4", default_features = false, features = ["std"] }
+chrono = { version = "0.4.31", default_features = false, features = ["std"] }
 
 # for decimal i256
 ethnum = "1"

From b0734542c2fef5d2d0c7b6ffce5d094de371168a Mon Sep 17 00:00:00 2001
From: baishen <baishen2009@gmail.com>
Date: Tue, 24 Oct 2023 15:54:58 +0800
Subject: [PATCH 79/80] feat: Add `nested_column_iter_to_arrays` to deserialize
 inner columns (#1583)

---
 src/io/parquet/read/deserialize/mod.rs | 19 ++++++
 src/io/parquet/read/mod.rs             |  2 +-
 tests/it/io/parquet/deserialize.rs     | 85 ++++++++++++++++++++++++++
 tests/it/io/parquet/mod.rs             |  1 +
 4 files changed, 106 insertions(+), 1 deletion(-)
 create mode 100644 tests/it/io/parquet/deserialize.rs

diff --git a/src/io/parquet/read/deserialize/mod.rs b/src/io/parquet/read/deserialize/mod.rs
index 8dd55bb877..1079e577a8 100644
--- a/src/io/parquet/read/deserialize/mod.rs
+++ b/src/io/parquet/read/deserialize/mod.rs
@@ -214,3 +214,22 @@ where
             .map(|x| x.map(|x| x.1)),
     ))
 }
+
+/// Basically the same as `column_iter_to_arrays`, with the addition of the `init` parameter
+/// to read the inner columns of the nested type directly, instead of reading the entire nested type.
+pub fn nested_column_iter_to_arrays<'a, I: 'a>(
+    columns: Vec<I>,
+    types: Vec<&PrimitiveType>,
+    field: Field,
+    init: Vec<InitNested>,
+    chunk_size: Option<usize>,
+    num_rows: usize,
+) -> Result<ArrayIter<'a>>
+where
+    I: Pages,
+{
+    Ok(Box::new(
+        nested::columns_to_iter_recursive(columns, types, field, init, num_rows, chunk_size)?
+            .map(|x| x.map(|x| x.1)),
+    ))
+}
diff --git a/src/io/parquet/read/mod.rs b/src/io/parquet/read/mod.rs
index baaffd6d44..ea2b2f46d4 100644
--- a/src/io/parquet/read/mod.rs
+++ b/src/io/parquet/read/mod.rs
@@ -37,7 +37,7 @@ use crate::{array::Array, error::Result};
 use crate::types::{i256, NativeType};
 pub use deserialize::{
     column_iter_to_arrays, create_list, create_map, get_page_iterator, init_nested, n_columns,
-    InitNested, NestedArrayIter, NestedState, StructIterator,
+    nested_column_iter_to_arrays, InitNested, NestedArrayIter, NestedState, StructIterator,
 };
 pub use file::{FileReader, RowGroupReader};
 pub use row_group::*;
diff --git a/tests/it/io/parquet/deserialize.rs b/tests/it/io/parquet/deserialize.rs
new file mode 100644
index 0000000000..3ea1c2846e
--- /dev/null
+++ b/tests/it/io/parquet/deserialize.rs
@@ -0,0 +1,85 @@
+use std::fs::File;
+
+use arrow2::{
+    array::StructArray,
+    datatypes::DataType,
+    error::Result,
+    io::parquet::read::{
+        infer_schema, n_columns, nested_column_iter_to_arrays, read_columns, read_metadata,
+        to_deserializer, BasicDecompressor, InitNested, PageReader,
+    },
+};
+
+#[test]
+fn test_deserialize_nested_column() -> Result<()> {
+    let path = "testing/parquet-testing/data/nested_structs.rust.parquet";
+    let mut reader = File::open(path).unwrap();
+
+    let metadata = read_metadata(&mut reader)?;
+    let schema = infer_schema(&metadata)?;
+
+    let num_rows = metadata.num_rows;
+    let row_group = metadata.row_groups[0].clone();
+
+    let field_columns = schema
+        .fields
+        .iter()
+        .map(|field| read_columns(&mut reader, row_group.columns(), &field.name))
+        .collect::<Result<Vec<_>>>()?;
+
+    let fields = schema.fields.clone();
+    for (mut columns, field) in field_columns.into_iter().zip(fields.iter()) {
+        if let DataType::Struct(inner_fields) = &field.data_type {
+            let mut array_iter =
+                to_deserializer(columns.clone(), field.clone(), num_rows, None, None)?;
+            let array = array_iter.next().transpose()?.unwrap();
+            let expected_array = array
+                .as_any()
+                .downcast_ref::<StructArray>()
+                .unwrap()
+                .clone();
+
+            // deserialize inner values of struct fields.
+            let init = vec![InitNested::Struct(field.is_nullable)];
+            let mut values = Vec::with_capacity(inner_fields.len());
+            for inner_field in inner_fields {
+                let n = n_columns(&inner_field.data_type);
+                let inner_columns: Vec<_> = columns.drain(0..n).collect();
+
+                let (nestd_columns, types): (Vec<_>, Vec<_>) = inner_columns
+                    .into_iter()
+                    .map(|(column_meta, chunk)| {
+                        let len = chunk.len();
+                        let pages = PageReader::new(
+                            std::io::Cursor::new(chunk),
+                            column_meta,
+                            std::sync::Arc::new(|_, _| true),
+                            vec![],
+                            len * 2 + 1024,
+                        );
+                        (
+                            BasicDecompressor::new(pages, vec![]),
+                            &column_meta.descriptor().descriptor.primitive_type,
+                        )
+                    })
+                    .unzip();
+
+                let mut inner_array_iter = nested_column_iter_to_arrays(
+                    nestd_columns,
+                    types,
+                    inner_field.clone(),
+                    init.clone(),
+                    None,
+                    num_rows,
+                )?;
+                let inner_array = inner_array_iter.next().transpose()?;
+                values.push(inner_array.unwrap());
+            }
+            let struct_array = StructArray::try_new(field.data_type.clone(), values, None)?;
+
+            assert_eq!(expected_array, struct_array);
+        }
+    }
+
+    Ok(())
+}
diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs
index 4803cc9c52..1b38c61c99 100644
--- a/tests/it/io/parquet/mod.rs
+++ b/tests/it/io/parquet/mod.rs
@@ -14,6 +14,7 @@ use arrow2::{
     types::{days_ms, NativeType},
 };
 
+mod deserialize;
 #[cfg(feature = "io_json_integration")]
 mod integration;
 mod read;

From 3ddc6a10c6fbc2d0f85a9f66eeb46112abd07029 Mon Sep 17 00:00:00 2001
From: Ben Levin <mostlyamiable@gmail.com>
Date: Fri, 27 Oct 2023 19:26:38 -0500
Subject: [PATCH 80/80] Move parquet async functionality behind feature flag
 (#1586)

---
 Cargo.toml                       | 10 +++++-----
 src/io/parquet/read/mod.rs       | 13 +++++++++----
 src/io/parquet/read/row_group.rs |  6 ++++++
 src/io/parquet/write/mod.rs      |  3 +++
 4 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 5deab656f5..a8e5933d2f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -100,7 +100,7 @@ odbc-api = { version = "0.36", optional = true }
 # Faster hashing
 ahash = "0.8"
 
-# For `LIKE` matching "contains" fast-path 
+# For `LIKE` matching "contains" fast-path
 memchr = { version = "2.6", optional = true }
 
 # Support conversion to/from arrow-rs
@@ -117,7 +117,6 @@ getrandom = { version = "0.2", features = ["js"] }
 version = "0.17"
 optional = true
 default_features = false
-features = ["async"]
 
 [dev-dependencies]
 criterion = "0.4"
@@ -160,7 +159,7 @@ full = [
     "io_ipc_compression",
     "io_json_integration",
     "io_print",
-    "io_parquet",
+    "io_parquet_async",
     "io_parquet_compression",
     "io_avro",
     "io_orc",
@@ -189,7 +188,8 @@ io_ipc_compression = ["lz4", "zstd"]
 io_flight = ["io_ipc", "arrow-format/flight-data"]
 
 # base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format.
-io_parquet = ["parquet2", "io_ipc", "base64", "futures", "streaming-iterator", "fallible-streaming-iterator"]
+io_parquet = ["parquet2", "io_ipc", "base64", "streaming-iterator", "fallible-streaming-iterator"]
+io_parquet_async = ["futures", "io_parquet", "parquet2/async"]
 
 io_parquet_compression = [
     "io_parquet_zstd",
@@ -200,7 +200,7 @@ io_parquet_compression = [
 ]
 
 # sample testing of generated arrow data
-io_parquet_sample_test = ["io_parquet"]
+io_parquet_sample_test = ["io_parquet_async"]
 
 # compression backends
 io_parquet_zstd = ["parquet2/zstd"]
diff --git a/src/io/parquet/read/mod.rs b/src/io/parquet/read/mod.rs
index ea2b2f46d4..e856f101af 100644
--- a/src/io/parquet/read/mod.rs
+++ b/src/io/parquet/read/mod.rs
@@ -10,19 +10,22 @@ pub mod statistics;
 
 use std::io::{Read, Seek};
 
+#[cfg(feature = "io_parquet_async")]
 use futures::{AsyncRead, AsyncSeek};
 
 // re-exports of parquet2's relevant APIs
+#[cfg(feature = "io_parquet_async")]
+#[cfg_attr(docsrs, doc(cfg(feature = "io_parquet_async")))]
+pub use parquet2::read::{get_page_stream, read_metadata_async as _read_metadata_async};
 pub use parquet2::{
     error::Error as ParquetError,
     fallible_streaming_iterator,
     metadata::{ColumnChunkMetaData, ColumnDescriptor, RowGroupMetaData},
     page::{CompressedDataPage, DataPageHeader, Page},
     read::{
-        decompress, get_column_iterator, get_page_stream,
-        read_columns_indexes as _read_columns_indexes, read_metadata as _read_metadata,
-        read_metadata_async as _read_metadata_async, read_pages_locations, BasicDecompressor,
-        Decompressor, MutStreamingIterator, PageFilter, PageReader, ReadColumnIterator, State,
+        decompress, get_column_iterator, read_columns_indexes as _read_columns_indexes,
+        read_metadata as _read_metadata, read_pages_locations, BasicDecompressor, Decompressor,
+        MutStreamingIterator, PageFilter, PageReader, ReadColumnIterator, State,
     },
     schema::types::{
         GroupLogicalType, ParquetType, PhysicalType, PrimitiveConvertedType, PrimitiveLogicalType,
@@ -60,6 +63,8 @@ pub fn read_metadata<R: Read + Seek>(reader: &mut R) -> Result<FileMetaData> {
 }
 
 /// Reads parquets' metadata asynchronously.
+#[cfg(feature = "io_parquet_async")]
+#[cfg_attr(docsrs, doc(cfg(feature = "io_parquet_async")))]
 pub async fn read_metadata_async<R: AsyncRead + AsyncSeek + Send + Unpin>(
     reader: &mut R,
 ) -> Result<FileMetaData> {
diff --git a/src/io/parquet/read/row_group.rs b/src/io/parquet/read/row_group.rs
index 176c6e8318..7062df31e4 100644
--- a/src/io/parquet/read/row_group.rs
+++ b/src/io/parquet/read/row_group.rs
@@ -1,5 +1,6 @@
 use std::io::{Read, Seek};
 
+#[cfg(feature = "io_parquet_async")]
 use futures::{
     future::{try_join_all, BoxFuture},
     AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt,
@@ -138,6 +139,7 @@ where
     Ok((meta, chunk))
 }
 
+#[cfg(feature = "io_parquet_async")]
 async fn _read_single_column_async<'b, R, F>(
     reader_factory: F,
     meta: &ColumnChunkMetaData,
@@ -163,6 +165,8 @@ where
 ///
 /// It does so asynchronously via a single `join_all` over all the necessary columns for
 /// `field_name`.
+#[cfg(feature = "io_parquet_async")]
+#[cfg_attr(docsrs, doc(cfg(feature = "io_parquet_async")))]
 pub async fn read_columns_async<
     'a,
     'b,
@@ -303,6 +307,8 @@ pub fn read_columns_many<'a, R: Read + Seek>(
 /// This operation is IO-bounded `O(C)` where C is the number of columns in the row group -
 /// it reads all the columns to memory from the row group associated to the requested fields.
 /// It does so asynchronously via `join_all`
+#[cfg(feature = "io_parquet_async")]
+#[cfg_attr(docsrs, doc(cfg(feature = "io_parquet_async")))]
 pub async fn read_columns_many_async<
     'a,
     'b,
diff --git a/src/io/parquet/write/mod.rs b/src/io/parquet/write/mod.rs
index d4134f27df..6ef1864c6f 100644
--- a/src/io/parquet/write/mod.rs
+++ b/src/io/parquet/write/mod.rs
@@ -22,6 +22,7 @@ mod pages;
 mod primitive;
 mod row_group;
 mod schema;
+#[cfg(feature = "io_parquet_async")]
 mod sink;
 mod utf8;
 mod utils;
@@ -68,6 +69,8 @@ use crate::compute::aggregate::estimated_bytes_size;
 pub use file::FileWriter;
 pub use row_group::{row_group_iter, RowGroupIterator};
 pub use schema::to_parquet_type;
+#[cfg(feature = "io_parquet_async")]
+#[cfg_attr(docsrs, doc(cfg(feature = "io_parquet_async")))]
 pub use sink::FileSink;
 
 pub use pages::array_to_columns;