Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

deduplicate variadic buffers in MutableArrayData::extend for ByteView arrays #6808

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 29 additions & 23 deletions arrow-data/src/transform/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer};
use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode};
use half::f16;
use num::Integer;
use std::collections::HashMap;
use std::mem;

mod boolean;
Expand Down Expand Up @@ -214,7 +215,7 @@ fn build_extend_dictionary(array: &ArrayData, offset: usize, max: usize) -> Opti
}

/// Builds an extend that adds `buffer_offset` to any buffer indices encountered
fn build_extend_view(array: &ArrayData, buffer_offset: u32) -> Extend {
fn build_extend_view(array: &ArrayData, buffer_lookup: Vec<usize>) -> Extend {
let views = array.buffer::<u128>(0);
Box::new(
move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| {
Expand All @@ -226,7 +227,7 @@ fn build_extend_view(array: &ArrayData, buffer_offset: u32) -> Extend {
return *v; // Stored inline
}
let mut view = ByteView::from(*v);
view.buffer_index += buffer_offset;
view.buffer_index = buffer_lookup[view.buffer_index as usize] as u32;
view.into()
}))
},
Expand Down Expand Up @@ -627,13 +628,19 @@ impl<'a> MutableArrayData<'a> {
_ => (None, false),
};

let variadic_data_buffers = match &data_type {
DataType::BinaryView | DataType::Utf8View => arrays
.iter()
.flat_map(|x| x.buffers().iter().skip(1))
.map(Buffer::clone)
.collect(),
_ => vec![],
let (variadic_data_buffers, buffer_to_idx) = match &data_type {
DataType::BinaryView | DataType::Utf8View => {
let mut buffer_to_idx = HashMap::new();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if building a hashmap / vec would be overly expensive (though we would need to run benchmarks to be sure)

cc @XiangpengHao

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

happy to run benchmarks, any particular in mind or should I create one with criterion specific to this?

let mut variadic_buffers = Vec::new();
for buffer in arrays.iter().flat_map(|x| x.buffers().iter().skip(1)) {
buffer_to_idx.entry(buffer.as_ptr()).or_insert_with(|| {
variadic_buffers.push(buffer.clone());
variadic_buffers.len() - 1
});
}
(variadic_buffers, buffer_to_idx)
}
_ => (vec![], HashMap::new()),
};

let extend_nulls = build_extend_nulls(data_type);
Expand Down Expand Up @@ -668,20 +675,19 @@ impl<'a> MutableArrayData<'a> {

extend_values.expect("MutableArrayData::new is infallible")
}
DataType::BinaryView | DataType::Utf8View => {
let mut next_offset = 0u32;
arrays
.iter()
.map(|arr| {
let num_data_buffers = (arr.buffers().len() - 1) as u32;
let offset = next_offset;
next_offset = next_offset
.checked_add(num_data_buffers)
.expect("view buffer index overflow");
build_extend_view(arr, offset)
})
.collect()
}
DataType::BinaryView | DataType::Utf8View => arrays
.iter()
.map(|arr| {
let arr_to_res_buffer_idx: Vec<_> = arr
.buffers()
.iter()
.skip(1)
.map(|buf| *buffer_to_idx.get(&buf.as_ptr()).unwrap())
.collect();

build_extend_view(arr, arr_to_res_buffer_idx)
})
.collect(),
_ => arrays.iter().map(|array| build_extend(array)).collect(),
};

Expand Down
Loading