Skip to content

Commit

Permalink
Add Array::shrink_to_fit(&mut self) (#6790)
Browse files Browse the repository at this point in the history
* Add `Array::shrink_to_fit`

* Test that shrink_to_fit actually frees memory

* Make sure the buffer isn't shared in the test of shrink_to_fit

* Remove `#[inline]`

* Use `realloc` to reallocate the bytes

* Clean up test

* Improve docstring for `Array::shrink_to_fit`

Co-authored-by: Raphael Taylor-Davies <[email protected]>

* `Buffer::shrink_to_fit`: ignore shared buffers

* Improve comment in `ArrayRef::shrink_to_fit`

* Document why `try_realloc` is safe, and actually make it safe :)

* Improve testing of shrink_to_fit

* Fix a few corner cases, and improve test

* Add license header to new test file

---------

Co-authored-by: Raphael Taylor-Davies <[email protected]>
  • Loading branch information
emilk and tustvold authored Nov 29, 2024
1 parent fe7e71a commit fdb06e8
Show file tree
Hide file tree
Showing 22 changed files with 412 additions and 17 deletions.
7 changes: 7 additions & 0 deletions arrow-array/src/array/boolean_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,13 @@ impl Array for BooleanArray {
self.values.is_empty()
}

fn shrink_to_fit(&mut self) {
self.values.shrink_to_fit();
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
}

fn offset(&self) -> usize {
self.values.offset()
}
Expand Down
8 changes: 8 additions & 0 deletions arrow-array/src/array/byte_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,14 @@ impl<T: ByteArrayType> Array for GenericByteArray<T> {
self.value_offsets.len() <= 1
}

fn shrink_to_fit(&mut self) {
self.value_offsets.shrink_to_fit();
self.value_data.shrink_to_fit();
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
}

fn offset(&self) -> usize {
0
}
Expand Down
43 changes: 26 additions & 17 deletions arrow-array/src/array/byte_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -430,31 +430,31 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
///
/// Before GC:
/// ```text
/// ┌──────┐
/// │......│
/// │......│
/// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer
/// ┌──────┐
/// │......│
/// │......│
/// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer
/// │ View 1 │─ ─ ─ ─ │......│ with data that
/// ├────────────────────┤ │......│ is not referred
/// │ View 2 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
/// └────────────────────┘ │......│ View 2
/// │......│
/// 2 views, refer to │......│
/// small portions of a └──────┘
/// large buffer
/// └────────────────────┘ │......│ View 2
/// │......│
/// 2 views, refer to │......│
/// small portions of a └──────┘
/// large buffer
/// ```
///
///
/// After GC:
///
/// ```text
/// ┌────────────────────┐ ┌─────┐ After gc, only
/// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is
/// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by
/// │ View 2 │─ ─ ─ ─ └─────┘ the views is
/// └────────────────────┘ left
///
///
/// 2 views
/// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is
/// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by
/// │ View 2 │─ ─ ─ ─ └─────┘ the views is
/// └────────────────────┘ left
///
///
/// 2 views
/// ```
/// This method will compact the data buffers by recreating the view array and only include the data
/// that is pointed to by the views.
Expand Down Expand Up @@ -575,6 +575,15 @@ impl<T: ByteViewType + ?Sized> Array for GenericByteViewArray<T> {
self.views.is_empty()
}

fn shrink_to_fit(&mut self) {
self.views.shrink_to_fit();
self.buffers.iter_mut().for_each(|b| b.shrink_to_fit());
self.buffers.shrink_to_fit();
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
}

fn offset(&self) -> usize {
0
}
Expand Down
5 changes: 5 additions & 0 deletions arrow-array/src/array/dictionary_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -720,6 +720,11 @@ impl<T: ArrowDictionaryKeyType> Array for DictionaryArray<T> {
self.keys.is_empty()
}

fn shrink_to_fit(&mut self) {
self.keys.shrink_to_fit();
self.values.shrink_to_fit();
}

fn offset(&self) -> usize {
self.keys.offset()
}
Expand Down
7 changes: 7 additions & 0 deletions arrow-array/src/array/fixed_size_binary_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,13 @@ impl Array for FixedSizeBinaryArray {
self.len == 0
}

fn shrink_to_fit(&mut self) {
self.value_data.shrink_to_fit();
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
}

fn offset(&self) -> usize {
0
}
Expand Down
7 changes: 7 additions & 0 deletions arrow-array/src/array/fixed_size_list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,13 @@ impl Array for FixedSizeListArray {
self.len == 0
}

fn shrink_to_fit(&mut self) {
self.values.shrink_to_fit();
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
}

fn offset(&self) -> usize {
0
}
Expand Down
8 changes: 8 additions & 0 deletions arrow-array/src/array/list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,14 @@ impl<OffsetSize: OffsetSizeTrait> Array for GenericListArray<OffsetSize> {
self.value_offsets.len() <= 1
}

fn shrink_to_fit(&mut self) {
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
self.values.shrink_to_fit();
self.value_offsets.shrink_to_fit();
}

fn offset(&self) -> usize {
0
}
Expand Down
9 changes: 9 additions & 0 deletions arrow-array/src/array/list_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,15 @@ impl<OffsetSize: OffsetSizeTrait> Array for GenericListViewArray<OffsetSize> {
self.value_sizes.is_empty()
}

fn shrink_to_fit(&mut self) {
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
self.values.shrink_to_fit();
self.value_offsets.shrink_to_fit();
self.value_sizes.shrink_to_fit();
}

fn offset(&self) -> usize {
0
}
Expand Down
8 changes: 8 additions & 0 deletions arrow-array/src/array/map_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,14 @@ impl Array for MapArray {
self.value_offsets.len() <= 1
}

fn shrink_to_fit(&mut self) {
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
self.entries.shrink_to_fit();
self.value_offsets.shrink_to_fit();
}

fn offset(&self) -> usize {
0
}
Expand Down
15 changes: 15 additions & 0 deletions arrow-array/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,12 @@ pub trait Array: std::fmt::Debug + Send + Sync {
/// ```
fn is_empty(&self) -> bool;

/// Shrinks the capacity of any exclusively owned buffer as much as possible
///
/// Shared or externally allocated buffers will be ignored, and
/// any buffer offsets will be preserved.
fn shrink_to_fit(&mut self) {}

/// Returns the offset into the underlying data used by this array(-slice).
/// Note that the underlying data can be shared by many arrays.
/// This defaults to `0`.
Expand Down Expand Up @@ -365,6 +371,15 @@ impl Array for ArrayRef {
self.as_ref().is_empty()
}

/// For shared buffers, this is a no-op.
fn shrink_to_fit(&mut self) {
if let Some(slf) = Arc::get_mut(self) {
slf.shrink_to_fit();
} else {
// We ignore shared buffers.
}
}

fn offset(&self) -> usize {
self.as_ref().offset()
}
Expand Down
7 changes: 7 additions & 0 deletions arrow-array/src/array/primitive_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1152,6 +1152,13 @@ impl<T: ArrowPrimitiveType> Array for PrimitiveArray<T> {
self.values.is_empty()
}

fn shrink_to_fit(&mut self) {
self.values.shrink_to_fit();
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
}

fn offset(&self) -> usize {
0
}
Expand Down
5 changes: 5 additions & 0 deletions arrow-array/src/array/run_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,11 @@ impl<T: RunEndIndexType> Array for RunArray<T> {
self.run_ends.is_empty()
}

fn shrink_to_fit(&mut self) {
self.run_ends.shrink_to_fit();
self.values.shrink_to_fit();
}

fn offset(&self) -> usize {
self.run_ends.offset()
}
Expand Down
7 changes: 7 additions & 0 deletions arrow-array/src/array/struct_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,13 @@ impl Array for StructArray {
self.len == 0
}

fn shrink_to_fit(&mut self) {
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
self.fields.iter_mut().for_each(|n| n.shrink_to_fit());
}

fn offset(&self) -> usize {
0
}
Expand Down
11 changes: 11 additions & 0 deletions arrow-array/src/array/union_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -744,6 +744,17 @@ impl Array for UnionArray {
self.type_ids.is_empty()
}

fn shrink_to_fit(&mut self) {
self.type_ids.shrink_to_fit();
if let Some(offsets) = &mut self.offsets {
offsets.shrink_to_fit();
}
for array in self.fields.iter_mut().flatten() {
array.shrink_to_fit();
}
self.fields.shrink_to_fit();
}

fn offset(&self) -> usize {
0
}
Expand Down
6 changes: 6 additions & 0 deletions arrow-buffer/src/buffer/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,12 @@ impl BooleanBuffer {
self.len == 0
}

/// Free up unused memory.
pub fn shrink_to_fit(&mut self) {
// TODO(emilk): we could shrink even more in the case where we are a small sub-slice of the full buffer
self.buffer.shrink_to_fit();
}

/// Returns the boolean value at index `i`.
///
/// # Panics
Expand Down
63 changes: 63 additions & 0 deletions arrow-buffer/src/buffer/immutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,41 @@ impl Buffer {
self.data.capacity()
}

/// Tried to shrink the capacity of the buffer as much as possible, freeing unused memory.
///
/// If the buffer is shared, this is a no-op.
///
/// If the memory was allocated with a custom allocator, this is a no-op.
///
/// If the capacity is already less than or equal to the desired capacity, this is a no-op.
///
/// The memory region will be reallocated using `std::alloc::realloc`.
pub fn shrink_to_fit(&mut self) {
let offset = self.ptr_offset();
let is_empty = self.is_empty();
let desired_capacity = if is_empty {
0
} else {
// For realloc to work, we cannot free the elements before the offset
offset + self.len()
};
if desired_capacity < self.capacity() {
if let Some(bytes) = Arc::get_mut(&mut self.data) {
if bytes.try_realloc(desired_capacity).is_ok() {
// Realloc complete - update our pointer into `bytes`:
self.ptr = if is_empty {
bytes.as_ptr()
} else {
// SAFETY: we kept all elements leading up to the offset
unsafe { bytes.as_ptr().add(offset) }
}
} else {
// Failure to reallocate is fine; we just failed to free up memory.
}
}
}
}

/// Returns whether the buffer is empty.
#[inline]
pub fn is_empty(&self) -> bool {
Expand Down Expand Up @@ -554,6 +589,34 @@ mod tests {
assert_eq!(buf2.slice_with_length(2, 1).as_slice(), &[10]);
}

#[test]
fn test_shrink_to_fit() {
let original = Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7]);
assert_eq!(original.as_slice(), &[0, 1, 2, 3, 4, 5, 6, 7]);
assert_eq!(original.capacity(), 64);

let slice = original.slice_with_length(2, 3);
drop(original); // Make sure the buffer isn't shared (or shrink_to_fit won't work)
assert_eq!(slice.as_slice(), &[2, 3, 4]);
assert_eq!(slice.capacity(), 64);

let mut shrunk = slice;
shrunk.shrink_to_fit();
assert_eq!(shrunk.as_slice(), &[2, 3, 4]);
assert_eq!(shrunk.capacity(), 5); // shrink_to_fit is allowed to keep the elements before the offset

// Test that we can handle empty slices:
let empty_slice = shrunk.slice_with_length(1, 0);
drop(shrunk); // Make sure the buffer isn't shared (or shrink_to_fit won't work)
assert_eq!(empty_slice.as_slice(), &[]);
assert_eq!(empty_slice.capacity(), 5);

let mut shrunk_empty = empty_slice;
shrunk_empty.shrink_to_fit();
assert_eq!(shrunk_empty.as_slice(), &[]);
assert_eq!(shrunk_empty.capacity(), 1); // NOTE: `Buffer` and `Bytes` doesn't support 0-capacity
}

#[test]
#[should_panic(expected = "the offset of the new Buffer cannot exceed the existing length")]
fn test_slice_offset_out_of_bound() {
Expand Down
5 changes: 5 additions & 0 deletions arrow-buffer/src/buffer/null.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,11 @@ impl NullBuffer {
self.buffer.is_empty()
}

/// Free up unused memory.
pub fn shrink_to_fit(&mut self) {
self.buffer.shrink_to_fit();
}

/// Returns the null count for this [`NullBuffer`]
#[inline]
pub fn null_count(&self) -> usize {
Expand Down
5 changes: 5 additions & 0 deletions arrow-buffer/src/buffer/offset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,11 @@ impl<O: ArrowNativeType> OffsetBuffer<O> {
Self(out.into())
}

/// Free up unused memory.
pub fn shrink_to_fit(&mut self) {
self.0.shrink_to_fit();
}

/// Returns the inner [`ScalarBuffer`]
pub fn inner(&self) -> &ScalarBuffer<O> {
&self.0
Expand Down
6 changes: 6 additions & 0 deletions arrow-buffer/src/buffer/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,12 @@ where
self.len == 0
}

/// Free up unused memory.
pub fn shrink_to_fit(&mut self) {
// TODO(emilk): we could shrink even more in the case where we are a small sub-slice of the full buffer
self.run_ends.shrink_to_fit();
}

/// Returns the values of this [`RunEndBuffer`] not including any offset
#[inline]
pub fn values(&self) -> &[E] {
Expand Down
Loading

0 comments on commit fdb06e8

Please sign in to comment.