This repository has been archived by the owner on Feb 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 224
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add
nested_column_iter_to_arrays
to deserialize inner columns (…
- Loading branch information
Showing
4 changed files
with
106 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
use std::fs::File; | ||
|
||
use arrow2::{ | ||
array::StructArray, | ||
datatypes::DataType, | ||
error::Result, | ||
io::parquet::read::{ | ||
infer_schema, n_columns, nested_column_iter_to_arrays, read_columns, read_metadata, | ||
to_deserializer, BasicDecompressor, InitNested, PageReader, | ||
}, | ||
}; | ||
|
||
#[test] | ||
fn test_deserialize_nested_column() -> Result<()> { | ||
let path = "testing/parquet-testing/data/nested_structs.rust.parquet"; | ||
let mut reader = File::open(path).unwrap(); | ||
|
||
let metadata = read_metadata(&mut reader)?; | ||
let schema = infer_schema(&metadata)?; | ||
|
||
let num_rows = metadata.num_rows; | ||
let row_group = metadata.row_groups[0].clone(); | ||
|
||
let field_columns = schema | ||
.fields | ||
.iter() | ||
.map(|field| read_columns(&mut reader, row_group.columns(), &field.name)) | ||
.collect::<Result<Vec<_>>>()?; | ||
|
||
let fields = schema.fields.clone(); | ||
for (mut columns, field) in field_columns.into_iter().zip(fields.iter()) { | ||
if let DataType::Struct(inner_fields) = &field.data_type { | ||
let mut array_iter = | ||
to_deserializer(columns.clone(), field.clone(), num_rows, None, None)?; | ||
let array = array_iter.next().transpose()?.unwrap(); | ||
let expected_array = array | ||
.as_any() | ||
.downcast_ref::<StructArray>() | ||
.unwrap() | ||
.clone(); | ||
|
||
// deserialize inner values of struct fields. | ||
let init = vec![InitNested::Struct(field.is_nullable)]; | ||
let mut values = Vec::with_capacity(inner_fields.len()); | ||
for inner_field in inner_fields { | ||
let n = n_columns(&inner_field.data_type); | ||
let inner_columns: Vec<_> = columns.drain(0..n).collect(); | ||
|
||
let (nestd_columns, types): (Vec<_>, Vec<_>) = inner_columns | ||
.into_iter() | ||
.map(|(column_meta, chunk)| { | ||
let len = chunk.len(); | ||
let pages = PageReader::new( | ||
std::io::Cursor::new(chunk), | ||
column_meta, | ||
std::sync::Arc::new(|_, _| true), | ||
vec![], | ||
len * 2 + 1024, | ||
); | ||
( | ||
BasicDecompressor::new(pages, vec![]), | ||
&column_meta.descriptor().descriptor.primitive_type, | ||
) | ||
}) | ||
.unzip(); | ||
|
||
let mut inner_array_iter = nested_column_iter_to_arrays( | ||
nestd_columns, | ||
types, | ||
inner_field.clone(), | ||
init.clone(), | ||
None, | ||
num_rows, | ||
)?; | ||
let inner_array = inner_array_iter.next().transpose()?; | ||
values.push(inner_array.unwrap()); | ||
} | ||
let struct_array = StructArray::try_new(field.data_type.clone(), values, None)?; | ||
|
||
assert_eq!(expected_array, struct_array); | ||
} | ||
} | ||
|
||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters