Skip to content

Commit 8e22da4

Browse files
authored
add information columns (#19)
1 parent 79a3f52 commit 8e22da4

File tree

4 files changed

+186
-45
lines changed

4 files changed

+186
-45
lines changed

README.md

+2
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ streams with little upfront load complexity or time.
9595
([encoding](https://protobuf.dev/programming-guides/encoding/#varints)).
9696
files are a sequence of messages
9797
* `SingleMessagePerFile`: each file contains a single message
98+
* `filename`, `position` and `size`: boolean values enabling columns which add
99+
source information about where the messages originated from
98100

99101
## features
100102

packages/duckdb_protobuf/src/io.rs

+22-4
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use protobuf::CodedInputStream;
55
use std::error::Error;
66
use std::fs::File;
77
use std::io;
8+
use std::path::{Path, PathBuf};
89
use strum::{AsRefStr, EnumIter, EnumString, IntoEnumIterator};
910

1011
#[derive(Copy, Clone, EnumString, EnumIter, AsRefStr)]
@@ -39,26 +40,35 @@ pub enum DelimitedLengthKind {
3940
#[self_referencing]
4041
pub struct LengthDelimitedRecordsReader {
4142
length_kind: DelimitedLengthKind,
43+
path: PathBuf,
4244
inner: File,
4345

4446
#[borrows(mut inner)]
4547
#[not_covariant]
4648
reader: CodedInputStream<'this>,
4749
}
4850

51+
pub struct Record {
52+
pub bytes: Vec<u8>,
53+
pub position: u64,
54+
pub size: u32,
55+
}
56+
4957
impl LengthDelimitedRecordsReader {
50-
pub fn create(inner: File, length_kind: DelimitedLengthKind) -> Self {
58+
pub fn create(inner: File, length_kind: DelimitedLengthKind, path: PathBuf) -> Self {
5159
LengthDelimitedRecordsReaderBuilder {
5260
length_kind,
61+
path,
5362
inner,
5463
reader_builder: |it| CodedInputStream::new(it),
5564
}
5665
.build()
5766
}
5867

59-
fn get_next(&mut self) -> Result<Vec<u8>, io::Error> {
68+
fn get_next(&mut self) -> Result<Record, io::Error> {
6069
let length_kind = *self.borrow_length_kind();
6170
Ok(self.with_reader_mut(move |reader| {
71+
let position = reader.pos();
6272
let len = match length_kind {
6373
DelimitedLengthKind::BigEndianFixed => reader.read_u32::<BigEndian>()?,
6474
DelimitedLengthKind::Varint => reader.read_raw_varint32()?,
@@ -67,15 +77,23 @@ impl LengthDelimitedRecordsReader {
6777
let mut buf = vec![0; len as usize];
6878
<CodedInputStream as io::Read>::read_exact(reader, &mut buf)?;
6979

70-
Ok::<_, io::Error>(buf)
80+
Ok::<_, io::Error>(Record {
81+
bytes: buf,
82+
position,
83+
size: len,
84+
})
7185
})?)
7286
}
7387

74-
pub fn try_get_next(&mut self) -> Result<Option<Vec<u8>>, io::Error> {
88+
pub fn try_get_next(&mut self) -> Result<Option<Record>, io::Error> {
7589
match self.get_next() {
7690
Ok(it) => Ok(Some(it)),
7791
Err(err) if err.kind() == io::ErrorKind::UnexpectedEof => Ok(None),
7892
Err(err) => Err(err.into()),
7993
}
8094
}
95+
96+
pub fn path(&self) -> &Path {
97+
self.borrow_path().as_path()
98+
}
8199
}

packages/duckdb_protobuf/src/read.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ pub fn write_message(
5252
Ok(())
5353
}
5454

55-
struct MyFlatVector<T> {
55+
pub struct MyFlatVector<T> {
5656
_phantom_data: PhantomData<T>,
5757
ptr: duckdb::ffi::duckdb_vector,
5858
capacity: usize,

packages/duckdb_protobuf/src/vtab.rs

+161-40
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,16 @@ use duckdb::vtab::{
44
BindInfo, DataChunk, Free, FunctionInfo, InitInfo, LogicalType, LogicalTypeId, VTab,
55
VTabLocalData,
66
};
7-
use prost_reflect::{DescriptorPool, DynamicMessage, MessageDescriptor};
7+
use prost_reflect::{DescriptorPool, DynamicMessage, MessageDescriptor, ReflectMessage};
88
use std::error::Error;
9+
use std::ffi::CString;
910
use std::fs::File;
1011
use std::io::Read;
1112
use std::ops::{Deref, DerefMut};
12-
use std::path::PathBuf;
13+
use std::path::{Path, PathBuf};
1314

14-
use crate::io::{parse, DelimitedLengthKind, LengthDelimitedRecordsReader, LengthKind};
15-
use crate::read::write_to_output;
15+
use crate::io::{parse, DelimitedLengthKind, LengthDelimitedRecordsReader, LengthKind, Record};
16+
use crate::read::{write_to_output, MyFlatVector, VectorAccessor};
1617
use crate::types::into_logical_type;
1718

1819
pub struct Parameters {
@@ -21,6 +22,9 @@ pub struct Parameters {
2122
pub message_name: String,
2223
pub shared_message_descriptor: MessageDescriptor,
2324
pub length_kind: LengthKind,
25+
pub include_filename: bool,
26+
pub include_position: bool,
27+
pub include_size: bool,
2428
}
2529

2630
impl Parameters {
@@ -62,18 +66,35 @@ impl Parameters {
6266
let length_kind = parse::<LengthKind>(&length_kind.to_string())
6367
.map_err(|err| format_err!("when parsing parameter delimiter: {}", err))?;
6468

69+
let include_filename = bind
70+
.get_named_parameter("filename")
71+
.map(|value| value.to_int64() != 0)
72+
.unwrap_or(false);
73+
74+
let include_position = bind
75+
.get_named_parameter("position")
76+
.map(|value| value.to_int64() != 0)
77+
.unwrap_or(false);
78+
79+
let include_size = bind
80+
.get_named_parameter("size")
81+
.map(|value| value.to_int64() != 0)
82+
.unwrap_or(false);
83+
6584
Ok(Self {
6685
files,
6786
descriptor_bytes,
6887
message_name,
6988
shared_message_descriptor: message_descriptor,
7089
length_kind,
90+
include_filename,
91+
include_position,
92+
include_size,
7193
})
7294
}
7395

7496
pub fn message_descriptor(&self) -> Result<MessageDescriptor, anyhow::Error> {
75-
let descriptor_pool =
76-
DescriptorPool::decode(self.descriptor_bytes.as_slice())?;
97+
let descriptor_pool = DescriptorPool::decode(self.descriptor_bytes.as_slice())?;
7798

7899
let message_descriptor = descriptor_pool
79100
.get_message_by_name(&self.message_name)
@@ -100,10 +121,18 @@ impl Parameters {
100121
"delimiter".to_string(),
101122
LogicalType::new(LogicalTypeId::Varchar),
102123
),
124+
(
125+
"filename".to_string(),
126+
LogicalType::new(LogicalTypeId::Boolean),
127+
),
128+
(
129+
"position".to_string(),
130+
LogicalType::new(LogicalTypeId::Boolean),
131+
),
132+
("size".to_string(), LogicalType::new(LogicalTypeId::Boolean)),
103133
]
104134
}
105135
}
106-
107136
pub struct GlobalState {
108137
queue: ArrayQueue<PathBuf>,
109138
}
@@ -180,6 +209,18 @@ impl ProtobufVTab {
180209
);
181210
}
182211

212+
if params.include_filename {
213+
bind.add_result_column("filename", LogicalType::new(LogicalTypeId::Varchar));
214+
}
215+
216+
if params.include_position {
217+
bind.add_result_column("position", LogicalType::new(LogicalTypeId::UBigint));
218+
}
219+
220+
if params.include_size {
221+
bind.add_result_column("size", LogicalType::new(LogicalTypeId::UBigint));
222+
}
223+
183224
data.assign(params);
184225

185226
Ok(())
@@ -221,13 +262,17 @@ impl ProtobufVTab {
221262
let mut column_information = Default::default();
222263

223264
for output_row_idx in 0..available_chunk_size {
224-
let bytes = match state_container.next_message()? {
265+
let StateContainerValue {
266+
path_reference,
267+
size,
268+
bytes,
269+
position,
270+
} = match state_container.next_message()? {
225271
None => break,
226-
Some(bytes) => bytes,
272+
Some(message_info) => message_info,
227273
};
228274

229-
let message =
230-
DynamicMessage::decode(local_descriptor.clone(), bytes.as_slice())?;
275+
let message = DynamicMessage::decode(local_descriptor.clone(), bytes.as_slice())?;
231276

232277
write_to_output(
233278
&mut column_information,
@@ -237,6 +282,48 @@ impl ProtobufVTab {
237282
output_row_idx,
238283
)?;
239284

285+
let mut field_offset = message.descriptor().fields().len();
286+
287+
if parameters.include_filename {
288+
let it = (|| -> Option<CString> {
289+
let value = CString::new(path_reference.path().to_str()?).ok()?;
290+
Some(value)
291+
})();
292+
293+
let column = output.get_vector(field_offset);
294+
295+
match it {
296+
None => unsafe {
297+
let validity = duckdb::ffi::duckdb_vector_get_validity(column);
298+
duckdb::ffi::duckdb_validity_set_row_invalid(validity, output_row_idx as _);
299+
},
300+
Some(value) => unsafe {
301+
duckdb::ffi::duckdb_vector_assign_string_element(
302+
column,
303+
output_row_idx as _,
304+
value.as_ptr(),
305+
)
306+
},
307+
}
308+
309+
field_offset += 1;
310+
}
311+
312+
if parameters.include_position {
313+
let column = output.get_vector(field_offset);
314+
let mut vector =
315+
unsafe { MyFlatVector::<u64>::with_capacity(column, available_chunk_size) };
316+
vector.as_mut_slice()[output_row_idx] = position as _;
317+
field_offset += 1;
318+
}
319+
320+
if parameters.include_size {
321+
let column = output.get_vector(field_offset);
322+
let mut vector =
323+
unsafe { MyFlatVector::<u64>::with_capacity(column, available_chunk_size) };
324+
vector.as_mut_slice()[output_row_idx] = size as _;
325+
}
326+
240327
items += 1;
241328
}
242329

@@ -252,47 +339,81 @@ struct StateContainer<'a> {
252339
parameters: &'a Parameters,
253340
}
254341

255-
impl StateContainer<'_> {
256-
fn next_message(&mut self) -> Result<Option<Vec<u8>>, anyhow::Error> {
257-
let file_reader = if let Some(reader) = &mut self.local_state.current {
258-
reader
259-
} else {
260-
let Some(next_file_path) = self.global_state.queue.pop() else {
261-
return Ok(None);
262-
};
342+
enum PathReference<'a> {
343+
Borrowed(&'a Path),
344+
Owned(PathBuf),
345+
}
346+
347+
impl<'a> PathReference<'a> {
348+
pub fn path(&self) -> &Path {
349+
match self {
350+
PathReference::Borrowed(it) => *it,
351+
PathReference::Owned(it) => it.as_path(),
352+
}
353+
}
354+
}
355+
356+
struct StateContainerValue<'a> {
357+
path_reference: PathReference<'a>,
358+
bytes: Vec<u8>,
359+
size: usize,
360+
position: u64,
361+
}
263362

264-
let mut next_file = File::open(&next_file_path)?;
265-
match self.parameters.length_kind {
266-
LengthKind::BigEndianFixed => {
267-
self.local_state.current = Some(LengthDelimitedRecordsReader::create(
363+
impl StateContainer<'_> {
364+
fn next_message(&mut self) -> Result<Option<StateContainerValue>, anyhow::Error> {
365+
let mut value = match self.local_state.current.take() {
366+
Some(it) => it,
367+
None => {
368+
let Some(next_file_path) = self.global_state.queue.pop() else {
369+
return Ok(None);
370+
};
371+
372+
let mut next_file = File::open(&next_file_path)?;
373+
match self.parameters.length_kind {
374+
LengthKind::BigEndianFixed => LengthDelimitedRecordsReader::create(
268375
next_file,
269376
DelimitedLengthKind::BigEndianFixed,
270-
));
271-
272-
self.local_state.current.as_mut().unwrap()
273-
}
274-
LengthKind::Varint => {
275-
self.local_state.current = Some(LengthDelimitedRecordsReader::create(
377+
next_file_path,
378+
),
379+
LengthKind::Varint => LengthDelimitedRecordsReader::create(
276380
next_file,
277381
DelimitedLengthKind::Varint,
278-
));
279-
280-
self.local_state.current.as_mut().unwrap()
281-
}
282-
LengthKind::SingleMessagePerFile => {
283-
let mut bytes = Vec::new();
284-
next_file.read_to_end(&mut bytes)?;
285-
return Ok(Some(bytes));
382+
next_file_path,
383+
),
384+
LengthKind::SingleMessagePerFile => {
385+
let mut bytes = Vec::new();
386+
next_file.read_to_end(&mut bytes)?;
387+
let size = bytes.len();
388+
return Ok(Some(StateContainerValue {
389+
bytes,
390+
path_reference: PathReference::Owned(next_file_path),
391+
position: 0,
392+
size,
393+
}));
394+
}
286395
}
287396
}
288397
};
289398

290-
let Some(next_message) = file_reader.try_get_next()? else {
291-
self.local_state.current = None;
399+
let Some(Record {
400+
position,
401+
size,
402+
bytes: next_message,
403+
}) = value.try_get_next()?
404+
else {
292405
return Ok(None);
293406
};
294407

295-
Ok(Some(next_message))
408+
self.local_state.current = Some(value);
409+
Ok(Some(StateContainerValue {
410+
path_reference: PathReference::Borrowed(
411+
self.local_state.current.as_ref().unwrap().path(),
412+
),
413+
bytes: next_message,
414+
size: size as _,
415+
position,
416+
}))
296417
}
297418
}
298419

0 commit comments

Comments
 (0)