1
1
from __future__ import annotations
2
2
3
- import sys
4
3
from collections .abc import Sequence
5
- from typing import Generic , Literal , Protocol , TypedDict , TypeVar , overload
4
+ from typing import TYPE_CHECKING , Generic , Protocol , TypedDict , TypeVar
6
5
7
- from ._meta import ObjectMeta
8
- from .arrow import ArrowArrayExportable , ArrowStreamExportable
6
+ if TYPE_CHECKING :
7
+ import sys
8
+ from collections .abc import Sequence
9
9
10
- if sys .version_info >= (3 , 11 ):
11
- from typing import Self
12
- else :
13
- from typing_extensions import Self
10
+ from ._meta import ObjectMeta
14
11
15
- ListChunkType_co = TypeVar (
16
- "ListChunkType_co" ,
17
- Sequence [ObjectMeta ],
18
- ArrowArrayExportable ,
19
- ArrowStreamExportable ,
20
- covariant = True ,
21
- )
22
- """The data structure used for holding list results.
12
+ if sys .version_info >= (3 , 11 ):
13
+ from typing import Self
14
+ else :
15
+ from typing_extensions import Self
23
16
24
- By default, listing APIs return a `list` of [`ObjectMeta`][obspec.ObjectMeta]. However
25
- for improved performance when listing large buckets, you can pass `return_arrow=True`.
26
- Then an Arrow `RecordBatch` will be returned instead.
27
- """
17
+
18
+ ListChunkType_co = TypeVar ("ListChunkType_co" , covariant = True )
19
+ """The data structure used for holding list results."""
28
20
29
21
30
22
class ListResult (TypedDict , Generic [ListChunkType_co ]):
@@ -83,32 +75,13 @@ async def __anext__(self) -> ListChunkType_co:
83
75
84
76
85
77
class List (Protocol ):
86
- @overload
87
- def list (
88
- self ,
89
- prefix : str | None = None ,
90
- * ,
91
- offset : str | None = None ,
92
- chunk_size : int = 50 ,
93
- return_arrow : Literal [True ],
94
- ) -> ListIterator [ArrowArrayExportable ]: ...
95
- @overload
96
78
def list (
97
79
self ,
98
80
prefix : str | None = None ,
99
81
* ,
100
82
offset : str | None = None ,
101
83
chunk_size : int = 50 ,
102
- return_arrow : Literal [False ] = False ,
103
- ) -> ListIterator [Sequence [ObjectMeta ]]: ...
104
- def list (
105
- self ,
106
- prefix : str | None = None ,
107
- * ,
108
- offset : str | None = None ,
109
- chunk_size : int = 50 ,
110
- return_arrow : bool = False ,
111
- ) -> ListIterator [ArrowArrayExportable ] | ListIterator [Sequence [ObjectMeta ]]:
84
+ ) -> ListIterator [Sequence [ObjectMeta ]]:
112
85
"""List all the objects with the given prefix.
113
86
114
87
Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of
@@ -134,28 +107,6 @@ def list(
134
107
break
135
108
```
136
109
137
- Return large list results as [Arrow](https://arrow.apache.org/). This is most
138
- useful with large list operations. In this case you may want to increase the
139
- `chunk_size` parameter.
140
-
141
- ```py
142
- stream = obs.list(store, chunk_size=1000, return_arrow=True)
143
- # Stream is now an iterable/async iterable of `RecordBatch`es
144
- for batch in stream:
145
- print(batch.num_rows) # 100
146
-
147
- # If desired, convert to a pyarrow RecordBatch (zero-copy) with
148
- # `pyarrow.record_batch(batch)`
149
- break
150
- ```
151
-
152
- Collect all list results into a single Arrow `RecordBatch`.
153
-
154
- ```py
155
- stream = obs.list(store, return_arrow=True)
156
- batch = stream.collect()
157
- ```
158
-
159
110
!!! note
160
111
The order of returned [`ObjectMeta`][obspec.ObjectMeta] is not
161
112
guaranteed
@@ -169,10 +120,6 @@ def list(
169
120
chunk_size: The number of items to collect per chunk in the returned
170
121
(async) iterator. All chunks except for the last one will have this many
171
122
items. This is ignored in [`collect`][obspec.ListIterator.collect].
172
- return_arrow: If `True`, return each batch of list items as an Arrow
173
- `RecordBatch`, not as a list of Python `dict`s. Arrow removes
174
- serialization overhead between Rust and Python and so this can be
175
- significantly faster for large list operations. Defaults to `False`.
176
123
177
124
Returns:
178
125
A ListStream, which you can iterate through to access list results.
@@ -182,32 +129,13 @@ def list(
182
129
183
130
184
131
class ListAsync (Protocol ):
185
- @overload
186
- def list_async (
187
- self ,
188
- prefix : str | None = None ,
189
- * ,
190
- offset : str | None = None ,
191
- chunk_size : int = 50 ,
192
- return_arrow : Literal [True ],
193
- ) -> ListStream [ArrowArrayExportable ]: ...
194
- @overload
195
- def list_async (
196
- self ,
197
- prefix : str | None = None ,
198
- * ,
199
- offset : str | None = None ,
200
- chunk_size : int = 50 ,
201
- return_arrow : Literal [False ] = False ,
202
- ) -> ListStream [Sequence [ObjectMeta ]]: ...
203
132
def list_async (
204
133
self ,
205
134
prefix : str | None = None ,
206
135
* ,
207
136
offset : str | None = None ,
208
137
chunk_size : int = 50 ,
209
- return_arrow : bool = False ,
210
- ) -> ListStream [ArrowArrayExportable ] | ListStream [Sequence [ObjectMeta ]]:
138
+ ) -> ListStream [Sequence [ObjectMeta ]]:
211
139
"""List all the objects with the given prefix.
212
140
213
141
Note that this method itself is **not async**. It's a synchronous method but
@@ -241,10 +169,6 @@ def list_async(
241
169
(async) iterator. All chunks except for the last one will have this many
242
170
items. This is ignored in
243
171
[`collect_async`][obspec.ListStream.collect_async].
244
- return_arrow: If `True`, return each batch of list items as an Arrow
245
- `RecordBatch`, not as a list of Python `dict`s. Arrow removes
246
- serialization overhead between Rust and Python and so this can be
247
- significantly faster for large list operations. Defaults to `False`.
248
172
249
173
Returns:
250
174
A ListStream, which you can iterate through to access list results.
@@ -254,26 +178,10 @@ def list_async(
254
178
255
179
256
180
class ListWithDelimiter (Protocol ):
257
- @overload
258
- def list_with_delimiter (
259
- self ,
260
- prefix : str | None = None ,
261
- * ,
262
- return_arrow : Literal [True ],
263
- ) -> ListResult [ArrowStreamExportable ]: ...
264
- @overload
265
- def list_with_delimiter (
266
- self ,
267
- prefix : str | None = None ,
268
- * ,
269
- return_arrow : Literal [False ] = False ,
270
- ) -> ListResult [Sequence [ObjectMeta ]]: ...
271
181
def list_with_delimiter (
272
182
self ,
273
183
prefix : str | None = None ,
274
- * ,
275
- return_arrow : bool = False ,
276
- ) -> ListResult [ArrowStreamExportable ] | ListResult [Sequence [ObjectMeta ]]:
184
+ ) -> ListResult [Sequence [ObjectMeta ]]:
277
185
"""List objects with the given prefix and an implementation specific
278
186
delimiter.
279
187
@@ -292,13 +200,6 @@ def list_with_delimiter(
292
200
Args:
293
201
prefix: The prefix within ObjectStore to use for listing. Defaults to None.
294
202
295
- Keyword Args:
296
- return_arrow: If `True`, return list results as an Arrow
297
- `Table`, not as a list of Python `dict`s. Arrow removes serialization
298
- overhead between Rust and Python and so this can be significantly faster
299
- for large list operations. Defaults to `False`.
300
-
301
-
302
203
Returns:
303
204
ListResult
304
205
@@ -307,26 +208,10 @@ def list_with_delimiter(
307
208
308
209
309
210
class ListWithDelimiterAsync (Protocol ):
310
- @overload
311
- async def list_with_delimiter_async (
312
- self ,
313
- prefix : str | None = None ,
314
- * ,
315
- return_arrow : Literal [True ],
316
- ) -> ListResult [ArrowStreamExportable ]: ...
317
- @overload
318
211
async def list_with_delimiter_async (
319
212
self ,
320
213
prefix : str | None = None ,
321
- * ,
322
- return_arrow : Literal [False ] = False ,
323
- ) -> ListResult [Sequence [ObjectMeta ]]: ...
324
- async def list_with_delimiter_async (
325
- self ,
326
- prefix : str | None = None ,
327
- * ,
328
- return_arrow : bool = False ,
329
- ) -> ListResult [ArrowStreamExportable ] | ListResult [Sequence [ObjectMeta ]]:
214
+ ) -> ListResult [Sequence [ObjectMeta ]]:
330
215
"""Call `list_with_delimiter` asynchronously.
331
216
332
217
Refer to the documentation for
0 commit comments