Skip to content

Commit 82e657f

Browse files
authored
Remove Arrow from list protocol (#26)
1 parent a3edb43 commit 82e657f

File tree

4 files changed

+16
-192
lines changed

4 files changed

+16
-192
lines changed

docs/api/arrow.md

Lines changed: 0 additions & 3 deletions
This file was deleted.

mkdocs.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ nav:
4141
- api/put.md
4242
- api/rename.md
4343
- api/attributes.md
44-
- api/arrow.md
4544
- CHANGELOG.md
4645

4746
watch:
@@ -121,8 +120,6 @@ plugins:
121120
signature_crossrefs: true
122121

123122
inventories:
124-
- https://arrow.apache.org/docs/objects.inv
125-
- https://docs.pola.rs/api/python/stable/objects.inv
126123
- https://docs.python.org/3/objects.inv
127124
- https://filesystem-spec.readthedocs.io/en/latest/objects.inv
128125

src/obspec/_list.py

Lines changed: 16 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,22 @@
11
from __future__ import annotations
22

3-
import sys
43
from collections.abc import Sequence
5-
from typing import Generic, Literal, Protocol, TypedDict, TypeVar, overload
4+
from typing import TYPE_CHECKING, Generic, Protocol, TypedDict, TypeVar
65

7-
from ._meta import ObjectMeta
8-
from .arrow import ArrowArrayExportable, ArrowStreamExportable
6+
if TYPE_CHECKING:
7+
import sys
8+
from collections.abc import Sequence
99

10-
if sys.version_info >= (3, 11):
11-
from typing import Self
12-
else:
13-
from typing_extensions import Self
10+
from ._meta import ObjectMeta
1411

15-
ListChunkType_co = TypeVar(
16-
"ListChunkType_co",
17-
Sequence[ObjectMeta],
18-
ArrowArrayExportable,
19-
ArrowStreamExportable,
20-
covariant=True,
21-
)
22-
"""The data structure used for holding list results.
12+
if sys.version_info >= (3, 11):
13+
from typing import Self
14+
else:
15+
from typing_extensions import Self
2316

24-
By default, listing APIs return a `list` of [`ObjectMeta`][obspec.ObjectMeta]. However
25-
for improved performance when listing large buckets, you can pass `return_arrow=True`.
26-
Then an Arrow `RecordBatch` will be returned instead.
27-
"""
17+
18+
ListChunkType_co = TypeVar("ListChunkType_co", covariant=True)
19+
"""The data structure used for holding list results."""
2820

2921

3022
class ListResult(TypedDict, Generic[ListChunkType_co]):
@@ -83,32 +75,13 @@ async def __anext__(self) -> ListChunkType_co:
8375

8476

8577
class List(Protocol):
86-
@overload
87-
def list(
88-
self,
89-
prefix: str | None = None,
90-
*,
91-
offset: str | None = None,
92-
chunk_size: int = 50,
93-
return_arrow: Literal[True],
94-
) -> ListIterator[ArrowArrayExportable]: ...
95-
@overload
9678
def list(
9779
self,
9880
prefix: str | None = None,
9981
*,
10082
offset: str | None = None,
10183
chunk_size: int = 50,
102-
return_arrow: Literal[False] = False,
103-
) -> ListIterator[Sequence[ObjectMeta]]: ...
104-
def list(
105-
self,
106-
prefix: str | None = None,
107-
*,
108-
offset: str | None = None,
109-
chunk_size: int = 50,
110-
return_arrow: bool = False,
111-
) -> ListIterator[ArrowArrayExportable] | ListIterator[Sequence[ObjectMeta]]:
84+
) -> ListIterator[Sequence[ObjectMeta]]:
11285
"""List all the objects with the given prefix.
11386
11487
Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of
@@ -134,28 +107,6 @@ def list(
134107
break
135108
```
136109
137-
Return large list results as [Arrow](https://arrow.apache.org/). This is most
138-
useful with large list operations. In this case you may want to increase the
139-
`chunk_size` parameter.
140-
141-
```py
142-
stream = obs.list(store, chunk_size=1000, return_arrow=True)
143-
# Stream is now an iterable/async iterable of `RecordBatch`es
144-
for batch in stream:
145-
print(batch.num_rows) # 100
146-
147-
# If desired, convert to a pyarrow RecordBatch (zero-copy) with
148-
# `pyarrow.record_batch(batch)`
149-
break
150-
```
151-
152-
Collect all list results into a single Arrow `RecordBatch`.
153-
154-
```py
155-
stream = obs.list(store, return_arrow=True)
156-
batch = stream.collect()
157-
```
158-
159110
!!! note
160111
The order of returned [`ObjectMeta`][obspec.ObjectMeta] is not
161112
guaranteed
@@ -169,10 +120,6 @@ def list(
169120
chunk_size: The number of items to collect per chunk in the returned
170121
(async) iterator. All chunks except for the last one will have this many
171122
items. This is ignored in [`collect`][obspec.ListIterator.collect].
172-
return_arrow: If `True`, return each batch of list items as an Arrow
173-
`RecordBatch`, not as a list of Python `dict`s. Arrow removes
174-
serialization overhead between Rust and Python and so this can be
175-
significantly faster for large list operations. Defaults to `False`.
176123
177124
Returns:
178125
A ListStream, which you can iterate through to access list results.
@@ -182,32 +129,13 @@ def list(
182129

183130

184131
class ListAsync(Protocol):
185-
@overload
186-
def list_async(
187-
self,
188-
prefix: str | None = None,
189-
*,
190-
offset: str | None = None,
191-
chunk_size: int = 50,
192-
return_arrow: Literal[True],
193-
) -> ListStream[ArrowArrayExportable]: ...
194-
@overload
195-
def list_async(
196-
self,
197-
prefix: str | None = None,
198-
*,
199-
offset: str | None = None,
200-
chunk_size: int = 50,
201-
return_arrow: Literal[False] = False,
202-
) -> ListStream[Sequence[ObjectMeta]]: ...
203132
def list_async(
204133
self,
205134
prefix: str | None = None,
206135
*,
207136
offset: str | None = None,
208137
chunk_size: int = 50,
209-
return_arrow: bool = False,
210-
) -> ListStream[ArrowArrayExportable] | ListStream[Sequence[ObjectMeta]]:
138+
) -> ListStream[Sequence[ObjectMeta]]:
211139
"""List all the objects with the given prefix.
212140
213141
Note that this method itself is **not async**. It's a synchronous method but
@@ -241,10 +169,6 @@ def list_async(
241169
(async) iterator. All chunks except for the last one will have this many
242170
items. This is ignored in
243171
[`collect_async`][obspec.ListStream.collect_async].
244-
return_arrow: If `True`, return each batch of list items as an Arrow
245-
`RecordBatch`, not as a list of Python `dict`s. Arrow removes
246-
serialization overhead between Rust and Python and so this can be
247-
significantly faster for large list operations. Defaults to `False`.
248172
249173
Returns:
250174
A ListStream, which you can iterate through to access list results.
@@ -254,26 +178,10 @@ def list_async(
254178

255179

256180
class ListWithDelimiter(Protocol):
257-
@overload
258-
def list_with_delimiter(
259-
self,
260-
prefix: str | None = None,
261-
*,
262-
return_arrow: Literal[True],
263-
) -> ListResult[ArrowStreamExportable]: ...
264-
@overload
265-
def list_with_delimiter(
266-
self,
267-
prefix: str | None = None,
268-
*,
269-
return_arrow: Literal[False] = False,
270-
) -> ListResult[Sequence[ObjectMeta]]: ...
271181
def list_with_delimiter(
272182
self,
273183
prefix: str | None = None,
274-
*,
275-
return_arrow: bool = False,
276-
) -> ListResult[ArrowStreamExportable] | ListResult[Sequence[ObjectMeta]]:
184+
) -> ListResult[Sequence[ObjectMeta]]:
277185
"""List objects with the given prefix and an implementation specific
278186
delimiter.
279187
@@ -292,13 +200,6 @@ def list_with_delimiter(
292200
Args:
293201
prefix: The prefix within ObjectStore to use for listing. Defaults to None.
294202
295-
Keyword Args:
296-
return_arrow: If `True`, return list results as an Arrow
297-
`Table`, not as a list of Python `dict`s. Arrow removes serialization
298-
overhead between Rust and Python and so this can be significantly faster
299-
for large list operations. Defaults to `False`.
300-
301-
302203
Returns:
303204
ListResult
304205
@@ -307,26 +208,10 @@ def list_with_delimiter(
307208

308209

309210
class ListWithDelimiterAsync(Protocol):
310-
@overload
311-
async def list_with_delimiter_async(
312-
self,
313-
prefix: str | None = None,
314-
*,
315-
return_arrow: Literal[True],
316-
) -> ListResult[ArrowStreamExportable]: ...
317-
@overload
318211
async def list_with_delimiter_async(
319212
self,
320213
prefix: str | None = None,
321-
*,
322-
return_arrow: Literal[False] = False,
323-
) -> ListResult[Sequence[ObjectMeta]]: ...
324-
async def list_with_delimiter_async(
325-
self,
326-
prefix: str | None = None,
327-
*,
328-
return_arrow: bool = False,
329-
) -> ListResult[ArrowStreamExportable] | ListResult[Sequence[ObjectMeta]]:
214+
) -> ListResult[Sequence[ObjectMeta]]:
330215
"""Call `list_with_delimiter` asynchronously.
331216
332217
Refer to the documentation for

src/obspec/arrow.py

Lines changed: 0 additions & 55 deletions
This file was deleted.

0 commit comments

Comments
 (0)