Skip to content

feat!: Remove Arrow from list protocol #26

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions docs/api/arrow.md

This file was deleted.

3 changes: 0 additions & 3 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ nav:
- api/put.md
- api/rename.md
- api/attributes.md
- api/arrow.md
- CHANGELOG.md

watch:
Expand Down Expand Up @@ -121,8 +120,6 @@ plugins:
signature_crossrefs: true

inventories:
- https://arrow.apache.org/docs/objects.inv
- https://docs.pola.rs/api/python/stable/objects.inv
- https://docs.python.org/3/objects.inv
- https://filesystem-spec.readthedocs.io/en/latest/objects.inv

Expand Down
147 changes: 16 additions & 131 deletions src/obspec/_list.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,22 @@
from __future__ import annotations

import sys
from collections.abc import Sequence
from typing import Generic, Literal, Protocol, TypedDict, TypeVar, overload
from typing import TYPE_CHECKING, Generic, Protocol, TypedDict, TypeVar

from ._meta import ObjectMeta
from .arrow import ArrowArrayExportable, ArrowStreamExportable
if TYPE_CHECKING:
import sys
from collections.abc import Sequence

if sys.version_info >= (3, 11):
from typing import Self
else:
from typing_extensions import Self
from ._meta import ObjectMeta

ListChunkType_co = TypeVar(
"ListChunkType_co",
Sequence[ObjectMeta],
ArrowArrayExportable,
ArrowStreamExportable,
covariant=True,
)
"""The data structure used for holding list results.
if sys.version_info >= (3, 11):
from typing import Self
else:
from typing_extensions import Self

By default, listing APIs return a `list` of [`ObjectMeta`][obspec.ObjectMeta]. However
for improved performance when listing large buckets, you can pass `return_arrow=True`.
Then an Arrow `RecordBatch` will be returned instead.
"""

ListChunkType_co = TypeVar("ListChunkType_co", covariant=True)
"""The data structure used for holding list results."""


class ListResult(TypedDict, Generic[ListChunkType_co]):
Expand Down Expand Up @@ -83,32 +75,13 @@ async def __anext__(self) -> ListChunkType_co:


class List(Protocol):
@overload
def list(
self,
prefix: str | None = None,
*,
offset: str | None = None,
chunk_size: int = 50,
return_arrow: Literal[True],
) -> ListIterator[ArrowArrayExportable]: ...
@overload
def list(
self,
prefix: str | None = None,
*,
offset: str | None = None,
chunk_size: int = 50,
return_arrow: Literal[False] = False,
) -> ListIterator[Sequence[ObjectMeta]]: ...
def list(
self,
prefix: str | None = None,
*,
offset: str | None = None,
chunk_size: int = 50,
return_arrow: bool = False,
) -> ListIterator[ArrowArrayExportable] | ListIterator[Sequence[ObjectMeta]]:
) -> ListIterator[Sequence[ObjectMeta]]:
"""List all the objects with the given prefix.

Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of
Expand All @@ -134,28 +107,6 @@ def list(
break
```

Return large list results as [Arrow](https://arrow.apache.org/). This is most
useful with large list operations. In this case you may want to increase the
`chunk_size` parameter.

```py
stream = obs.list(store, chunk_size=1000, return_arrow=True)
# Stream is now an iterable/async iterable of `RecordBatch`es
for batch in stream:
print(batch.num_rows) # 100

# If desired, convert to a pyarrow RecordBatch (zero-copy) with
# `pyarrow.record_batch(batch)`
break
```

Collect all list results into a single Arrow `RecordBatch`.

```py
stream = obs.list(store, return_arrow=True)
batch = stream.collect()
```

!!! note
The order of returned [`ObjectMeta`][obspec.ObjectMeta] is not
guaranteed
Expand All @@ -169,10 +120,6 @@ def list(
chunk_size: The number of items to collect per chunk in the returned
(async) iterator. All chunks except for the last one will have this many
items. This is ignored in [`collect`][obspec.ListIterator.collect].
return_arrow: If `True`, return each batch of list items as an Arrow
`RecordBatch`, not as a list of Python `dict`s. Arrow removes
serialization overhead between Rust and Python and so this can be
significantly faster for large list operations. Defaults to `False`.

Returns:
A ListStream, which you can iterate through to access list results.
Expand All @@ -182,32 +129,13 @@ def list(


class ListAsync(Protocol):
@overload
def list_async(
self,
prefix: str | None = None,
*,
offset: str | None = None,
chunk_size: int = 50,
return_arrow: Literal[True],
) -> ListStream[ArrowArrayExportable]: ...
@overload
def list_async(
self,
prefix: str | None = None,
*,
offset: str | None = None,
chunk_size: int = 50,
return_arrow: Literal[False] = False,
) -> ListStream[Sequence[ObjectMeta]]: ...
def list_async(
self,
prefix: str | None = None,
*,
offset: str | None = None,
chunk_size: int = 50,
return_arrow: bool = False,
) -> ListStream[ArrowArrayExportable] | ListStream[Sequence[ObjectMeta]]:
) -> ListStream[Sequence[ObjectMeta]]:
"""List all the objects with the given prefix.

Note that this method itself is **not async**. It's a synchronous method but
Expand Down Expand Up @@ -241,10 +169,6 @@ def list_async(
(async) iterator. All chunks except for the last one will have this many
items. This is ignored in
[`collect_async`][obspec.ListStream.collect_async].
return_arrow: If `True`, return each batch of list items as an Arrow
`RecordBatch`, not as a list of Python `dict`s. Arrow removes
serialization overhead between Rust and Python and so this can be
significantly faster for large list operations. Defaults to `False`.

Returns:
A ListStream, which you can iterate through to access list results.
Expand All @@ -254,26 +178,10 @@ def list_async(


class ListWithDelimiter(Protocol):
@overload
def list_with_delimiter(
self,
prefix: str | None = None,
*,
return_arrow: Literal[True],
) -> ListResult[ArrowStreamExportable]: ...
@overload
def list_with_delimiter(
self,
prefix: str | None = None,
*,
return_arrow: Literal[False] = False,
) -> ListResult[Sequence[ObjectMeta]]: ...
def list_with_delimiter(
self,
prefix: str | None = None,
*,
return_arrow: bool = False,
) -> ListResult[ArrowStreamExportable] | ListResult[Sequence[ObjectMeta]]:
) -> ListResult[Sequence[ObjectMeta]]:
"""List objects with the given prefix and an implementation specific
delimiter.

Expand All @@ -292,13 +200,6 @@ def list_with_delimiter(
Args:
prefix: The prefix within ObjectStore to use for listing. Defaults to None.

Keyword Args:
return_arrow: If `True`, return list results as an Arrow
`Table`, not as a list of Python `dict`s. Arrow removes serialization
overhead between Rust and Python and so this can be significantly faster
for large list operations. Defaults to `False`.


Returns:
ListResult

Expand All @@ -307,26 +208,10 @@ def list_with_delimiter(


class ListWithDelimiterAsync(Protocol):
@overload
async def list_with_delimiter_async(
self,
prefix: str | None = None,
*,
return_arrow: Literal[True],
) -> ListResult[ArrowStreamExportable]: ...
@overload
async def list_with_delimiter_async(
self,
prefix: str | None = None,
*,
return_arrow: Literal[False] = False,
) -> ListResult[Sequence[ObjectMeta]]: ...
async def list_with_delimiter_async(
self,
prefix: str | None = None,
*,
return_arrow: bool = False,
) -> ListResult[ArrowStreamExportable] | ListResult[Sequence[ObjectMeta]]:
) -> ListResult[Sequence[ObjectMeta]]:
"""Call `list_with_delimiter` asynchronously.

Refer to the documentation for
Expand Down
55 changes: 0 additions & 55 deletions src/obspec/arrow.py

This file was deleted.

Loading