diff --git a/docs/api/arrow.md b/docs/api/arrow.md deleted file mode 100644 index 1535ee5..0000000 --- a/docs/api/arrow.md +++ /dev/null @@ -1,3 +0,0 @@ -# Arrow - -::: obspec.arrow diff --git a/mkdocs.yml b/mkdocs.yml index 7ac3ae8..076265a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -41,7 +41,6 @@ nav: - api/put.md - api/rename.md - api/attributes.md - - api/arrow.md - CHANGELOG.md watch: @@ -121,8 +120,6 @@ plugins: signature_crossrefs: true inventories: - - https://arrow.apache.org/docs/objects.inv - - https://docs.pola.rs/api/python/stable/objects.inv - https://docs.python.org/3/objects.inv - https://filesystem-spec.readthedocs.io/en/latest/objects.inv diff --git a/src/obspec/_list.py b/src/obspec/_list.py index 04e2d9b..89e501a 100644 --- a/src/obspec/_list.py +++ b/src/obspec/_list.py @@ -1,30 +1,22 @@ from __future__ import annotations -import sys from collections.abc import Sequence -from typing import Generic, Literal, Protocol, TypedDict, TypeVar, overload +from typing import TYPE_CHECKING, Generic, Protocol, TypedDict, TypeVar -from ._meta import ObjectMeta -from .arrow import ArrowArrayExportable, ArrowStreamExportable +if TYPE_CHECKING: + import sys + from collections.abc import Sequence -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self + from ._meta import ObjectMeta -ListChunkType_co = TypeVar( - "ListChunkType_co", - Sequence[ObjectMeta], - ArrowArrayExportable, - ArrowStreamExportable, - covariant=True, -) -"""The data structure used for holding list results. + if sys.version_info >= (3, 11): + from typing import Self + else: + from typing_extensions import Self -By default, listing APIs return a `list` of [`ObjectMeta`][obspec.ObjectMeta]. However -for improved performance when listing large buckets, you can pass `return_arrow=True`. -Then an Arrow `RecordBatch` will be returned instead. -""" + +ListChunkType_co = TypeVar("ListChunkType_co", covariant=True) +"""The data structure used for holding list results.""" class ListResult(TypedDict, Generic[ListChunkType_co]): @@ -83,32 +75,13 @@ async def __anext__(self) -> ListChunkType_co: class List(Protocol): - @overload - def list( - self, - prefix: str | None = None, - *, - offset: str | None = None, - chunk_size: int = 50, - return_arrow: Literal[True], - ) -> ListIterator[ArrowArrayExportable]: ... - @overload def list( self, prefix: str | None = None, *, offset: str | None = None, chunk_size: int = 50, - return_arrow: Literal[False] = False, - ) -> ListIterator[Sequence[ObjectMeta]]: ... - def list( - self, - prefix: str | None = None, - *, - offset: str | None = None, - chunk_size: int = 50, - return_arrow: bool = False, - ) -> ListIterator[ArrowArrayExportable] | ListIterator[Sequence[ObjectMeta]]: + ) -> ListIterator[Sequence[ObjectMeta]]: """List all the objects with the given prefix. Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of @@ -134,28 +107,6 @@ def list( break ``` - Return large list results as [Arrow](https://arrow.apache.org/). This is most - useful with large list operations. In this case you may want to increase the - `chunk_size` parameter. - - ```py - stream = obs.list(store, chunk_size=1000, return_arrow=True) - # Stream is now an iterable/async iterable of `RecordBatch`es - for batch in stream: - print(batch.num_rows) # 100 - - # If desired, convert to a pyarrow RecordBatch (zero-copy) with - # `pyarrow.record_batch(batch)` - break - ``` - - Collect all list results into a single Arrow `RecordBatch`. - - ```py - stream = obs.list(store, return_arrow=True) - batch = stream.collect() - ``` - !!! note The order of returned [`ObjectMeta`][obspec.ObjectMeta] is not guaranteed @@ -169,10 +120,6 @@ def list( chunk_size: The number of items to collect per chunk in the returned (async) iterator. All chunks except for the last one will have this many items. This is ignored in [`collect`][obspec.ListIterator.collect]. - return_arrow: If `True`, return each batch of list items as an Arrow - `RecordBatch`, not as a list of Python `dict`s. Arrow removes - serialization overhead between Rust and Python and so this can be - significantly faster for large list operations. Defaults to `False`. Returns: A ListStream, which you can iterate through to access list results. @@ -182,32 +129,13 @@ def list( class ListAsync(Protocol): - @overload - def list_async( - self, - prefix: str | None = None, - *, - offset: str | None = None, - chunk_size: int = 50, - return_arrow: Literal[True], - ) -> ListStream[ArrowArrayExportable]: ... - @overload - def list_async( - self, - prefix: str | None = None, - *, - offset: str | None = None, - chunk_size: int = 50, - return_arrow: Literal[False] = False, - ) -> ListStream[Sequence[ObjectMeta]]: ... def list_async( self, prefix: str | None = None, *, offset: str | None = None, chunk_size: int = 50, - return_arrow: bool = False, - ) -> ListStream[ArrowArrayExportable] | ListStream[Sequence[ObjectMeta]]: + ) -> ListStream[Sequence[ObjectMeta]]: """List all the objects with the given prefix. Note that this method itself is **not async**. It's a synchronous method but @@ -241,10 +169,6 @@ def list_async( (async) iterator. All chunks except for the last one will have this many items. This is ignored in [`collect_async`][obspec.ListStream.collect_async]. - return_arrow: If `True`, return each batch of list items as an Arrow - `RecordBatch`, not as a list of Python `dict`s. Arrow removes - serialization overhead between Rust and Python and so this can be - significantly faster for large list operations. Defaults to `False`. Returns: A ListStream, which you can iterate through to access list results. @@ -254,26 +178,10 @@ def list_async( class ListWithDelimiter(Protocol): - @overload - def list_with_delimiter( - self, - prefix: str | None = None, - *, - return_arrow: Literal[True], - ) -> ListResult[ArrowStreamExportable]: ... - @overload - def list_with_delimiter( - self, - prefix: str | None = None, - *, - return_arrow: Literal[False] = False, - ) -> ListResult[Sequence[ObjectMeta]]: ... def list_with_delimiter( self, prefix: str | None = None, - *, - return_arrow: bool = False, - ) -> ListResult[ArrowStreamExportable] | ListResult[Sequence[ObjectMeta]]: + ) -> ListResult[Sequence[ObjectMeta]]: """List objects with the given prefix and an implementation specific delimiter. @@ -292,13 +200,6 @@ def list_with_delimiter( Args: prefix: The prefix within ObjectStore to use for listing. Defaults to None. - Keyword Args: - return_arrow: If `True`, return list results as an Arrow - `Table`, not as a list of Python `dict`s. Arrow removes serialization - overhead between Rust and Python and so this can be significantly faster - for large list operations. Defaults to `False`. - - Returns: ListResult @@ -307,26 +208,10 @@ def list_with_delimiter( class ListWithDelimiterAsync(Protocol): - @overload - async def list_with_delimiter_async( - self, - prefix: str | None = None, - *, - return_arrow: Literal[True], - ) -> ListResult[ArrowStreamExportable]: ... - @overload async def list_with_delimiter_async( self, prefix: str | None = None, - *, - return_arrow: Literal[False] = False, - ) -> ListResult[Sequence[ObjectMeta]]: ... - async def list_with_delimiter_async( - self, - prefix: str | None = None, - *, - return_arrow: bool = False, - ) -> ListResult[ArrowStreamExportable] | ListResult[Sequence[ObjectMeta]]: + ) -> ListResult[Sequence[ObjectMeta]]: """Call `list_with_delimiter` asynchronously. Refer to the documentation for diff --git a/src/obspec/arrow.py b/src/obspec/arrow.py deleted file mode 100644 index 3cdf2e2..0000000 --- a/src/obspec/arrow.py +++ /dev/null @@ -1,55 +0,0 @@ -"""Arrow protocol type hints for use in [list][obspec.List] calls.""" - -from __future__ import annotations - -from typing import Protocol - - -class ArrowArrayExportable(Protocol): - """An object with an `__arrow_c_array__` method. - - Supported objects include: - - - arro3 `Array` or `RecordBatch` objects. - - pyarrow `Array` or `RecordBatch` objects - - Such an object implements the [Arrow C Data Interface - interface](https://arrow.apache.org/docs/format/CDataInterface.html) via the - [Arrow PyCapsule - Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). - This allows for zero-copy Arrow data interchange across libraries. - """ - - def __arrow_c_array__( - self, - requested_schema: object | None = None, - ) -> tuple[object, object]: - """Return Arrow C data interface PyCapsules for the object.""" - ... - - -class ArrowStreamExportable(Protocol): - """An object with an `__arrow_c_stream__` method. - - Supported objects include: - - - arro3 `Table`, `RecordBatchReader`, `ChunkedArray`, or `ArrayReader` objects. - - Polars `Series` or `DataFrame` objects (polars v1.2 or higher) - - pyarrow `RecordBatchReader`, `Table`, or `ChunkedArray` objects (pyarrow v14 or - higher) - - pandas `DataFrame`s (pandas v2.2 or higher) - - ibis `Table` objects. - - For an up to date list of supported objects, see [this - issue](https://github.com/apache/arrow/issues/39195#issuecomment-2245718008). - - Such an object implements the [Arrow C Stream - interface](https://arrow.apache.org/docs/format/CStreamInterface.html) via the - [Arrow PyCapsule - Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). - This allows for zero-copy Arrow data interchange across libraries. - """ - - def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: - """Return an Arrow C stream interface PyCapsule for the object.""" - ...