From 596901800a271ab1f9dadfac6b2f0e211d269a10 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Thu, 23 Jan 2025 15:04:51 +0100 Subject: [PATCH 01/12] test deterministic memory store --- tests/test_store/test_memory.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_store/test_memory.py b/tests/test_store/test_memory.py index ba38889b52..02deb4db25 100644 --- a/tests/test_store/test_memory.py +++ b/tests/test_store/test_memory.py @@ -1,12 +1,18 @@ from __future__ import annotations +from typing import TYPE_CHECKING + import pytest +import zarr from zarr.core.buffer import Buffer, cpu, gpu from zarr.storage import GpuMemoryStore, MemoryStore from zarr.testing.store import StoreTests from zarr.testing.utils import gpu_test +if TYPE_CHECKING: + from zarr.core.common import ZarrFormat + class TestMemoryStore(StoreTests[MemoryStore, cpu.Buffer]): store_cls = MemoryStore @@ -46,6 +52,28 @@ def test_store_supports_partial_writes(self, store: MemoryStore) -> None: def test_list_prefix(self, store: MemoryStore) -> None: assert True + @pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) + @pytest.mark.parametrize("zarr_format", [2, 3]) + async def test_deterministic_size( + self, store: MemoryStore, dtype, zarr_format: ZarrFormat + ) -> None: + def padding_size() -> int: + a = zarr.empty( + store=store, + shape=(3,), + chunks=(1000,), + dtype=dtype, + zarr_format=zarr_format, + overwrite=True, + ) + a[...] = b"1" if dtype == "str" else 1 + key = "0" if zarr_format == 2 else "c/0" + return len(store._store_dict[key]) + + l1 = padding_size() + l2 = padding_size() + assert l1 == l2 + @gpu_test class TestGpuMemoryStore(StoreTests[GpuMemoryStore, gpu.Buffer]): From 167afe3e89f898bb77a2307b90f01268bf171b2c Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Thu, 23 Jan 2025 15:07:53 +0100 Subject: [PATCH 02/12] deterministic memory store --- src/zarr/core/buffer/cpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/buffer/cpu.py b/src/zarr/core/buffer/cpu.py index 5019075496..8aff2e5cfc 100644 --- a/src/zarr/core/buffer/cpu.py +++ b/src/zarr/core/buffer/cpu.py @@ -154,7 +154,7 @@ def create( order: Literal["C", "F"] = "C", fill_value: Any | None = None, ) -> Self: - ret = cls(np.empty(shape=tuple(shape), dtype=dtype, order=order)) + ret = cls(np.zeros(shape=tuple(shape), dtype=dtype, order=order)) if fill_value is not None: ret.fill(fill_value) return ret From b119ea9c86226f909e078d92400bd4ac88a653c4 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Thu, 23 Jan 2025 15:22:05 +0100 Subject: [PATCH 03/12] simplify test --- tests/test_store/test_memory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_store/test_memory.py b/tests/test_store/test_memory.py index 02deb4db25..cacec95e96 100644 --- a/tests/test_store/test_memory.py +++ b/tests/test_store/test_memory.py @@ -52,7 +52,7 @@ def test_store_supports_partial_writes(self, store: MemoryStore) -> None: def test_list_prefix(self, store: MemoryStore) -> None: assert True - @pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) + @pytest.mark.parametrize("dtype", ["uint8", "float32", "int64"]) @pytest.mark.parametrize("zarr_format", [2, 3]) async def test_deterministic_size( self, store: MemoryStore, dtype, zarr_format: ZarrFormat @@ -66,7 +66,7 @@ def padding_size() -> int: zarr_format=zarr_format, overwrite=True, ) - a[...] = b"1" if dtype == "str" else 1 + a[...] = 1 key = "0" if zarr_format == 2 else "c/0" return len(store._store_dict[key]) From ee8116641d9ddb7ef2d966a46300c962705e58f0 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Thu, 23 Jan 2025 16:04:01 +0100 Subject: [PATCH 04/12] document changes --- changes/2755.bugfix.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/2755.bugfix.rst diff --git a/changes/2755.bugfix.rst b/changes/2755.bugfix.rst new file mode 100644 index 0000000000..b8eb4b6efa --- /dev/null +++ b/changes/2755.bugfix.rst @@ -0,0 +1 @@ +Upon creation, an empty ``zarr.core.buffer.cpu.NDBuffer`` is filled with zeros to ensure deterministic behavior. \ No newline at end of file From 21d9dc700fe4229e9424cf55a2d30d279197f716 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 24 Jan 2025 11:22:37 +0100 Subject: [PATCH 05/12] Update src/zarr/core/buffer/cpu.py Co-authored-by: Joe Hamman --- src/zarr/core/buffer/cpu.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/zarr/core/buffer/cpu.py b/src/zarr/core/buffer/cpu.py index 8aff2e5cfc..6c4b1a3587 100644 --- a/src/zarr/core/buffer/cpu.py +++ b/src/zarr/core/buffer/cpu.py @@ -154,9 +154,7 @@ def create( order: Literal["C", "F"] = "C", fill_value: Any | None = None, ) -> Self: - ret = cls(np.zeros(shape=tuple(shape), dtype=dtype, order=order)) - if fill_value is not None: - ret.fill(fill_value) + ret = cls(np.full(shape=tuple(shape), fill_value=fill_value or 0, dtype=dtype, order=order)) return ret @classmethod From 943da8dcd0fc8581998d657c7714d38a7df1bd87 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 24 Jan 2025 11:25:36 +0100 Subject: [PATCH 06/12] lint --- src/zarr/core/buffer/cpu.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/buffer/cpu.py b/src/zarr/core/buffer/cpu.py index 6c4b1a3587..1342185950 100644 --- a/src/zarr/core/buffer/cpu.py +++ b/src/zarr/core/buffer/cpu.py @@ -154,8 +154,9 @@ def create( order: Literal["C", "F"] = "C", fill_value: Any | None = None, ) -> Self: - ret = cls(np.full(shape=tuple(shape), fill_value=fill_value or 0, dtype=dtype, order=order)) - return ret + return cls( + np.full(shape=tuple(shape), fill_value=fill_value or 0, dtype=dtype, order=order) + ) @classmethod def from_numpy_array(cls, array_like: npt.ArrayLike) -> Self: From 72b3b9a5d5808081b40dac2a573b0b584eb30dce Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 24 Jan 2025 14:13:57 +0100 Subject: [PATCH 07/12] handle fill_value==None --- src/zarr/core/buffer/cpu.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/buffer/cpu.py b/src/zarr/core/buffer/cpu.py index 1342185950..225adb6f5c 100644 --- a/src/zarr/core/buffer/cpu.py +++ b/src/zarr/core/buffer/cpu.py @@ -154,9 +154,10 @@ def create( order: Literal["C", "F"] = "C", fill_value: Any | None = None, ) -> Self: - return cls( - np.full(shape=tuple(shape), fill_value=fill_value or 0, dtype=dtype, order=order) - ) + if fill_value is None: + return cls(np.zeros(shape=tuple(shape), dtype=dtype, order=order)) + else: + return cls(np.full(shape=tuple(shape), fill_value=fill_value, dtype=dtype, order=order)) @classmethod def from_numpy_array(cls, array_like: npt.ArrayLike) -> Self: From e38789b6ebdb15012769fb286caa40fa2366bcb4 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 24 Jan 2025 14:31:33 +0100 Subject: [PATCH 08/12] better test --- tests/test_store/test_memory.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/tests/test_store/test_memory.py b/tests/test_store/test_memory.py index cacec95e96..f00d75a8f0 100644 --- a/tests/test_store/test_memory.py +++ b/tests/test_store/test_memory.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING +import numpy as np import pytest import zarr @@ -57,22 +58,19 @@ def test_list_prefix(self, store: MemoryStore) -> None: async def test_deterministic_size( self, store: MemoryStore, dtype, zarr_format: ZarrFormat ) -> None: - def padding_size() -> int: - a = zarr.empty( - store=store, - shape=(3,), - chunks=(1000,), - dtype=dtype, - zarr_format=zarr_format, - overwrite=True, - ) - a[...] = 1 - key = "0" if zarr_format == 2 else "c/0" - return len(store._store_dict[key]) - - l1 = padding_size() - l2 = padding_size() - assert l1 == l2 + a = zarr.empty( + store=store, + shape=(3,), + chunks=(1000,), + dtype=dtype, + zarr_format=zarr_format, + overwrite=True, + ) + a[...] = 1 + a.resize((1000,)) + + np.testing.assert_array_equal(a[:3], 1) + np.testing.assert_array_equal(a[3:], 0) @gpu_test From dc368121b2c68ea9bef7bd77285837e63c300f6a Mon Sep 17 00:00:00 2001 From: Hannes Spitz <44113112+brokkoli71@users.noreply.github.com> Date: Wed, 29 Jan 2025 11:38:20 +0100 Subject: [PATCH 09/12] improve changes documentation Co-authored-by: David Stansby --- changes/2755.bugfix.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/changes/2755.bugfix.rst b/changes/2755.bugfix.rst index b8eb4b6efa..a10ca0eb92 100644 --- a/changes/2755.bugfix.rst +++ b/changes/2755.bugfix.rst @@ -1 +1,2 @@ -Upon creation, an empty ``zarr.core.buffer.cpu.NDBuffer`` is filled with zeros to ensure deterministic behavior. \ No newline at end of file +Upon creation, an empty ``zarr.core.buffer.cpu.NDBuffer`` is filled with zeros to ensure deterministic behavior. +This fixes a bug where Zarr format 2 data with no fill value was written with un-predictable chunk sizes. \ No newline at end of file From 39a3540ec5f6c9eae547435117f9c94b43684066 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 29 Jan 2025 12:02:18 +0100 Subject: [PATCH 10/12] update docstrings --- src/zarr/api/asynchronous.py | 12 ++++-------- src/zarr/api/synchronous.py | 12 ++++-------- src/zarr/core/group.py | 17 ++++++++--------- 3 files changed, 16 insertions(+), 25 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 8eba4fc152..91431ad76b 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -1065,7 +1065,8 @@ async def create( async def empty( shape: ChunkCoords, **kwargs: Any ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: - """Create an empty array. + """Create an empty array with the specified shape. The contents will be filled with the + array's fill value or zeros if no fill value is provided. Parameters ---------- @@ -1073,12 +1074,6 @@ async def empty( Shape of the empty array. **kwargs Keyword arguments passed to :func:`zarr.api.asynchronous.create`. - - Notes - ----- - The contents of an empty Zarr array are not defined. On attempting to - retrieve data from an empty Zarr array, any values may be returned, - and these are not guaranteed to be stable from one access to the next. """ return await create(shape=shape, fill_value=None, **kwargs) @@ -1087,7 +1082,8 @@ async def empty( async def empty_like( a: ArrayLike, **kwargs: Any ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: - """Create an empty array like `a`. + """Create an empty array like `a`. The contents will be filled with the + array's fill value or zeros if no fill value is provided. Parameters ---------- diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index f8bee9fcef..c5e31ca452 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -895,7 +895,8 @@ def create_array( # TODO: add type annotations for kwargs def empty(shape: ChunkCoords, **kwargs: Any) -> Array: - """Create an empty array. + """Create an empty array with the specified shape. The contents will be filled with the + array's fill value or zeros if no fill value is provided. Parameters ---------- @@ -908,12 +909,6 @@ def empty(shape: ChunkCoords, **kwargs: Any) -> Array: ------- Array The new array. - - Notes - ----- - The contents of an empty Zarr array are not defined. On attempting to - retrieve data from an empty Zarr array, any values may be returned, - and these are not guaranteed to be stable from one access to the next. """ return Array(sync(async_api.empty(shape, **kwargs))) @@ -921,7 +916,8 @@ def empty(shape: ChunkCoords, **kwargs: Any) -> Array: # TODO: move ArrayLike to common module # TODO: add type annotations for kwargs def empty_like(a: ArrayLike, **kwargs: Any) -> Array: - """Create an empty array like another array. + """Create an empty array like another array. The contents will be filled with the + array's fill value or zeros if no fill value is provided. Parameters ---------- diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 4760923e0b..880ad8945d 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1498,7 +1498,8 @@ async def tree(self, expand: bool | None = None, level: int | None = None) -> An async def empty( self, *, name: str, shape: ChunkCoords, **kwargs: Any ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: - """Create an empty array in this Group. + """Create an empty array with the specified shape in this Group. The contents will + be filled with the array's fill value or zeros if no fill value is provided. Parameters ---------- @@ -1592,7 +1593,8 @@ async def full( async def empty_like( self, *, name: str, data: async_api.ArrayLike, **kwargs: Any ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: - """Create an empty sub-array like `data`. + """Create an empty sub-array like `data`. The contents will be filled with + the array's fill value or zeros if no fill value is provided. Parameters ---------- @@ -2442,7 +2444,8 @@ def require_array(self, name: str, *, shape: ShapeLike, **kwargs: Any) -> Array: @_deprecate_positional_args def empty(self, *, name: str, shape: ChunkCoords, **kwargs: Any) -> Array: - """Create an empty array in this Group. + """Create an empty array with the specified shape in this Group. The contents will be filled with + the array's fill value or zeros if no fill value is provided. Parameters ---------- @@ -2453,11 +2456,6 @@ def empty(self, *, name: str, shape: ChunkCoords, **kwargs: Any) -> Array: **kwargs Keyword arguments passed to :func:`zarr.api.asynchronous.create`. - Notes - ----- - The contents of an empty Zarr array are not defined. On attempting to - retrieve data from an empty Zarr array, any values may be returned, - and these are not guaranteed to be stable from one access to the next. """ return Array(self._sync(self._async_group.empty(name=name, shape=shape, **kwargs))) @@ -2531,7 +2529,8 @@ def full( @_deprecate_positional_args def empty_like(self, *, name: str, data: async_api.ArrayLike, **kwargs: Any) -> Array: - """Create an empty sub-array like `data`. + """Create an empty sub-array like `data`. The contents will be filled + with the array's fill value or zeros if no fill value is provided. Parameters ---------- From cf89d068736e07e6acce747fd46ff578666a1a47 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 29 Jan 2025 12:18:38 +0100 Subject: [PATCH 11/12] document changed `zarr.empty` --- changes/2755.bugfix.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/changes/2755.bugfix.rst b/changes/2755.bugfix.rst index a10ca0eb92..2555369544 100644 --- a/changes/2755.bugfix.rst +++ b/changes/2755.bugfix.rst @@ -1,2 +1,3 @@ -Upon creation, an empty ``zarr.core.buffer.cpu.NDBuffer`` is filled with zeros to ensure deterministic behavior. +The array returned by ``zarr.empty`` and an empty ``zarr.core.buffer.cpu.NDBuffer`` will now be filled with the +specified fill value, or with zeros if no fill value is provided. This fixes a bug where Zarr format 2 data with no fill value was written with un-predictable chunk sizes. \ No newline at end of file From 90f4f731198de1fc16447e8d5e4655b7dd03dabd Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 5 Feb 2025 11:41:01 +0100 Subject: [PATCH 12/12] add notes to empty() and empty_like() --- src/zarr/api/asynchronous.py | 12 ++++++++++++ src/zarr/api/synchronous.py | 12 ++++++++++++ src/zarr/core/group.py | 12 +++++++++++- 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 91431ad76b..0584f19c3f 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -1074,6 +1074,12 @@ async def empty( Shape of the empty array. **kwargs Keyword arguments passed to :func:`zarr.api.asynchronous.create`. + + Notes + ----- + The contents of an empty Zarr array are not defined. On attempting to + retrieve data from an empty Zarr array, any values may be returned, + and these are not guaranteed to be stable from one access to the next. """ return await create(shape=shape, fill_value=None, **kwargs) @@ -1096,6 +1102,12 @@ async def empty_like( ------- Array The new array. + + Notes + ----- + The contents of an empty Zarr array are not defined. On attempting to + retrieve data from an empty Zarr array, any values may be returned, + and these are not guaranteed to be stable from one access to the next. """ like_kwargs = _like_args(a, kwargs) return await empty(**like_kwargs) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index efeb65265f..fe68981cb9 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -916,6 +916,12 @@ def empty(shape: ChunkCoords, **kwargs: Any) -> Array: ------- Array The new array. + + Notes + ----- + The contents of an empty Zarr array are not defined. On attempting to + retrieve data from an empty Zarr array, any values may be returned, + and these are not guaranteed to be stable from one access to the next. """ return Array(sync(async_api.empty(shape, **kwargs))) @@ -937,6 +943,12 @@ def empty_like(a: ArrayLike, **kwargs: Any) -> Array: ------- Array The new array. + + Notes + ----- + The contents of an empty Zarr array are not defined. On attempting to + retrieve data from an empty Zarr array, any values may be returned, + and these are not guaranteed to be stable from one access to the next. """ return Array(sync(async_api.empty_like(a, **kwargs))) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 880ad8945d..1f5d57c0ab 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1516,7 +1516,6 @@ async def empty( retrieve data from an empty Zarr array, any values may be returned, and these are not guaranteed to be stable from one access to the next. """ - return await async_api.empty(shape=shape, store=self.store_path, path=name, **kwargs) async def zeros( @@ -2456,6 +2455,11 @@ def empty(self, *, name: str, shape: ChunkCoords, **kwargs: Any) -> Array: **kwargs Keyword arguments passed to :func:`zarr.api.asynchronous.create`. + Notes + ----- + The contents of an empty Zarr array are not defined. On attempting to + retrieve data from an empty Zarr array, any values may be returned, + and these are not guaranteed to be stable from one access to the next. """ return Array(self._sync(self._async_group.empty(name=name, shape=shape, **kwargs))) @@ -2545,6 +2549,12 @@ def empty_like(self, *, name: str, data: async_api.ArrayLike, **kwargs: Any) -> ------- Array The new array. + + Notes + ----- + The contents of an empty Zarr array are not defined. On attempting to + retrieve data from an empty Zarr array, any values may be returned, + and these are not guaranteed to be stable from one access to the next. """ return Array(self._sync(self._async_group.empty_like(name=name, data=data, **kwargs)))