Skip to content

Commit

Permalink
Make threads the default executor (#621)
Browse files Browse the repository at this point in the history
* Make `threads` the default executor

* Fix chunking in store tests and run on multiple executors

* Use single-threaded executor in test_default_spec_config_override to avoid memory error

* Don't measure peak mem with threads executor
  • Loading branch information
tomwhite authored Nov 25, 2024
1 parent 4d79a26 commit 003bf92
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 23 deletions.
4 changes: 2 additions & 2 deletions cubed/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,9 +274,9 @@ def compute(
if executor is None:
executor = arrays[0].spec.executor
if executor is None:
from cubed.runtime.executors.local import SingleThreadedExecutor
from cubed.runtime.executors.local import ThreadsExecutor

executor = SingleThreadedExecutor()
executor = ThreadsExecutor()

_return_in_memory_array = kwargs.pop("_return_in_memory_array", True)
plan.execute(
Expand Down
20 changes: 16 additions & 4 deletions cubed/runtime/executors/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from cubed.runtime.types import Callback, CubedPipeline, DagExecutor, TaskEndEvent
from cubed.runtime.utils import (
execution_stats,
execution_timing,
handle_callbacks,
handle_operation_start_callbacks,
profile_memray,
Expand Down Expand Up @@ -61,9 +62,14 @@ def execute_dag(
[callback.on_task_end(event) for callback in callbacks]


@execution_timing
def run_func_threads(input, func=None, config=None, name=None, compute_id=None):
return func(input, config=config)


@profile_memray
@execution_stats
def run_func(input, func=None, config=None, name=None, compute_id=None):
def run_func_processes(input, func=None, config=None, name=None, compute_id=None):
return func(input, config=config)


Expand Down Expand Up @@ -142,7 +148,11 @@ def create_futures_func_multiprocessing(input, **kwargs):


def pipeline_to_stream(
concurrent_executor: Executor, name: str, pipeline: CubedPipeline, **kwargs
concurrent_executor: Executor,
run_func: Callable,
name: str,
pipeline: CubedPipeline,
**kwargs,
) -> Stream:
return stream.iterate(
map_unordered(
Expand Down Expand Up @@ -200,15 +210,17 @@ async def async_execute_dag(
mp_context=context,
max_tasks_per_child=max_tasks_per_child,
)
run_func = run_func_processes
else:
concurrent_executor = ThreadPoolExecutor(max_workers=max_workers)
run_func = run_func_threads
try:
if not compute_arrays_in_parallel:
# run one pipeline at a time
for name, node in visit_nodes(dag, resume=resume):
handle_operation_start_callbacks(callbacks, name)
st = pipeline_to_stream(
concurrent_executor, name, node["pipeline"], **kwargs
concurrent_executor, run_func, name, node["pipeline"], **kwargs
)
async with st.stream() as streamer:
async for _, stats in streamer:
Expand All @@ -218,7 +230,7 @@ async def async_execute_dag(
# run pipelines in the same topological generation in parallel by merging their streams
streams = [
pipeline_to_stream(
concurrent_executor, name, node["pipeline"], **kwargs
concurrent_executor, run_func, name, node["pipeline"], **kwargs
)
for name, node in gen
]
Expand Down
21 changes: 21 additions & 0 deletions cubed/runtime/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,33 @@ def execute_with_stats(function, *args, **kwargs):
)


def execute_with_timing(function, *args, **kwargs):
"""Invoke function and measure timing information.
Returns the result of the function call and a stats dictionary.
"""

function_start_tstamp = time.time()
result = function(*args, **kwargs)
function_end_tstamp = time.time()
return result, dict(
function_start_tstamp=function_start_tstamp,
function_end_tstamp=function_end_tstamp,
)


def execution_stats(func):
"""Decorator to measure timing information and peak memory usage of a function call."""

return partial(execute_with_stats, func)


def execution_timing(func):
"""Decorator to measure timing information of a function call."""

return partial(execute_with_timing, func)


def execute_with_memray(function, input, **kwargs):
# only run memray if installed, and only for first input (for operations that run on block locations)
if (
Expand Down
26 changes: 14 additions & 12 deletions cubed/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,43 +127,43 @@ def test_from_zarr(tmp_path, spec, executor, path):
)


def test_store(tmp_path, spec):
def test_store(tmp_path, spec, executor):
a = xp.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]], chunks=(2, 2), spec=spec)

store = tmp_path / "source.zarr"
target = zarr.empty(a.shape, store=store)
target = zarr.empty(a.shape, chunks=a.chunksize, store=store)

cubed.store(a, target)
cubed.store(a, target, executor=executor)
assert_array_equal(target[:], np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]))


def test_store_multiple(tmp_path, spec):
def test_store_multiple(tmp_path, spec, executor):
a = xp.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]], chunks=(2, 2), spec=spec)
b = xp.asarray([[1, 1, 1], [1, 1, 1], [1, 1, 1]], chunks=(2, 2), spec=spec)

store1 = tmp_path / "source1.zarr"
target1 = zarr.empty(a.shape, store=store1)
target1 = zarr.empty(a.shape, chunks=a.chunksize, store=store1)
store2 = tmp_path / "source2.zarr"
target2 = zarr.empty(b.shape, store=store2)
target2 = zarr.empty(b.shape, chunks=b.chunksize, store=store2)

cubed.store([a, b], [target1, target2])
cubed.store([a, b], [target1, target2], executor=executor)
assert_array_equal(target1[:], np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]))
assert_array_equal(target2[:], np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]]))


def test_store_fails(tmp_path, spec):
def test_store_fails(tmp_path, spec, executor):
a = xp.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]], chunks=(2, 2), spec=spec)
b = xp.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]], chunks=(2, 2), spec=spec)
store = tmp_path / "source.zarr"
target = zarr.empty(a.shape, store=store)
target = zarr.empty(a.shape, chunks=a.chunksize, store=store)

with pytest.raises(
ValueError, match=r"Different number of sources \(2\) and targets \(1\)"
):
cubed.store([a, b], [target])
cubed.store([a, b], [target], executor=executor)

with pytest.raises(ValueError, match="All sources must be cubed array objects"):
cubed.store([1], [target])
cubed.store([1], [target], executor=executor)


@pytest.mark.parametrize("path", [None, "sub", "sub/group"])
Expand Down Expand Up @@ -370,7 +370,9 @@ def test_default_spec_config_override():
# override default spec to increase allowed_mem
from cubed import config

with config.set({"spec.allowed_mem": "4GB"}):
with config.set(
{"spec.allowed_mem": "4GB", "spec.executor_name": "single-threaded"}
):
a = xp.ones((20000, 10000), chunks=(10000, 10000))
b = xp.negative(a)
assert_array_equal(b.compute(), -np.ones((20000, 10000)))
Expand Down
6 changes: 3 additions & 3 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,9 @@ These properties can be passed directly to the {py:class}`Spec <cubed.Spec>` con
| Property | Default | Description |
|--------------------|-------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
| `work_dir` | `None` | The directory path (specified as an fsspec URL) used for storing intermediate data. If not set, the user's temporary directory is used. |
| `allowed_mem` | `"2GB"` | The total memory available to a worker for running a task. This includes any `reserved_mem` that has been set. |
| `reserved_mem` | `"100MB"` | The memory reserved on a worker for non-data use when running a task |
| `executor_name` | `"single-threaded"` | The executor for running computations. One of `"single-threaded"`, `"threads"`, `"processes"`, `"beam"`, `"coiled"`, `"dask"`, `"lithops"`, `"modal"`. |
| `allowed_mem` | `"2GB"` | The total memory available to a worker for running a task. This includes any `reserved_mem` that has been set. |
| `reserved_mem` | `"100MB"` | The memory reserved on a worker for non-data use when running a task |
| `executor_name` | `"threads"` | The executor for running computations. One of `"single-threaded"`, `"threads"`, `"processes"`, `"beam"`, `"coiled"`, `"dask"`, `"lithops"`, `"modal"`. |
| `executor_options` | `None` | Options to pass to the executor on construction. See below for possible options for each executor. |
| `zarr_compressor` | `"default"`| The compressor used by Zarr for intermediate data. If not specified, or set to `"default"`, Zarr will use the default Blosc compressor. If set to `None`, compression is disabled, which can be a good option when using local storage. Use a dictionary (or nested YAML) to configure arbitrary compression using Numcodecs. |

Expand Down
6 changes: 4 additions & 2 deletions docs/user-guide/executors.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ Cubed provides a variety of executors for running the tasks in a computation, wh

## Local single-machine executors

If you don't specify an executor then the local in-process single-threaded Python executor is used. This is a very simple executor (called `single-threaded`) that is intended for testing on small amounts of data before running larger computations using the `processes` executor on a single machine, or a distributed executor in the cloud.
If you don't specify an executor then the local in-process multi-threaded Python executor is used by default. This is called the `threads` executor. It doesn't require any set up so it is useful for quickly getting started and running on datasets that don't fit in memory, but that can fit on a single machine's disk.

The `processes` executor runs on a single machine, and uses all the cores on the machine. It doesn't require any set up so it is useful for quickly getting started and running on datasets that don't fit in memory, but can fit on a single machine's disk.
The `processes` executor also runs on a single machine, and uses all the cores on the machine. However, unlike the `threads` executor, each task runs in a separate process, which avoids GIL contention, but adds some overhead in process startup time and communication. Typically, running using `processes` is more performant than `threads`, but it is worth trying both on your workload to see which is best.

There is a third local executor called `single-threaded` that runs tasks sequentially in a single thread, and is intended for testing on small amounts of data.

## Which cloud service executor should I use?

Expand Down

0 comments on commit 003bf92

Please sign in to comment.