From 5cce6fa49244bd9b6477d5b1b185e8c2f062e152 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <158148890+caugonnet@users.noreply.github.com> Date: Thu, 24 Oct 2024 08:48:32 +0200 Subject: [PATCH] Integrate CUDASTF -> CudaX (#2572) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CUDASTF is an implementation of the Sequential Task Flow model for CUDA. The availability of parallelism within modern hardware has dramatically increased, with large nodes now featuring multiple accelerators. As a result, maximizing concurrency at the application level in a scalable manner has become a crucial priority. To effectively hide latencies, it is essential to achieve the highest level of asynchrony possible. CUDASTF introduces a tasking model that automates data transfers while enforcing implicit data-driven dependencies. Implemented as a header-only C++ library, CUDASTF builds on top of CUDA APIs to simplify the development of multi-GPU applications. CUDASTF is currently capable of generating parallel applications using either the CUDA stream API or the CUDA graph API. --------- Co-authored-by: Cédric Augonnet Co-authored-by: Andrei Alexandrescu --- CMakePresets.json | 13 +- ci/windows/build_common.psm1 | 2 +- cudax/CMakeLists.txt | 16 +- cudax/cmake/cudaxBuildCompilerTargets.cmake | 20 + cudax/cmake/cudaxHeaderTesting.cmake | 39 +- cudax/cmake/cudaxSTFConfigureTarget.cmake | 41 + cudax/cmake/stf_header_unittest.in.cu | 9 + cudax/examples/CMakeLists.txt | 14 + cudax/examples/stf/01-axpy-cuda_kernel.cu | 73 + .../examples/stf/01-axpy-cuda_kernel_chain.cu | 80 + cudax/examples/stf/01-axpy-launch.cu | 62 + cudax/examples/stf/01-axpy-parallel_for.cu | 61 + cudax/examples/stf/01-axpy.cu | 72 + cudax/examples/stf/02-axpy-host_launch.cu | 81 + cudax/examples/stf/03-temporary-data.cu | 69 + cudax/examples/stf/04-fibonacci-run_once.cu | 79 + cudax/examples/stf/04-fibonacci.cu | 75 + cudax/examples/stf/08-cub-reduce.cu | 95 + cudax/examples/stf/1f1b.cu | 121 + cudax/examples/stf/CMakeLists.txt | 124 + cudax/examples/stf/axpy-annotated.cu | 82 + cudax/examples/stf/binary_fhe.cu | 201 ++ cudax/examples/stf/cfd.cu | 600 ++++ cudax/examples/stf/custom_data_interface.cu | 325 +++ cudax/examples/stf/explicit_data_places.cu | 87 + cudax/examples/stf/fdtd_mgpu.cu | 296 ++ cudax/examples/stf/frozen_data_init.cu | 53 + .../stf/graph_algorithms/degree_centrality.cu | 64 + .../examples/stf/graph_algorithms/jaccard.cu | 137 + .../examples/stf/graph_algorithms/pagerank.cu | 146 + .../examples/stf/graph_algorithms/tricount.cu | 99 + cudax/examples/stf/heat.cu | 126 + cudax/examples/stf/heat_mgpu.cu | 163 ++ cudax/examples/stf/jacobi.cu | 153 + cudax/examples/stf/launch_histogram.cu | 170 ++ cudax/examples/stf/launch_scan.cu | 162 ++ cudax/examples/stf/launch_sum.cu | 81 + cudax/examples/stf/launch_sum_cub.cu | 73 + .../examples/stf/linear_algebra/06-pdgemm.cu | 401 +++ .../stf/linear_algebra/07-cholesky.cu | 746 +++++ cudax/examples/stf/linear_algebra/07-potri.cu | 1689 +++++++++++ cudax/examples/stf/linear_algebra/cg_csr.cu | 357 +++ .../stf/linear_algebra/cg_dense_2D.cu | 460 +++ cudax/examples/stf/linear_algebra/strassen.cu | 489 ++++ .../examples/stf/logical_gates_composition.cu | 73 + cudax/examples/stf/mandelbrot.cu | 129 + cudax/examples/stf/parallel_for_2D.cu | 73 + cudax/examples/stf/scan.cu | 204 ++ cudax/examples/stf/standalone-launches.cu | 77 + cudax/examples/stf/thrust_zip_iterator.cu | 135 + cudax/examples/stf/void_data_interface.cu | 38 + cudax/examples/stf/word_count.cu | 96 + .../__stf/allocators/adapters.cuh | 223 ++ .../__stf/allocators/block_allocator.cuh | 186 ++ .../__stf/allocators/buddy_allocator.cuh | 366 +++ .../__stf/allocators/cached_allocator.cuh | 183 ++ .../__stf/allocators/pooled_allocator.cuh | 530 ++++ .../__stf/allocators/uncached_allocator.cuh | 75 + .../experimental/__stf/graph/graph_ctx.cuh | 1034 +++++++ .../__stf/graph/graph_data_interface.cuh | 89 + .../experimental/__stf/graph/graph_task.cuh | 534 ++++ .../__stf/graph/interfaces/slice.cuh | 309 ++ .../__stf/graph/internal/event_types.cuh | 96 + .../__stf/internal/acquire_release.cuh | 329 +++ .../__stf/internal/async_prereq.cuh | 490 ++++ .../__stf/internal/async_resources_handle.cuh | 528 ++++ .../internal/backend_allocator_setup.cuh | 121 + .../__stf/internal/backend_ctx.cuh | 1110 ++++++++ .../experimental/__stf/internal/constants.cuh | 88 + .../internal/cooperative_group_system.cuh | 129 + .../__stf/internal/data_interface.cuh | 450 +++ .../cuda/experimental/__stf/internal/dot.cuh | 528 ++++ .../__stf/internal/exec_affinity.cuh | 109 + .../__stf/internal/execution_policy.cuh | 776 ++++++ .../__stf/internal/frozen_logical_data.cuh | 206 ++ .../internal/hashtable_linearprobing.cuh | 266 ++ .../experimental/__stf/internal/hooks.cuh | 222 ++ .../internal/interpreted_execution_policy.cuh | 323 +++ .../experimental/__stf/internal/launch.cuh | 509 ++++ .../__stf/internal/logical_data.cuh | 2482 +++++++++++++++++ .../experimental/__stf/internal/machine.cuh | 132 + .../cuda/experimental/__stf/internal/msir.cuh | 226 ++ .../__stf/internal/parallel_for_scope.cuh | 475 ++++ .../__stf/internal/reduction_base.cuh | 72 + .../experimental/__stf/internal/reorderer.cuh | 381 +++ .../experimental/__stf/internal/repeat.cuh | 91 + .../experimental/__stf/internal/scheduler.cuh | 467 ++++ .../experimental/__stf/internal/slice.cuh | 1193 ++++++++ .../cuda/experimental/__stf/internal/task.cuh | 693 +++++ .../experimental/__stf/internal/task_dep.cuh | 294 ++ .../__stf/internal/task_state.cuh | 254 ++ .../__stf/internal/task_statistics.cuh | 303 ++ .../__stf/internal/thread_hierarchy.cuh | 531 ++++ .../__stf/internal/void_interface.cuh | 206 ++ .../__stf/localization/composite_slice.cuh | 376 +++ .../__stf/places/blocked_partition.cuh | 130 + .../__stf/places/cyclic_shape.cuh | 332 +++ .../__stf/places/exec/cuda_stream.cuh | 99 + .../__stf/places/exec/green_context.cuh | 276 ++ .../__stf/places/exec/green_ctx_view.cuh | 69 + .../places/exec/host/callback_queues.cuh | 606 ++++ .../experimental/__stf/places/inner_shape.cuh | 136 + .../__stf/places/loop_dispatch.cuh | 171 ++ .../__stf/places/place_partition.cuh | 218 ++ .../cuda/experimental/__stf/places/places.cuh | 1701 +++++++++++ .../__stf/places/tiled_partition.cuh | 173 ++ .../interfaces/hashtable_linearprobing.cuh | 150 + .../__stf/stream/interfaces/slice.cuh | 303 ++ .../stream/interfaces/slice_reduction_ops.cuh | 227 ++ .../__stf/stream/internal/event_types.cuh | 501 ++++ .../experimental/__stf/stream/reduction.cuh | 168 ++ .../experimental/__stf/stream/stream_ctx.cuh | 1320 +++++++++ .../__stf/stream/stream_data_interface.cuh | 194 ++ .../experimental/__stf/stream/stream_task.cuh | 852 ++++++ .../__stf/utility/cartesian_iterator.cuh | 523 ++++ .../__stf/utility/constant_logical_data.cuh | 125 + .../cuda/experimental/__stf/utility/core.cuh | 696 +++++ .../__stf/utility/cuda_attributes.cuh | 29 + .../__stf/utility/cuda_safe_call.cuh | 433 +++ .../experimental/__stf/utility/dimensions.cuh | 538 ++++ .../__stf/utility/getenv_cache.cuh | 66 + .../experimental/__stf/utility/handle.cuh | 292 ++ .../cuda/experimental/__stf/utility/hash.cuh | 225 ++ .../experimental/__stf/utility/memory.cuh | 1088 ++++++++ .../cuda/experimental/__stf/utility/nvtx.cuh | 62 + .../__stf/utility/pretty_print.cuh | 266 ++ .../experimental/__stf/utility/run_once.cuh | 108 + .../__stf/utility/scope_guard.cuh | 214 ++ .../__stf/utility/source_location.cuh | 90 + .../experimental/__stf/utility/stopwatch.cuh | 134 + .../__stf/utility/stream_to_dev.cuh | 56 + .../experimental/__stf/utility/threads.cuh | 198 ++ .../experimental/__stf/utility/traits.cuh | 583 ++++ .../experimental/__stf/utility/unique_id.cuh | 87 + .../experimental/__stf/utility/unittest.cuh | 709 +++++ .../__stf/utility/unstable_unique.cuh | 175 ++ cudax/include/cuda/experimental/stf.cuh | 1834 ++++++++++++ cudax/test/CMakeLists.txt | 9 +- .../memory_resource/any_async_resource.cu | 2 +- cudax/test/memory_resource/any_resource.cu | 2 +- cudax/test/memory_resource/shared_resource.cu | 2 +- .../{test_resource.h => test_resource.cuh} | 0 cudax/test/stf/CMakeLists.txt | 282 ++ .../test/stf/algorithm/algorithm_with_read.cu | 58 + cudax/test/stf/algorithm/graph_algorithms.cu | 108 + cudax/test/stf/algorithm/in_graph_ctx.cu | 68 + cudax/test/stf/algorithm/nested.cu | 67 + cudax/test/stf/allocators/adapter.cu | 56 + cudax/test/stf/allocators/buddy_allocator.cu | 43 + cudax/test/stf/allocators/cap_tmp_buffers.cu | 56 + cudax/test/stf/cpp/concurrency_test.cu | 98 + cudax/test/stf/cpp/read_const.cu | 83 + cudax/test/stf/cpp/redundant_data.cu | 86 + .../stf/cpp/redundant_data_different_modes.cu | 56 + cudax/test/stf/cpp/reuse_computation.cu | 80 + cudax/test/stf/cpp/reuse_computation_2.cu | 65 + cudax/test/stf/cpp/scoped_graph_task.cu | 101 + cudax/test/stf/cpp/user_streams.cu | 99 + .../0_Introduction/vectorAdd/README.md | 69 + .../0_Introduction/vectorAdd/vectorAdd.cu | 44 + .../vectorAdd/vectorAdd_cudastf.cu | 151 + .../jacobiCudaGraphs/README.md | 69 + .../jacobiCudaGraphs/jacobi.cu | 626 +++++ .../jacobiCudaGraphs/jacobi_cudastf.cu | 407 +++ .../conjugateGradientMultiDeviceCG_custf.cu | 521 ++++ .../MonteCarloMultiGPU.cu | 488 ++++ .../MonteCarlo_common.cuh | 127 + .../MonteCarlo_gold.cu | 160 ++ .../MonteCarlo_kernel.cu | 264 ++ .../MonteCarlo_reduction.cuh | 84 + .../MonteCarloMultiGPU_cudastf/README.md | 75 + .../MonteCarloMultiGPU_cudastf/realtype.cuh | 49 + cudax/test/stf/dot/basic.cu | 48 + cudax/test/stf/dot/graph_print_to_dot.cu | 50 + cudax/test/stf/dot/with_events.cu | 49 + cudax/test/stf/error_checks/ctx_mismatch.cu | 92 + .../error_checks/data_interface_mismatch.cu | 102 + .../test/stf/error_checks/double_finalize.cu | 81 + cudax/test/stf/error_checks/erase_frozen.cu | 78 + .../error_checks/misformed_tasks_dbl_end.cu | 79 + .../error_checks/misformed_tasks_dbl_start.cu | 76 + .../test/stf/error_checks/non_managed_data.cu | 71 + .../stf/error_checks/slice_check_bounds.cu | 84 + .../stf/error_checks/uninitialized_data.cu | 73 + .../stf/error_checks/unsatisfiable_spec.cu | 73 + cudax/test/stf/error_checks/write_frozen.cu | 80 + .../stf/examples/01-axpy-launch-ranges-cg.cu | 75 + cudax/test/stf/examples/01-axpy-places.cu | 123 + cudax/test/stf/examples/05-stencil-no-copy.cu | 256 ++ cudax/test/stf/examples/05-stencil-places.cu | 92 + cudax/test/stf/examples/05-stencil.cu | 265 ++ .../test/stf/examples/05-stencil2d-places.cu | 114 + cudax/test/stf/examples/07-cholesky-redux.cu | 751 +++++ .../test/stf/examples/07-cholesky-unified.cu | 693 +++++ cudax/test/stf/examples/09-nbody-algorithm.cu | 129 + .../stf/examples/09-nbody-blocked-graph.cu | 320 +++ cudax/test/stf/examples/09-nbody-blocked.cu | 283 ++ cudax/test/stf/examples/09-nbody.cu | 122 + .../batcher_sort_interpreted_cufhe.bool.ir | 355 +++ .../kernel_sharpen_interpreted_cufhe.bool.ir | 556 ++++ cudax/test/stf/fhe/parse_arctyrex.cu | 433 +++ .../test/stf/freeze/constant_logical_data.cu | 72 + cudax/test/stf/freeze/freeze.cu | 82 + cudax/test/stf/freeze/freeze_rw.cu | 83 + cudax/test/stf/freeze/task_fence.cu | 98 + cudax/test/stf/gnu/06-pdgemm.cpp | 371 +++ cudax/test/stf/gnu/07-cholesky.cpp | 718 +++++ cudax/test/stf/gnu/include_only.cpp | 23 + cudax/test/stf/graph/concurrency_test.cu | 84 + cudax/test/stf/graph/epoch.cu | 56 + cudax/test/stf/graph/for_each_batched.cu | 128 + .../test/stf/graph/for_each_batched_write.cu | 80 + cudax/test/stf/graph/freeze_for_graph.cu | 74 + cudax/test/stf/graph/graph_composition.cu | 117 + cudax/test/stf/graph/graph_ctx_low_level.cu | 54 + cudax/test/stf/graph/graph_tmp_data.cu | 60 + cudax/test/stf/graph/many.cu | 34 + cudax/test/stf/graph/multiple_graph_ctx.cu | 69 + cudax/test/stf/graph/static_graph_ctx.cu | 51 + cudax/test/stf/green_context/axpy_gc.cu | 112 + cudax/test/stf/green_context/cuda_graph.cu | 80 + cudax/test/stf/green_context/gc_grid.cu | 110 + cudax/test/stf/hash/ctx_hash.cu | 69 + cudax/test/stf/hash/logical_data.cu | 51 + cudax/test/stf/hashtable/fusion.cu | 64 + cudax/test/stf/hashtable/fusion_reduction.cu | 122 + cudax/test/stf/hashtable/parallel_for.cu | 38 + .../test/stf/hashtable/parallel_for_shape.cu | 39 + cudax/test/stf/hashtable/test.cu | 64 + cudax/test/stf/interface/data_from_device.cu | 82 + .../test/stf/interface/data_from_device_2.cu | 97 + .../stf/interface/data_from_device_async.cu | 102 + .../test/stf/interface/data_from_device_wb.cu | 78 + .../stf/interface/graph_use_device_data.cu | 100 + .../stf/interface/mix_stream_and_graph.cu | 87 + .../stf/interface/mix_stream_and_graph_2.cu | 119 + cudax/test/stf/interface/move_operator.cu | 64 + cudax/test/stf/interface/scal.cu | 71 + cudax/test/stf/interface/scalar_div.cu | 124 + .../test/stf/interface/stream_add_callback.cu | 50 + cudax/test/stf/local_stf/interop_cuda.cu | 70 + cudax/test/stf/local_stf/legacy_to_stf.cu | 215 ++ .../stf/loop_dispatch/dispatch_on_streams.cu | 44 + cudax/test/stf/loop_dispatch/loop_dispatch.cu | 41 + .../stf/loop_dispatch/nested_loop_dispatch.cu | 52 + cudax/test/stf/parallel_for/fdtd.cu | 254 ++ cudax/test/stf/parallel_for/fdtd_acc.cpp | 300 ++ .../stf/parallel_for/parallel_for_all_devs.cu | 82 + .../test/stf/parallel_for/parallel_for_box.cu | 91 + .../stf/parallel_for/parallel_for_repeat.cu | 75 + .../test2_parallel_for_context.cu | 73 + .../stf/parallel_for/test_parallel_for.cu | 134 + cudax/test/stf/parallel_for/tiled_loops.cu | 84 + cudax/test/stf/places/cuda_stream_place.cu | 89 + cudax/test/stf/places/managed.cu | 87 + cudax/test/stf/places/managed_from_shape.cu | 92 + cudax/test/stf/places/managed_from_user.cu | 90 + cudax/test/stf/places/non_current_device.cu | 77 + cudax/test/stf/places/place_partition.cu | 52 + cudax/test/stf/places/recursion.cu | 41 + cudax/test/stf/reclaiming/graph.cu | 72 + cudax/test/stf/reclaiming/graph_2.cu | 66 + cudax/test/stf/reclaiming/graph_real_oom.cu | 78 + cudax/test/stf/reclaiming/stream.cu | 142 + cudax/test/stf/reductions/many_inc.cu | 51 + cudax/test/stf/reductions/redux_test.cu | 42 + cudax/test/stf/reductions/redux_test2.cu | 78 + .../test/stf/reductions/slice2d_reduction.cu | 59 + cudax/test/stf/reductions/slice_custom_op.cu | 72 + .../stf/reductions/successive_reductions.cu | 67 + .../reductions/successive_reductions_pfor.cu | 45 + cudax/test/stf/reductions/sum.cu | 108 + cudax/test/stf/reductions/sum_array.cu | 119 + .../stf/reductions/sum_multiple_places.cu | 62 + .../sum_multiple_places_no_refvalue.cu | 118 + .../stf/reductions/write_back_after_redux.cu | 67 + cudax/test/stf/slice/pinning.cu | 59 + .../stf/static_error_checks/CMakeLists.txt | 53 + .../stf/static_error_checks/gnu_launch.cpp | 27 + .../static_error_checks/gnu_parallel_for.cpp | 28 + .../graph_task_prototype.cu | 28 + cudax/test/stf/static_error_checks/static.cu | 19 + .../stream_task_prototype.cu | 28 + .../stf/static_error_checks/task_prototype.cu | 28 + cudax/test/stf/stencil/stencil-1D.cu | 210 ++ cudax/test/stf/stress/empty_tasks.cu | 54 + cudax/test/stf/stress/empty_tasks_alloc.cu | 46 + cudax/test/stf/stress/kernel_chain.cu | 72 + cudax/test/stf/stress/kernel_chain_fused.cu | 70 + cudax/test/stf/stress/launch_overhead.cu | 65 + .../test/stf/stress/launch_vs_parallelfor.cu | 125 + cudax/test/stf/stress/many_read.cu | 70 + .../test/stf/stress/parallel_for_overhead.cu | 59 + cudax/test/stf/stress/task_bench.cu | 319 +++ cudax/test/stf/threads/axpy-threads-2.cu | 115 + cudax/test/stf/threads/axpy-threads-pfor.cu | 114 + cudax/test/stf/threads/axpy-threads.cu | 112 + cudax/test/stf/tools/auto_dump/auto_dump.cu | 64 + cudax/test/stf/utility/timing_with_fences.cu | 95 + docs/cudax/index.rst | 4 +- docs/cudax/stf.rst | 2056 ++++++++++++++ docs/cudax/stf/custom_data_interface.rst | 330 +++ .../stf/images/dot-output-axpy-annotated.png | Bin 0 -> 11489 bytes .../stf/images/dot-output-axpy-events.png | Bin 0 -> 135415 bytes docs/cudax/stf/images/dot-output-axpy.png | Bin 0 -> 11351 bytes docs/cudax/stf/images/dot-output-heat.png | Bin 0 -> 116112 bytes docs/cudax/stf/images/graph_01.dot | 10 + docs/cudax/stf/images/graph_01.png | Bin 0 -> 25117 bytes docs/cudax/stf/images/graph_02.dot | 16 + docs/cudax/stf/images/graph_02.png | Bin 0 -> 33089 bytes docs/cudax/stf/images/ncu-ui.png | Bin 0 -> 175221 bytes docs/cudax/stf/images/task-sequence-user.dot | 26 + docs/cudax/stf/images/task-sequence-user.png | Bin 0 -> 19913 bytes docs/cudax/stf/images/task-sequence.dot | 68 + docs/cudax/stf/images/task-sequence.png | Bin 0 -> 50433 bytes docs/cudax/stf/lower_level_api.rst | 114 + docs/repo.toml | 25 +- examples/cudax_stf/CMakeLists.txt | 54 + examples/cudax_stf/cmake/CPM.cmake | 1269 +++++++++ examples/cudax_stf/simple_stf.cu | 40 + 320 files changed, 66556 insertions(+), 13 deletions(-) create mode 100644 cudax/cmake/cudaxSTFConfigureTarget.cmake create mode 100644 cudax/cmake/stf_header_unittest.in.cu create mode 100644 cudax/examples/CMakeLists.txt create mode 100644 cudax/examples/stf/01-axpy-cuda_kernel.cu create mode 100644 cudax/examples/stf/01-axpy-cuda_kernel_chain.cu create mode 100644 cudax/examples/stf/01-axpy-launch.cu create mode 100644 cudax/examples/stf/01-axpy-parallel_for.cu create mode 100644 cudax/examples/stf/01-axpy.cu create mode 100644 cudax/examples/stf/02-axpy-host_launch.cu create mode 100644 cudax/examples/stf/03-temporary-data.cu create mode 100644 cudax/examples/stf/04-fibonacci-run_once.cu create mode 100644 cudax/examples/stf/04-fibonacci.cu create mode 100644 cudax/examples/stf/08-cub-reduce.cu create mode 100644 cudax/examples/stf/1f1b.cu create mode 100644 cudax/examples/stf/CMakeLists.txt create mode 100644 cudax/examples/stf/axpy-annotated.cu create mode 100644 cudax/examples/stf/binary_fhe.cu create mode 100644 cudax/examples/stf/cfd.cu create mode 100644 cudax/examples/stf/custom_data_interface.cu create mode 100644 cudax/examples/stf/explicit_data_places.cu create mode 100644 cudax/examples/stf/fdtd_mgpu.cu create mode 100644 cudax/examples/stf/frozen_data_init.cu create mode 100644 cudax/examples/stf/graph_algorithms/degree_centrality.cu create mode 100644 cudax/examples/stf/graph_algorithms/jaccard.cu create mode 100644 cudax/examples/stf/graph_algorithms/pagerank.cu create mode 100644 cudax/examples/stf/graph_algorithms/tricount.cu create mode 100644 cudax/examples/stf/heat.cu create mode 100644 cudax/examples/stf/heat_mgpu.cu create mode 100644 cudax/examples/stf/jacobi.cu create mode 100644 cudax/examples/stf/launch_histogram.cu create mode 100644 cudax/examples/stf/launch_scan.cu create mode 100644 cudax/examples/stf/launch_sum.cu create mode 100644 cudax/examples/stf/launch_sum_cub.cu create mode 100644 cudax/examples/stf/linear_algebra/06-pdgemm.cu create mode 100644 cudax/examples/stf/linear_algebra/07-cholesky.cu create mode 100644 cudax/examples/stf/linear_algebra/07-potri.cu create mode 100644 cudax/examples/stf/linear_algebra/cg_csr.cu create mode 100644 cudax/examples/stf/linear_algebra/cg_dense_2D.cu create mode 100644 cudax/examples/stf/linear_algebra/strassen.cu create mode 100644 cudax/examples/stf/logical_gates_composition.cu create mode 100644 cudax/examples/stf/mandelbrot.cu create mode 100644 cudax/examples/stf/parallel_for_2D.cu create mode 100644 cudax/examples/stf/scan.cu create mode 100644 cudax/examples/stf/standalone-launches.cu create mode 100644 cudax/examples/stf/thrust_zip_iterator.cu create mode 100644 cudax/examples/stf/void_data_interface.cu create mode 100644 cudax/examples/stf/word_count.cu create mode 100644 cudax/include/cuda/experimental/__stf/allocators/adapters.cuh create mode 100644 cudax/include/cuda/experimental/__stf/allocators/block_allocator.cuh create mode 100644 cudax/include/cuda/experimental/__stf/allocators/buddy_allocator.cuh create mode 100644 cudax/include/cuda/experimental/__stf/allocators/cached_allocator.cuh create mode 100644 cudax/include/cuda/experimental/__stf/allocators/pooled_allocator.cuh create mode 100644 cudax/include/cuda/experimental/__stf/allocators/uncached_allocator.cuh create mode 100644 cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh create mode 100644 cudax/include/cuda/experimental/__stf/graph/graph_data_interface.cuh create mode 100644 cudax/include/cuda/experimental/__stf/graph/graph_task.cuh create mode 100644 cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh create mode 100644 cudax/include/cuda/experimental/__stf/graph/internal/event_types.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/acquire_release.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/backend_allocator_setup.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/constants.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/cooperative_group_system.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/data_interface.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/dot.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/exec_affinity.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/execution_policy.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/frozen_logical_data.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/hashtable_linearprobing.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/hooks.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/interpreted_execution_policy.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/launch.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/logical_data.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/machine.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/msir.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/reduction_base.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/reorderer.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/repeat.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/scheduler.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/slice.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/task.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/task_dep.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/task_state.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/task_statistics.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/thread_hierarchy.cuh create mode 100644 cudax/include/cuda/experimental/__stf/internal/void_interface.cuh create mode 100644 cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh create mode 100644 cudax/include/cuda/experimental/__stf/places/blocked_partition.cuh create mode 100644 cudax/include/cuda/experimental/__stf/places/cyclic_shape.cuh create mode 100644 cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh create mode 100644 cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh create mode 100644 cudax/include/cuda/experimental/__stf/places/exec/green_ctx_view.cuh create mode 100644 cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh create mode 100644 cudax/include/cuda/experimental/__stf/places/inner_shape.cuh create mode 100644 cudax/include/cuda/experimental/__stf/places/loop_dispatch.cuh create mode 100644 cudax/include/cuda/experimental/__stf/places/place_partition.cuh create mode 100644 cudax/include/cuda/experimental/__stf/places/places.cuh create mode 100644 cudax/include/cuda/experimental/__stf/places/tiled_partition.cuh create mode 100644 cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh create mode 100644 cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh create mode 100644 cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh create mode 100644 cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh create mode 100644 cudax/include/cuda/experimental/__stf/stream/reduction.cuh create mode 100644 cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh create mode 100644 cudax/include/cuda/experimental/__stf/stream/stream_data_interface.cuh create mode 100644 cudax/include/cuda/experimental/__stf/stream/stream_task.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/cartesian_iterator.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/constant_logical_data.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/core.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/cuda_attributes.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/cuda_safe_call.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/dimensions.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/getenv_cache.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/handle.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/hash.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/memory.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/nvtx.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/pretty_print.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/run_once.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/scope_guard.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/source_location.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/stopwatch.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/stream_to_dev.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/threads.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/traits.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/unique_id.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/unittest.cuh create mode 100644 cudax/include/cuda/experimental/__stf/utility/unstable_unique.cuh create mode 100644 cudax/include/cuda/experimental/stf.cuh rename cudax/test/memory_resource/{test_resource.h => test_resource.cuh} (100%) create mode 100644 cudax/test/stf/CMakeLists.txt create mode 100644 cudax/test/stf/algorithm/algorithm_with_read.cu create mode 100644 cudax/test/stf/algorithm/graph_algorithms.cu create mode 100644 cudax/test/stf/algorithm/in_graph_ctx.cu create mode 100644 cudax/test/stf/algorithm/nested.cu create mode 100644 cudax/test/stf/allocators/adapter.cu create mode 100644 cudax/test/stf/allocators/buddy_allocator.cu create mode 100644 cudax/test/stf/allocators/cap_tmp_buffers.cu create mode 100644 cudax/test/stf/cpp/concurrency_test.cu create mode 100644 cudax/test/stf/cpp/read_const.cu create mode 100644 cudax/test/stf/cpp/redundant_data.cu create mode 100644 cudax/test/stf/cpp/redundant_data_different_modes.cu create mode 100644 cudax/test/stf/cpp/reuse_computation.cu create mode 100644 cudax/test/stf/cpp/reuse_computation_2.cu create mode 100644 cudax/test/stf/cpp/scoped_graph_task.cu create mode 100644 cudax/test/stf/cpp/user_streams.cu create mode 100644 cudax/test/stf/cuda-samples/0_Introduction/vectorAdd/README.md create mode 100644 cudax/test/stf/cuda-samples/0_Introduction/vectorAdd/vectorAdd.cu create mode 100644 cudax/test/stf/cuda-samples/0_Introduction/vectorAdd/vectorAdd_cudastf.cu create mode 100644 cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/README.md create mode 100644 cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.cu create mode 100644 cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi_cudastf.cu create mode 100644 cudax/test/stf/cuda-samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_custf.cu create mode 100644 cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarloMultiGPU.cu create mode 100644 cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarlo_common.cuh create mode 100644 cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarlo_gold.cu create mode 100644 cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarlo_kernel.cu create mode 100644 cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarlo_reduction.cuh create mode 100644 cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/README.md create mode 100644 cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/realtype.cuh create mode 100644 cudax/test/stf/dot/basic.cu create mode 100644 cudax/test/stf/dot/graph_print_to_dot.cu create mode 100644 cudax/test/stf/dot/with_events.cu create mode 100644 cudax/test/stf/error_checks/ctx_mismatch.cu create mode 100644 cudax/test/stf/error_checks/data_interface_mismatch.cu create mode 100644 cudax/test/stf/error_checks/double_finalize.cu create mode 100644 cudax/test/stf/error_checks/erase_frozen.cu create mode 100644 cudax/test/stf/error_checks/misformed_tasks_dbl_end.cu create mode 100644 cudax/test/stf/error_checks/misformed_tasks_dbl_start.cu create mode 100644 cudax/test/stf/error_checks/non_managed_data.cu create mode 100644 cudax/test/stf/error_checks/slice_check_bounds.cu create mode 100644 cudax/test/stf/error_checks/uninitialized_data.cu create mode 100644 cudax/test/stf/error_checks/unsatisfiable_spec.cu create mode 100644 cudax/test/stf/error_checks/write_frozen.cu create mode 100644 cudax/test/stf/examples/01-axpy-launch-ranges-cg.cu create mode 100644 cudax/test/stf/examples/01-axpy-places.cu create mode 100644 cudax/test/stf/examples/05-stencil-no-copy.cu create mode 100644 cudax/test/stf/examples/05-stencil-places.cu create mode 100644 cudax/test/stf/examples/05-stencil.cu create mode 100644 cudax/test/stf/examples/05-stencil2d-places.cu create mode 100644 cudax/test/stf/examples/07-cholesky-redux.cu create mode 100644 cudax/test/stf/examples/07-cholesky-unified.cu create mode 100644 cudax/test/stf/examples/09-nbody-algorithm.cu create mode 100644 cudax/test/stf/examples/09-nbody-blocked-graph.cu create mode 100644 cudax/test/stf/examples/09-nbody-blocked.cu create mode 100644 cudax/test/stf/examples/09-nbody.cu create mode 100755 cudax/test/stf/fhe/batcher_sort_interpreted_cufhe.bool.ir create mode 100644 cudax/test/stf/fhe/kernel_sharpen_interpreted_cufhe.bool.ir create mode 100644 cudax/test/stf/fhe/parse_arctyrex.cu create mode 100644 cudax/test/stf/freeze/constant_logical_data.cu create mode 100644 cudax/test/stf/freeze/freeze.cu create mode 100644 cudax/test/stf/freeze/freeze_rw.cu create mode 100644 cudax/test/stf/freeze/task_fence.cu create mode 100644 cudax/test/stf/gnu/06-pdgemm.cpp create mode 100644 cudax/test/stf/gnu/07-cholesky.cpp create mode 100644 cudax/test/stf/gnu/include_only.cpp create mode 100644 cudax/test/stf/graph/concurrency_test.cu create mode 100644 cudax/test/stf/graph/epoch.cu create mode 100644 cudax/test/stf/graph/for_each_batched.cu create mode 100644 cudax/test/stf/graph/for_each_batched_write.cu create mode 100644 cudax/test/stf/graph/freeze_for_graph.cu create mode 100644 cudax/test/stf/graph/graph_composition.cu create mode 100644 cudax/test/stf/graph/graph_ctx_low_level.cu create mode 100644 cudax/test/stf/graph/graph_tmp_data.cu create mode 100644 cudax/test/stf/graph/many.cu create mode 100644 cudax/test/stf/graph/multiple_graph_ctx.cu create mode 100644 cudax/test/stf/graph/static_graph_ctx.cu create mode 100644 cudax/test/stf/green_context/axpy_gc.cu create mode 100644 cudax/test/stf/green_context/cuda_graph.cu create mode 100644 cudax/test/stf/green_context/gc_grid.cu create mode 100644 cudax/test/stf/hash/ctx_hash.cu create mode 100644 cudax/test/stf/hash/logical_data.cu create mode 100644 cudax/test/stf/hashtable/fusion.cu create mode 100644 cudax/test/stf/hashtable/fusion_reduction.cu create mode 100644 cudax/test/stf/hashtable/parallel_for.cu create mode 100644 cudax/test/stf/hashtable/parallel_for_shape.cu create mode 100644 cudax/test/stf/hashtable/test.cu create mode 100644 cudax/test/stf/interface/data_from_device.cu create mode 100644 cudax/test/stf/interface/data_from_device_2.cu create mode 100644 cudax/test/stf/interface/data_from_device_async.cu create mode 100644 cudax/test/stf/interface/data_from_device_wb.cu create mode 100644 cudax/test/stf/interface/graph_use_device_data.cu create mode 100644 cudax/test/stf/interface/mix_stream_and_graph.cu create mode 100644 cudax/test/stf/interface/mix_stream_and_graph_2.cu create mode 100644 cudax/test/stf/interface/move_operator.cu create mode 100644 cudax/test/stf/interface/scal.cu create mode 100644 cudax/test/stf/interface/scalar_div.cu create mode 100644 cudax/test/stf/interface/stream_add_callback.cu create mode 100644 cudax/test/stf/local_stf/interop_cuda.cu create mode 100644 cudax/test/stf/local_stf/legacy_to_stf.cu create mode 100644 cudax/test/stf/loop_dispatch/dispatch_on_streams.cu create mode 100644 cudax/test/stf/loop_dispatch/loop_dispatch.cu create mode 100644 cudax/test/stf/loop_dispatch/nested_loop_dispatch.cu create mode 100644 cudax/test/stf/parallel_for/fdtd.cu create mode 100644 cudax/test/stf/parallel_for/fdtd_acc.cpp create mode 100644 cudax/test/stf/parallel_for/parallel_for_all_devs.cu create mode 100644 cudax/test/stf/parallel_for/parallel_for_box.cu create mode 100644 cudax/test/stf/parallel_for/parallel_for_repeat.cu create mode 100644 cudax/test/stf/parallel_for/test2_parallel_for_context.cu create mode 100644 cudax/test/stf/parallel_for/test_parallel_for.cu create mode 100644 cudax/test/stf/parallel_for/tiled_loops.cu create mode 100644 cudax/test/stf/places/cuda_stream_place.cu create mode 100644 cudax/test/stf/places/managed.cu create mode 100644 cudax/test/stf/places/managed_from_shape.cu create mode 100644 cudax/test/stf/places/managed_from_user.cu create mode 100644 cudax/test/stf/places/non_current_device.cu create mode 100644 cudax/test/stf/places/place_partition.cu create mode 100644 cudax/test/stf/places/recursion.cu create mode 100644 cudax/test/stf/reclaiming/graph.cu create mode 100644 cudax/test/stf/reclaiming/graph_2.cu create mode 100644 cudax/test/stf/reclaiming/graph_real_oom.cu create mode 100644 cudax/test/stf/reclaiming/stream.cu create mode 100644 cudax/test/stf/reductions/many_inc.cu create mode 100644 cudax/test/stf/reductions/redux_test.cu create mode 100644 cudax/test/stf/reductions/redux_test2.cu create mode 100644 cudax/test/stf/reductions/slice2d_reduction.cu create mode 100644 cudax/test/stf/reductions/slice_custom_op.cu create mode 100644 cudax/test/stf/reductions/successive_reductions.cu create mode 100644 cudax/test/stf/reductions/successive_reductions_pfor.cu create mode 100644 cudax/test/stf/reductions/sum.cu create mode 100644 cudax/test/stf/reductions/sum_array.cu create mode 100644 cudax/test/stf/reductions/sum_multiple_places.cu create mode 100644 cudax/test/stf/reductions/sum_multiple_places_no_refvalue.cu create mode 100644 cudax/test/stf/reductions/write_back_after_redux.cu create mode 100644 cudax/test/stf/slice/pinning.cu create mode 100644 cudax/test/stf/static_error_checks/CMakeLists.txt create mode 100644 cudax/test/stf/static_error_checks/gnu_launch.cpp create mode 100644 cudax/test/stf/static_error_checks/gnu_parallel_for.cpp create mode 100644 cudax/test/stf/static_error_checks/graph_task_prototype.cu create mode 100644 cudax/test/stf/static_error_checks/static.cu create mode 100644 cudax/test/stf/static_error_checks/stream_task_prototype.cu create mode 100644 cudax/test/stf/static_error_checks/task_prototype.cu create mode 100644 cudax/test/stf/stencil/stencil-1D.cu create mode 100644 cudax/test/stf/stress/empty_tasks.cu create mode 100644 cudax/test/stf/stress/empty_tasks_alloc.cu create mode 100644 cudax/test/stf/stress/kernel_chain.cu create mode 100644 cudax/test/stf/stress/kernel_chain_fused.cu create mode 100644 cudax/test/stf/stress/launch_overhead.cu create mode 100644 cudax/test/stf/stress/launch_vs_parallelfor.cu create mode 100644 cudax/test/stf/stress/many_read.cu create mode 100644 cudax/test/stf/stress/parallel_for_overhead.cu create mode 100644 cudax/test/stf/stress/task_bench.cu create mode 100644 cudax/test/stf/threads/axpy-threads-2.cu create mode 100644 cudax/test/stf/threads/axpy-threads-pfor.cu create mode 100644 cudax/test/stf/threads/axpy-threads.cu create mode 100644 cudax/test/stf/tools/auto_dump/auto_dump.cu create mode 100644 cudax/test/stf/utility/timing_with_fences.cu create mode 100644 docs/cudax/stf.rst create mode 100644 docs/cudax/stf/custom_data_interface.rst create mode 100644 docs/cudax/stf/images/dot-output-axpy-annotated.png create mode 100644 docs/cudax/stf/images/dot-output-axpy-events.png create mode 100644 docs/cudax/stf/images/dot-output-axpy.png create mode 100644 docs/cudax/stf/images/dot-output-heat.png create mode 100644 docs/cudax/stf/images/graph_01.dot create mode 100644 docs/cudax/stf/images/graph_01.png create mode 100644 docs/cudax/stf/images/graph_02.dot create mode 100644 docs/cudax/stf/images/graph_02.png create mode 100644 docs/cudax/stf/images/ncu-ui.png create mode 100644 docs/cudax/stf/images/task-sequence-user.dot create mode 100644 docs/cudax/stf/images/task-sequence-user.png create mode 100644 docs/cudax/stf/images/task-sequence.dot create mode 100644 docs/cudax/stf/images/task-sequence.png create mode 100644 docs/cudax/stf/lower_level_api.rst create mode 100644 examples/cudax_stf/CMakeLists.txt create mode 100755 examples/cudax_stf/cmake/CPM.cmake create mode 100644 examples/cudax_stf/simple_stf.cu diff --git a/CMakePresets.json b/CMakePresets.json index 03bbef3c948..8650c9b1290 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -62,6 +62,10 @@ "THRUST_MULTICONFIG_ENABLE_SYSTEM_TBB": true, "cudax_ENABLE_HEADER_TESTING": true, "cudax_ENABLE_TESTING": true, + "cudax_ENABLE_EXAMPLES": true, + "cudax_ENABLE_CUDASTF_BOUNDSCHECK": false, + "cudax_ENABLE_CUDASTF_DEBUG": false, + "cudax_ENABLE_CUDASTF_MATHLIBS": false, "cudax_ENABLE_DIALECT_CPP17": true, "cudax_ENABLE_DIALECT_CPP20": true, "CCCL_C_Parallel_ENABLE_TESTING": true, @@ -73,9 +77,11 @@ "displayName": "all-dev debug", "inherits": "all-dev", "cacheVariables": { - "CCCL_ENABLE_BENCHMARKS": false, "CMAKE_BUILD_TYPE": "Debug", - "CMAKE_CUDA_FLAGS": "-G" + "CMAKE_CUDA_FLAGS": "-G", + "CCCL_ENABLE_BENCHMARKS": false, + "cudax_ENABLE_CUDASTF_BOUNDSCHECK": true, + "cudax_ENABLE_CUDASTF_DEBUG": true } }, { @@ -297,6 +303,9 @@ "CCCL_ENABLE_CUDAX": true, "cudax_ENABLE_HEADER_TESTING": true, "cudax_ENABLE_TESTING": true, + "cudax_ENABLE_EXAMPLES": true, + "cudax_ENABLE_CUDASTF_BOUNDSCHECK": false, + "cudax_ENABLE_CUDASTF_MATHLIBS": false, "cudax_ENABLE_DIALECT_CPP17": false, "cudax_ENABLE_DIALECT_CPP20": false } diff --git a/ci/windows/build_common.psm1 b/ci/windows/build_common.psm1 index 4d122d25608..bf40142ef13 100644 --- a/ci/windows/build_common.psm1 +++ b/ci/windows/build_common.psm1 @@ -112,7 +112,7 @@ function build_preset { sccache_stats('Start') - cmake --build --preset $PRESET -v + cmake --build --preset $PRESET -v -- -k 0 $test_result = $LastExitCode $preset_dir = "${BUILD_DIR}/${PRESET}" diff --git a/cudax/CMakeLists.txt b/cudax/CMakeLists.txt index 087064653e9..65e94914f0f 100644 --- a/cudax/CMakeLists.txt +++ b/cudax/CMakeLists.txt @@ -19,9 +19,19 @@ enable_language(CUDA) option(cudax_ENABLE_HEADER_TESTING "Test that CUDA Experimental's public headers compile." ON) option(cudax_ENABLE_TESTING "Build CUDA Experimental's tests." ON) +option(cudax_ENABLE_EXAMPLES "Build CUDA Experimental's examples." ON) +option(cudax_ENABLE_CUDASTF_BOUNDSCHECK "Enable bounds checks for STF targets. Requires debug build." OFF) +option(cudax_ENABLE_CUDASTF_DEBUG "Enable additional debugging for STF targets. Requires debug build." OFF) +option(cudax_ENABLE_CUDASTF_MATHLIBS "Enable STF tests/examples that use cublas/cusolver." OFF) + +if ((cudax_ENABLE_CUDASTF_BOUNDSCHECK OR cudax_ENABLE_CUDASTF_DEBUG) AND + NOT CMAKE_BUILD_TYPE MATCHES "Debug" AND NOT CMAKE_BUILD_TYPE MATCHES "RelWithDebInfo") + message(FATAL_ERROR "cudax_ENABLE_CUDASTF_BOUNDSCHECK and cudax_ENABLE_CUDASTF_DEBUG require a Debug build.") +endif() include(cmake/cudaxBuildCompilerTargets.cmake) include(cmake/cudaxBuildTargetList.cmake) +include(cmake/cudaxSTFConfigureTarget.cmake) cudax_build_compiler_targets() cudax_build_target_list() @@ -31,7 +41,9 @@ if (cudax_ENABLE_HEADER_TESTING) endif() if (cudax_ENABLE_TESTING) - include(CTest) - enable_testing() # Must be in root directory add_subdirectory(test) endif() + +if (cudax_ENABLE_EXAMPLES) + add_subdirectory(examples) +endif() diff --git a/cudax/cmake/cudaxBuildCompilerTargets.cmake b/cudax/cmake/cudaxBuildCompilerTargets.cmake index df540a24018..6443098092b 100644 --- a/cudax/cmake/cudaxBuildCompilerTargets.cmake +++ b/cudax/cmake/cudaxBuildCompilerTargets.cmake @@ -23,6 +23,26 @@ function(cudax_build_compiler_targets) # cudax requires dim3 to be usable from a constexpr context, and the CUDART headers require # __cplusplus to be defined for this to work: append_option_if_available("/Zc:__cplusplus" cxx_compile_options) + + # cudax requires __VA_OPT__ for its unit tests + append_option_if_available("/Zc:preprocessor" cxx_compile_options) + + # XXX Temporary hack for STF ! + # C4267: conversion from 'meow' to 'purr', possible loss of data + append_option_if_available("/wd4267" cxx_compile_options) + + # C4459 : declaration of 'identifier' hides global declaration + # We work around std::chrono::last which hides some internal "last" variable + append_option_if_available("/wd4459" cxx_compile_options) + + # stf used getenv which is potentially unsafe but not in our context + list(APPEND cxx_compile_definitions "_CRT_SECURE_NO_WARNINGS") + endif() + + if("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}") + # stf heavily uses host device lambdas which break on clang due to a warning about the implicitly + # deleted copy constructor + append_option_if_available("-Wno-deprecated-copy" cxx_compile_options) endif() cccl_build_compiler_interface(cudax.compiler_interface diff --git a/cudax/cmake/cudaxHeaderTesting.cmake b/cudax/cmake/cudaxHeaderTesting.cmake index 6c81b529525..c314fec6d5a 100644 --- a/cudax/cmake/cudaxHeaderTesting.cmake +++ b/cudax/cmake/cudaxHeaderTesting.cmake @@ -12,7 +12,9 @@ function(cudax_add_header_test label definitions) cudax_get_target_property(config_dialect ${cn_target} DIALECT) cudax_get_target_property(config_prefix ${cn_target} PREFIX) - set(headertest_target ${config_prefix}.headers.${label}) + ################### + # Non-STF headers # + set(headertest_target ${config_prefix}.headers.${label}.no_stf) cccl_generate_header_tests(${headertest_target} cudax/include DIALECT ${config_dialect} # The cudax header template removes the check for the `small` macro. @@ -22,6 +24,9 @@ function(cudax_add_header_test label definitions) # The following internal headers are not required to compile independently: "cuda/experimental/__async/prologue.cuh" "cuda/experimental/__async/epilogue.cuh" + # STF headers are compiled separately: + "cuda/experimental/stf.cuh" + "cuda/experimental/__stf/*" ) target_link_libraries(${headertest_target} PUBLIC ${cn_target}) target_compile_definitions(${headertest_target} PRIVATE @@ -32,6 +37,38 @@ function(cudax_add_header_test label definitions) add_dependencies(cudax.all.headers ${headertest_target}) add_dependencies(${config_prefix}.all ${headertest_target}) + + # FIXME: Enable MSVC + if (NOT "MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") + ############### + # STF headers # + set(headertest_target ${config_prefix}.headers.${label}.stf) + cccl_generate_header_tests(${headertest_target} cudax/include + DIALECT ${config_dialect} + GLOBS + "cuda/experimental/stf.cuh" + "cuda/experimental/__stf/*.cuh" + + # FIXME: The cudax header template removes the check for the `small` macro. + # cuda/experimental/__stf/utility/memory.cuh defines functions named `small`. + # These should be renamed to avoid conflicts with windows system headers, and + # the following line removed: + HEADER_TEMPLATE "${cudax_SOURCE_DIR}/cmake/header_test.in.cu" + ) + target_link_libraries(${headertest_target} PUBLIC ${cn_target}) + target_compile_options(${headertest_target} PRIVATE + # Required by stf headers: + $<$:--extended-lambda> + # FIXME: We should be able to refactor away from needing this by + # using _CCCL_HOST_DEVICE and friends + `::cuda::std` utilities where + # necessary. + $<$:--expt-relaxed-constexpr> + ) + cudax_clone_target_properties(${headertest_target} ${cn_target}) + endif() + + add_dependencies(cudax.all.headers ${headertest_target}) + add_dependencies(${config_prefix}.all ${headertest_target}) endforeach() endfunction() diff --git a/cudax/cmake/cudaxSTFConfigureTarget.cmake b/cudax/cmake/cudaxSTFConfigureTarget.cmake new file mode 100644 index 00000000000..74765af8ff3 --- /dev/null +++ b/cudax/cmake/cudaxSTFConfigureTarget.cmake @@ -0,0 +1,41 @@ +# Configures a target for the STF framework. +function(cudax_stf_configure_target target_name) + set(options LINK_MATHLIBS) + set(oneValueArgs) + set(multiValueArgs) + cmake_parse_arguments(CSCT "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + target_link_libraries(${target_name} PRIVATE + ${cn_target} + CUDA::cudart + CUDA::curand + CUDA::cuda_driver + ) + target_compile_options(${target_name} PRIVATE + $<$:--extended-lambda> + $<$:--expt-relaxed-constexpr> + ) + set_target_properties(${target_name} PROPERTIES + CUDA_RUNTIME_LIBRARY Static + CUDA_SEPARABLE_COMPILATION ON + ) + + if (CSCT_LINK_MATHLIBS) + target_link_libraries(${target_name} PRIVATE + CUDA::cublas + CUDA::cusolver + ) + endif() + + if (cudax_ENABLE_CUDASTF_BOUNDSCHECK) + target_compile_definitions(${target_name} PRIVATE + "CUDASTF_BOUNDSCHECK" + ) + endif() + + if (cudax_ENABLE_CUDASTF_DEBUG) + target_compile_definitions(${target_name} PRIVATE + "CUDASTF_DEBUG" + ) + endif() +endfunction() diff --git a/cudax/cmake/stf_header_unittest.in.cu b/cudax/cmake/stf_header_unittest.in.cu new file mode 100644 index 00000000000..6b2354ce49a --- /dev/null +++ b/cudax/cmake/stf_header_unittest.in.cu @@ -0,0 +1,9 @@ +// This file is autogenerated by configuring stf_header_unittest.in.cu. + +// clang-format off +#define UNITTESTED_FILE "@source@" + +#include + +#include <@source@> +//clang-format on diff --git a/cudax/examples/CMakeLists.txt b/cudax/examples/CMakeLists.txt new file mode 100644 index 00000000000..f766bd25b2c --- /dev/null +++ b/cudax/examples/CMakeLists.txt @@ -0,0 +1,14 @@ +foreach(cn_target IN LISTS cudax_TARGETS) + cudax_get_target_property(config_prefix ${cn_target} PREFIX) + + # Metatarget for the current configuration's tests: + set(config_meta_target ${config_prefix}.examples) + add_custom_target(${config_meta_target}) + add_dependencies(${config_prefix}.all ${config_meta_target}) +endforeach() + +# FIXME: Enable MSVC +if (NOT "MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") + # STF tests are handled separately: + add_subdirectory(stf) +endif() diff --git a/cudax/examples/stf/01-axpy-cuda_kernel.cu b/cudax/examples/stf/01-axpy-cuda_kernel.cu new file mode 100644 index 00000000000..4613bac5fd9 --- /dev/null +++ b/cudax/examples/stf/01-axpy-cuda_kernel.cu @@ -0,0 +1,73 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief An AXPY kernel described using a cuda_kernel construct + * + */ + +#include + +using namespace cuda::experimental::stf; + +__global__ void axpy(double a, slice x, slice y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < x.size(); i += nthreads) + { + y(i) += a * x(i); + } +} + +double X0(int i) +{ + return sin((double) i); +} + +double Y0(int i) +{ + return cos((double) i); +} + +int main() +{ + context ctx = graph_ctx(); + const size_t N = 16; + double X[N], Y[N]; + + for (size_t i = 0; i < N; i++) + { + X[i] = X0(i); + Y[i] = Y0(i); + } + + double alpha = 3.14; + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + /* Compute Y = Y + alpha X */ + ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) { + // axpy<<<16, 128, 0, ...>>>(alpha, dX, dY) + return cuda_kernel_desc{axpy, 16, 128, 0, alpha, dX, dY}; + }; + + ctx.finalize(); + + for (size_t i = 0; i < N; i++) + { + assert(fabs(Y[i] - (Y0(i) + alpha * X0(i))) < 0.0001); + assert(fabs(X[i] - X0(i)) < 0.0001); + } +} diff --git a/cudax/examples/stf/01-axpy-cuda_kernel_chain.cu b/cudax/examples/stf/01-axpy-cuda_kernel_chain.cu new file mode 100644 index 00000000000..fb3724dceff --- /dev/null +++ b/cudax/examples/stf/01-axpy-cuda_kernel_chain.cu @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Example of task implementing a chain of CUDA kernels + * + */ + +#include + +using namespace cuda::experimental::stf; + +__global__ void axpy(double a, slice x, slice y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < x.size(); i += nthreads) + { + y(i) += a * x(i); + } +} + +double X0(int i) +{ + return sin((double) i); +} + +double Y0(int i) +{ + return cos((double) i); +} + +int main() +{ + context ctx = graph_ctx(); + const size_t N = 16; + double X[N], Y[N]; + + for (size_t i = 0; i < N; i++) + { + X[i] = X0(i); + Y[i] = Y0(i); + } + + double alpha = 3.14; + double beta = 4.5; + double gamma = -4.1; + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + /* Compute Y = Y + alpha X, Y = Y + beta X and then Y = Y + gamma X */ + ctx.cuda_kernel_chain(lX.read(), lY.rw())->*[&](auto dX, auto dY) { + // clang-format off + return std::vector { + { axpy, 16, 128, 0, alpha, dX, dY }, + { axpy, 16, 128, 0, beta, dX, dY }, + { axpy, 16, 128, 0, gamma, dX, dY } + }; + // clang-format on + }; + + ctx.finalize(); + + for (size_t i = 0; i < N; i++) + { + assert(fabs(Y[i] - (Y0(i) + (alpha + beta + gamma) * X0(i))) < 0.0001); + assert(fabs(X[i] - X0(i)) < 0.0001); + } +} diff --git a/cudax/examples/stf/01-axpy-launch.cu b/cudax/examples/stf/01-axpy-launch.cu new file mode 100644 index 00000000000..4dcd01ab46b --- /dev/null +++ b/cudax/examples/stf/01-axpy-launch.cu @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Example of AXPY kernel implemented with the launch API + */ + +#include + +using namespace cuda::experimental::stf; + +double X0(int i) +{ + return sin((double) i); +} + +double Y0(int i) +{ + return cos((double) i); +} + +int main() +{ + context ctx; + const size_t N = 16; + double X[N], Y[N]; + + for (size_t i = 0; i < N; i++) + { + X[i] = X0(i); + Y[i] = Y0(i); + } + + double alpha = 3.14; + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + /* Compute Y = Y + alpha X */ + ctx.launch(lX.read(), lY.rw())->*[=] _CCCL_DEVICE(auto t, auto dX, auto dY) { + for (auto ind : t.apply_partition(shape(dX))) + { + dY(ind) += alpha * dX(ind); + } + }; + + ctx.finalize(); + + for (size_t i = 0; i < N; i++) + { + assert(fabs(Y[i] - (Y0(i) + alpha * X0(i))) < 0.0001); + assert(fabs(X[i] - X0(i)) < 0.0001); + } +} diff --git a/cudax/examples/stf/01-axpy-parallel_for.cu b/cudax/examples/stf/01-axpy-parallel_for.cu new file mode 100644 index 00000000000..be69b8dbadd --- /dev/null +++ b/cudax/examples/stf/01-axpy-parallel_for.cu @@ -0,0 +1,61 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief An AXPY kernel implemented using the parallel_for construct + * + */ + +#include + +using namespace cuda::experimental::stf; + +double X0(int i) +{ + return sin((double) i); +} + +double Y0(int i) +{ + return cos((double) i); +} + +int main() +{ + context ctx; + const size_t N = 16; + double X[N], Y[N]; + + for (size_t i = 0; i < N; i++) + { + X[i] = X0(i); + Y[i] = Y0(i); + } + + double alpha = 3.14; + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + /* Compute Y = Y + alpha X */ + ctx.parallel_for(lY.shape(), lX.read(), lY.rw())->*[alpha] __device__(size_t i, auto dX, auto dY) { + dY(i) += alpha * dX(i); + }; + + ctx.finalize(); + + for (size_t i = 0; i < N; i++) + { + assert(fabs(Y[i] - (Y0(i) + alpha * X0(i))) < 0.0001); + assert(fabs(X[i] - X0(i)) < 0.0001); + } +} diff --git a/cudax/examples/stf/01-axpy.cu b/cudax/examples/stf/01-axpy.cu new file mode 100644 index 00000000000..5c1d684fab7 --- /dev/null +++ b/cudax/examples/stf/01-axpy.cu @@ -0,0 +1,72 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief An AXPY kernel implemented with CUDA kernel in a task + * + */ + +#include + +using namespace cuda::experimental::stf; + +__global__ void axpy(double a, slice x, slice y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < x.size(); i += nthreads) + { + y(i) += a * x(i); + } +} + +double X0(int i) +{ + return sin((double) i); +} + +double Y0(int i) +{ + return cos((double) i); +} + +int main() +{ + context ctx; + const size_t N = 16; + double X[N], Y[N]; + + for (size_t i = 0; i < N; i++) + { + X[i] = X0(i); + Y[i] = Y0(i); + } + + double alpha = 3.14; + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + /* Compute Y = Y + alpha X */ + ctx.task(lX.read(), lY.rw())->*[&](cudaStream_t s, auto dX, auto dY) { + axpy<<<16, 128, 0, s>>>(alpha, dX, dY); + }; + + ctx.finalize(); + + for (size_t i = 0; i < N; i++) + { + assert(fabs(Y[i] - (Y0(i) + alpha * X0(i))) < 0.0001); + assert(fabs(X[i] - X0(i)) < 0.0001); + } +} diff --git a/cudax/examples/stf/02-axpy-host_launch.cu b/cudax/examples/stf/02-axpy-host_launch.cu new file mode 100644 index 00000000000..5c7bb85bdd4 --- /dev/null +++ b/cudax/examples/stf/02-axpy-host_launch.cu @@ -0,0 +1,81 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief An AXPY kernel implemented with a task of the CUDA graph backend and + * a host callback + * + * The host_launch mechanism is also illustrated + * + */ + +#include + +using namespace cuda::experimental::stf; + +__global__ void axpy(double a, slice x, slice y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < x.size(); i += nthreads) + { + y(i) += a * x(i); + } +} + +double X0(int i) +{ + return sin((double) i); +} + +double Y0(int i) +{ + return cos((double) i); +} + +int main() +{ + graph_ctx ctx; + const size_t N = 16; + double X[N], Y[N]; + + for (size_t i = 0; i < N; i++) + { + X[i] = X0(i); + Y[i] = Y0(i); + } + + double alpha = 3.14; + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + /* Compute Y = Y + alpha X */ + ctx.task(lX.read(), lY.rw())->*[&](cudaStream_t s, auto dX, auto dY) { + axpy<<<16, 128, 0, s>>>(alpha, dX, dY); + }; + + /* Asynchronously check the result on the host */ + ctx.host_launch(lX.read(), lY.read())->*[&](auto hX, auto hY) { + for (size_t ind = 0; ind < hX.extent(0); ind++) + { + // Y should be Y0 + alpha X0 + EXPECT(fabs(hY(ind) - (Y0(ind) + alpha * X0(ind))) < 0.0001); + + // X should be X0 + EXPECT(fabs(hX(ind) - X0(ind)) < 0.0001); + } + }; + + ctx.finalize(); +} diff --git a/cudax/examples/stf/03-temporary-data.cu b/cudax/examples/stf/03-temporary-data.cu new file mode 100644 index 00000000000..491cdd57d72 --- /dev/null +++ b/cudax/examples/stf/03-temporary-data.cu @@ -0,0 +1,69 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief This example illustrates how we can create temporary data from shapes, and use them in tasks + */ + +#include + +using namespace cuda::experimental::stf; + +int main() +{ + const int n = 4096; + int X[n]; + int Y[n]; + + for (size_t i = 0; i < n; i++) + { + X[i] = 3 * i; + Y[i] = 2 * i - 3; + } + + context ctx; + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + // Select an odd number + int niter = 19; + assert(niter % 2 == 1); + + for (int iter = 0; iter < niter; iter++) + { + // We here define a temporary vector with the same shape as X, for which there is no existing copy + // This data handle has a limited scope, so that it is automatically destroyed at each iteration of the loop + auto tmp = ctx.logical_data(lX.shape()); + + ctx.task(lY.rw(), lX.rw(), tmp.write())->*[](cudaStream_t s, auto sY, auto sX, auto sTMP) { + // We swap X and Y using TMP as temporary buffer + // TMP = X + cuda_safe_call( + cudaMemcpyAsync(sTMP.data_handle(), sX.data_handle(), n * sizeof(int), cudaMemcpyDeviceToDevice, s)); + // X = Y + cuda_safe_call(cudaMemcpyAsync(sX.data_handle(), sY.data_handle(), n * sizeof(int), cudaMemcpyDeviceToDevice, s)); + // Y = TMP + cuda_safe_call( + cudaMemcpyAsync(sY.data_handle(), sTMP.data_handle(), n * sizeof(int), cudaMemcpyDeviceToDevice, s)); + }; + } + + ctx.finalize(); + + // We have exchanged an odd number of times, so they must be inverted + for (size_t i = 0; i < n; i++) + { + assert(X[i] == 2 * i - 3); + assert(Y[i] == 3 * i); + } +} diff --git a/cudax/examples/stf/04-fibonacci-run_once.cu b/cudax/examples/stf/04-fibonacci-run_once.cu new file mode 100644 index 00000000000..f4975f46b8d --- /dev/null +++ b/cudax/examples/stf/04-fibonacci-run_once.cu @@ -0,0 +1,79 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief An example of Fibonacci sequence illustrating how we can use + * dynamically created logical data and the run_once utility + */ + +#include + +using namespace cuda::experimental::stf; + +int fibo_ref(int n) +{ + if (n < 2) + { + return n; + } + else + { + return fibo_ref(n - 1) + fibo_ref(n - 2); + } +} + +__global__ void add(slice out, const slice in1, const slice in2) +{ + out(0) = in1(0) + in2(0); +} + +__global__ void set(slice out, int val) +{ + out(0) = val; +} + +logical_data> compute_fibo(context& ctx, int n) +{ + // The result for a given value n is memoized in a logical_data that will be reused every time we compute the same + // value + return run_once(n)->*[&](int n) { + auto result = ctx.logical_data(shape_of>(1)).set_symbol(std::to_string(n)); + if (n < 2) + { + ctx.task(result.write()).set_symbol("fibo" + std::to_string(n))->*[=](cudaStream_t s, auto sresult) { + set<<<1, 1, 0, s>>>(sresult, n); + }; + } + else + { + auto fib2 = compute_fibo(ctx, n - 2); + auto fib1 = compute_fibo(ctx, n - 1); + ctx.task(fib1.read(), fib2.read(), result.write()).set_symbol("fibo" + std::to_string(n)) + ->*[=](cudaStream_t s, auto s1, auto s2, auto sresult) { + add<<<1, 1, 0, s>>>(sresult, s1, s2); + }; + } + return result; + }; +} + +int main(int argc, char** argv) +{ + int n = (argc > 1) ? atoi(argv[1]) : 4; + + context ctx; + auto result = compute_fibo(ctx, n); + ctx.host_launch(result.read())->*[&](auto res) { + EXPECT(res(0) == fibo_ref(n)); + }; + ctx.finalize(); +} diff --git a/cudax/examples/stf/04-fibonacci.cu b/cudax/examples/stf/04-fibonacci.cu new file mode 100644 index 00000000000..c9c2af840b4 --- /dev/null +++ b/cudax/examples/stf/04-fibonacci.cu @@ -0,0 +1,75 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief An example of Fibonacci sequence illustrating how we can use + * dynamically created logical data + */ + +#include + +using namespace cuda::experimental::stf; + +int fibo_ref(int n) +{ + if (n < 2) + { + return n; + } + else + { + return fibo_ref(n - 1) + fibo_ref(n - 2); + } +} + +__global__ void add(slice out, const slice in1, const slice in2) +{ + out(0) = in1(0) + in2(0); +} + +__global__ void set(slice out, int val) +{ + out(0) = val; +} + +logical_data> compute_fibo(context& ctx, int n) +{ + auto out = ctx.logical_data(shape_of>(1)); + if (n < 2) + { + ctx.task(out.write())->*[=](cudaStream_t s, auto sout) { + set<<<1, 1, 0, s>>>(sout, n); + }; + } + else + { + auto fib1 = compute_fibo(ctx, n - 1); + auto fib2 = compute_fibo(ctx, n - 2); + ctx.task(fib1.read(), fib2.read(), out.write())->*[=](cudaStream_t s, auto s1, auto s2, auto sout) { + add<<<1, 1, 0, s>>>(sout, s1, s2); + }; + } + + return out; +} + +int main(int argc, char** argv) +{ + int n = (argc > 1) ? atoi(argv[1]) : 4; + + context ctx; // = graph_ctx(); + auto result = compute_fibo(ctx, n); + ctx.host_launch(result.read())->*[&](auto res) { + EXPECT(res(0) == fibo_ref(n)); + }; + ctx.finalize(); +} diff --git a/cudax/examples/stf/08-cub-reduce.cu b/cudax/examples/stf/08-cub-reduce.cu new file mode 100644 index 00000000000..0695c90d3b5 --- /dev/null +++ b/cudax/examples/stf/08-cub-reduce.cu @@ -0,0 +1,95 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Example of reduction implementing using CUB kernels + */ + +#include + +#include + +using namespace cuda::experimental::stf; + +template +__global__ void reduce(slice values, slice partials, size_t nelems) +{ + using namespace cub; + typedef BlockReduce BlockReduceT; + + auto thread_id = BLOCK_THREADS * blockIdx.x + threadIdx.x; + + // Local reduction + T local_sum = 0; + for (size_t ind = thread_id; ind < nelems; ind += blockDim.x * gridDim.x) + { + local_sum += values(ind); + } + + __shared__ typename BlockReduceT::TempStorage temp_storage; + + // Per-thread tile data + T result = BlockReduceT(temp_storage).Sum(local_sum); + + if (threadIdx.x == 0) + { + partials(blockIdx.x) = result; + } +} + +template +void run() +{ + Ctx ctx; + + const size_t N = 1024 * 16; + const size_t BLOCK_SIZE = 128; + const size_t num_blocks = 32; + + int *X, ref_tot; + + X = new int[N]; + ref_tot = 0; + + for (size_t ind = 0; ind < N; ind++) + { + X[ind] = rand() % N; + ref_tot += X[ind]; + } + + auto values = ctx.logical_data(X, {N}); + auto partials = ctx.logical_data(shape_of>(num_blocks)); + auto result = ctx.logical_data(shape_of>(1)); + + ctx.task(values.read(), partials.write(), result.write())->*[&](auto stream, auto values, auto partials, auto result) { + // reduce values into partials + reduce<<>>(values, partials, N); + + // reduce partials on a single block into result + reduce<<<1, BLOCK_SIZE, 0, stream>>>(partials, result, num_blocks); + }; + + ctx.host_launch(result.read())->*[&](auto p) { + if (p(0) != ref_tot) + { + fprintf(stderr, "INCORRECT RESULT: p sum = %d, ref tot = %d\n", p(0), ref_tot); + abort(); + } + }; + + ctx.finalize(); +} + +int main() +{ + run(); + run(); +} diff --git a/cudax/examples/stf/1f1b.cu b/cudax/examples/stf/1f1b.cu new file mode 100644 index 00000000000..47b61a4b23e --- /dev/null +++ b/cudax/examples/stf/1f1b.cu @@ -0,0 +1,121 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Toy example to reproduce the asynchrony of a 1F1B pipeline + */ + +#include + +using namespace cuda::experimental::stf; + +__global__ void forward(slice, long long int clock_cnt) +{ + long long int start_clock = clock64(); + long long int clock_offset = 0; + while (clock_offset < clock_cnt) + { + clock_offset = clock64() - start_clock; + } +} + +__global__ void backward(slice, long long int clock_cnt) +{ + long long int start_clock = clock64(); + long long int clock_offset = 0; + while (clock_offset < clock_cnt) + { + clock_offset = clock64() - start_clock; + } +} + +int main(int argc, char** argv) +{ + context ctx; + // Use a graph context if the second argument is set and not null + if (argc > 2 && atoi(argv[2])) + { + ctx = graph_ctx(); + } + + int device; + cudaGetDevice(&device); + + // cudaDevAttrClockRate: Peak clock frequency in kilohertz; + int clock_rate; + cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, device); + + int f_grid_size, f_block_size; + std::tie(f_grid_size, f_block_size) = reserved::compute_occupancy(forward); + + int b_grid_size, b_block_size; + std::tie(b_grid_size, b_block_size) = reserved::compute_occupancy(backward); + + int factor = 1; + if (argc > 1) + { + factor = atoi(argv[1]); + } + + size_t num_batches = 8 * factor; + int num_devs = 8; + int real_devs; + cuda_safe_call(cudaGetDeviceCount(&real_devs)); + + std::vector>> data; + + for (size_t b = 0; b < num_batches; b++) + { + auto batch_data = ctx.logical_data(shape_of>(1024)); + data.push_back(batch_data); + + ctx.task(exec_place::device(0), data[b].write())->*[](cudaStream_t, auto) { + // Init ... + }; + } + + cuda_safe_call(cudaStreamSynchronize(ctx.task_fence())); + + size_t niter = 10; + + for (size_t iter = 0; iter < niter; iter++) + { + for (size_t b = 0; b < num_batches; b++) + { + for (int d = 0; d < num_devs; d++) + { + ctx.task(exec_place::device(d % real_devs), data[b].rw())->*[=](cudaStream_t s, auto bd) { + int ms = 10; + long long int clock_cnt = (long long int) (ms * clock_rate / factor); + forward<<>>(bd, clock_cnt); + }; + } + // } + // + // for (size_t b = 0; b < num_batches; b++) { + for (int d = num_devs; d-- > 0;) + { + ctx.task(exec_place::device(d % real_devs), data[b].rw())->*[=](cudaStream_t s, auto bd) { + int ms = 20; + long long int clock_cnt = (long long int) (ms * clock_rate / factor); + backward<<>>(bd, clock_cnt); + }; + } + } + + /* We introduce a fence because the actual pipeline would introduce + * some all to all communication to update coefficients */ + cuda_safe_call(cudaStreamSynchronize(ctx.task_fence())); + } + + ctx.finalize(); + return 0; +} diff --git a/cudax/examples/stf/CMakeLists.txt b/cudax/examples/stf/CMakeLists.txt new file mode 100644 index 00000000000..0ae4ccbb0e2 --- /dev/null +++ b/cudax/examples/stf/CMakeLists.txt @@ -0,0 +1,124 @@ +set(stf_example_sources + 01-axpy.cu + 01-axpy-cuda_kernel.cu + 01-axpy-cuda_kernel_chain.cu + 01-axpy-parallel_for.cu + 01-axpy-launch.cu + 02-axpy-host_launch.cu + 03-temporary-data.cu + 04-fibonacci.cu + 04-fibonacci-run_once.cu + 08-cub-reduce.cu + axpy-annotated.cu + binary_fhe.cu + cfd.cu + custom_data_interface.cu + void_data_interface.cu + frozen_data_init.cu + graph_algorithms/degree_centrality.cu + graph_algorithms/pagerank.cu + graph_algorithms/tricount.cu + graph_algorithms/jaccard.cu + explicit_data_places.cu + fdtd_mgpu.cu + heat.cu + heat_mgpu.cu + jacobi.cu + launch_histogram.cu + launch_scan.cu + launch_sum.cu + launch_sum_cub.cu + logical_gates_composition.cu + parallel_for_2D.cu + scan.cu + mandelbrot.cu + standalone-launches.cu + thrust_zip_iterator.cu + word_count.cu + 1f1b.cu +) + +# Examples using CUBLAS, CUSOLVER... +set(stf_example_mathlib_sources + linear_algebra/06-pdgemm.cu + linear_algebra/07-cholesky.cu + linear_algebra/07-potri.cu + linear_algebra/cg_csr.cu + linear_algebra/cg_dense_2D.cu + linear_algebra/strassen.cu +) + +find_package(cudax) # already found, bring in version info. + +find_package(CUB ${cudax_VERSION} EXACT CONFIG + NO_DEFAULT_PATH # Only check the explicit path in HINTS: + HINTS "${CCCL_SOURCE_DIR}/lib/cmake/cub/" +) + +find_package(Thrust ${cudax_VERSION} EXACT CONFIG + NO_DEFAULT_PATH # Only check the explicit path in HINTS: + HINTS "${CCCL_SOURCE_DIR}/lib/cmake/thrust/" +) +thrust_create_target(cudax.stf.examples.thrust) + +find_package(CUDAToolkit REQUIRED) + +## cudax_add_stf_example +# +# Add an stf example executable and register it with ctest. +# +# target_name_var: Variable name to overwrite with the name of the example +# target. Useful for post-processing target information. +# source: The source file for the example. +# cn_target: The reference cudax target with configuration information. +# Additional args are passed to cudax_stf_configure_target. +function(cudax_add_stf_example target_name_var source cn_target) + cudax_get_target_property(config_dialect ${cn_target} DIALECT) + cudax_get_target_property(config_prefix ${cn_target} PREFIX) + + get_filename_component(dir ${source} DIRECTORY) + get_filename_component(filename ${source} NAME_WE) + if (dir) + set(filename "${dir}/${filename}") + endif() + string(REPLACE "/" "." example_name "stf/${filename}") + + set(example_target ${config_prefix}.example.${example_name}) + + add_executable(${example_target} ${source}) + cccl_configure_target(${example_target} DIALECT ${config_dialect}) + cudax_clone_target_properties(${example_target} ${cn_target}) + cudax_stf_configure_target(${example_target} ${ARGN}) + target_link_libraries(${example_target} PRIVATE + cudax.stf.examples.thrust + CUB::CUB + ) + + set(stf_meta_target ${config_prefix}.examples.stf) + add_dependencies(${stf_meta_target} ${example_target}) + + add_test(NAME ${example_target} COMMAND "$") + + set(${target_name_var} ${example_target} PARENT_SCOPE) +endfunction() + +# Create examples for each enabled configuration: +foreach(cn_target IN LISTS cudax_TARGETS) + cudax_get_target_property(config_prefix ${cn_target} PREFIX) + + # Metatarget for the current configuration's examples: + set(config_meta_target ${config_prefix}.examples) + set(stf_meta_target ${config_prefix}.examples.stf) + add_custom_target(${stf_meta_target}) + add_dependencies(${config_meta_target} ${stf_meta_target}) + + foreach(source IN LISTS stf_example_sources) + cudax_add_stf_example(example_target "${source}" ${cn_target}) + endforeach() + + if (cudax_ENABLE_CUDASTF_MATHLIBS) + foreach(source IN LISTS stf_example_mathlib_sources) + cudax_add_stf_example(example_target "${source}" ${cn_target} LINK_MATHLIBS) + endforeach() + endif() +endforeach() diff --git a/cudax/examples/stf/axpy-annotated.cu b/cudax/examples/stf/axpy-annotated.cu new file mode 100644 index 00000000000..f4b81f5e96d --- /dev/null +++ b/cudax/examples/stf/axpy-annotated.cu @@ -0,0 +1,82 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief This example illustrates how we can annotate tasks and logical data with debugging symbol + * + * CUDASTF_DOT_FILE=axpy.dot build/examples/axpy-annotated + * + * # Generate the visualization from this dot file in PDF or PNG format + * dot -Tpdf axpy.dot -o axpy.pdf + * dot -Tpng axpy.dot -o axpy.png + * + * # Generate visualization with events (for advanced users) + * CUDASTF_DOT_IGNORE_PREREQS=0 CUDASTF_DOT_FILE=axpy-with-events.dot build/examples/axpy-annotated + * dot -Tpng axpy-with-events.dot -o axpy-with-events.png + * + */ + +#include + +using namespace cuda::experimental::stf; + +__global__ void axpy(double a, slice x, slice y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < x.size(); i += nthreads) + { + y(i) += a * x(i); + } +} + +double X0(int i) +{ + return sin((double) i); +} + +double Y0(int i) +{ + return cos((double) i); +} + +int main() +{ + context ctx; + const size_t N = 16; + double X[N], Y[N]; + + for (size_t i = 0; i < N; i++) + { + X[i] = X0(i); + Y[i] = Y0(i); + } + + double alpha = 3.14; + + auto lX = ctx.logical_data(X).set_symbol("X"); + auto lY = ctx.logical_data(Y).set_symbol("Y"); + + /* Compute Y = Y + alpha X */ + ctx.task(lX.read(), lY.rw()).set_symbol("axpy")->*[&](cudaStream_t s, auto dX, auto dY) { + axpy<<<16, 128, 0, s>>>(alpha, dX, dY); + }; + + ctx.finalize(); + + for (size_t i = 0; i < N; i++) + { + assert(fabs(Y[i] - (Y0(i) + alpha * X0(i))) < 0.0001); + assert(fabs(X[i] - X0(i)) < 0.0001); + } +} diff --git a/cudax/examples/stf/binary_fhe.cu b/cudax/examples/stf/binary_fhe.cu new file mode 100644 index 00000000000..6b6e86154b4 --- /dev/null +++ b/cudax/examples/stf/binary_fhe.cu @@ -0,0 +1,201 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief A toy example to illustrate how we can compose logical operations + * over encrypted data + */ + +#include + +using namespace cuda::experimental::stf; + +class ciphertext; + +class plaintext +{ +public: + plaintext(const context& ctx) + : ctx(ctx) + {} + + plaintext(context& ctx, std::vector v) + : values(v) + , ctx(ctx) + { + l = ctx.logical_data(&values[0], values.size()); + } + + void set_symbol(std::string s) + { + l.set_symbol(s); + symbol = s; + } + + std::string get_symbol() const + { + return symbol; + } + + std::string symbol; + + const logical_data>& data() const + { + return l; + } + + logical_data>& data() + { + return l; + } + + // This will asynchronously fill string s + void convert_to_vector(std::vector& v) + { + ctx.host_launch(l.read()).set_symbol("to_vector")->*[&](auto dl) { + v.resize(dl.size()); + for (size_t i = 0; i < dl.size(); i++) + { + v[i] = dl(i); + } + }; + } + + ciphertext encrypt() const; + + logical_data> l; + +private: + std::vector values; + mutable context ctx; +}; + +class ciphertext +{ +public: + ciphertext(const context& ctx) + : ctx(ctx) + {} + + plaintext decrypt() const + { + plaintext p(ctx); + p.l = ctx.logical_data(shape_of>(l.shape().size())); + // fprintf(stderr, "Decrypting...\n"); + ctx.parallel_for(l.shape(), l.read(), p.l.write()).set_symbol("decrypt")->* + [] _CCCL_DEVICE(size_t i, auto dctxt, auto dptxt) { + dptxt(i) = char((dctxt(i) >> 32)); + // printf("DECRYPT %ld : %lx -> %x\n", i, dctxt(i), (int) dptxt(i)); + }; + return p; + } + + ciphertext operator|(const ciphertext& other) const + { + ciphertext result(ctx); + result.l = ctx.logical_data(data().shape()); + + ctx.parallel_for(data().shape(), data().read(), other.data().read(), result.data().write()).set_symbol("OR")->* + [] _CCCL_DEVICE(size_t i, auto d_c1, auto d_c2, auto d_res) { + d_res(i) = d_c1(i) | d_c2(i); + }; + + return result; + } + + ciphertext operator&(const ciphertext& other) const + { + ciphertext result(ctx); + result.l = ctx.logical_data(data().shape()); + + ctx.parallel_for(data().shape(), data().read(), other.data().read(), result.data().write()).set_symbol("AND")->* + [] _CCCL_DEVICE(size_t i, auto d_c1, auto d_c2, auto d_res) { + d_res(i) = d_c1(i) & d_c2(i); + }; + + return result; + } + + ciphertext operator~() const + { + ciphertext result(ctx); + result.l = ctx.logical_data(data().shape()); + ctx.parallel_for(data().shape(), data().read(), result.data().write()).set_symbol("NOT")->* + [] _CCCL_DEVICE(size_t i, auto d_c, auto d_res) { + d_res(i) = ~d_c(i); + }; + + return result; + } + + const logical_data>& data() const + { + return l; + } + + logical_data>& data() + { + return l; + } + + logical_data> l; + +private: + mutable context ctx; +}; + +ciphertext plaintext::encrypt() const +{ + ciphertext c(ctx); + c.l = ctx.logical_data(shape_of>(l.shape().size())); + + ctx.parallel_for(l.shape(), l.read(), c.l.write()).set_symbol("encrypt")->* + [] _CCCL_DEVICE(size_t i, auto dptxt, auto dctxt) { + // A super safe encryption ! + dctxt(i) = ((uint64_t) (dptxt(i)) << 32 | 0x4); + }; + + return c; +} + +template +T circuit(const T& a, const T& b) +{ + return (~((a | ~b) & (~a | b))); +} + +int main() +{ + context ctx; + + std::vector vA{3, 3, 2, 2, 17}; + plaintext pA(ctx, vA); + pA.set_symbol("A"); + + std::vector vB{1, 7, 7, 7, 49}; + plaintext pB(ctx, vB); + pB.set_symbol("B"); + + auto eA = pA.encrypt(); + auto eB = pB.encrypt(); + auto out = circuit(eA, eB); + + std::vector v_out; + out.decrypt().convert_to_vector(v_out); + + ctx.finalize(); + + for (size_t i = 0; i < v_out.size(); i++) + { + char expected = circuit(vA[i], vB[i]); + EXPECT(expected == v_out[i]); + } +} diff --git a/cudax/examples/stf/cfd.cu b/cudax/examples/stf/cfd.cu new file mode 100644 index 00000000000..ee67e3d5a3a --- /dev/null +++ b/cudax/examples/stf/cfd.cu @@ -0,0 +1,600 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Simulation of a fluid over a regular grid with an implicit Jacobi solver + */ + +#include + +#include + +#include +#include +#include + +using namespace std::chrono; +using namespace cuda::experimental::stf; + +/* wall-clock time */ +double gettime() +{ + auto now = system_clock::now().time_since_epoch(); + return duration_cast>(now).count(); +} + +void writeplotfile(int m, int n, int scale) +{ + FILE* gnuplot = EXPECT(fopen("cfd.plt", "w")); + SCOPE(exit) + { + EXPECT(fclose(gnuplot) == 0); + }; + + fprintf(gnuplot, + "set terminal pngcairo\n" + "set output 'cfd_output.png'\n" + "set size square\n" + "set key off\n" + "unset xtics\n" + "unset ytics\n"); + + fprintf(gnuplot, "set xrange [%i:%i]\n", 1 - scale, m + scale); + fprintf(gnuplot, "set yrange [%i:%i]\n", 1 - scale, n + scale); + + fprintf(gnuplot, + "plot \"colourmap.dat\" w rgbimage, \"velocity.dat\" u " + "1:2:(%d*0.75*$3/sqrt($3**2+$4**2)):(%d*0.75*$4/sqrt($3**2+$4**2)) with vectors lc rgb \"#7F7F7F\"", + scale, + scale); + + // printf("\nWritten gnuplot script 'cfd.plt'\n"); +} + +double colfunc(double x) +{ + double x1 = 0.2; + double x2 = 0.5; + + double absx = fabs(x); + + if (absx > x2) + { + return 0.0; + } + else if (absx < x1) + { + return 1.0; + } + else + { + return 1.0 - pow((absx - x1) / (x2 - x1), 2); + } +} + +void hue2rgb(double hue, int& r, int& g, int& b) +{ + int rgbmax = 255; + + r = (int) (rgbmax * colfunc(hue - 1.0)); + g = (int) (rgbmax * colfunc(hue - 0.5)); + b = (int) (rgbmax * colfunc(hue)); +} + +void writedatafiles(context& ctx, logical_data> lpsi, int m, int n, int scale) +{ + auto lvel = ctx.logical_data(shape_of>(m, n, 2)).set_symbol("vel"); + auto lrgb = ctx.logical_data(shape_of>(m, n, 3)).set_symbol("rgb"); + + ctx.host_launch(lpsi.read(), lvel.write(), lrgb.write()).set_symbol("writedatafiles") + ->*[=](auto psi, auto vel, auto rgb) { + // printf("\n\nWriting data files ...\n"); + + // calculate velocities and hues + + for (int i = 0; i < m; i++) + { + for (int j = 0; j < n; j++) + { + vel(i, j, 0) = (psi(i + 1, j + 2) - psi(i + 1, j)) / 2.0; + vel(i, j, 1) = -(psi(i + 2, j + 1) - psi(i, j + 1)) / 2.0; + + double v1 = vel(i, j, 0); + double v2 = vel(i, j, 1); + + double modvsq = v1 * v1 + v2 * v2; + + double hue = pow(modvsq, 0.4); + + hue2rgb(hue, rgb(i, j, 0), rgb(i, j, 1), rgb(i, j, 2)); + } + } + + // write data + + FILE* cfile = EXPECT(fopen("colourmap.dat", "w")); + SCOPE(exit) + { + fclose(cfile); + }; + FILE* vfile = EXPECT(fopen("velocity.dat", "w")); + SCOPE(exit) + { + fclose(vfile); + }; + + for (int i = 0; i < m; i++) + { + int ix = i + 1; + + for (int j = 0; j < n; j++) + { + int iy = j + 1; + + fprintf(cfile, "%i %i %i %i %i\n", ix, iy, rgb(i, j, 0), rgb(i, j, 1), rgb(i, j, 2)); + + if ((ix - 1) % scale == (scale - 1) / 2 && (iy - 1) % scale == (scale - 1) / 2) + { + fprintf(vfile, "%i %i %f %f\n", ix, iy, vel(i, j, 0), vel(i, j, 1)); + } + } + } + + // printf("... done!\n"); + + writeplotfile(m, n, scale); + }; +} + +void jacobistep(context& ctx, logical_data> lpsinew, logical_data> lpsi, int m, int n) +{ + ctx.parallel_for(box<2>({1, m + 1}, {1, n + 1}), lpsinew.write(), lpsi.read()).set_symbol("jacobi_step") + ->*[] __device__(size_t i, size_t j, auto psinew, auto psi) { + psinew(i, j) = 0.25 * (psi(i - 1, j) + psi(i + 1, j) + psi(i, j + 1) + psi(i, j - 1)); + }; +} + +void jacobistepvort( + context& ctx, + logical_data> lzetnew, + logical_data> lpsinew, + logical_data> lzet, + logical_data> lpsi, + int m, + int n, + double re) +{ + ctx.parallel_for(box<2>({1, m + 1}, {1, n + 1}), lpsinew.write(), lpsi.read(), lzet.read()) + .set_symbol("jacobi_step_psi") + ->*[] __device__(size_t i, size_t j, auto psinew, auto psi, auto zet) { + psinew(i, j) = 0.25 * (psi(i - 1, j) + psi(i + 1, j) + psi(i, j + 1) + psi(i, j - 1)) - zet(i, j); + }; + + ctx.parallel_for(box<2>({1, m + 1}, {1, n + 1}), lzetnew.write(), lzet.read(), lpsi.read()) + .set_symbol("jacobi_step_zet") + ->*[=] __device__(size_t i, size_t j, auto zetnew, auto zet, auto psi) { + zetnew(i, j) = 0.25 * (zet(i - 1, j) + zet(i + 1, j) + zet(i, j + 1) + zet(i, j - 1)) + - re / 16.0 + * ((psi(i, j + 1) - psi(i, j - 1)) * (zet(i + 1, j) - zet(i - 1, j)) + - (psi(i + 1, j) - psi(i - 1, j)) * (zet(i, j + 1) - zet(i, j - 1))); + }; +} + +template +T transfer_host(context& ctx, logical_data>& ldata) +{ + T out; + + bool is_graph = ctx.is_graph_ctx(); + + if (is_graph) + { + ctx.host_launch(ldata.read()).set_symbol("transfer_host")->*[&](auto data) { + out = data(0); + }; + + /* This forces the completion of the host callback, so that the host + * thread can use the content for dynamic control flow */ + cudaStreamSynchronize(ctx.task_fence()); + } + else + { + ctx.task(exec_place::host, ldata.read()).set_symbol("transfer_host")->*[&](cudaStream_t stream, auto data) { + cuda_safe_call(cudaStreamSynchronize(stream)); + out = data(0); + }; + } + + return out; +} + +double +deltasq(context& ctx, logical_data> lnewarr, logical_data> loldarr, int m, int n) +{ + auto ldsq = ctx.logical_data(shape_of>({1})).set_symbol("tmp_accumulator"); + + // + // for (i = 1; i <= m; i++) { + // for (j = 1; j <= n; j++) { + // double tmp = newarr[i * (m + 2) + j] - oldarr[i * (m + 2) + j]; + // dsq += tmp * tmp; + // } + // } + + auto spec = con(con<128>(hw_scope::thread)); + ctx.launch(spec, ldsq.write(), lnewarr.read(), loldarr.read()).set_symbol("deltasq")->* + [m, n] __device__(auto th, auto dsq, auto newarr, auto oldarr) { + if (th.rank() == 0) + { + dsq(0) = 0.0; + } + th.sync(); + + // Each thread computes the sum of elements assigned to it + double local_sum = 0.0; + for (auto [i, j] : + th.apply_partition(box<2>({1, m + 1}, {1, n + 1}), std::tuple())) + { + double tmp = newarr(i, j) - oldarr(i, j); + local_sum += tmp * tmp; + } + + auto ti = th.inner(); + + __shared__ double block_sum[th.static_width(1)]; + block_sum[ti.rank()] = local_sum; + + for (size_t s = ti.size() / 2; s > 0; s /= 2) + { + if (ti.rank() < s) + { + block_sum[ti.rank()] += block_sum[ti.rank() + s]; + } + ti.sync(); + } + + if (ti.rank() == 0) + { + atomicAdd(&dsq(0), block_sum[0]); + } + }; + + return transfer_host(ctx, ldsq); +} + +void boundarypsi(context& ctx, logical_data> lpsi, int m, int /*n*/, int b, int h, int w) +{ + // BCs on bottom edge + ctx.parallel_for(box({b + 1, b + w}), lpsi.rw()).set_symbol("boundary_bottom")->*[=] __device__(size_t i, auto psi) { + psi(i, 0) = double(i - b); + }; + + ctx.parallel_for(box<1>({b + w, m + 1}), lpsi.rw()).set_symbol("boundary_bottom")->*[=] __device__(size_t i, auto psi) { + psi(i, 0) = double(w); + }; + + // BCS on RHS + ctx.parallel_for(box({1, h + 1}), lpsi.rw()).set_symbol("boundary_right")->*[=] __device__(size_t j, auto psi) { + psi(m + 1, j) = double(w); + }; + + ctx.parallel_for(box({h + 1, h + w}), lpsi.rw()).set_symbol("boundary_right")->*[=] __device__(size_t j, auto psi) { + psi(m + 1, j) = (double) (w - j + h); + }; +} + +void boundaryzet(context& ctx, logical_data> lzet, logical_data> lpsi, int m, int n) +{ + // set top/bottom BCs: + ctx.parallel_for(box({1, m + 1}), lzet.rw(), lpsi.read()).set_symbol("boundary_topbottom") + ->*[=] __device__(size_t i, auto zet, auto psi) { + zet(i, 0) = 2.0 * (psi(i, 1) - psi(i, 0)); + zet(i, n + 1) = 2.0 * (psi(i, n) - psi(i, n + 1)); + }; + + // set left and right BCs: + ctx.parallel_for(box({1, n + 1}), lzet.rw(), lpsi.read()).set_symbol("boundary_leftright") + ->*[=] __device__(size_t j, auto zet, auto psi) { + zet(0, j) = 2.0 * (psi(1, j) - psi(0, j)); + zet(m + 1, j) = 2.0 * (psi(m, j) - psi(m + 1, j)); + }; +} + +int main(int argc, char** argv) +{ + context ctx; + + int printfreq = 10; // output frequency + double error = -1.0; + double tolerance = 0.0001; //-1.0; // 0.0001; //tolerance for convergence. <=0 means do not check + + // command line arguments + int scalefactor = 1, numiter = 10; + + double re = -1.0; // Reynold's number - must be less than 3.7 + + // simulation sizes + int bbase = 10; + int hbase = 15; + int wbase = 5; + int mbase = 32; + int nbase = 32; + + int irrotational = 1, checkerr = 0; + + // do we stop because of tolerance? + if (tolerance > 0) + { + checkerr = 1; + } + + // check command line parameters and parse them + + if (argc > 5) + { + printf("Usage: cfd [reynolds] [use_graphs]\n"); + return 0; + } + + if (argc > 1) + { + scalefactor = atoi(argv[1]); + } + + if (argc > 2) + { + numiter = atoi(argv[2]); + } + + if (argc > 3) + { + re = atof(argv[3]); + irrotational = 0; + } + + // Use a CUDA graph backend + if (argc > 4) + { + if (atoi(argv[4]) == 1) + { + ctx = graph_ctx(); + } + fprintf(stderr, "Using %s backend.\n", ctx.to_string().c_str()); + } + + // if (!checkerr) { + // printf("Scale Factor = %i, iterations = %i\n", scalefactor, numiter); + // } else { + // printf("Scale Factor = %i, iterations = %i, tolerance= %g\n", scalefactor, numiter, tolerance); + // } + + // if (irrotational) { + // printf("Irrotational flow\n"); + // } else { + // printf("Reynolds number = %f\n", re); + // } + + tolerance /= scalefactor; + + // Calculate b, h & w and m & n + int b = bbase * scalefactor; + int h = hbase * scalefactor; + int w = wbase * scalefactor; + int m = mbase * scalefactor; + int n = nbase * scalefactor; + + re /= scalefactor; + + // printf("Running CFD on %d x %d grid in serial\n", m, n); + + // main arrays and their temporary versions + logical_data> lzet, lzettmp, lpsi, lpsitmp; + + // allocate arrays + lpsi = ctx.logical_data(shape_of>(m + 2, n + 2)).set_symbol("psi"); + lpsitmp = ctx.logical_data(lpsi.shape()).set_symbol("psi_tmp"); + + // zero the psi array + ctx.parallel_for(lpsi.shape(), lpsi.write()).set_symbol("InitPsi")->*[] __device__(size_t i, size_t j, auto psi) { + psi(i, j) = 0.0; + }; + + if (!irrotational) + { + lzet = ctx.logical_data(lpsi.shape()).set_symbol("zet"); + lzettmp = ctx.logical_data(lpsi.shape()).set_symbol("zet_tmp"); + + // zero the zeta array + ctx.parallel_for(lzet.shape(), lzet.write()).set_symbol("InitZet")->*[] __device__(size_t i, size_t j, auto zet) { + zet(i, j) = 0.0; + }; + } + + // set the psi boundary conditions + boundarypsi(ctx, lpsi, m, n, b, h, w); + + // compute normalisation factor for error + auto lbnorm = ctx.logical_data(shape_of>({1})).set_symbol("bnorm"); + + nvtxRangePush("Compute_Normalization"); + + // bnorm += psi * psi + auto spec = con(con<32>()); + ctx.launch(spec, lbnorm.write(), lpsi.read()).set_symbol("Compute_Normalization") + ->*[] __device__(auto th, auto bnorm, auto psi) { + if (th.rank() == 0) + { + bnorm(0) = 0.0; + } + th.sync(); + // Each thread computes the sum of elements assigned to it + double local_sum = 0.0; + for (auto [i, j] : th.apply_partition(shape(psi))) + { + local_sum += psi(i, j) * psi(i, j); + } + + auto ti = th.inner(); + + __shared__ double block_sum[th.static_width(1)]; + block_sum[ti.rank()] = local_sum; + + for (size_t s = ti.size() / 2; s > 0; s /= 2) + { + if (ti.rank() < s) + { + block_sum[ti.rank()] += block_sum[ti.rank() + s]; + } + ti.sync(); + } + + if (ti.rank() == 0) + { + atomicAdd(&bnorm(0), block_sum[0]); + } + }; + + if (!irrotational) + { + // update zeta BCs that depend on psi + boundaryzet(ctx, lzet, lpsi, m, n); + + // update normalisation + ctx.launch(spec, lbnorm.rw(), lzet.read()).set_symbol("Compute_Normalization") + ->*[] __device__(auto th, auto bnorm, auto zet) { + // Each thread computes the sum of elements assigned to it + double local_sum = 0.0; + for (auto [i, j] : th.apply_partition(shape(zet))) + { + local_sum += zet(i, j) * zet(i, j); + } + + auto ti = th.inner(); + + __shared__ double block_sum[th.static_width(1)]; + block_sum[ti.rank()] = local_sum; + + for (size_t s = ti.size() / 2; s > 0; s /= 2) + { + if (ti.rank() < s) + { + block_sum[ti.rank()] += block_sum[ti.rank() + s]; + } + ti.sync(); + } + + if (ti.rank() == 0) + { + atomicAdd(&bnorm(0), block_sum[0]); + } + }; + } + + double bnorm = transfer_host(ctx, lbnorm); + bnorm = sqrt(bnorm); + + // begin iterative Jacobi loop + + // printf("\nStarting main loop...\n\n"); + + double tstart = gettime(); + nvtxRangePush("Overall_Iteration"); + + int iter = 1; + for (; iter <= numiter; iter++) + { + // calculate psi for next iteration + if (irrotational) + { + jacobistep(ctx, lpsitmp, lpsi, m, n); + } + else + { + jacobistepvort(ctx, lzettmp, lpsitmp, lzet, lpsi, m, n, re); + } + + // calculate current error if required + bool compute_error = (iter == numiter) || (checkerr && (iter % printfreq == 0)); + if (compute_error) + { + error = deltasq(ctx, lpsitmp, lpsi, m, n); + + if (!irrotational) + { + error += deltasq(ctx, lzettmp, lzet, m, n); + } + + error = sqrt(error); + error = error / bnorm; + + if (checkerr && (error < tolerance)) + { + // printf("Converged on iteration %d\n", iter); + break; + } + } + + // copy back + ctx.parallel_for(box<2>({1, m + 1}, {1, n + 1}), lpsi.rw(), lpsitmp.read()).set_symbol("SwitchPsi") + ->*[] __device__(size_t i, size_t j, auto psi, auto psitmp) { + psi(i, j) = psitmp(i, j); + }; + + if (!irrotational) + { + ctx.parallel_for(box<2>({1, m + 1}, {1, n + 1}), lzet.rw(), lzettmp.read()).set_symbol("SwitchZet") + ->*[] __device__(size_t i, size_t j, auto zet, auto zettmp) { + zet(i, j) = zettmp(i, j); + }; + } + + if (!irrotational) + { + // update zeta BCs that depend on psi + boundaryzet(ctx, lzet, lpsi, m, n); + } + + // if (iter % printfreq == 0) { + // if (!checkerr) { + // printf("Completed iteration %d\n", iter); + // } else { + // printf("Completed iteration %d, error = %g\n", iter, error); + // } + // } + } + nvtxRangePop(); // pop + + if (iter > numiter) + { + iter = numiter; + } + + double tstop = gettime(); + + double ttot = tstop - tstart; + double titer = ttot / (double) iter; + + // output results + + writedatafiles(ctx, lpsi, m, n, scalefactor); + + ctx.finalize(); + // print out some stats + + // printf("\n... finished\n"); + printf("After %d iterations, the error is %g\n", iter, error); + printf("Time for %d iterations was %g seconds\n", iter, ttot); + printf("Each iteration took %g seconds\n", titer); + + // printf("... finished\n"); + + return 0; +} diff --git a/cudax/examples/stf/custom_data_interface.cu b/cudax/examples/stf/custom_data_interface.cu new file mode 100644 index 00000000000..acb5bc9b159 --- /dev/null +++ b/cudax/examples/stf/custom_data_interface.cu @@ -0,0 +1,325 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief This example illustrates how to create a custom data interface and use them in tasks + * + */ + +#include + +using namespace cuda::experimental::stf; + +/** + * @brief A simple class describing a contiguous matrix of size (m, n) + */ +template +class matrix +{ +public: + matrix(size_t m, size_t n, T* base) + : m(m) + , n(n) + , base(base) + {} + + __host__ __device__ T& operator()(size_t i, size_t j) + { + return base[i + j * m]; + } + + __host__ __device__ const T& operator()(size_t i, size_t j) const + { + return base[i + j * m]; + } + + size_t m, n; + T* base; +}; + +/** + * @brief defines the shape of a matrix + * + * Note that we specialize cuda::experimental::stf::shape_of to avoid ambiguous specialization + * + * @extends shape_of + */ +template +class cuda::experimental::stf::shape_of> +{ +public: + /** + * @brief The default constructor. + * + * All `shape_of` specializations must define this constructor. + */ + shape_of() = default; + + explicit shape_of(size_t m, size_t n) + : m(m) + , n(n) + {} + + /** + * @name Copies a shape. + * + * All `shape_of` specializations must define this constructor. + */ + shape_of(const shape_of&) = default; + + /** + * @brief Extracts the shape from a matrix + * + * @param M matrix to get the shape from + * + * All `shape_of` specializations must define this constructor. + */ + shape_of(const matrix& M) + : shape_of>(M.m, M.n) + {} + + /// Mandatory method : defined the total number of elements in the shape + size_t size() const + { + return m * n; + } + + using coords_t = ::std::tuple; + + // This transforms a tuple of (shape, 1D index) into a coordinate + _CCCL_HOST_DEVICE coords_t index_to_coords(size_t index) const + { + return coords_t(index % m, index / m); + } + + size_t m; + size_t n; +}; + +/** + * @brief Data interface to manipulate a matrix in the CUDA stream backend + */ +template +class matrix_stream_interface : public stream_data_interface_simple> +{ +public: + using base = stream_data_interface_simple>; + using typename base::shape_t; + + /// Initialize from an existing matrix + matrix_stream_interface(matrix m) + : base(std::move(m)) + {} + + /// Initialize from a shape of matrix + matrix_stream_interface(typename base::shape_t s) + : base(s) + {} + + /// Copy the content of an instance to another instance + /// + /// This implementation assumes that we have registered memory if one of the data place is the host + void stream_data_copy( + const data_place& dst_memory_node, + instance_id_t dst_instance_id, + const data_place& src_memory_node, + instance_id_t src_instance_id, + cudaStream_t stream) override + { + assert(src_memory_node != dst_memory_node); + + cudaMemcpyKind kind = cudaMemcpyDeviceToDevice; + if (src_memory_node == data_place::host) + { + kind = cudaMemcpyHostToDevice; + } + + if (dst_memory_node == data_place::host) + { + kind = cudaMemcpyDeviceToHost; + } + + const matrix& src_instance = this->instance(src_instance_id); + const matrix& dst_instance = this->instance(dst_instance_id); + + size_t sz = src_instance.m * src_instance.n * sizeof(T); + + cuda_safe_call(cudaMemcpyAsync((void*) dst_instance.base, (void*) src_instance.base, sz, kind, stream)); + } + + /// allocate an instance on a specific data place + /// + /// setting *s to a negative value informs CUDASTF that the allocation + /// failed, and that a memory reclaiming mechanism need to be performed. + void stream_data_allocate( + backend_ctx_untyped& /*unused*/, + const data_place& memory_node, + instance_id_t instance_id, + ::std::ptrdiff_t& s, + void** /*unused*/, + cudaStream_t stream) override + { + matrix& instance = this->instance(instance_id); + size_t sz = instance.m * instance.n * sizeof(T); + + T* base_ptr; + + if (memory_node == data_place::host) + { + // Fallback to a synchronous method as there is no asynchronous host allocation API + cuda_safe_call(cudaStreamSynchronize(stream)); + cuda_safe_call(cudaHostAlloc(&base_ptr, sz, cudaHostAllocMapped)); + } + else + { + cuda_safe_call(cudaMallocAsync(&base_ptr, sz, stream)); + } + + // By filling a positive number, we notify that the allocation was succesful + s = sz; + + instance.base = base_ptr; + } + + /// deallocate an instance + void stream_data_deallocate( + backend_ctx_untyped& /*unused*/, + const data_place& memory_node, + instance_id_t instance_id, + void* /*unused*/, + cudaStream_t stream) override + { + matrix& instance = this->instance(instance_id); + if (memory_node == data_place::host) + { + // Fallback to a synchronous method as there is no asynchronous host deallocation API + cuda_safe_call(cudaStreamSynchronize(stream)); + cuda_safe_call(cudaFreeHost(instance.base)); + } + else + { + cuda_safe_call(cudaFreeAsync(instance.base, stream)); + } + } + + /// Register the host memory associated to an instance of matrix + /// + /// Note that this pin_host_memory method is not mandatory, but then it is + /// the responsability of the user to only passed memory that is already + /// registered, and the allocation method on the host must allocate + /// registered memory too. Otherwise, copy methods need to be synchronous. + bool pin_host_memory(instance_id_t instance_id) override + { + matrix& instance = this->instance(instance_id); + if (!instance.base) + { + return false; + } + + cuda_safe_call(pin_memory(instance.base, instance.m * instance.n * sizeof(T))); + + return true; + } + + /// Unregister memory pinned by pin_host_memory + void unpin_host_memory(instance_id_t instance_id) override + { + matrix& instance = this->instance(instance_id); + unpin_memory(instance.base); + } +}; + +/** + * @brief Define how the CUDA stream backend must manipulate a matrix + * + * Note that we specialize cuda::experimental::stf::shape_of to avoid ambiguous specialization + * + * @extends streamed_interface_of + */ +template +struct cuda::experimental::stf::streamed_interface_of> +{ + using type = matrix_stream_interface; +}; + +/** + * @brief A hash of the matrix + */ +template +struct cuda::experimental::stf::hash> +{ + std::size_t operator()(matrix const& m) const noexcept + { + // Combine hashes from the base address and sizes + return cuda::experimental::stf::hash_all(m.m, m.n, m.base); + } +}; + +template +__global__ void kernel(matrix M) +{ + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads_x = gridDim.x * blockDim.x; + + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + int nthreads_y = gridDim.y * blockDim.y; + + for (int x = tid_x; x < M.m; x += nthreads_x) + { + for (int y = tid_y; y < M.n; y += nthreads_y) + { + M(x, y) += -x + 7 * y; + } + } +} + +int main() +{ + stream_ctx ctx; + + const size_t m = 8; + const size_t n = 10; + std::vector v(m * n); + + matrix M(m, n, &v[0]); + + // M(i,j) = 17 * i + 23 * j + for (size_t j = 0; j < n; j++) + { + for (size_t i = 0; i < m; i++) + { + M(i, j) = 17 * i + 23 * j; + } + } + + auto lM = ctx.logical_data(M); + + // M(i,j) += -i + 7*i + ctx.task(lM.rw())->*[](cudaStream_t s, auto dM) { + kernel<<>>(dM); + }; + + // M(i,j) += 2*i + 6*j + ctx.parallel_for(lM.shape(), lM.rw())->*[] _CCCL_DEVICE(size_t i, size_t j, auto dM) { + dM(i, j) += 2 * i + 6 * j; + }; + + ctx.finalize(); + + for (size_t j = 0; j < n; j++) + { + for (size_t i = 0; i < m; i++) + { + assert(M(i, j) == (17 * i + 23 * j) + (-i + 7 * j) + (2 * i + 6 * j)); + } + } +} diff --git a/cudax/examples/stf/explicit_data_places.cu b/cudax/examples/stf/explicit_data_places.cu new file mode 100644 index 00000000000..144c3b0225e --- /dev/null +++ b/cudax/examples/stf/explicit_data_places.cu @@ -0,0 +1,87 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief An AXPY kernel implemented with a task of the CUDA stream backend + * where the task accesses host memory from the device + * + */ + +#include + +#include + +using namespace cuda::experimental::stf; + +__global__ void axpy(double a, slice x, slice y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < x.size(); i += nthreads) + { + y(i) += a * x(i); + } +} + +double X0(int i) +{ + return sin((double) i); +} + +double Y0(int i) +{ + return cos((double) i); +} + +int main() +{ + // Verify whether this device can access memory concurrently from CPU and GPU. + int dev; + cuda_safe_call(cudaGetDevice(&dev)); + assert(dev >= 0); + cudaDeviceProp prop; + cuda_safe_call(cudaGetDeviceProperties(&prop, dev)); + if (!prop.concurrentManagedAccess) + { + fprintf(stderr, "Concurrent CPU/GPU access not supported, skipping test.\n"); + return 0; + } + + stream_ctx ctx; + const size_t N = 16; + double X[N], Y[N]; + + for (size_t i = 0; i < N; i++) + { + X[i] = X0(i); + Y[i] = Y0(i); + } + + double alpha = 3.14; + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + /* Compute Y = Y + alpha X, but leave X on the host and access it with mapped memory */ + ctx.task(lX.read(data_place::host), lY.rw())->*[&](cudaStream_t s, auto dX, auto dY) { + axpy<<<16, 128, 0, s>>>(alpha, dX, dY); + }; + + ctx.finalize(); + + for (size_t i = 0; i < N; i++) + { + assert(fabs(Y[i] - (Y0(i) + alpha * X0(i))) < 0.0001); + assert(fabs(X[i] - X0(i)) < 0.0001); + } +} diff --git a/cudax/examples/stf/fdtd_mgpu.cu b/cudax/examples/stf/fdtd_mgpu.cu new file mode 100644 index 00000000000..a9a54d1f993 --- /dev/null +++ b/cudax/examples/stf/fdtd_mgpu.cu @@ -0,0 +1,296 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief An example solving Maxwell equations in 3D using FDTD on multiple devices + */ + +#include + +#include + +using namespace cuda::experimental::stf; + +// FIXME : MSVC has trouble with box constructors +#if !defined(_CCCL_COMPILER_MSVC) +void write_vtk_2D(const std::string& filename, slice Ez, double dx, double dy, double /*unused*/) +{ + FILE* f = fopen(filename.c_str(), "w"); + + const size_t pos_z = Ez.extent(2) / 2; + const size_t nx = Ez.extent(0); + + const size_t size = Ez.extent(0) * Ez.extent(1); + + fprintf(f, "# vtk DataFile Version 3.0\n"); + fprintf(f, "vtk output\n"); + fprintf(f, "ASCII\n"); + fprintf(f, "DATASET UNSTRUCTURED_GRID\n"); + fprintf(f, "POINTS %ld float\n", 4 * size); + + for (size_t y = 0; y < Ez.extent(1); y++) + { + for (size_t x = 0; x < Ez.extent(0); x++) + { + fprintf(f, "%lf %lf 0.0\n", dx * static_cast(x + 0), dy * static_cast(y + 0)); + fprintf(f, "%lf %lf 0.0\n", dx * static_cast(x + 1), dy * static_cast(y + 0)); + fprintf(f, "%lf %lf 0.0\n", dx * static_cast(x + 1), dy * static_cast(y + 1)); + fprintf(f, "%lf %lf 0.0\n", dx * static_cast(x + 0), dy * static_cast(y + 1)); + } + } + + fprintf(f, "CELLS %ld %ld\n", size, 5 * size); + + size_t cell_id = 0; + for (size_t y = 0; y < Ez.extent(1); y++) + { + for (size_t x = 0; x < Ez.extent(0); x++) + { + const size_t point_offset = cell_id * 4; + fprintf(f, + "4 %d %d %d %d\n", + (int) (point_offset + 0), + (int) (point_offset + 1), + (int) (point_offset + 2), + (int) (point_offset + 3)); + + cell_id++; + } + } + + fprintf(f, "CELL_TYPES %ld\n", size); + + for (size_t ii = 0; ii < size; ii++) + { + fprintf(f, "5\n"); + } + + fprintf(f, "CELL_DATA %ld\n", size); + fprintf(f, "SCALARS Ez double 1\n"); + fprintf(f, "LOOKUP_TABLE default\n"); + + for (size_t y = 0; y < Ez.extent(1); y++) + { + for (size_t x = 0; x < Ez.extent(0); x++) + { + fprintf(f, "%lf\n", Ez(x, y, pos_z)); + } + } + + fclose(f); +} + +// Define the source function +_CCCL_DEVICE double Source(double t, double x, double y, double z) +{ + constexpr double pi = 3.14159265358979323846; + constexpr double freq = 1e9; + constexpr double omega = (2 * pi * freq); + constexpr double wavelength = 3e8 / freq; + constexpr double k = 2 * pi / wavelength; + return sin(k * x - omega * t); +} +#endif // !defined(_CCCL_COMPILER_MSVC) + +int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) +{ +#if !defined(_CCCL_COMPILER_MSVC) + context ctx; + + // Initialize the time loop + size_t timesteps = 10; + if (argc > 1) + { + timesteps = (size_t) atol(argv[1]); + } + + // No output by default + int output_freq = -1; + if (argc > 2) + { + output_freq = atoi(argv[2]); + } + + // Default value : grid of all devices + exec_place where = exec_place::all_devices(); + + if (argc > 3) + { + switch (atoi(argv[3])) + { + case 0: + where = exec_place::host; + break; + case 1: + where = exec_place::current_device(); + break; + case 2: + where = exec_place::all_devices(); + break; + case 3: + where = exec_place::repeat(exec_place::current_device(), 8); + break; + default: + fprintf(stderr, "Invalid exec place argument\n"); + abort(); + } + + fprintf(stderr, "Running on %s\n", where.to_string().c_str()); + } + + if (argc > 4) + { + int use_graph = atoi(argv[4]); + if (use_graph) + { + ctx = graph_ctx(); + } + + fprintf(stderr, "Use %s backend.\n", use_graph ? "graph" : "stream"); + } + + // Domain dimensions + const size_t SIZE_X = 100; + const size_t SIZE_Y = 100; + const size_t SIZE_Z = 100; + + // Grid spacing + const double DX = 0.01; + const double DY = 0.01; + const double DZ = 0.01; + + // Define the electric and magnetic fields + auto data_shape = shape_of>(SIZE_X, SIZE_Y, SIZE_Z); + auto lEx = ctx.logical_data(data_shape); + auto lEy = ctx.logical_data(data_shape); + auto lEz = ctx.logical_data(data_shape); + auto lHx = ctx.logical_data(data_shape); + auto lHy = ctx.logical_data(data_shape); + auto lHz = ctx.logical_data(data_shape); + + // Define the permittivity and permeability of the medium + auto lepsilon = ctx.logical_data(data_shape); + auto lmu = ctx.logical_data(data_shape); + + const double EPSILON = 8.85e-12; // Permittivity of free space + const double MU = 1.256e-6; // Permeability of free space + + // CFL condition DT <= min(DX, DY, DZ) * sqrt(epsilon_max * mu_max) + double DT = 0.25 * min(min(DX, DY), DZ) * sqrt(EPSILON * MU); + + // Initialize E + ctx.parallel_for(blocked_partition(), where, data_shape, lEx.write(), lEy.write(), lEz.write()) + ->*[] _CCCL_DEVICE(size_t i, size_t j, size_t k, auto Ex, auto Ey, auto Ez) { + Ex(i, j, k) = 0.0; + Ey(i, j, k) = 0.0; + Ez(i, j, k) = 0.0; + }; + + // Initialize H + ctx.parallel_for(blocked_partition(), where, data_shape, lHx.write(), lHy.write(), lHz.write()) + ->*[] _CCCL_DEVICE(size_t i, size_t j, size_t k, auto Hx, auto Hy, auto Hz) { + Hx(i, j, k) = 0.0; + Hy(i, j, k) = 0.0; + Hz(i, j, k) = 0.0; + }; + + // Initialize permittivity and permeability fields + ctx.parallel_for(blocked_partition(), where, data_shape, lepsilon.write(), lmu.write()) + ->*[=] _CCCL_DEVICE(size_t i, size_t j, size_t k, auto epsilon, auto mu) { + epsilon(i, j, k) = EPSILON; + mu(i, j, k) = MU; + }; + + // Set the source function at the center of the grid + + const size_t center_x = SIZE_X / 2; + const size_t center_y = SIZE_Y / 2; + const size_t center_z = SIZE_Z / 2; + + /* Index shapes for Electric fields, Magnetic fields, and the indices where there is a source */ + box Es({1ul, SIZE_X - 1}, {1ul, SIZE_Y - 1}, {1ul, SIZE_Z - 1}); + box Hs({0ul, SIZE_X - 1}, {0ul, SIZE_Y - 1}, {0ul, SIZE_Z - 1}); + box source_s({center_x, center_x + 1}, {center_y, center_y + 1}, {center_z, center_z + 1}); + + ctx.repeat(timesteps)->*[&](context ctx, size_t n) { + // Update the electric fields + + // Update Ex + ctx.parallel_for(blocked_partition(), where, Es, lEx.rw(), lHy.read(), lHz.read(), lepsilon.read()) + ->*[=] + _CCCL_DEVICE(size_t i, size_t j, size_t k, auto Ex, auto Hy, auto Hz, auto epsilon) { + Ex(i, j, k) = Ex(i, j, k) + + (DT / (epsilon(i, j, k) * DX)) * (Hz(i, j, k) - Hz(i, j - 1, k) - Hy(i, j, k) + Hy(i, j, k - 1)); + }; + + // Update Ey + ctx.parallel_for(blocked_partition(), where, Es, lEy.rw(), lHx.read(), lHz.read(), lepsilon.read()) + ->*[=] + _CCCL_DEVICE(size_t i, size_t j, size_t k, auto Ey, auto Hx, auto Hz, auto epsilon) { + Ey(i, j, k) = Ey(i, j, k) + + (DT / (epsilon(i, j, k) * DY)) * (Hx(i, j, k) - Hx(i, j, k - 1) - Hz(i, j, k) + Hz(i - 1, j, k)); + }; + + // Update Ez + ctx.parallel_for(blocked_partition(), where, Es, lEz.rw(), lHx.read(), lHy.read(), lepsilon.read()) + ->*[=] + _CCCL_DEVICE(size_t i, size_t j, size_t k, auto Ez, auto Hx, auto Hy, auto epsilon) { + Ez(i, j, k) = Ez(i, j, k) + + (DT / (epsilon(i, j, k) * DZ)) * (Hy(i, j, k) - Hy(i - 1, j, k) - Hx(i, j, k) + Hx(i, j - 1, k)); + }; + + // Add the source function at the center of the grid + ctx.parallel_for(blocked_partition(), where, source_s, lEz.rw()) + ->*[=] _CCCL_DEVICE(size_t i, size_t j, size_t k, auto Ez) { + Ez(i, j, k) = Ez(i, j, k) + Source(n * DT, i * DX, j * DY, k * DZ); + }; + + // Update the magnetic fields + + // Update Hx + ctx.parallel_for(blocked_partition(), where, Hs, lHx.rw(), lEy.read(), lEz.read(), lmu.read()) + ->*[=] _CCCL_DEVICE(size_t i, size_t j, size_t k, auto Hx, auto Ey, auto Ez, auto mu) { + Hx(i, j, k) = Hx(i, j, k) + - (DT / (mu(i, j, k) * DY)) * (Ez(i, j + 1, k) - Ez(i, j, k) - Ey(i, j, k + 1) + Ey(i, j, k)); + }; + + // Update Hy + ctx.parallel_for(blocked_partition(), where, Hs, lHy.rw(), lEx.read(), lEz.read(), lmu.read()) + ->*[=] _CCCL_DEVICE(size_t i, size_t j, size_t k, auto Hy, auto Ex, auto Ez, auto mu) { + Hy(i, j, k) = Hy(i, j, k) + - (DT / (mu(i, j, k) * DZ)) * (Ex(i, j, k + 1) - Ex(i, j, k) - Ez(i + 1, j, k) + Ez(i, j, k)); + }; + + // Update Hz + ctx.parallel_for(blocked_partition(), where, Hs, lHz.rw(), lEx.read(), lEy.read(), lmu.read()) + ->*[=] _CCCL_DEVICE(size_t i, size_t j, size_t k, auto Hz, auto Ex, auto Ey, auto mu) { + Hz(i, j, k) = Hz(i, j, k) + - (DT / (mu(i, j, k) * DX)) * (Ey(i + 1, j, k) - Ey(i, j, k) - Ex(i, j + 1, k) + Ex(i, j, k)); + }; + + if (output_freq > 0 && n % output_freq == 0) + { + ctx.host_launch(lEz.read())->*[=](auto Ez) { + // Output the electric field at the center of the grid + fprintf(stderr, "%ld\t%le\n", n, Ez(center_x, center_y, center_z)); + + std::string filename = "Ez" + std::to_string(n) + ".vtk"; + + // Dump a 2D slice of Ez in VTK + write_vtk_2D(filename, Ez, DX, DY, DZ); + }; + } + }; + + ctx.finalize(); +#endif // !defined(_CCCL_COMPILER_MSVC) +} diff --git a/cudax/examples/stf/frozen_data_init.cu b/cudax/examples/stf/frozen_data_init.cu new file mode 100644 index 00000000000..70a5c3943cd --- /dev/null +++ b/cudax/examples/stf/frozen_data_init.cu @@ -0,0 +1,53 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Illustrate how we can use frozen data to initialize constant data + * + */ + +#include + +using namespace cuda::experimental::stf; + +int main() +{ + context ctx; + + /* Create a piece of data that can be use many times without further synchronizations */ + auto buffer = ctx.logical_data(shape_of>(128, 64)).set_symbol("buffer"); + ctx.parallel_for(buffer.shape(), buffer.write())->*[] __device__(size_t i, size_t j, auto b) { + b(i, j) = sin(-1.0 * i) + cos(2.0 * j); + }; + + auto frozen_buffer = ctx.freeze(buffer); + + auto h_buf = frozen_buffer.get(data_place::host).first; + auto d_buf = frozen_buffer.get(data_place::current_device()).first; + + cuda_safe_call(cudaStreamSynchronize(ctx.task_fence())); + + auto lX = ctx.logical_data(buffer.shape()).set_symbol("X"); + ctx.parallel_for(lX.shape(), lX.write()).set_symbol("X=buf")->*[d_buf] __device__(size_t i, size_t j, auto x) { + x(i, j) = d_buf(i, j); + }; + + ctx.parallel_for(exec_place::host, lX.shape(), lX.read()).set_symbol("check buf") + ->*[h_buf](size_t i, size_t j, auto x) { + EXPECT(fabs(x(i, j) - h_buf(i, j)) < 0.0001); + }; + + // Make sure all tasks are done before unfreezing + frozen_buffer.unfreeze(ctx.task_fence()); + + ctx.finalize(); +} diff --git a/cudax/examples/stf/graph_algorithms/degree_centrality.cu b/cudax/examples/stf/graph_algorithms/degree_centrality.cu new file mode 100644 index 00000000000..58fc3a61358 --- /dev/null +++ b/cudax/examples/stf/graph_algorithms/degree_centrality.cu @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Computes the Degree Centrality for each vertex within a graph + * + */ + +#include + +#include + +using namespace cuda::experimental::stf; + +/** + * @brief Computes the Degree Centrality for each vertex. + * + * @param idx The index of the vertex for which Degree Centrality is being calculated. + * @param d_offsets Slice containing the offset vector of the CSR representation. + * @return The degree of each vertex. + */ +__device__ int degree_centrality(int idx, slice loffsets) +{ + return loffsets[idx + 1] - loffsets[idx]; +} + +int main() +{ + stream_ctx ctx; + + // row offsets in CSR format + std::vector offsets = {0, 4, 11, 12, 14, 15, 16, 18, 19, 20}; + // edges in CSR format + std::vector nonzeros = {1, 2, 3, 6, 0, 3, 4, 5, 6, 7, 8, 0, 0, 1, 1, 1, 0, 1, 1, 1}; + // output degrees for each vertex + int num_vertices = offsets.size() - 1; + std::vector degrees(num_vertices, 0); + + auto loffsets = ctx.logical_data(&offsets[0], offsets.size()); + auto lnonzeros = ctx.logical_data(&nonzeros[0], nonzeros.size()); + auto ldegrees = ctx.logical_data(°rees[0], degrees.size()); + + ctx.parallel_for(box(num_vertices), loffsets.read(), ldegrees.rw()) + ->*[] __device__(size_t idx, auto loffsets, auto ldegrees) { + ldegrees[idx] = degree_centrality(idx, loffsets); + }; + + ctx.finalize(); + + // for (int i = 0; i < num_vertices; ++i) { + // printf("Vertex %d: Degree Centrality = %d\n", i, degrees[i]); + // } + + return 0; +} diff --git a/cudax/examples/stf/graph_algorithms/jaccard.cu b/cudax/examples/stf/graph_algorithms/jaccard.cu new file mode 100644 index 00000000000..86a59c2ab5c --- /dev/null +++ b/cudax/examples/stf/graph_algorithms/jaccard.cu @@ -0,0 +1,137 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Computes the Jaccard Similarity for each vertex within a graph + * + */ + +#include + +#include + +using namespace cuda::experimental::stf; + +// Performs Binary Search on a given array with start/end bounds and a lookup element +__device__ int binary_search(slice arr, int start, int end, int lookup) +{ + while (start <= end) + { + int mid = start + (end - start) / 2; + if (arr[mid] == lookup) + { + return mid; + } + else if (arr[mid] < lookup) + { + start = mid + 1; + } + else + { + end = mid - 1; + } + } + return -1; +} + +/** + * @brief Computes the intersection size of neighbors of two vertices. + * + * @param loffsets Slice containing the offset vector of the CSR representation. + * @param lnonzeros Slice containing the non-zero elements (neighbors) vector of the CSR representation. + * @param u Index of the first vertex. + * @param v Index of the second vertex. + * @return The number of common neighbors (intersection size) of vertices u and v. + */ +__device__ int calculate_intersection_size(slice loffsets, slice lnonzeros, int u, int v) +{ + int count = 0; + for (int i = loffsets[u]; i < loffsets[u + 1]; i++) + { + if (binary_search(lnonzeros, loffsets[v], loffsets[v + 1] - 1, lnonzeros[i]) != -1) + { + count++; + } + } + return count; +} + +/** + * @brief Computes the union size of neighbors of two vertices. + * + * @param loffsets Slice containing the offset vector of the CSR representation. + * @param lnonzeros Slice containing the non-zero elements (neighbors) vector of the CSR representation. + * @param u Index of the first vertex. + * @param v Index of the second vertex. + * @return The number of unique neighbors (union size) of vertices u and v. + */ +__device__ int calculate_union_size(slice loffsets, slice lnonzeros, int u, int v) +{ + int count = (loffsets[u + 1] - loffsets[u]) + (loffsets[v + 1] - loffsets[v]); + for (int i = loffsets[u]; i < loffsets[u + 1]; i++) + { + if (binary_search(lnonzeros, loffsets[v], loffsets[v + 1] - 1, lnonzeros[i]) != -1) + { + count--; + } + } + return count; +} + +int main() +{ + stream_ctx ctx; + + // row offsets in CSR format + std::vector offsets = {0, 4, 11, 12, 14, 15, 16, 18, 19, 20}; + // edges in CSR format + std::vector nonzeros = {1, 2, 3, 6, 0, 3, 4, 5, 6, 7, 8, 0, 0, 1, 1, 1, 0, 1, 1, 1}; + // output jaccard similarities for each vertex + int num_vertices = offsets.size() - 1; + std::vector jaccard_similarities(num_vertices * num_vertices, 0.0f); + + auto loffsets = ctx.logical_data(&offsets[0], offsets.size()); + auto lnonzeros = ctx.logical_data(&nonzeros[0], nonzeros.size()); + auto ljaccard_similarities = ctx.logical_data(&jaccard_similarities[0], jaccard_similarities.size()); + + ctx.parallel_for(box(num_vertices), loffsets.read(), lnonzeros.read(), ljaccard_similarities.rw()) + ->*[] __device__(size_t idx, auto loffsets, auto lnonzeros, auto ljaccard_similarities) { + for (int j = 0; j < loffsets.size() - 1; j++) + { + if (idx != j) + { + int intersection = calculate_intersection_size(loffsets, lnonzeros, idx, j); + int uni = calculate_union_size(loffsets, lnonzeros, idx, j); + if (uni > 0) + { + ljaccard_similarities[idx * (loffsets.size() - 1) + j] = static_cast(intersection) / uni; + } + } + } + }; + + ctx.finalize(); + + for (int u = 0; u < num_vertices; u++) + { + for (int v = 0; v < num_vertices; v++) + { + if (u != v) + { + printf( + "Jaccard similarity between vertex %d and vertex %d: %f\n", u, v, jaccard_similarities[u * num_vertices + v]); + } + } + } + + return 0; +} diff --git a/cudax/examples/stf/graph_algorithms/pagerank.cu b/cudax/examples/stf/graph_algorithms/pagerank.cu new file mode 100644 index 00000000000..e6353d38a5e --- /dev/null +++ b/cudax/examples/stf/graph_algorithms/pagerank.cu @@ -0,0 +1,146 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Computes the PageRank for vertices within a graph + * + */ + +#include + +#include + +using namespace cuda::experimental::stf; + +/** + * @brief Performs an atomic maximum operation on floating-point numbers by reinterpreting them as integers. + * + * @param address Pointer to the float value that will be updated. + * @param val The float value to compare and possibly set at the address. + * @return The old value at the address (reinterpreted as a float). + */ +__device__ float atomicMaxFloat(float* address, float val) +{ + int* address_as_int = (int*) address; + int old = *address_as_int; + int new_val = __float_as_int(val); + atomicMax(address_as_int, new_val); + return __int_as_float(old); +} + +/** + * @brief Calculates the PageRank for a given vertex. + * + * @param idx The index of the vertex for which PageRank is being calculated. + * @param loffsets Slice containing the offset vector of the CSR representation. + * @param lnonzeros Slice containing the non-zero elements (neighbors) vector of the CSR representation. + * @param lpage_rank Slice containing current PageRank values for each vertex. + * @param lnew_page_rank Slice containing where new PageRank values will be stored. + * @param init_rank The initial PageRank value to be used in the calculation. + */ +__device__ void calculating_pagerank( + int idx, + slice loffsets, + slice lnonzeros, + slice lpage_rank, + slice lnew_page_rank, + float init_rank) +{ + float rank_sum = 0.0; + for (int i = loffsets[idx]; i < loffsets[idx + 1]; i++) + { + int neighbor = lnonzeros[i]; + int out_degree = loffsets[neighbor + 1] - loffsets[neighbor]; + rank_sum += lpage_rank[neighbor] / out_degree; + } + lnew_page_rank[idx] = 0.85 * rank_sum + (1.0 - 0.85) * init_rank; +} + +int main() +{ + stream_ctx ctx; + + // row offsets in CSR format + std::vector offsets = {0, 4, 11, 12, 14, 15, 16, 18, 19, 20}; + // edges in CSR format + std::vector nonzeros = {1, 2, 3, 6, 0, 3, 4, 5, 6, 7, 8, 0, 0, 1, 1, 1, 0, 1, 1, 1}; + + int num_vertices = offsets.size() - 1; + float init_rank = 1.0f / num_vertices; + float tolerance = 1e-6f; + float max_diff = 0.0f; + int NITER = 100; + + // output pageranks for each vertex + std::vector page_rank(num_vertices, init_rank); + std::vector new_page_rank(num_vertices); + + auto loffsets = ctx.logical_data(&offsets[0], offsets.size()); + auto lnonzeros = ctx.logical_data(&nonzeros[0], nonzeros.size()); + auto lpage_rank = ctx.logical_data(&page_rank[0], page_rank.size()); + auto lnew_page_rank = ctx.logical_data(&new_page_rank[0], new_page_rank.size()); + auto lmax_diff = ctx.logical_data(&max_diff, {1}); + + for (int iter = 0; iter < NITER; ++iter) + { + // Calculate Current Iteration PageRank + ctx.parallel_for(box(num_vertices), loffsets.read(), lnonzeros.read(), lpage_rank.rw(), lnew_page_rank.rw()) + ->*[init_rank] __device__(size_t idx, auto loffsets, auto lnonzeros, auto lpage_rank, auto lnew_page_rank) { + calculating_pagerank(idx, loffsets, lnonzeros, lpage_rank, lnew_page_rank, init_rank); + }; + + // Calculate Current Iteration Error + ctx.parallel_for(box(1), lmax_diff.write())->*[] __device__(size_t, auto lmax_diff) { + lmax_diff(0) = 0.0f; + }; + + // Calculate Current Iteration Error + ctx.parallel_for(box(num_vertices), lpage_rank.read(), lnew_page_rank.read(), lmax_diff.rw()) + ->*[] __device__(size_t idx, auto lpage_rank, auto lnew_page_rank, auto lmax_diff) { + atomicMaxFloat(lmax_diff.data_handle(), fabs(lnew_page_rank[idx] - lpage_rank[idx])); + }; + + // Reduce Error and Check for Convergence + bool converged; + ctx.task(exec_place::host, lmax_diff.read())->*[tolerance, &converged](cudaStream_t s, auto max_diff) { + cuda_safe_call(cudaStreamSynchronize(s)); + converged = (max_diff(0) < tolerance); + }; + + if (converged) + { + break; + } + + // Update New PageRank Values + std::swap(lpage_rank, lnew_page_rank); + } + + ctx.finalize(); + + /* CHECKING FOR ANSWER CORRECTNESS */ + // sum of all page ranks should equal 1 + double sum_pageranks = 0.0; + for (int64_t i = 0; i < num_vertices; i++) + { + sum_pageranks += page_rank[i]; + } + printf("Page rank answer is %s.\n", abs(sum_pageranks - 1.0) < 0.001 ? "correct" : "not correct"); + + printf("PageRank Results:\n"); + for (size_t i = 0; i < page_rank.size(); ++i) + { + printf("Vertex %zu: %f\n", i, page_rank[i]); + } + + return 0; +} diff --git a/cudax/examples/stf/graph_algorithms/tricount.cu b/cudax/examples/stf/graph_algorithms/tricount.cu new file mode 100644 index 00000000000..69cb775dc72 --- /dev/null +++ b/cudax/examples/stf/graph_algorithms/tricount.cu @@ -0,0 +1,99 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Computes the total number of triangles within a graph + * + */ + +#include + +#include + +using namespace cuda::experimental::stf; + +// Performs Binary Search on a given array with start/end bounds and a lookup element +__device__ int binary_search(slice arr, int start, int end, int lookup) +{ + while (start <= end) + { + int mid = start + (end - start) / 2; + if (arr[mid] == lookup) + { + return mid; + } + else if (arr[mid] < lookup) + { + start = mid + 1; + } + else + { + end = mid - 1; + } + } + return -1; +} + +/** + * @brief Computes the Triangle Counting for each vertex. + * + * @param idx The index of the vertex for which Triangle Counting is being calculated. + * @param loffsets Slice containing the offset vector of the CSR representation. + * @param lnonzeros Slice containing the non-zero elements (neighbors) vector of the CSR representation. + * @return The local triangle count for the vertex. + */ +__device__ unsigned long long int triangle_count(int idx, slice loffsets, slice lnonzeros) +{ + int lcount = 0; + for (int i = loffsets[idx]; i < loffsets[idx + 1]; i++) + { + int v = lnonzeros[i]; + for (int j = loffsets[idx]; j < loffsets[idx + 1]; j++) + { + int w = lnonzeros[j]; + if (binary_search(lnonzeros, loffsets[v], loffsets[v + 1] - 1, w) != -1) + { + lcount++; + } + } + } + return lcount; +} + +int main() +{ + stream_ctx ctx; + + // row offsets in CSR format + std::vector offsets = {0, 0, 1, 2, 4, 5, 6, 8, 9, 10}; + // edges in CSR format + std::vector nonzeros = {0, 0, 0, 1, 1, 1, 0, 1, 1, 1}; + + int num_vertices = offsets.size() - 1; + unsigned long long int total_count = 0; + + auto loffsets = ctx.logical_data(&offsets[0], offsets.size()); + auto lnonzeros = ctx.logical_data(&nonzeros[0], nonzeros.size()); + auto ltotal_count = ctx.logical_data(&total_count, {1}); + + ctx.parallel_for(box(num_vertices), loffsets.read(), lnonzeros.read(), ltotal_count.rw()) + ->*[] __device__(size_t idx, auto loffsets, auto lnonzeros, auto ltotal_count) { + unsigned long long int count = triangle_count(idx, loffsets, lnonzeros); + atomicAdd(ltotal_count.data_handle(), count); + }; + + ctx.finalize(); + + printf("Number of triangles: %lld\n", total_count); + + return 0; +} diff --git a/cudax/examples/stf/heat.cu b/cudax/examples/stf/heat.cu new file mode 100644 index 00000000000..2af2cda6422 --- /dev/null +++ b/cudax/examples/stf/heat.cu @@ -0,0 +1,126 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief An example solving heat equation with finite differences using the + * parallel_for construct. + * + * A multi-gpu version is shown in the heat_mgpu.cu example. + * + * This example also illustrate how to annotate resources with set_symbol + */ + +#include + +using namespace cuda::experimental::stf; + +void dump_iter(slice sUn, int iter) +{ + /* Create a binary file in the PPM format */ + char name[64]; + snprintf(name, 64, "heat_%06d.ppm", iter); + FILE* f = fopen(name, "wb"); + fprintf(f, "P6\n%zu %zu\n255\n", sUn.extent(0), sUn.extent(1)); + for (size_t j = 0; j < sUn.extent(1); j++) + { + for (size_t i = 0; i < sUn.extent(0); i++) + { + int v = (int) (255.0 * sUn(i, j) / 100.0); + // we assume values between 0.0 and 100.0 : max value is in red, + // min is in blue + unsigned char color[3]; + color[0] = static_cast(v); /* red */ + color[1] = static_cast(0); /* green */ + color[2] = static_cast(255 - v); /* blue */ + fwrite(color, 1, 3, f); + } + } + fclose(f); +} + +int main() +{ + context ctx; + + const size_t N = 800; + + auto lU = ctx.logical_data(shape_of>(N, N)); + auto lU1 = ctx.logical_data(lU.shape()); + + // Initialize the Un field with boundary conditions, and a disk at a lower + // temperature in the middle. + ctx.parallel_for(lU.shape(), lU.write())->*[=] _CCCL_DEVICE(size_t i, size_t j, auto U) { + double rad = U.extent(0) / 8.0; + double dx = (double) i - U.extent(0) / 2; + double dy = (double) j - U.extent(1) / 2; + + U(i, j) = (dx * dx + dy * dy < rad * rad) ? 100.0 : 0.0; + + /* Set up boundary conditions */ + if (j == 0.0) + { + U(i, j) = 100.0; + } + if (j == U.extent(1) - 1) + { + U(i, j) = 0.0; + } + if (i == 0.0) + { + U(i, j) = 0.0; + } + if (i == U.extent(0) - 1) + { + U(i, j) = 0.0; + } + }; + + // diffusion constant + double a = 0.5; + + double dx = 0.1; + double dy = 0.1; + double dx2 = dx * dx; + double dy2 = dy * dy; + + // time step + double dt = dx2 * dy2 / (2.0 * a * (dx2 + dy2)); + + double c = a * dt; + + int nsteps = 1000; + int image_freq = -1; + + for (int iter = 0; iter < nsteps; iter++) + { + if (image_freq > 0 && iter % image_freq == 0) + { + // Dump Un in a PPM file + ctx.host_launch(lU.read())->*[=](auto U) { + dump_iter(U, iter); + }; + } + + // Update Un using Un1 value with a finite difference scheme + ctx.parallel_for(inner<1>(lU.shape()), lU.read(), lU1.write()) + ->*[=] + _CCCL_DEVICE(size_t i, size_t j, auto U, auto U1) { + U1(i, j) = + U(i, j) + + c * ((U(i - 1, j) - 2 * U(i, j) + U(i + 1, j)) / dx2 + (U(i, j - 1) - 2 * U(i, j) + U(i, j + 1)) / dy2); + }; + + std::swap(lU, lU1); + } + + ctx.finalize(); +} diff --git a/cudax/examples/stf/heat_mgpu.cu b/cudax/examples/stf/heat_mgpu.cu new file mode 100644 index 00000000000..a4befc2a05b --- /dev/null +++ b/cudax/examples/stf/heat_mgpu.cu @@ -0,0 +1,163 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief An example solving heat equation with finite differences on multiple devices + * + * This example also illustrate how to annotate resources with set_symbol + */ + +#include + +using namespace cuda::experimental::stf; + +void dump_iter(slice sUn, int iter) +{ + /* Create a binary file in the PPM format */ + char name[64]; + snprintf(name, 64, "heat_%06d.ppm", iter); + FILE* f = fopen(name, "wb"); + fprintf(f, "P6\n%zu %zu\n255\n", sUn.extent(0), sUn.extent(1)); + for (size_t j = 0; j < sUn.extent(1); j++) + { + for (size_t i = 0; i < sUn.extent(0); i++) + { + int v = (int) (255.0 * sUn(i, j) / 100.0); + // we assume values between 0.0 and 100.0 : max value is in red, + // min is in blue + unsigned char color[3]; + color[0] = static_cast(v); /* red */ + color[1] = static_cast(0); /* green */ + color[2] = static_cast(255 - v); /* blue */ + fwrite(color, 1, 3, f); + } + } + fclose(f); +} + +int main(int argc, char** argv) +{ + context ctx; + + size_t N = 1000; + int nsteps = 100; + int image_freq = -1; + + if (argc > 1) + { + N = atol(argv[1]); + } + + if (argc > 2) + { + nsteps = atoi(argv[2]); + } + + if (argc > 3) + { + image_freq = atoi(argv[3]); + } + + if (argc > 4) + { + int use_graphs = atoi(argv[4]); + if (use_graphs != 0) + { + ctx = graph_ctx(); + } + } + + auto lU = ctx.logical_data(shape_of>(N, N)); + auto lU1 = ctx.logical_data(lU.shape()); + + lU.set_symbol("U"); + lU1.set_symbol("U1"); + + auto all_devs = exec_place::all_devices(); + + // Initialize the Un field with boundary conditions, and a disk at a lower + // temperature in the middle. + ctx.parallel_for(blocked_partition(), all_devs, lU.shape(), lU.write()).set_symbol("init")->* + [=] _CCCL_DEVICE(size_t i, size_t j, auto U) { + double rad = U.extent(0) / 8.0; + double dx = (double) i - U.extent(0) / 2; + double dy = (double) j - U.extent(1) / 2; + + U(i, j) = (dx * dx + dy * dy < rad * rad) ? 100.0 : 0.0; + + /* Set up boundary conditions */ + if (j == 0.0) + { + U(i, j) = 100.0; + } + if (j == U.extent(1) - 1) + { + U(i, j) = 0.0; + } + if (i == 0.0) + { + U(i, j) = 0.0; + } + if (i == U.extent(0) - 1) + { + U(i, j) = 0.0; + } + }; + + // diffusion constant + double a = 0.5; + + double dx = 0.1; + double dy = 0.1; + double dx2 = dx * dx; + double dy2 = dy * dy; + + // time step + double dt = dx2 * dy2 / (2.0 * a * (dx2 + dy2)); + + double c = a * dt; + + cudaEvent_t start, stop; + + cuda_safe_call(cudaEventCreate(&start)); + cuda_safe_call(cudaEventCreate(&stop)); + + cuda_safe_call(cudaEventRecord(start, ctx.task_fence())); + + ctx.repeat(nsteps)->*[&](context ctx, size_t iter) { + if (image_freq > 0 && iter % image_freq == 0) + { + // Dump Un in a PPM file + ctx.host_launch(lU.read()).set_symbol("dump")->*[=](auto U) { + dump_iter(U, static_cast(iter)); + }; + } + + // Update Un using Un1 value with a finite difference scheme + ctx.parallel_for(blocked_partition(), all_devs, inner<1>(lU.shape()), lU.read(), lU1.write()).set_symbol("step")->* + [=] _CCCL_DEVICE(size_t i, size_t j, auto U, auto U1) { + U1(i, j) = + U(i, j) + + c * ((U(i - 1, j) - 2 * U(i, j) + U(i + 1, j)) / dx2 + (U(i, j - 1) - 2 * U(i, j) + U(i, j + 1)) / dy2); + }; + + std::swap(lU, lU1); + }; + + cuda_safe_call(cudaEventRecord(stop, ctx.task_fence())); + + ctx.finalize(); + + float elapsedTime; + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Elapsed time: %f ms\n", elapsedTime); +} diff --git a/cudax/examples/stf/jacobi.cu b/cudax/examples/stf/jacobi.cu new file mode 100644 index 00000000000..099a16344b5 --- /dev/null +++ b/cudax/examples/stf/jacobi.cu @@ -0,0 +1,153 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Jacobi method with launch + * + */ + +#include + +#include + +using namespace cuda::experimental::stf; + +/* Implement atomicMax with a compare and swap */ +_CCCL_DEVICE double atomicMax(double* address, double val) +{ + unsigned long long int* address_as_ull = (unsigned long long int*) address; + unsigned long long int old = *address_as_ull, assumed; + + do + { + assumed = old; + old = atomicCAS(address_as_ull, assumed, __double_as_longlong(fmax(val, __longlong_as_double(assumed)))); + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + + return __longlong_as_double(old); +} + +template +_CCCL_DEVICE double reduce_max(thread_hierarchy_t& t, double local_max) +{ + auto ti = t.inner(); + slice error = t.template storage(0); + + error(0) = 0.0; + t.sync(); + + // Note we do not use t.static_width(1) because t is a runtime variable so it + // cannot be used directly to statically evaluate the size. + __shared__ double block_max[thread_hierarchy_t::static_width(1)]; + block_max[ti.rank()] = local_max; + for (size_t s = ti.size() / 2; s > 0; s /= 2) + { + if (ti.rank() < s) + { + block_max[ti.rank()] = fmax(block_max[ti.rank() + s], block_max[ti.rank()]); + } + ti.sync(); + } + + if (ti.rank() == 0) + { + atomicMax(&error(0), block_max[0]); + } + t.sync(); + + return error(0); +} + +int main(int argc, char** argv) +{ + context ctx; + + size_t n = 4096; + size_t m = 4096; + size_t iter_max = 100; + double tol = 0.0000001; + + if (argc > 2) + { + n = atol(argv[1]); + m = atol(argv[2]); + } + + if (argc > 3) + { + iter_max = atoi(argv[3]); + } + + if (argc > 4) + { + tol = atof(argv[4]); + } + + auto lA = ctx.logical_data(shape_of>(m, n)); + auto lAnew = ctx.logical_data(lA.shape()); + + auto all_devs = exec_place::all_devices(); + + ctx.parallel_for(blocked_partition(), all_devs, lA.shape(), lA.write(), lAnew.write()).set_symbol("init")->* + [=] _CCCL_DEVICE(size_t i, size_t j, auto A, auto Anew) { + A(i, j) = (i == j) ? 10.0 : -1.0; + }; + + cudaEvent_t start, stop; + + cuda_safe_call(cudaEventCreate(&start)); + cuda_safe_call(cudaEventCreate(&stop)); + + cuda_safe_call(cudaEventRecord(start, ctx.task_fence())); + + auto spec = con(con<64>(), mem(sizeof(double))); + + ctx.launch(spec, all_devs, lA.rw(), lAnew.write())->*[iter_max, tol, n, m] _CCCL_DEVICE(auto t, auto A, auto Anew) { + auto ti = t.inner(); + for (size_t iter = 0; iter < iter_max; iter++) + { + // thread-local maximum error + double local_error = 0.0; + + for (auto [i, j] : t.apply_partition(inner<1>(shape(A)))) + { + Anew(i, j) = 0.25 * (A(i - 1, j) + A(i + 1, j) + A(i, j - 1) + A(i, j + 1)); + + local_error = fmax(local_error, fabs(A(i, j) - Anew(i, j))); + } + + // compute the overall maximum error + double error = reduce_max(t, local_error); + + /* Fill A with the new values */ + for (auto [i, j] : t.apply_partition(shape(A))) + { + A(i, j) = Anew(i, j); + } + + if (iter % 25 == 0 && t.rank() == 0) + { + printf("iter %zu : error %e (tol %e)\n", iter, error, tol); + } + } + }; + + cuda_safe_call(cudaEventRecord(stop, ctx.task_fence())); + + ctx.finalize(); + + float elapsedTime; + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Elapsed time: %f ms\n", elapsedTime); +} diff --git a/cudax/examples/stf/launch_histogram.cu b/cudax/examples/stf/launch_histogram.cu new file mode 100644 index 00000000000..5b231167431 --- /dev/null +++ b/cudax/examples/stf/launch_histogram.cu @@ -0,0 +1,170 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief A naive parallel histogram algorithm written with launch + * + */ + +#include + +using namespace cuda::experimental::stf; + +__host__ __device__ double X0(int i) +{ + return sin((double) i); +} + +int main(int argc, char** argv) +{ + stream_ctx ctx; + + double lower_level = -1.0; + double upper_level = 1.0; + constexpr size_t num_levels = 21; + + size_t N = 128 * 1024UL; + if (argc > 1) + { + N = size_t(atoll(argv[1])); + } + + int check = 1; + if (argc > 2) + { + check = atoi(argv[2]); + } + + // fprintf(stderr, "SIZE %s\n", pretty_print_bytes(N * sizeof(double)).c_str()); + + std::vector X(N); + std::vector histo(num_levels - 1); + + for (size_t i = 0; i < N; i++) + { + X[i] = X0(i); + } + + // If we were to register each part one by one, there could be pages which + // cross multiple parts, and the pinning operation would fail. + cuda_safe_call(cudaHostRegister(&X[0], N * sizeof(double), cudaHostRegisterPortable)); + + auto lX = ctx.logical_data(&X[0], N); + lX.set_symbol("X"); + + auto lhisto = ctx.logical_data(&histo[0], num_levels - 1); + lhisto.set_symbol("histogram"); + + cuda_safe_call(cudaStreamSynchronize(ctx.task_fence())); + + cudaEvent_t start, stop; + cuda_safe_call(cudaEventCreate(&start)); + cuda_safe_call(cudaEventCreate(&stop)); + cuda_safe_call(cudaEventRecord(start, ctx.task_fence())); + + constexpr size_t BLOCK_THREADS = 128; + + // size_t NDEVS = 1; + // auto where = exec_place::repeat(exec_place::current_device(), NDEVS); + auto where = exec_place::current_device(); + + auto spec = con<8>(con(BLOCK_THREADS, mem((num_levels - 1) * sizeof(size_t)))); + + ctx.launch(spec, where, lX.read(), lhisto.write())->*[=] _CCCL_DEVICE(auto th, auto x, auto histo) { + size_t block_id = th.rank(0); + + slice smem_hist = th.template storage(1); + assert(smem_hist.size() == (num_levels - 1)); + + /* Thread local histogram */ + size_t local_hist[num_levels - 1]; + for (size_t k = 0; k < num_levels - 1; k++) + { + local_hist[k] = 0; + smem_hist[k] = 0; + } + + if (th.rank() == 0) + { + for (size_t k = 0; k < num_levels - 1; k++) + { + histo[k] = 0; + } + } + + for (size_t i = th.rank(); i < x.size(); i += th.size()) + { + double xi = x(i); + if (xi >= lower_level && xi < upper_level) + { + size_t bin = size_t(((num_levels - 1) * (xi - lower_level)) / (upper_level - lower_level)); + local_hist[bin]++; + } + } + + // smem was zero'ed + th.inner().sync(); + + /* Each thread contributes to an histogram in shared memory */ + for (size_t k = 0; k < num_levels - 1; k++) + { + atomicAdd((unsigned long long*) &smem_hist[k], local_hist[k]); + } + + // histo was zero'ed + th.sync(); + + if (th.inner().rank() == 0) + { + for (size_t k = 0; k < num_levels - 1; k++) + { + atomicAdd((unsigned long long*) &histo[k], smem_hist[k]); + } + } + }; + + cuda_safe_call(cudaEventRecord(stop, ctx.task_fence())); + + ctx.finalize(); + + float ms = 0; + cuda_safe_call(cudaEventElapsedTime(&ms, start, stop)); + + // fprintf(stdout, "%zu %f ms\n", N / 1024 / 1024, ms); + + if (check) + { + // fprintf(stderr, "Checking result...\n"); + size_t refhist[num_levels - 1]; + for (size_t i = 0; i < num_levels - 1; i++) + { + refhist[i] = 0; + } + + for (size_t i = 0; i < N; i++) + { + double xi = X[i]; + if (xi >= lower_level && xi < upper_level) + { + size_t bin = size_t(((num_levels - 1) * (xi - lower_level)) / (upper_level - lower_level)); + refhist[bin]++; + } + } + + // double dlevel = (upper_level - lower_level) / (num_levels - 1); + for (size_t i = 0; i < num_levels - 1; i++) + { + EXPECT(refhist[i] == histo[i]); + // fprintf(stderr, "[%lf:%lf[ %ld\n", lower_level + i * dlevel, lower_level + (i + 1) * dlevel, histo[i]); + } + } +} diff --git a/cudax/examples/stf/launch_scan.cu b/cudax/examples/stf/launch_scan.cu new file mode 100644 index 00000000000..780d500c07d --- /dev/null +++ b/cudax/examples/stf/launch_scan.cu @@ -0,0 +1,162 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief A parallel scan algorithm + * + */ + +#include // or equivalently + +#include + +using namespace cuda::experimental::stf; + +__host__ __device__ double X0(int) +{ + // return sin((double) i); + return 1.0; +} + +int main(int argc, char** argv) +{ + stream_ctx ctx; + // graph_ctx ctx; + + size_t N = 128 * 1024UL * 1024UL; + if (argc > 1) + { + N = size_t(atoll(argv[1])); + } + + int check = 0; + if (argc > 2) + { + check = atoi(argv[2]); + } + + std::vector X(N); + + for (size_t i = 0; i < N; i++) + { + X[i] = X0(i); + } + + auto lX = ctx.logical_data(&X[0], N); + + // No need to move this back to the host if we do not check the result + if (!check) + { + lX.set_write_back(false); + } + + cuda_safe_call(cudaStreamSynchronize(ctx.task_fence())); + + cudaEvent_t start, stop; + cuda_safe_call(cudaEventCreate(&start)); + cuda_safe_call(cudaEventCreate(&stop)); + cuda_safe_call(cudaEventRecord(start, ctx.task_fence())); + + constexpr size_t BLOCK_THREADS = 128; + constexpr size_t NBLOCKS = 8; + + auto spec = con(con(), mem(NBLOCKS * sizeof(double))); + + // auto where = exec_place::repeat(exec_place::current_device(), NDEVS); + auto where = exec_place::current_device(); + + ctx.launch(spec, where, lX.rw())->*[=] _CCCL_DEVICE(auto th, auto x) { + const size_t block_id = th.rank(0); + const size_t tid = th.inner().rank(); + // const size_t tid = th.rank(1, 0); + + // Block-wide partials using static allocation + __shared__ double block_partial_sum[th.static_width(1)]; + + // Device-wide partial sums + slice dev_partial_sum = th.template storage(0); + + /* Thread local prefix-sum */ + const box<1> b = th.apply_partition(shape(x), std::tuple()); + for (size_t i = b.get_begin(0) + 1; i < b.get_end(0); i++) + { + x(i) += x(i - 1); + } + block_partial_sum[tid] = x(b.get_end(0) - 1); + + th.inner().sync(); + + /* Block level : get partials sum accross the different threads */ + if (tid == 0) + { // rank in scope block is 0 + // Prefix sum on partial sums + for (size_t i = 1; i < BLOCK_THREADS; i++) + { + block_partial_sum[i] += block_partial_sum[i - 1]; + } + dev_partial_sum[block_id] = block_partial_sum[BLOCK_THREADS - 1]; + } + + /* Reduce partial sums at device level : get sum accross all blocks */ + th.sync(); + + if (block_id == 0 && tid == 0) + { // rank in scope 0 + for (size_t i = 1; i < NBLOCKS; i++) + { + dev_partial_sum[i] += dev_partial_sum[i - 1]; + // printf("SUMMED dev_partial_sum[%ld] = %f\n", i, dev_partial_sum[i]); + } + } + + th.sync(); + + for (size_t i = b.get_begin(0); i < b.get_end(0); i++) + { + if (tid > 0) + { + x(i) += block_partial_sum[tid - 1]; + } + + if (block_id > 0) + { + x(i) += dev_partial_sum[block_id - 1]; + } + } + }; + + cuda_safe_call(cudaEventRecord(stop, ctx.task_fence())); + + ctx.finalize(); + + float ms = 0; + cuda_safe_call(cudaEventElapsedTime(&ms, start, stop)); + + printf("%s in %f ms (%g GB/s)\n", + pretty_print_bytes(N * sizeof(double)).c_str(), + ms, + double(N * sizeof(double) / 1024 / 1024) / ms); + + if (check) + { + fprintf(stderr, "Checking result...\n"); + EXPECT(fabs(X[0] - X0(0)) < 0.00001); + for (size_t i = 0; i < N; i++) + { + if (fabs(X[i] - X[i - 1] - X0(i)) > 0.00001) + { + fprintf(stderr, "I %zu X[i] %f (X[i] - X[i-1]) %f expect %f\n", i, X[i], (X[i] - X[i - 1]), X0(i)); + } + EXPECT(fabs(X[i] - X[i - 1] - X0(i)) < 0.00001); + } + } +} diff --git a/cudax/examples/stf/launch_sum.cu b/cudax/examples/stf/launch_sum.cu new file mode 100644 index 00000000000..1cdd2cf79ba --- /dev/null +++ b/cudax/examples/stf/launch_sum.cu @@ -0,0 +1,81 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief A reduction kernel written using launch + */ + +#include + +using namespace cuda::experimental::stf; + +double X0(int i) +{ + return sin((double) i); +} + +int main() +{ + context ctx; + + const size_t N = 128 * 1024 * 1024; + + std::vector X(N); + double sum = 0.0; + + double ref_sum = 0.0; + + for (size_t ind = 0; ind < N; ind++) + { + X[ind] = sin((double) ind); + ref_sum += X[ind]; + } + + auto lX = ctx.logical_data(&X[0], {N}); + auto lsum = ctx.logical_data(&sum, {1}); + + auto number_devices = 1; // + auto where = exec_place::repeat(exec_place::device(0), number_devices); + + auto spec = par<16>(con<32>()); + + ctx.launch(spec, where, lX.read(), lsum.rw())->*[] _CCCL_DEVICE(auto th, auto x, auto sum) { + // Each thread computes the sum of elements assigned to it + double local_sum = 0.0; + for (size_t i = th.rank(); i < x.size(); i += th.size()) + { + local_sum += x(i); + } + + auto ti = th.inner(); + + __shared__ double block_sum[th.static_width(1)]; + block_sum[ti.rank()] = local_sum; + + for (size_t s = ti.size() / 2; s > 0; s /= 2) + { + ti.sync(); + if (ti.rank() < s) + { + block_sum[ti.rank()] += block_sum[ti.rank() + s]; + } + } + + if (ti.rank() == 0) + { + atomicAdd(&sum(0), block_sum[0]); + } + }; + + ctx.finalize(); + + EXPECT(fabs(sum - ref_sum) < 0.0001); +} diff --git a/cudax/examples/stf/launch_sum_cub.cu b/cudax/examples/stf/launch_sum_cub.cu new file mode 100644 index 00000000000..a9e64dbb27f --- /dev/null +++ b/cudax/examples/stf/launch_sum_cub.cu @@ -0,0 +1,73 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief A reduction kernel written using launch and CUB + */ + +#include + +#include + +using namespace cuda::experimental::stf; + +double X0(int i) +{ + return sin((double) i); +} + +int main() +{ + context ctx; + + const size_t N = 128 * 1024 * 1024; + + std::vector X(N); + double sum = 0.0; + + double ref_sum = 0.0; + + for (size_t ind = 0; ind < N; ind++) + { + X[ind] = sin((double) ind); + ref_sum += X[ind]; + } + + auto lX = ctx.logical_data(&X[0], {N}); + auto lsum = ctx.logical_data(&sum, {1}); + + auto number_devices = 2; + auto where = exec_place::repeat(exec_place::device(0), number_devices); + + auto spec = par<32>(con<128>()); + + ctx.launch(spec, where, lX.read(), lsum.rw())->*[] _CCCL_DEVICE(auto th, auto x, auto sum) { + // Each thread computes the sum of elements assigned to it + double local_sum = 0.0; + for (auto ind : th.apply_partition(shape(x))) + { + local_sum += x(ind); + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + double block_sum = BlockReduce(temp_storage).Sum(local_sum); + if (th.inner().rank() == 0) + { + atomicAdd(&sum(0), block_sum); + } + }; + + ctx.finalize(); + + EXPECT(fabs(sum - ref_sum) < 0.0001); +} diff --git a/cudax/examples/stf/linear_algebra/06-pdgemm.cu b/cudax/examples/stf/linear_algebra/06-pdgemm.cu new file mode 100644 index 00000000000..07835093b71 --- /dev/null +++ b/cudax/examples/stf/linear_algebra/06-pdgemm.cu @@ -0,0 +1,401 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief An example that implements a tiled matrix product over multiple devices using CUBLAS + * + * This also illustrates how the same code base can be used both with a + * stream_ctx and a graph_ctx backend. + */ + +#include + +#include + +#define TILED + +using namespace cuda::experimental::stf; + +static std::unordered_map cublas_handles; + +/* Get a CUBLAS handle valid on the current device, or initialize it lazily */ +cublasHandle_t get_cublas_handle() +{ + int dev; + cuda_safe_call(cudaGetDevice(&dev)); + + auto& result = cublas_handles[dev]; + if (result == cublasHandle_t()) + { // not found, default value inserted + // Lazy initialization, and save the handle for future use + cuda_safe_call(cublasCreate(&result)); + } + return result; +} + +template +class matrix +{ +public: + matrix(stream_ctx& ctx, + size_t NROWS, + size_t NCOLS, + size_t BLOCKSIZE_ROWS, + size_t BLOCKSIZE_COLS, + const char* _symbol = "matrix") + { + symbol = _symbol; + + m = NROWS; + mb = BLOCKSIZE_ROWS; + + n = NCOLS; + nb = BLOCKSIZE_COLS; + + assert(m % mb == 0); + assert(n % nb == 0); + + size_t s = ((size_t) m) * ((size_t) n) * sizeof(T); + // cuda_safe_call(cudaMallocHost(&h_array, m*n*sizeof(T))); + // fprintf(stderr, "Allocating %ld x %ld x %ld = %ld bytes (%f GB) on host for %s\n", m, n, sizeof(T), s, + // s / (1024.0 * 1024.0 * 1024.0), _symbol); + h_array = (T*) malloc(s); + assert(h_array); + cuda_safe_call(cudaHostRegister(h_array, s, cudaHostRegisterPortable)); + + // Compute the number of blocks + mt = m / mb; + nt = n / nb; + + handles.resize(mt * nt); + + for (size_t colb = 0; colb < nt; colb++) + { + for (size_t rowb = 0; rowb < mt; rowb++) + { + T* addr_h = get_block_h(rowb, colb); + +#ifdef TILED + // tiles are stored contiguously + const size_t ld = mb; +#else + const size_t ld = m; +#endif + + std::ignore = ld; // avoid warning #177-D: variable "ld" was declared but never referenced + auto s = make_slice(addr_h, std::tuple{mb, nb}, ld); + auto tile = ctx.logical_data(s); + tile.set_write_back(false); + + tile.set_symbol(std::string(symbol) + "_" + std::to_string(rowb) + "_" + std::to_string(colb)); + + handles[rowb + colb * mt] = std::move(tile); + } + } + + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + for (int a = 1; a * a <= ndevs; a++) + { + if (ndevs % a == 0) + { + grid_p = a; + grid_q = ndevs / a; + } + } + + assert(grid_p * grid_q == ndevs); + + // std::cout << "FOUND " << ndevs << " DEVICES " + // << "p=" << grid_p << " q=" << grid_q << std::endl; + } + + int get_preferred_devid(int row, int col) + { + return (row % grid_p) + (col % grid_q) * grid_p; + } + + logical_data>& get_handle(int row, int col) + { + return handles[row + col * mt]; + } + + size_t get_index(size_t row, size_t col) + { +#ifdef TILED + // Find which tile contains this element + int tile_row = row / mb; + int tile_col = col / nb; + + size_t tile_size = mb * nb; + + // Look for the index of the begining of the tile + size_t tile_start = (tile_row + mt * tile_col) * tile_size; + + // Offset within the tile + size_t offset = (row % mb) + (col % nb) * mb; + + return tile_start + offset; +#else + return row + col * m; +#endif + } + + T* get_block_h(int brow, int bcol) + { + size_t index = get_index(brow * mb, bcol * nb); + return &h_array[index]; + } + + // Fill with func(Matrix*,row, col) + template + void fill(stream_ctx& ctx, Fun&& fun) + { + nvtxRangePushA("FILL"); + // Fill blocks by blocks + for (int colb = 0; colb < nt; colb++) + { + for (int rowb = 0; rowb < mt; rowb++) + { + // Each task fills a block + auto& h = get_handle(rowb, colb); + int devid = get_preferred_devid(rowb, colb); + + ctx.parallel_for(exec_place::device(devid), h.shape(), h.write()).set_symbol("INIT")->* + [=] _CCCL_DEVICE(size_t lrow, size_t lcol, auto sA) { + size_t row = lrow + rowb * sA.extent(0); + size_t col = lcol + colb * sA.extent(1); + sA(lrow, lcol) = fun(row, col); + }; + } + } + nvtxRangePop(); + } + + T* h_array; + size_t m; // nrows + size_t n; // ncols + + size_t mb; // block size (rows) + size_t nb; // block size (cols) + + size_t mt; // numter of column blocks + size_t nt; // numter of row blocks + + // abstract data handles + std::vector>> handles; + + const char* symbol; + + // for the mapping + int ndevs; + int grid_p, grid_q; +}; + +void DGEMM( + stream_ctx& ctx, + cublasOperation_t transa, + cublasOperation_t transb, + double alpha, + matrix& A, + int A_row, + int A_col, + matrix& B, + int B_row, + int B_col, + double beta, + matrix& C, + int C_row, + int C_col) +{ + auto dev = exec_place::device(C.get_preferred_devid(C_row, C_col)); + + auto t = ctx.task( + dev, A.get_handle(A_row, A_col).read(), B.get_handle(B_row, B_col).read(), C.get_handle(C_row, C_col).rw()); + t.set_symbol("DGEMM"); + + t->*[&](cudaStream_t stream, auto tA, auto tB, auto tC) { + cuda_safe_call(cublasSetStream(get_cublas_handle(), stream)); + int k = tA.extent(transa == CUBLAS_OP_N ? 1 : 0); + cuda_safe_call(cublasDgemm( + get_cublas_handle(), + transa, + transb, + tC.extent(0), + tC.extent(1), + k, + &alpha, + tA.data_handle(), + tA.stride(1), + tB.data_handle(), + tB.stride(1), + &beta, + tC.data_handle(), + tC.stride(1))); + }; +} + +void PDGEMM(stream_ctx& ctx, + cublasOperation_t transa, + cublasOperation_t transb, + double alpha, + matrix& A, + matrix& B, + double beta, + matrix& C) +{ + for (int m = 0; m < C.mt; m++) + { + for (int n = 0; n < C.nt; n++) + { + //========================================= + // alpha*A*B does not contribute; scale C + //========================================= + int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + if (alpha == 0.0 || inner_k == 0) + { + DGEMM(ctx, transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); + } + else if (transa == CUBLAS_OP_N) + { + //================================ + // CUBLAS_OP_N / CUBLAS_OP_N + //================================ + if (transb == CUBLAS_OP_N) + { + assert(A.nt == B.mt); + for (int k = 0; k < A.nt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(ctx, transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); + } + } + //===================================== + // CUBLAS_OP_N / CUBLAS_OP_T + //===================================== + else + { + for (int k = 0; k < A.nt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(ctx, transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); + } + } + } + else + { + //===================================== + // CUBLAS_OP_T / CUBLAS_OP_N + //===================================== + if (transb == CUBLAS_OP_N) + { + for (int k = 0; k < A.mt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(ctx, transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); + } + } + //========================================== + // CUBLAS_OP_T / CUBLAS_OP_T + //========================================== + else + { + for (int k = 0; k < A.mt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(ctx, transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); + } + } + } + } + } +} + +void run(stream_ctx& ctx, size_t N, size_t NB) +{ + auto fixed_alloc = block_allocator(ctx, NB * NB * sizeof(double)); + ctx.set_allocator(fixed_alloc); + + // Set up CUBLAS and CUSOLVER + int ndevs; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + /* Warm up allocators */ + for (size_t d = 0; d < ndevs; d++) + { + auto lX = ctx.logical_data(shape_of>(1)); + ctx.parallel_for(exec_place::device(d), lX.shape(), lX.write())->*[] _CCCL_DEVICE(size_t, auto) {}; + } + + /* Initializes CUBLAS on all devices */ + for (size_t d = 0; d < ndevs; d++) + { + cuda_safe_call(cudaSetDevice(d)); + get_cublas_handle(); + } + + matrix A(ctx, N, N, NB, NB, "A"); + matrix B(ctx, N, N, NB, NB, "B"); + matrix C(ctx, N, N, NB, NB, "C"); + + // (Hilbert matrix + 2*N*Id) + auto hilbert = [=] _CCCL_HOST_DEVICE(size_t row, size_t col) { + return 1.0 / (col + row + 1.0) + 2.0 * N * (col == row); + }; + + A.fill(ctx, hilbert); + B.fill(ctx, hilbert); + C.fill(ctx, hilbert); + + cudaEvent_t startEvent, stopEvent; + + cuda_safe_call(cudaEventCreate(&startEvent)); + cuda_safe_call(cudaEventCreate(&stopEvent)); + + cuda_safe_call(cudaEventRecord(startEvent, ctx.task_fence())); + + PDGEMM(ctx, CUBLAS_OP_N, CUBLAS_OP_N, 1.0, A, B, -2.0, C); + + cuda_safe_call(cudaEventRecord(stopEvent, ctx.task_fence())); + + ctx.finalize(); + + float milliseconds; + cuda_safe_call(cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); + + double gflops_pdgemm = 2.0 * ((double) N * (double) N * (double) N) / (1000000000.0); + std::cout + << "[PDDGEMM] ELAPSED: " << milliseconds << " ms, GFLOPS: " << gflops_pdgemm / (milliseconds / 1000.0) << '\n'; +} + +int main(int argc, char** argv) +{ + size_t N = 4096; + size_t NB = 512; + + if (argc > 1) + { + N = atoi(argv[1]); + } + + if (argc > 2) + { + NB = atoi(argv[2]); + } + + assert(N % NB == 0); + + stream_ctx ctx; + run(ctx, N, NB); + + // // Also run using a graph context. + // ctx = graph_ctx(); + // run(ctx, N, NB); +} diff --git a/cudax/examples/stf/linear_algebra/07-cholesky.cu b/cudax/examples/stf/linear_algebra/07-cholesky.cu new file mode 100644 index 00000000000..144721ff184 --- /dev/null +++ b/cudax/examples/stf/linear_algebra/07-cholesky.cu @@ -0,0 +1,746 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief This example implements a Cholesky decomposition over multiple devices using CUBLAS and CUSOLVER + * + * It also illustrates how we can use CUDASTF to allocate temporary data for CUSOLVER in CUDASTF tasks + */ + +#include + +#include +#include + +#define TILED + +using namespace cuda::experimental::stf; + +// Global for the sake of simplicity ! +stream_ctx ctx; + +/* Get a CUBLAS handle valid on the current device, or initialize it lazily */ +cublasHandle_t& get_cublas_handle() +{ + int dev; + cuda_safe_call(cudaGetDevice(&dev)); + + static std::unordered_map cublas_handles; + auto& result = cublas_handles[dev]; + if (result == cublasHandle_t()) + { // not found, default value inserted + // Lazy initialization, and save the handle for future use + cuda_safe_call(cublasCreate(&result)); + } + return result; +} + +/* Get a CUSOLVER handle valid on the current device, or initialize it lazily */ +cusolverDnHandle_t& get_cusolver_handle() +{ + int dev; + cuda_safe_call(cudaGetDevice(&dev)); + + static std::unordered_map cusolver_handles; + auto& result = cusolver_handles[dev]; + if (result == cusolverDnHandle_t()) + { // not found, default value inserted + // Lazy initialization, and save the handle for future use + cuda_safe_call(cusolverDnCreate(&result)); + } + return result; +} + +template +class matrix +{ +public: + matrix(int NROWS, int NCOLS, int BLOCKSIZE_ROWS, int BLOCKSIZE_COLS, bool is_sym, const char* _symbol = "matrix") + { + symbol = _symbol; + + sym_matrix = is_sym; + + m = NROWS; + mb = BLOCKSIZE_ROWS; + + n = NCOLS; + nb = BLOCKSIZE_COLS; + + assert(m % mb == 0); + assert(n % nb == 0); + + // cuda_safe_call(cudaMallocHost(&h_array, m*n*sizeof(T))); + // fprintf(stderr, "Allocating %ld x %ld x %ld = %ld bytes (%f GB) on host for %s\n", m, n, sizeof(T), s, + // s / (1024.0 * 1024.0 * 1024.0), _symbol); + h_array.resize(m * n); + cuda_safe_call(cudaHostRegister(&h_array[0], h_array.size() * sizeof(T), cudaHostRegisterPortable)); + + // Compute the number of blocks + mt = m / mb; + nt = n / nb; + + handles.resize(mt * nt); + + for (int colb = 0; colb < nt; colb++) + { + int low_rowb = sym_matrix ? colb : 0; + for (int rowb = low_rowb; rowb < mt; rowb++) + { + T* addr_h = get_block_h(rowb, colb); + auto& h = handle(rowb, colb); + +#ifdef TILED + // tiles are stored contiguously + size_t ld = mb; +#else + size_t ld = m; +#endif + std::ignore = ld; // work around bug in compiler + h = ctx.logical_data(make_slice(addr_h, std::tuple{mb, nb}, ld)); + h.set_symbol(std::string(symbol) + "_" + std::to_string(rowb) + "_" + std::to_string(colb)); + h.set_write_back(false); + } + } + + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + for (int a = 1; a * a <= ndevs; a++) + { + if (ndevs % a == 0) + { + grid_p = a; + grid_q = ndevs / a; + } + } + + assert(grid_p * grid_q == ndevs); + + // std::cout << "FOUND " << ndevs << " DEVICES " + // << "p=" << grid_p << " q=" << grid_q << std::endl; + } + + int get_preferred_devid(int row, int col) + { + return (row % grid_p) + (col % grid_q) * grid_p; + } + + auto& handle(int row, int col) + { + return handles[row + col * mt]; + } + + size_t get_index(size_t row, size_t col) + { +#ifdef TILED + // Find which tile contains this element + int tile_row = row / mb; + int tile_col = col / nb; + + size_t tile_size = mb * nb; + + // Look for the index of the begining of the tile + size_t tile_start = (tile_row + mt * tile_col) * tile_size; + + // Offset within the tile + size_t offset = (row % mb) + (col % nb) * mb; + + return tile_start + offset; +#else + return row + col * m; +#endif + } + + T* get_block_h(int brow, int bcol) + { + size_t index = get_index(brow * mb, bcol * nb); + return &h_array[index]; + } + + // Fill with func(Matrix*,row, col) + template + void fill(Fun&& fun) + { + nvtxRangePushA("FILL"); + // Fill blocks by blocks + for (int colb = 0; colb < nt; colb++) + { + int low_rowb = sym_matrix ? colb : 0; + for (int rowb = low_rowb; rowb < mt; rowb++) + { + // Each task fills a block + auto& h = handle(rowb, colb); + int devid = get_preferred_devid(rowb, colb); + + ctx.parallel_for(exec_place::device(devid), h.shape(), h.write()).set_symbol("INIT")->* + [=] _CCCL_DEVICE(size_t lrow, size_t lcol, auto sA) { + size_t row = lrow + rowb * sA.extent(0); + size_t col = lcol + colb * sA.extent(1); + sA(lrow, lcol) = fun(row, col); + }; + } + } + nvtxRangePop(); + } + + std::vector h_array; + size_t m; // nrows + size_t n; // ncols + + // Is this a sym matrix ? (lower assumed) + bool sym_matrix; + + size_t mb; // block size (rows) + size_t nb; // block size (cols) + + size_t mt; // number of column blocks + size_t nt; // number of row blocks + + // abstract data handles + std::vector>> handles; + + const char* symbol; + + // for the mapping + int ndevs; + int grid_p, grid_q; +}; + +void DPOTRF(cublasFillMode_t uplo, class matrix& A, int A_row, int A_col) +{ + auto& Akk = A.handle(A_row, A_col); + size_t m_akk = Akk.shape().extent(0); + // Note that the handle may be different from the actual handle... + int Lwork_expected; + cuda_safe_call(cusolverDnDpotrf_bufferSize(get_cusolver_handle(), uplo, m_akk, nullptr, 0, &Lwork_expected)); + + auto potrf_buffer = ctx.logical_data(size_t(Lwork_expected)); + potrf_buffer.set_allocator(ctx.get_default_allocator()); + + auto devInfo = ctx.logical_data(shape_of>(1)); + + auto t = + ctx.task(exec_place::device(A.get_preferred_devid(A_row, A_col)), Akk.rw(), potrf_buffer.write(), devInfo.write()); + t.set_symbol("DPOTRF"); + t->*[uplo](cudaStream_t s, auto sAkk, auto buffer, auto info) { + auto& h = get_cusolver_handle(); + cuda_safe_call(cusolverDnSetStream(h, s)); + + cuda_safe_call(cusolverDnDpotrf( + h, + uplo, + sAkk.extent(0), + sAkk.data_handle(), + sAkk.stride(1), + buffer.data_handle(), + buffer.extent(0), + info.data_handle())); + }; +} + +void DGEMM( + cublasOperation_t transa, + cublasOperation_t transb, + double alpha, + class matrix& A, + int A_row, + int A_col, + class matrix& B, + int B_row, + int B_col, + double beta, + class matrix& C, + int C_row, + int C_col) +{ + auto t = ctx.task(exec_place::device(A.get_preferred_devid(C_row, C_col)), + A.handle(A_row, A_col).read(), + B.handle(B_row, B_col).read(), + C.handle(C_row, C_col).rw()); + t.set_symbol("DGEMM"); + t->*[transa, transb, alpha, beta](cudaStream_t s, auto sA, auto sB, auto sC) { + auto& h = get_cublas_handle(); + cuda_safe_call(cublasSetStream(h, s)); + + auto k = (transa == CUBLAS_OP_N) ? sA.extent(1) : sA.extent(0); + cuda_safe_call(cublasDgemm( + h, + transa, + transb, + sC.extent(0), + sC.extent(1), + k, + &alpha, + sA.data_handle(), + sA.stride(1), + sB.data_handle(), + sB.stride(1), + &beta, + sC.data_handle(), + sC.stride(1))); + }; +} + +void DSYRK( + cublasFillMode_t uplo, + cublasOperation_t trans, + double alpha, + class matrix& A, + int A_row, + int A_col, + double beta, + class matrix& C, + int C_row, + int C_col) +{ + auto t = ctx.task(exec_place::device(A.get_preferred_devid(C_row, C_col)), + A.handle(A_row, A_col).read(), + C.handle(C_row, C_col).rw()); + t.set_symbol("DSYRK"); + t->*[uplo, trans, alpha, beta](cudaStream_t s, auto sA, auto sC) { + auto& h = get_cublas_handle(); + cuda_safe_call(cublasSetStream(h, s)); + + // number of rows of matrix op(A) and C + auto n = sC.extent(0); + + // number of columns of matrix op(A) + auto k = (trans == CUBLAS_OP_N) ? sA.extent(1) : sA.extent(0); + + cuda_safe_call( + cublasDsyrk(h, uplo, trans, n, k, &alpha, sA.data_handle(), sA.stride(1), &beta, sC.data_handle(), sC.stride(1))); + }; +} + +void DTRSM( + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t transa, + cublasDiagType_t diag, + double alpha, + class matrix& A, + int A_row, + int A_col, + class matrix& B, + int B_row, + int B_col) +{ + auto t = ctx.task(exec_place::device(A.get_preferred_devid(B_row, B_col)), + A.handle(A_row, A_col).read(), + B.handle(B_row, B_col).rw()); + t.set_symbol("DTRSM"); + t->*[side, uplo, transa, diag, alpha](cudaStream_t s, auto sA, auto sB) { + auto& h = get_cublas_handle(); + cuda_safe_call(cublasSetStream(h, s)); + + cuda_safe_call(cublasDtrsm( + h, + side, + uplo, + transa, + diag, + sB.extent(0), + sB.extent(1), + &alpha, + sA.data_handle(), + sA.stride(1), + sB.data_handle(), + sB.stride(1))); + }; +} + +void PDNRM2_HOST(matrix* A, double* result) +{ +#ifdef HAVE_DOT + reserved::dot::set_current_color("red"); +#endif + + for (int rowb = 0; rowb < A->mt; rowb++) + { + for (int colb = 0; colb < A->nt; colb++) + { + ctx.host_launch(A->handle(rowb, colb).read())->*[=](auto sA) { + double res2 = 0.0; + for (size_t col = 0; col < sA.extent(1); col++) + { + for (size_t row = 0; row < sA.extent(0); row++) + { + double v = sA(row, col); + res2 += v * v; + } + } + *result += res2; + }; + } + } +} + +void PDPOTRF(matrix& A) +{ +#ifdef HAVE_DOT + reserved::dot::set_current_color("yellow"); +#endif + + assert(A.m == A.n); + assert(A.mt == A.nt); + + int NBLOCKS = A.mt; + assert(A.mb == A.nb); + + cuda_safe_call(cudaSetDevice(0)); + + nvtxRangePushA("SUBMIT_PDPOTRF"); + for (int K = 0; K < NBLOCKS; K++) + { + int dev_akk = A.get_preferred_devid(K, K); + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(K, K))); + DPOTRF(CUBLAS_FILL_MODE_LOWER, A, K, K); + + for (int row = K + 1; row < NBLOCKS; row++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, K))); + DTRSM(CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, K, K, A, row, K); + + for (int col = K + 1; col < row; col++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); + DGEMM(CUBLAS_OP_N, CUBLAS_OP_T, -1.0, A, row, K, A, col, K, 1.0, A, row, col); + } + + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, row))); + DSYRK(CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, -1.0, A, row, K, 1.0, A, row, row); + } + } + cuda_safe_call(cudaSetDevice(0)); + + nvtxRangePop(); +} + +// Algorithm from PLASMA +void PDTRSM(cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + double alpha, + class matrix& A, + class matrix& B) +{ + // std::cout << "[PDTRSM] START B MT " << B.mt << " NT " << B.nt << std::endl; + + if (side == CUBLAS_SIDE_LEFT) + { + if (uplo == CUBLAS_FILL_MODE_UPPER) + { + // TODO + assert(0); + abort(); + } + else + { + //=========================================== + // CUBLAS_SIDE_LEFT / CUBLAS_FILL_MODE_LOWER / CUBLAS_OP_N + //=========================================== + if (trans == CUBLAS_OP_N) + { + for (int k = 0; k < B.mt; k++) + { + double lalpha = k == 0 ? alpha : 1.0; + for (int n = 0; n < B.nt; n++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(k, k))); + DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); + } + for (int m = k + 1; m < B.mt; m++) + { + for (int n = 0; n < B.nt; n++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(m, k))); + DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); + } + } + } + } + //================================================ + // CUBLAS_SIDE_LEFT / CUBLAS_FILL_MODE_LOWER / CUBLAS_OP_[C|T] + //================================================ + else + { + for (int k = 0; k < B.mt; k++) + { + double lalpha = k == 0 ? alpha : 1.0; + for (int n = 0; n < B.nt; n++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); + DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); + } + for (int m = k + 1; m < B.mt; m++) + { + for (int n = 0; n < B.nt; n++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); + DGEMM( + trans, CUBLAS_OP_N, -1.0, A, B.mt - k - 1, B.mt - 1 - m, B, B.mt - k - 1, n, lalpha, B, B.mt - 1 - m, n); + } + } + } + } + } + } + else + { + // TODO + abort(); + } + cuda_safe_call(cudaSetDevice(0)); + // std::cout << "[PDTRSM] END" << std::endl; +} + +void PDPOTRS(matrix& A, class matrix& B, cublasFillMode_t uplo) +{ +#ifdef HAVE_DOT + reserved::dot::set_current_color("green"); +#endif + + // std::cout << "[PDPOTRS] START" << std::endl; + // Call the parallel functions. + PDTRSM( + CUBLAS_SIDE_LEFT, uplo, uplo == CUBLAS_FILL_MODE_UPPER ? CUBLAS_OP_T : CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, 1.0, A, B); + +#ifdef HAVE_DOT + reserved::dot::set_current_color("darkgreen"); +#endif + + PDTRSM( + CUBLAS_SIDE_LEFT, uplo, uplo == CUBLAS_FILL_MODE_UPPER ? CUBLAS_OP_N : CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, B); + // std::cout << "[PDPOTRS] END" << std::endl; +} + +/***************************************************************************** + * Parallel tile matrix-matrix + *multiplication. + * @see plasma_omp_dgemm + ******************************************************************************/ +void PDGEMM(cublasOperation_t transa, + cublasOperation_t transb, + double alpha, + class matrix& A, + class matrix& B, + double beta, + class matrix& C) +{ +#ifdef HAVE_DOT + reserved::dot::set_current_color("blue"); +#endif + + for (int m = 0; m < C.mt; m++) + { + for (int n = 0; n < C.nt; n++) + { + //========================================= + // alpha*A*B does not contribute; scale C + //========================================= + int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + if (alpha == 0.0 || inner_k == 0) + { + DGEMM(transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); + } + else if (transa == CUBLAS_OP_N) + { + //================================ + // CUBLAS_OP_N / CUBLAS_OP_N + //================================ + if (transb == CUBLAS_OP_N) + { + for (int k = 0; k < A.nt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); + } + } + //===================================== + // CUBLAS_OP_N / CUBLAS_OP_T + //===================================== + else + { + for (int k = 0; k < A.nt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); + } + } + } + else + { + //===================================== + // CUBLAS_OP_T / CUBLAS_OP_N + //===================================== + if (transb == CUBLAS_OP_N) + { + for (int k = 0; k < A.mt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); + } + } + //========================================== + // CUBLAS_OP_T / CUBLAS_OP_T + //========================================== + else + { + for (int k = 0; k < A.mt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); + } + } + } + } + } +} + +int main(int argc, char** argv) +{ + int N = 1024; + int NB = 128; + + if (argc > 1) + { + N = atoi(argv[1]); + } + + if (argc > 2) + { + NB = atoi(argv[2]); + } + + int check_result = 1; + if (getenv("CHECK_RESULT")) + { + check_result = atoi(getenv("CHECK_RESULT")); + } + + assert(N % NB == 0); + + // Use pools of preallocated blocks + auto fixed_alloc = block_allocator(ctx, NB * NB * sizeof(double)); + ctx.set_allocator(fixed_alloc); + + // Set up CUBLAS and CUSOLVER + int ndevs; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + for (size_t d = 0; d < ndevs; d++) + { + auto lX = ctx.logical_data(shape_of>(1)); + ctx.parallel_for(exec_place::device(d), lX.shape(), lX.write())->*[] _CCCL_DEVICE(size_t, auto) {}; + cuda_safe_call(cudaSetDevice(d)); + get_cublas_handle(); + get_cusolver_handle(); + } + + cuda_safe_call(cudaSetDevice(0)); + + matrix A(N, N, NB, NB, true, "A"); + matrix Aref(N, N, NB, NB, false, "Aref"); + + // (Hilbert matrix + 2*N*Id) to have a diagonal dominant matrix + auto hilbert = [=] _CCCL_HOST_DEVICE(size_t row, size_t col) { + return 1.0 / (col + row + 1.0) + 2.0 * N * (col == row); + }; + + if (check_result) + { + Aref.fill(hilbert); + } + + A.fill(hilbert); + + /* Right-hand side */ + matrix B_potrs(N, 1, NB, 1, false, "B"); + matrix Bref_potrs(N, 1, NB, 1, false, "Bref"); + + if (check_result) + { + auto rhs_vals = [] _CCCL_HOST_DEVICE(size_t row, size_t /*unused*/) { + return 1.0 * (row + 1); + }; + B_potrs.fill(rhs_vals); + Bref_potrs.fill(rhs_vals); + } + + // // Compute ||Bref|| + double Bref_nrm2 = 0.0; + double res_nrm2 = 0.0; + + if (check_result) + { + PDNRM2_HOST(&Bref_potrs, &Bref_nrm2); + } + + cudaEvent_t startEvent_pdpotrf, stopEvent_pdpotrf; + float milliseconds_pdpotrf = 0; + + // for (int row = 0; row < A.mt; row++) + // { + // for (int col = 0; col <= row; col++) + // { + // cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); + // NOOP(A, row, col); + // } + // } + + cuda_safe_call(cudaStreamSynchronize(ctx.task_fence())); + + cuda_safe_call(cudaEventCreate(&startEvent_pdpotrf)); + cuda_safe_call(cudaEventCreate(&stopEvent_pdpotrf)); + + cuda_safe_call(cudaEventRecord(startEvent_pdpotrf, ctx.task_fence())); + + PDPOTRF(A); + + cuda_safe_call(cudaEventRecord(stopEvent_pdpotrf, ctx.task_fence())); + + /* + * POTRS + */ + + if (check_result) + { + // Solve AX = B and put the result in B + PDPOTRS(A, B_potrs, CUBLAS_FILL_MODE_LOWER); + + // Compute (AX - B) + // Bref = (Aref*B - Bref) + PDGEMM(CUBLAS_OP_N, CUBLAS_OP_N, 1.0, Aref, B_potrs, -1.0, Bref_potrs); + + // Compute ||AX - B|| = ||Bref|| + PDNRM2_HOST(&Bref_potrs, &res_nrm2); + } + + ctx.finalize(); + + cuda_safe_call(cudaEventElapsedTime(&milliseconds_pdpotrf, startEvent_pdpotrf, stopEvent_pdpotrf)); + + double gflops_pdpotrf = 1.0 / 3.0 * ((double) N * (double) N * (double) N) / (1000000000.0); + std::cout << "[PDPOTRF] ELAPSED: " << milliseconds_pdpotrf + << " ms, GFLOPS: " << gflops_pdpotrf / (milliseconds_pdpotrf / 1000.0) << std::endl; + + if (check_result) + { + if (double residual = sqrt(res_nrm2) / sqrt(Bref_nrm2); residual >= 0.01) + { + std::cerr << "[POTRS] ||AX - B|| : " << sqrt(res_nrm2) << std::endl; + std::cerr << "[POTRS] ||B|| : " << sqrt(Bref_nrm2) << std::endl; + std::cerr << "[POTRS] RESIDUAL (||AX - B||/||B||) : " << residual << std::endl; + assert(!"Algorithm did not converge."); + } + } +} diff --git a/cudax/examples/stf/linear_algebra/07-potri.cu b/cudax/examples/stf/linear_algebra/07-potri.cu new file mode 100644 index 00000000000..e3fb3dd55b7 --- /dev/null +++ b/cudax/examples/stf/linear_algebra/07-potri.cu @@ -0,0 +1,1689 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief This example implements the POTRI matrix inversion algorithm over multiple devices + * + * + */ +#include + +#include +#include +#include + +#define TILED + +using namespace cuda::experimental::stf; + +stream_ctx ctx; + +static std::unordered_map cublas_handles; +static std::unordered_map cusolver_handles; + +/* Get a CUBLAS handle valid on the current device, or initialize it lazily */ +cublasHandle_t& get_cublas_handle() +{ + int dev = cuda_try(); + + auto& result = cublas_handles[dev]; + if (result == cublasHandle_t()) + { // not found, default value inserted + // Lazy initialization, and save the handle for future use + cuda_try(cublasCreate(&result)); + } + return result; +} + +/* Get a CUSOLVER handle valid on the current device, or initialize it lazily */ +cusolverDnHandle_t& get_cusolver_handle() +{ + int dev; + cuda_try(cudaGetDevice(&dev)); + + auto& result = cusolver_handles[dev]; + if (result == cusolverDnHandle_t()) + { // not found, default value inserted + // Lazy initialization, and save the handle for future use + cuda_try(cusolverDnCreate(&result)); + } + return result; +} + +template +class matrix +{ +public: + matrix(int NROWS, int NCOLS, int BLOCKSIZE_ROWS, int BLOCKSIZE_COLS, bool is_sym, const char* _symbol = "matrix") + { + symbol = _symbol; + + sym_matrix = is_sym; + + m = NROWS; + mb = BLOCKSIZE_ROWS; + + n = NCOLS; + nb = BLOCKSIZE_COLS; + + assert(m % mb == 0); + assert(n % nb == 0); + + size_t s = m * n * sizeof(T); + // cuda_try(cudaMallocHost(&h_array, m*n*sizeof(T))); + // fprintf(stderr, "Allocating %ld x %ld x %ld = %ld bytes (%f GB) on host for %s\n", m, n, sizeof(T), s, + // s / (1024.0 * 1024.0 * 1024.0), _symbol); + h_array = (T*) malloc(s); + assert(h_array); + cuda_try(cudaHostRegister(h_array, s, cudaHostRegisterPortable)); + // cuda_try(cudaMalloc(&d_array, m*n*sizeof(T))); + + // Compute the number of blocks + mt = m / mb; + nt = n / nb; + + handles.resize(mt * nt); + + for (int colb = 0; colb < nt; colb++) + { + int low_rowb = sym_matrix ? colb : 0; + for (int rowb = low_rowb; rowb < mt; rowb++) + { + T* addr_h = get_block_h(rowb, colb); + auto& h = get_handle(rowb, colb); + +#ifdef TILED + // tiles are stored contiguously + size_t ld = mb; +#else + size_t ld = m; +#endif + std::ignore = ld; // work around compiler bug + h = ctx.logical_data(make_slice(addr_h, std::tuple{mb, nb}, ld)); + h.set_symbol(std::string(symbol) + "_" + std::to_string(rowb) + "_" + std::to_string(colb)); + h.set_write_back(false); + } + } + + cuda_try(cudaGetDeviceCount(&ndevs)); + for (int a = 1; a * a <= ndevs; a++) + { + if (ndevs % a == 0) + { + grid_p = a; + grid_q = ndevs / a; + } + } + + assert(grid_p * grid_q == ndevs); + + // std::cout << "FOUND " << ndevs << " DEVICES " + // << "p=" << grid_p << " q=" << grid_q << std::endl; + } + + int get_preferred_devid(int row, int col) + { + return (row % grid_p) + (col % grid_q) * grid_p; + } + + auto& get_handle(int row, int col) + { + return handles[row + col * mt]; + } + + size_t get_index(size_t row, size_t col) + { +#ifdef TILED + // Find which tile contains this element + int tile_row = row / mb; + int tile_col = col / nb; + + size_t tile_size = mb * nb; + + // Look for the index of the begining of the tile + size_t tile_start = (tile_row + mt * tile_col) * tile_size; + + // Offset within the tile + size_t offset = (row % mb) + (col % nb) * mb; + + return tile_start + offset; +#else + return row + col * m; +#endif + } + + T* get_block_h(int brow, int bcol) + { + size_t index = get_index(brow * mb, bcol * nb); + return &h_array[index]; + } + + // Fill with func(Matrix*,row, col) + template + void fill(Fun&& fun) + { + nvtxRangePushA("FILL"); + // Fill blocks by blocks + for (int colb = 0; colb < nt; colb++) + { + int low_rowb = sym_matrix ? colb : 0; + for (int rowb = low_rowb; rowb < mt; rowb++) + { + // Each task fills a block + auto& h = get_handle(rowb, colb); + int devid = get_preferred_devid(rowb, colb); + + ctx.parallel_for(exec_place::device(devid), h.shape(), h.write()).set_symbol("INIT")->* + [=] _CCCL_DEVICE(size_t lrow, size_t lcol, auto sA) { + size_t row = lrow + rowb * sA.extent(0); + size_t col = lcol + colb * sA.extent(1); + sA(lrow, lcol) = fun(row, col); + }; + } + } + nvtxRangePop(); + } + + // Print blocks + void print() + { + // print blocks by blocks + for (int colb = 0; colb < nt; colb++) + { + int low_rowb = sym_matrix ? colb : 0; + for (int rowb = low_rowb; rowb < mt; rowb++) + { + // Each task fills a block + ctx.host_launch(get_handle(rowb, colb).read())->*[=](auto sA) { + for (int lcol = 0; lcol < sA.extent(1); lcol++) + { + size_t col = lcol + colb * sA.extent(1); + for (int lrow = 0; lrow < sA.extent(0); lrow++) + { + size_t row = lrow + rowb * sA.extent(0); + + fprintf(stderr, "%d,%d : %le\n", row, col, sA(lrow, lcol)); + } + } + }; + } + } + } + + T* h_array; + T* d_array; + size_t m; // nrows + size_t n; // ncols + + // Is this a sym matrix ? (lower assumed) + bool sym_matrix; + + size_t mb; // block size (rows) + size_t nb; // block size (cols) + + size_t mt; // number of column blocks + size_t nt; // number of row blocks + + // abstract data handles + std::vector>> handles; + + const char* symbol; + + // for the mapping + int ndevs; + int grid_p, grid_q; +}; + +void DPOTRF(cublasFillMode_t uplo, matrix& A, int A_row, int A_col) +{ + auto& Akk = A.get_handle(A_row, A_col); + size_t m_akk = Akk.shape().extent(0); + // Note that the handle may be different from the actual handle... + int Lwork_expected; + cuda_safe_call(cusolverDnDpotrf_bufferSize(get_cusolver_handle(), uplo, m_akk, nullptr, 0, &Lwork_expected)); + + auto potrf_buffer = ctx.logical_data(shape_of>(Lwork_expected)); + potrf_buffer.set_allocator(ctx.get_default_allocator()); + + auto devInfo = ctx.logical_data(shape_of>(1)); + + auto t = ctx.task(Akk.rw(), potrf_buffer.write(), devInfo.write()); + t.set_symbol("DPOTRF"); + t->*[&](auto s, auto sAkk, auto buffer, auto info) { + auto& h = get_cusolver_handle(); + cuda_try(cusolverDnSetStream(h, s)); + + cuda_try(cusolverDnDpotrf( + h, + uplo, + sAkk.extent(0), + sAkk.data_handle(), + sAkk.stride(1), + buffer.data_handle(), + buffer.extent(0), + info.data_handle())); + }; +} + +void DTRTRI(cublasFillMode_t uplo, cublasDiagType_t diag, matrix& A, int A_row, int A_col) +{ + // Preallocate a buffer used by CUSOLVER + size_t workspaceInBytesOnDevice, workspaceInBytesOnHost; + int64_t m_a00 = A.mb; + assert(A.mb == A.nb); + + cuda_try(cusolverDnXtrtri_bufferSize( + get_cusolver_handle(), + uplo, + diag, + m_a00, + CUDA_R_64F /* DTRTRI */, + nullptr, + m_a00, + &workspaceInBytesOnDevice, + &workspaceInBytesOnHost)); + + // We don't support allocating buffers of 0 bytes ... XXX + if (workspaceInBytesOnHost == 0) + { + workspaceInBytesOnHost = 8; + } + + auto d_buffer = ctx.logical_data(shape_of>(workspaceInBytesOnDevice)); + auto h_buffer = ctx.logical_data(shape_of>(workspaceInBytesOnHost)); + d_buffer.set_allocator(ctx.get_default_allocator()); + h_buffer.set_allocator(ctx.get_default_allocator()); + + auto devInfo = ctx.logical_data(shape_of>(1)); + + auto t = + ctx.task(A.get_handle(A_row, A_col).rw(), d_buffer.write(), h_buffer.write(data_place::managed), devInfo.write()); + t.set_symbol("DTRTRI"); + t->*[&](auto s, auto sA, auto dbuffer, auto hbuffer, auto info) { + auto& h = get_cusolver_handle(); + cuda_try(cusolverDnSetStream(h, s)); + + // DTRTRI(...) + cuda_try(cusolverDnXtrtri( + h, + uplo, + diag, + sA.extent(0), + CUDA_R_64F /* DTRTRI */, + sA.data_handle(), + sA.stride(1), + (double*) dbuffer.data_handle(), + workspaceInBytesOnDevice, + (double*) hbuffer.data_handle(), + workspaceInBytesOnHost, + info.data_handle())); + }; +} + +/* + * Note: this code was taken from CUSOLVER + * + * SLACPY copies all or part of a two-dimensional matrix A to another matrix B. + * + * up up_and_lo + * 1 0 upper triangle, including diagonal + * 0 0 lower triangle, including diagonal + * ? 1 whole matrix + * + * configuration: + * dim3 grids( m/VEC, m/BY ) + * dim3 threads(VEC,BY) + */ +template +__global__ void __launch_bounds__(1 << (VEC_LOG + BY_LOG)) + lacpy_kernel(int m, int n, const T_ELEM_SRC* A, size_t lda, T_ELEM_DST* B, size_t ldb, int up, int up_and_lo) +{ + const int VEC = (1 << VEC_LOG); + const int BY = (1 << BY_LOG); + + const int inx = threadIdx.x; + const int iny = threadIdx.y; + + const int ibx = blockIdx.x * VEC; + const int iby = blockIdx.y * BY; + + const int i = ibx + inx; + const int j = iby + iny; + + if (ibx >= m) + { + return; + } + if (iby >= n) + { + return; + } + + T_ELEM_SRC Areg = T_ELEM_SRC(0); + + if (up_and_lo) + { + /* + * copy whole matrix + DO 60 J = 1, N + DO 50 I = 1, M + B( I, J ) = A( I, J ) + 50 CONTINUE + 60 CONTINUE + */ + if ((i < m) && (j < n)) + { + Areg = A[i + j * lda]; + B[i + j * ldb] = T_ELEM_DST(Areg); + } + return; + } + + // only lower or upper triangle is copied. + if (up) + { + /* + * copy upper triangle, including diagonal + DO 20 J = 1, N + DO 10 I = 1, MIN( J, M ) + B( I, J ) = A( I, J ) + 10 CONTINUE + 20 CONTINUE + */ + if ((i <= min(j, m - 1)) && (j < n)) + { + Areg = A[i + j * lda]; + B[i + j * ldb] = T_ELEM_DST(Areg); + } + } + else + { + /* + * copy lower triangle, including diagonal + DO 40 J = 1, N + DO 30 I = J, M + B( I, J ) = A( I, J ) + 30 CONTINUE + 40 CONTINUE + + */ + if (((j <= i) && (i < m)) && (j < n)) + { + Areg = A[i + j * lda]; + B[i + j * ldb] = T_ELEM_DST(Areg); + } + } +} + +/* + * SLACPY copies all or part of a two-dimensional matrix A to another + * matrix B. + * + * Input + * ------- + * UPLO is CHARACTER*1 + * Specifies the part of the matrix A to be copied to B. + * = 'U': Upper triangular part + * = 'L': Lower triangular part + * Otherwise: All of the matrix A + * + * M is INTEGER + * The number of rows of the matrix A. + * M >= 0. + * + * N is INTEGER + * The number of columns of the matrix A. + * N >= 0. + * + * A is REAL array, dimension (LDA,N) + * The m by n matrix A. If UPLO = 'U', only the upper triangle + * or trapezoid is accessed; if UPLO = 'L', only the lower + * triangle or trapezoid is accessed. + * + * LDA is INTEGER + * The first dimension of the array A. LDA >= max(1,M). + * + * B is REAL array, dimension (LDB,N) + * On exit, B = A in the locations specified by UPLO. + * + * LDB is INTEGER + * The leading dimension of the array B. LDB >= max(1,M). + * + */ +template +cusolverStatus_t cusolverDnXlacpy( + cublasFillMode_t uplo, // "UPPER", B = upper(A) + // "LOWER", B = lower(A) + // otherwise, B = A + int m, + int n, + const T_ELEM_SRC* A, + int lda, + T_ELEM_DST* B, + int ldb, + cudaStream_t stream) +{ + cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS; + cudaError_t cudaStat1 = cudaSuccess; + + int up = 0; + int up_and_lo = 0; + + // Quick return if possible + if ((0 >= m) || (0 >= n)) + { + return status; + } + + /* + * up up_and_lo + * 1 0 upper triangle, including diagonal + * 0 0 lower triangle, including diagonal + * ? 1 whole matrix + */ + if (CUBLAS_FILL_MODE_LOWER == uplo) + { + // Lower triangular part + up = 0; + } + else if (CUBLAS_FILL_MODE_UPPER == uplo) + { + // upper triangular part + up = 1; + } + else + { + up_and_lo = 1; // Otherwise: All of the matrix A + } + + const int VEC_LOG = 5; + const int BY_LOG = 3; + const int VEC = (1 << VEC_LOG); + const int BY = (1 << BY_LOG); + dim3 grids((m + VEC - 1) / VEC, (n + BY - 1) / BY); + dim3 threads(VEC, BY); + + lacpy_kernel + <<>>(m, n, A, (size_t) lda, B, (size_t) ldb, up, up_and_lo); + + cudaStat1 = cudaGetLastError(); /* launch error */ + if (cudaSuccess != cudaStat1) + { + fprintf(stderr, "Error (lacpy): %d\n", cudaStat1); + status = CUSOLVER_STATUS_EXECUTION_FAILED; + } + + return status; +} + +cusolverStatus_t cusolverDnDlacpy( + cublasFillMode_t uplo, // "UPPER", B = upper(A) + // "LOWER", B = lower(A) + // otherwise, B = A + int m, + int n, + const double* A, + int lda, + double* B, + int ldb, + cudaStream_t stream) +{ + return cusolverDnXlacpy(uplo, m, n, A, lda, B, ldb, stream); +} + +// Pretend there is a CUBLAS interface for DLAAUM +void cublasDnDlaaum_bufferSize(cublasHandle_t /*unused*/, int m, int n, size_t* Workspace_size) +{ + assert(Workspace_size); + *Workspace_size = m * n * sizeof(double); +} + +// Pretend there is a CUBLAS interface for DLAAUM +// A triangular +// Lower : A = A^T * A +// Upper : A = A A^T +void cublasDnDlaaum( + cublasHandle_t cublas_handle, + cublasFillMode_t uplo, + int m, + int n, + double* A, + int ldA, + double* Workspace_d, + size_t Workspace_size) +{ + cudaStream_t stream; + cuda_safe_call(cublasGetStream(cublas_handle, &stream)); + + // "Hand coded" + // We use a full copy of A ! + // fprintf(stderr, "GOT Workspace_size %ld ... expected %d\n", Workspace_size, m * n * sizeof(double)); + std::ignore = Workspace_size; + assert(Workspace_size >= m * n * sizeof(double)); + + double* B = Workspace_d; + int ldB = m; + + // Blank the buffer + cuda_safe_call(cudaMemsetAsync(B, 0, m * n * sizeof(double), stream)); + + // Copy A (with upper or lower 0 untouched) + cusolverDnDlacpy(uplo, m, n, A, ldA, B, ldB, stream); + + cublasDiagType_t diag = CUBLAS_DIAG_NON_UNIT; + const double one = 1.0; + + auto side = (uplo == CUBLAS_FILL_MODE_LOWER) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT; + + // LOWER: TRMM(A,B) : B = op(A) * B = A^T * B with A triangular (B = C in CUBLAS), CUBLAS_OP_T, CUBLAS_SIDE_RIGHT + // UPPER: TRMM(A,B) : B = B * op(A) = B A^T with A triangular (B = C in CUBLAS), CUBLAS_OP_T, CUBLAS_SIDE_RIGHT + cuda_safe_call(cublasDtrmm(cublas_handle, side, uplo, CUBLAS_OP_T, diag, m, n, &one, A, ldA, B, ldB, B, ldB)); + + // Copy B=AA^T back into A (with upper or lower 0 untouched) + cusolverDnDlacpy(uplo, m, n, B, ldB, A, ldA, stream); +} + +void DLAAUM(cublasFillMode_t uplo, matrix& A, int A_row, int A_col) +{ + int NB = A.mb; + size_t Lwork; + cublasDnDlaaum_bufferSize(get_cublas_handle(), NB, NB, &Lwork); + + auto d_buffer = ctx.logical_data(shape_of>(Lwork)); + + auto t = ctx.task(A.get_handle(A_row, A_col).rw(), d_buffer.write()); + t.set_symbol("DLAAUM"); + t->*[&](auto s, auto sA, auto buffer) { + auto& h = get_cublas_handle(); + cuda_try(cublasSetStream(h, s)); + + cublasDnDlaaum( + h, uplo, sA.extent(0), sA.extent(1), sA.data_handle(), sA.stride(1), (double*) buffer.data_handle(), Lwork); + }; +} + +void DGEMM( + cublasOperation_t transa, + cublasOperation_t transb, + double alpha, + matrix& A, + int A_row, + int A_col, + matrix& B, + int B_row, + int B_col, + double beta, + matrix& C, + int C_row, + int C_col) +{ + auto ignored = get_cublas_handle(); + auto t = + ctx.task(A.get_handle(A_row, A_col).read(), B.get_handle(B_row, B_col).read(), C.get_handle(C_row, C_col).rw()); + t.set_symbol("DGEMM"); + t->*[&](auto s, auto sA, auto sB, auto sC) { + auto& h = get_cublas_handle(); + cuda_try(cublasSetStream(h, s)); + + int k = (transa == CUBLAS_OP_N) ? sA.extent(1) : sA.extent(0); + cuda_try(cublasDgemm( + h, + transa, + transb, + sC.extent(0), + sC.extent(1), + k, + &alpha, + sA.data_handle(), + sA.stride(1), + sB.data_handle(), + sB.stride(1), + &beta, + sC.data_handle(), + sC.stride(1))); + }; +} + +void DSYMM( + cublasSideMode_t side, + cublasFillMode_t uplo, + double alpha, + matrix& A, + int A_row, + int A_col, + matrix& B, + int B_row, + int B_col, + double beta, + matrix& C, + int C_row, + int C_col) +{ + auto ignored = get_cublas_handle(); + auto t = + ctx.task(A.get_handle(A_row, A_col).read(), B.get_handle(B_row, B_col).read(), C.get_handle(C_row, C_col).rw()); + t.set_symbol("DSYMM"); + t->*[&](auto s, auto sA, auto sB, auto sC) { + auto& h = get_cublas_handle(); + cuda_try(cublasSetStream(h, s)); + + cuda_try(cublasDsymm( + h, + side, + uplo, + sC.extent(0), + sC.extent(1), + &alpha, + sA.data_handle(), + sA.stride(1), + sB.data_handle(), + sB.stride(1), + &beta, + sC.data_handle(), + sC.stride(1))); + }; +} + +void DSYRK( + cublasFillMode_t uplo, + cublasOperation_t trans, + double alpha, + matrix& A, + int A_row, + int A_col, + double beta, + matrix& C, + int C_row, + int C_col) +{ + auto ignored = get_cublas_handle(); + auto t = ctx.task(A.get_handle(A_row, A_col).read(), C.get_handle(C_row, C_col).rw()); + t.set_symbol("DSYRK"); + t->*[&](auto s, auto sA, auto sC) { + auto& h = get_cublas_handle(); + cuda_try(cublasSetStream(h, s)); + + // number of rows of matrix op(A) and C + int n = sC.extent(0); + + // number of columns of matrix op(A) + int k = (trans == CUBLAS_OP_N) ? sA.extent(1) : sA.extent(0); + + cuda_try( + cublasDsyrk(h, uplo, trans, n, k, &alpha, sA.data_handle(), sA.stride(1), &beta, sC.data_handle(), sC.stride(1))); + }; +} + +void DTRSM( + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t transa, + cublasDiagType_t diag, + double alpha, + matrix& A, + int A_row, + int A_col, + matrix& B, + int B_row, + int B_col) +{ + auto ignored = get_cublas_handle(); + auto t = ctx.task(A.get_handle(A_row, A_col).read(), B.get_handle(B_row, B_col).rw()); + t.set_symbol("DTRSM"); + t->*[&](auto s, auto sA, auto sB) { + auto& h = get_cublas_handle(); + cuda_try(cublasSetStream(h, s)); + + cuda_try(cublasDtrsm( + h, + side, + uplo, + transa, + diag, + sB.extent(0), + sB.extent(1), + &alpha, + sA.data_handle(), + sA.stride(1), + sB.data_handle(), + sB.stride(1))); + }; +} + +void DTRMM( + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t transa, + cublasDiagType_t diag, + double alpha, + matrix& A, + int A_row, + int A_col, + matrix& B, + int B_row, + int B_col) +{ + auto ignored = get_cublas_handle(); + auto t = ctx.task(A.get_handle(A_row, A_col).read(), B.get_handle(B_row, B_col).rw()); + t.set_symbol("DTRMM"); + t->*[&](auto s, auto sA, auto sB) { + auto& h = get_cublas_handle(); + cuda_try(cublasSetStream(h, s)); + + // Note : CUBLAS DTRMM implementation is out of place but supports in place by using the same buffer B and C + cuda_try(cublasDtrmm( + get_cublas_handle(), + side, + uplo, + transa, + diag, + sB.extent(0), + sB.extent(1), + &alpha, + sA.data_handle(), + sA.stride(1), + sB.data_handle(), + sB.stride(1), + sB.data_handle(), + sB.stride(1) /* same as B*/)); + }; +} + +void PDNRM2_HOST(matrix* A, double* result) +{ +#ifdef HAVE_DOT + ctx.get_dot()->set_current_color("red"); +#endif + + for (int rowb = 0; rowb < A->mt; rowb++) + { + for (int colb = 0; colb < A->nt; colb++) + { + ctx.host_launch(A->get_handle(rowb, colb).read())->*[=](auto sA) { + double res2 = 0.0; + for (size_t col = 0; col < sA.extent(1); col++) + { + for (size_t row = 0; row < sA.extent(0); row++) + { + double v = sA(row, col); + res2 += v * v; + } + } + *result += res2; + }; + } + } +} + +void PDPOTRF(matrix& A) +{ +#ifdef HAVE_DOT + ctx.get_dot()->set_current_color("yellow"); +#endif + + assert(A.m == A.n); + assert(A.mt == A.nt); + + int NBLOCKS = A.mt; + assert(A.mb == A.nb); + + nvtxRangePushA("SUBMIT_PDPOTRF"); + for (int K = 0; K < NBLOCKS; K++) + { + cuda_try(cudaSetDevice(A.get_preferred_devid(K, K))); + DPOTRF(CUBLAS_FILL_MODE_LOWER, A, K, K); + + for (int row = K + 1; row < NBLOCKS; row++) + { + cuda_try(cudaSetDevice(A.get_preferred_devid(row, K))); + DTRSM(CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, K, K, A, row, K); + + for (int col = K + 1; col < row; col++) + { + cuda_try(cudaSetDevice(A.get_preferred_devid(row, col))); + DGEMM(CUBLAS_OP_N, CUBLAS_OP_T, -1.0, A, row, K, A, col, K, 1.0, A, row, col); + } + + cuda_try(cudaSetDevice(A.get_preferred_devid(row, row))); + DSYRK(CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, -1.0, A, row, K, 1.0, A, row, row); + } + } + + nvtxRangePop(); +} + +// Algorithm from PLASMA +void PDTRSM(cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + double alpha, + matrix& A, + matrix& B) +{ + // std::cout << "[PDTRSM] START B MT " << B.mt << " NT " << B.nt << std::endl; + + nvtxRangePushA("SUBMIT_PDTRSM"); + + if (side == CUBLAS_SIDE_LEFT) + { + if (uplo == CUBLAS_FILL_MODE_UPPER) + { + // TODO + abort(); + } + else + { + //=========================================== + // CUBLAS_SIDE_LEFT / CUBLAS_FILL_MODE_LOWER / CUBLAS_OP_N + //=========================================== + if (trans == CUBLAS_OP_N) + { + for (int k = 0; k < B.mt; k++) + { + double lalpha = k == 0 ? alpha : 1.0; + for (int n = 0; n < B.nt; n++) + { + cuda_try(cudaSetDevice(A.get_preferred_devid(k, k))); + DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); + } + for (int m = k + 1; m < B.mt; m++) + { + for (int n = 0; n < B.nt; n++) + { + cuda_try(cudaSetDevice(A.get_preferred_devid(m, k))); + DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); + } + } + } + } + //================================================ + // CUBLAS_SIDE_LEFT / CUBLAS_FILL_MODE_LOWER / CUBLAS_OP_[C|T] + //================================================ + else + { + for (int k = 0; k < B.mt; k++) + { + double lalpha = k == 0 ? alpha : 1.0; + for (int n = 0; n < B.nt; n++) + { + cuda_try(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); + DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); + } + for (int m = k + 1; m < B.mt; m++) + { + for (int n = 0; n < B.nt; n++) + { + cuda_try(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); + DGEMM( + trans, CUBLAS_OP_N, -1.0, A, B.mt - k - 1, B.mt - 1 - m, B, B.mt - k - 1, n, lalpha, B, B.mt - 1 - m, n); + } + } + } + } + } + } + else + { + // TODO + abort(); + } + // std::cout << "[PDTRSM] END" << std::endl; + + nvtxRangePop(); +} + +void PDPOTRS(matrix& A, matrix& B, cublasFillMode_t uplo) +{ + nvtxRangePushA("SUBMIT_PDPOTRS"); + +#ifdef HAVE_DOT + ctx.get_dot()->set_current_color("green"); +#endif + + // std::cout << "[PDPOTRS] START" << std::endl; + // Call the parallel functions. + PDTRSM( + CUBLAS_SIDE_LEFT, uplo, uplo == CUBLAS_FILL_MODE_UPPER ? CUBLAS_OP_T : CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, 1.0, A, B); + +#ifdef HAVE_DOT + ctx.get_dot()->set_current_color("darkgreen"); +#endif + + PDTRSM( + CUBLAS_SIDE_LEFT, uplo, uplo == CUBLAS_FILL_MODE_UPPER ? CUBLAS_OP_N : CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, B); + // std::cout << "[PDPOTRS] END" << std::endl; + + nvtxRangePop(); +} + +/***************************************************************************/ /** + * Parallel tile matrix-matrix + *multiplication. + * @see plasma_omp_dgemm + ******************************************************************************/ +void PDGEMM(cublasOperation_t transa, + cublasOperation_t transb, + double alpha, + matrix& A, + matrix& B, + double beta, + matrix& C) +{ +#ifdef HAVE_DOT + reserved::dot::set_current_color("blue"); +#endif + + for (int m = 0; m < C.mt; m++) + { + for (int n = 0; n < C.nt; n++) + { + cuda_try(cudaSetDevice(C.get_preferred_devid(m, n))); + + //========================================= + // alpha*A*B does not contribute; scale C + //========================================= + int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + if (alpha == 0.0 || inner_k == 0) + { + DGEMM(transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); + } + else if (transa == CUBLAS_OP_N) + { + //================================ + // CUBLAS_OP_N / CUBLAS_OP_N + //================================ + if (transb == CUBLAS_OP_N) + { + assert(A.nt == B.mt); + for (int k = 0; k < A.nt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); + } + } + //===================================== + // CUBLAS_OP_N / CUBLAS_OP_T + //===================================== + else + { + for (int k = 0; k < A.nt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); + } + } + } + else + { + //===================================== + // CUBLAS_OP_T / CUBLAS_OP_N + //===================================== + if (transb == CUBLAS_OP_N) + { + for (int k = 0; k < A.mt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); + } + } + //========================================== + // CUBLAS_OP_T / CUBLAS_OP_T + //========================================== + else + { + for (int k = 0; k < A.mt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); + } + } + } + } + } +} + +/* + * Algorithm taken from the PLASMA library + */ +// We assume a lower triangular matrix (uplo == CUBLAS_FILL_MODE_LOWER) +void PDTRTRI(matrix& A, cublasFillMode_t uplo, cublasDiagType_t diag) +{ + assert(uplo == CUBLAS_FILL_MODE_LOWER); + + nvtxRangePushA("SUBMIT_PDTRTRI"); + + for (int k = 0; k < A.nt; k++) + { + for (int m = k + 1; m < A.mt; m++) + { + cuda_try(cudaSetDevice(A.get_preferred_devid(m, k))); + DTRSM(CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, diag, -1.0, A, k, k, A, m, k); + } + for (int m = k + 1; m < A.mt; m++) + { + for (int n = 0; n < k; n++) + { + cuda_try(cudaSetDevice(A.get_preferred_devid(m, n))); + DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, 1.0, A, m, k, A, k, n, 1.0, A, m, n); + } + } + for (int n = 0; n < k; n++) + { + cuda_try(cudaSetDevice(A.get_preferred_devid(k, n))); + DTRSM(CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, diag, 1.0, A, k, k, A, k, n); + } + + // DTRTRI(...) + cuda_try(cudaSetDevice(A.get_preferred_devid(k, k))); + DTRTRI(uplo, diag, A, k, k); + } + + nvtxRangePop(); +} + +/* + * Algorithm taken from the PLASMA library + */ +// We assume a lower triangular matrix (uplo == CUBLAS_FILL_MODE_LOWER) +void PDLAUUM(matrix& A, cublasFillMode_t uplo) +{ + assert(uplo == CUBLAS_FILL_MODE_LOWER); + + nvtxRangePushA("SUBMIT_PDLAUUM"); + + for (int k = 0; k < A.mt; k++) + { + for (int n = 0; n < k; n++) + { + cuda_try(cudaSetDevice(A.get_preferred_devid(n, n))); + DSYRK(CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, 1.0, A, k, n, 1.0, A, n, n); + + for (int m = n + 1; m < k; m++) + { + cuda_try(cudaSetDevice(A.get_preferred_devid(m, n))); + DGEMM(CUBLAS_OP_T, CUBLAS_OP_N, 1.0, A, k, m, A, k, n, 1.0, A, m, n); + } + } + for (int n = 0; n < k; n++) + { + cuda_try(cudaSetDevice(A.get_preferred_devid(k, n))); + DTRMM(CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, k, k, A, k, n); + } + + // LAAUM (Akk RW) (compute Akk^T * Akk) + cuda_try(cudaSetDevice(A.get_preferred_devid(k, k))); + DLAAUM(uplo, A, k, k); + } + + nvtxRangePop(); +} + +void PDSYMM(cublasSideMode_t side, + cublasFillMode_t uplo, + double alpha, + matrix& A, + matrix& B, + double beta, + matrix& C) +{ + int k, m, n; + double zbeta; + double zone = (double) 1.0; + + for (m = 0; m < C.mt; m++) + { + for (n = 0; n < C.nt; n++) + { + cuda_try(cudaSetDevice(C.get_preferred_devid(m, n))); + /* + * CUBLAS_SIDE_LEFT / CUBLAS_FILL_MODE_LOWER + */ + if (side == CUBLAS_SIDE_LEFT) + { + if (uplo == CUBLAS_FILL_MODE_LOWER) + { + for (k = 0; k < C.mt; k++) + { + zbeta = k == 0 ? beta : zone; + if (k < m) + { + DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, alpha, A, m, k, B, k, n, zbeta, C, m, n); + } + else + { + if (k == m) + { + DSYMM(side, uplo, alpha, A, k, k, B, k, n, zbeta, C, m, n); + } + else + { + DGEMM(CUBLAS_OP_T, CUBLAS_OP_N, alpha, A, k, m, B, k, n, zbeta, C, m, n); + } + } + } + } + /* + * CUBLAS_SIDE_LEFT / CUBLAS_FILL_MODE_UPPER + */ + else + { + for (k = 0; k < C.mt; k++) + { + zbeta = k == 0 ? beta : zone; + if (k < m) + { + DGEMM(CUBLAS_OP_T, CUBLAS_OP_N, alpha, A, k, m, B, k, n, zbeta, C, m, n); + } + else + { + if (k == m) + { + DSYMM(side, uplo, alpha, A, k, k, B, k, n, zbeta, C, m, n); + } + else + { + DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, alpha, A, m, k, B, k, n, zbeta, C, m, n); + } + } + } + } + } + /* + * CUBLAS_SIDE_RIGHT / CUBLAS_FILL_MODE_LOWER + */ + else + { + if (uplo == CUBLAS_FILL_MODE_LOWER) + { + for (k = 0; k < C.nt; k++) + { + zbeta = k == 0 ? beta : zone; + if (k < n) + { + DGEMM(CUBLAS_OP_N, CUBLAS_OP_T, alpha, B, m, k, A, n, k, zbeta, C, m, n); + } + else + { + if (k == n) + { + DSYMM(side, uplo, alpha, A, k, k, B, m, k, zbeta, C, m, n); + } + else + { + DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, alpha, B, m, k, A, k, n, zbeta, C, m, n); + } + } + } + } + /* + * CUBLAS_SIDE_RIGHT / CUBLAS_FILL_MODE_UPPER + */ + else + { + for (k = 0; k < C.nt; k++) + { + zbeta = k == 0 ? beta : zone; + if (k < n) + { + DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, alpha, B, m, k, A, k, n, zbeta, C, m, n); + } + else + { + if (k == n) + { + DSYMM(side, uplo, alpha, A, k, k, B, m, k, zbeta, C, m, n); + } + else + { + DGEMM(CUBLAS_OP_N, CUBLAS_OP_T, alpha, B, m, k, A, n, k, zbeta, C, m, n); + } + } + } + } + } + } + } +} + +void PDTRMM(cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + double alpha, + matrix& A, + matrix& B) +{ + if (side == CUBLAS_SIDE_LEFT) + { + if (uplo == CUBLAS_FILL_MODE_UPPER) + { + //=========================================== + // CUBLAS_SIDE_LEFT / CUBLAS_FILL_MODE_UPPER / CUBLAS_OP_N + //=========================================== + if (trans == CUBLAS_OP_N) + { + for (int m = 0; m < B.mt; m++) + { + for (int n = 0; n < B.nt; n++) + { + cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); + + DTRMM(side, uplo, trans, diag, alpha, A, m, m, B, m, n); + + for (int k = m + 1; k < A.mt; k++) + { + DGEMM(trans, CUBLAS_OP_N, alpha, A, m, k, B, k, n, 1.0, B, m, n); + } + } + } + } + //================================================ + // CUBLAS_SIDE_LEFT / CUBLAS_FILL_MODE_UPPER / CUBLAS_OP_T + //================================================ + else + { + for (int m = B.mt - 1; m > -1; m--) + { + for (int n = 0; n < B.nt; n++) + { + cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); + + DTRMM(side, uplo, trans, diag, alpha, A, m, m, B, m, n); + + for (int k = 0; k < m; k++) + { + DGEMM(trans, CUBLAS_OP_N, alpha, A, k, m, B, k, n, 1.0, B, m, n); + } + } + } + } + } + else + { + //=========================================== + // CUBLAS_SIDE_LEFT / CUBLAS_FILL_MODE_LOWER / CUBLAS_OP_N + //=========================================== + if (trans == CUBLAS_OP_N) + { + for (int m = B.mt - 1; m > -1; m--) + { + for (int n = 0; n < B.nt; n++) + { + cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); + + DTRMM(side, uplo, trans, diag, alpha, A, m, m, B, m, n); + + for (int k = 0; k < m; k++) + { + DGEMM(trans, CUBLAS_OP_N, alpha, A, m, k, B, k, n, 1.0, B, m, n); + } + } + } + } + //================================================ + // CUBLAS_SIDE_LEFT / CUBLAS_FILL_MODE_LOWER / CUBLAS_OP_T + //================================================ + else + { + for (int m = 0; m < B.mt; m++) + { + for (int n = 0; n < B.nt; n++) + { + DTRMM(side, uplo, trans, diag, alpha, A, m, m, B, m, n); + + for (int k = m + 1; k < A.mt; k++) + { + DGEMM(trans, CUBLAS_OP_N, alpha, A, k, m, B, k, n, 1.0, B, m, n); + } + } + } + } + } + } + else + { + if (uplo == CUBLAS_FILL_MODE_UPPER) + { + //============================================ + // CUBLAS_SIDE_RIGHT / CUBLAS_FILL_MODE_UPPER / CUBLAS_OP_N + //============================================ + if (trans == CUBLAS_OP_N) + { + for (int n = B.nt - 1; n > -1; n--) + { + for (int m = 0; m < B.mt; m++) + { + cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); + + DTRMM(side, uplo, trans, diag, alpha, A, n, n, B, m, n); + + for (int k = 0; k < n; k++) + { + DGEMM(CUBLAS_OP_N, trans, alpha, B, m, k, A, k, n, 1.0, B, m, n); + } + } + } + } + //================================================= + // CUBLAS_SIDE_RIGHT / CUBLAS_FILL_MODE_UPPER / Plasma[_Conj]Trans + //================================================= + else + { + for (int n = 0; n < B.nt; n++) + { + for (int m = 0; m < B.mt; m++) + { + cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); + + DTRMM(side, uplo, trans, diag, alpha, A, n, n, B, m, n); + + for (int k = n + 1; k < A.mt; k++) + { + DGEMM(CUBLAS_OP_N, trans, alpha, B, m, k, A, n, k, 1.0, B, m, n); + } + } + } + } + } + else + { + //============================================ + // CUBLAS_SIDE_RIGHT / CUBLAS_FILL_MODE_LOWER / CUBLAS_OP_N + //============================================ + if (trans == CUBLAS_OP_N) + { + for (int n = 0; n < B.nt; n++) + { + for (int m = 0; m < B.mt; m++) + { + cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); + + DTRMM(side, uplo, trans, diag, alpha, A, n, n, B, m, n); + + for (int k = n + 1; k < A.mt; k++) + { + DGEMM(CUBLAS_OP_N, trans, alpha, B, m, k, A, k, n, 1.0, B, m, n); + } + } + } + } + //================================================= + // CUBLAS_SIDE_RIGHT / CUBLAS_FILL_MODE_LOWER / Plasma[_Conj]Trans + //================================================= + else + { + for (int n = B.nt - 1; n > -1; n--) + { + for (int m = 0; m < B.mt; m++) + { + cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); + + DTRMM(side, uplo, trans, diag, alpha, A, n, n, B, m, n); + + for (int k = 0; k < n; k++) + { + DGEMM(CUBLAS_OP_N, trans, alpha, B, m, k, A, n, k, 1.0, B, m, n); + } + } + } + } + } + } +} + +// Taken from Chameleon (INRIA) +// All the formula are reported in the LAPACK Lawn 41: +// http://www.netlib.org/lapack/lawns/lawn41.ps +#define FMULS_POTRI(__n) ((double) (__n) * ((2. / 3.) + (double) (__n) * ((1. / 3.) * (double) (__n) + 1.))) +#define FADDS_POTRI(__n) ((double) (__n) * ((1. / 6.) + (double) (__n) * ((1. / 3.) * (double) (__n) - 0.5))) +double flops_dpotri(double __n) +{ + double flops = (FMULS_POTRI((__n)) + FADDS_POTRI((__n))); + return flops; +} + +void run(int N, int NB) +{ + // Use pools of preallocated blocks + auto fixed_alloc = block_allocator(ctx, NB * NB * sizeof(double)); + ctx.set_allocator(fixed_alloc); + + // Set up CUBLAS and CUSOLVER + int ndevs; + cuda_try(cudaGetDeviceCount(&ndevs)); + + for (size_t d = 0; d < ndevs; d++) + { + auto ldummy = ctx.logical_data(shape_of>(1)); + ctx.task(exec_place::device(d), ldummy.write())->*[](cudaStream_t, auto) { + get_cublas_handle(); + get_cusolver_handle(); + }; + + ctx.task(exec_place::host, ldummy.write(data_place::managed))->*[](cudaStream_t, auto) {}; + } + + cuda_try(cudaSetDevice(0)); + + cudaStream_t timing_stream; + cuda_try(cudaStreamCreate(&timing_stream)); + + matrix A(N, N, NB, NB, true, "A"); + matrix Aref(N, N, NB, NB, false, "Aref"); + + // (Hilbert matrix + 2*N*Id) to have a diagonal dominant matrix + auto hilbert = [=] _CCCL_HOST_DEVICE(size_t row, size_t col) { + return 1.0 / (col + row + 1.0) + 2.0 * N * (col == row); + }; + + Aref.fill(hilbert); + A.fill(hilbert); + + /* Right-hand side */ + matrix B_potrs(N, 1, NB, 1, false, "B"); + matrix Bref_potrs(N, 1, NB, 1, false, "Bref"); + + auto rhs_vals = [] _CCCL_HOST_DEVICE(size_t row, size_t /*unused*/) { + return 1.0 * (row + 1); + }; + + B_potrs.fill(rhs_vals); + Bref_potrs.fill(rhs_vals); + + int check_result = 1; + if (getenv("CHECK_RESULT")) + { + check_result = atoi(getenv("CHECK_RESULT")); + } + + int check_result_potrs = check_result; + if (getenv("CHECK_RESULT_POTRS")) + { + check_result_potrs = atoi(getenv("CHECK_RESULT_POTRS")); + } + + // // Compute ||Bref|| + double Bref_nrm2 = 0.0; + double res_nrm2 = 0.0; + + if (check_result_potrs) + { + PDNRM2_HOST(&Bref_potrs, &Bref_nrm2); + } + + cudaEvent_t startEvent, stopEvent; + + cuda_safe_call(cudaSetDevice(0)); + cuda_safe_call(cudaStreamSynchronize(ctx.task_fence())); + cuda_safe_call(cudaEventCreate(&startEvent)); + cuda_safe_call(cudaEventCreate(&stopEvent)); + cuda_safe_call(cudaEventRecord(startEvent, ctx.task_fence())); + + ctx.get_dot()->set_current_color("green"); + PDPOTRF(A); + ctx.get_dot()->set_current_color("white"); + + /* + * POTRS + */ + + if (check_result_potrs) + { + // Solve AX = B and put the result in B + PDPOTRS(A, B_potrs, CUBLAS_FILL_MODE_LOWER); + + // Compute (AX - B) + // Bref = (Aref*B - Bref) + PDGEMM(CUBLAS_OP_N, CUBLAS_OP_N, 1.0, Aref, B_potrs, -1.0, Bref_potrs); + + // Compute ||AX - B|| = ||Bref|| + PDNRM2_HOST(&Bref_potrs, &res_nrm2); + } + + /* + * POTRI + */ + /* PDPOTRI = PDTRTRI + PDLAUUM */ + + // PDTRTRI : La^-1 (invert A) + // fprintf(stderr, "A=La before POTRI\n"); + // A.print(); + + ctx.get_dot()->set_current_color("yellow"); + PDTRTRI(A, CUBLAS_FILL_MODE_LOWER, CUBLAS_DIAG_NON_UNIT); + ctx.get_dot()->set_current_color("white"); + + // fprintf(stderr, "A=La^-1 after POTRI\n"); + // A.print(); + + // Computes the lower part of A^tA (La^-t La^-1) + ctx.get_dot()->set_current_color("blue"); + PDLAUUM(A, CUBLAS_FILL_MODE_LOWER); + ctx.get_dot()->set_current_color("white"); + + double b_nrm2_potri = 0.0; + double res_nrm2_potri = 0.0; + + if (check_result) + { + /* Right-hand side */ + matrix B_potri(N, 1, NB, 1, false, "B_potri"); + matrix Bref_potri(N, 1, NB, 1, false, "Bref_potri"); + + // auto rhs_vals = [](matrix& mat, int row, int col) { return 1.0 * (row + 1); }; + B_potri.fill(rhs_vals); + Bref_potri.fill(rhs_vals); + + // AX = B, X = A^-1 B + // LLt X = B, X = (LLt)^-1 B = L^-t L^-1 B + // Compute Bref_potri = (A^-1 B - B) + PDNRM2_HOST(&Bref_potri, &b_nrm2_potri); + + // B = (A^-1)*B (A triangular lower, B_potri full) + // fprintf(stderr, "B_potri before PDTRMM\n"); + // B_potri.print(); + // + // fprintf(stderr, "A before PDTRMM\n"); + // A.print(); + + // B_tmp = 0 (to avoid NaN*0.0) + matrix B_tmp(N, 1, NB, 1, false, "B_tmp"); + auto zero_vals = [] _CCCL_HOST_DEVICE(size_t /* unused */, size_t /*unused*/) { + return 0.0; + }; + B_tmp.fill(zero_vals); + + // B_tmp = A * B_potri + 0*B_tmp + PDSYMM(CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, 1.0, A, B_potri, 0.0, B_tmp); + + // fprintf(stderr, "B_potri after PDTRMM\n"); + // B_potri.print(); + + // res = A X - B + PDGEMM(CUBLAS_OP_N, CUBLAS_OP_N, 1.0, Aref, B_tmp, -1.0, Bref_potri); + + // fprintf(stderr, "Bref_potri after PDGEMM\n"); + // Bref_potri.print(); + + // Compute residual + PDNRM2_HOST(&Bref_potri, &res_nrm2_potri); + } + + cuda_safe_call(cudaSetDevice(0)); + cuda_safe_call(cudaEventRecord(stopEvent, ctx.task_fence())); + + ctx.finalize(); + + if (check_result_potrs) + { + double residual = sqrt(res_nrm2) / sqrt(Bref_nrm2); + // std::cout << "[POTRS] ||AX - B|| : " << sqrt(res_nrm2) << std::endl; + // std::cout << "[POTRS] ||B|| : " << sqrt(Bref_nrm2) << std::endl; + // std::cout << "[POTRS] RESIDUAL (||AX - B||/||B||) : " << residual << std::endl; + EXPECT(residual < 0.01); + } + + if (check_result) + { + double residual_potri = sqrt(res_nrm2_potri) / sqrt(b_nrm2_potri); + // std::cout << "[POTRI] RESIDUAL ||A * ((A^-1)B) - B|| : " << sqrt(res_nrm2_potri) << std::endl; + // std::cout << "[POTRI] RESIDUAL ||B|| : " << sqrt(b_nrm2_potri) << std::endl; + // std::cout << "[POTRI] RESIDUAL (||A * ((A^-1)B) - B||/||B||) : " << residual_potri << std::endl; + EXPECT(residual_potri < 0.0001); + } + + // // Compute Aref * A^-1 in Aref (A^-1 is lower triangular) + // PDTRMM(CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, 1.0, A, Aref); + + // // This should be almost identity + // Aref.print(); + +#if 0 + + std::cout << "Print A^-1 after PDLAUUM : " << std::endl; + A.print(); + + std::cout << "RES after AX - B POTRI : " << std::endl; + Bref_potri.print(); + + // This should be almost identity + Aref.print(); +#endif + + float milliseconds; + cuda_safe_call(cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); + + double gflops = flops_dpotri((double) N) / (1000000000.0); + std::cout + << "[PDPOTRI] ELAPSED: " << milliseconds << " ms, GFLOPS: " << gflops / (milliseconds / 1000.0) << std::endl; +} + +int main(int argc, char** argv) +{ + int N = 1024; + int NB = 128; + + if (argc > 1) + { + N = atoi(argv[1]); + } + + if (argc > 2) + { + NB = atoi(argv[2]); + } + + assert(N % NB == 0); + + run(N, NB); +} diff --git a/cudax/examples/stf/linear_algebra/cg_csr.cu b/cudax/examples/stf/linear_algebra/cg_csr.cu new file mode 100644 index 00000000000..6d6acaa8245 --- /dev/null +++ b/cudax/examples/stf/linear_algebra/cg_csr.cu @@ -0,0 +1,357 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Sparse conjugate gradient algorithm + */ + +#include + +using namespace cuda::experimental::stf; + +stream_ctx ctx; + +class csr_matrix +{ +public: + csr_matrix(size_t num_rows, size_t num_nonzeros, double* values, size_t* row_offsets, size_t* column_indices) + { + val_handle = ctx.logical_data(make_slice(values, num_nonzeros)); + col_handle = ctx.logical_data(make_slice(column_indices, num_nonzeros)); + row_handle = ctx.logical_data(make_slice(row_offsets, num_rows + 1)); + } + + /* Description of the CSR */ + logical_data> val_handle; + logical_data> row_handle; + logical_data> col_handle; +}; + +class vector +{ +public: + vector(size_t N) + { + handle = ctx.logical_data(shape_of>(N)); + } + + static void copy_vector(const vector& from, vector& to) + { + ctx.parallel_for(to.handle.shape(), to.handle.write(), from.handle.read()).set_symbol("copy_vector") + ->*[] _CCCL_DEVICE(size_t i, slice dto, slice dfrom) { + dto(i) = dfrom(i); + }; + } + + // Copy constructor + vector(const vector& a) + { + handle = ctx.logical_data(a.handle.shape()); + copy_vector(a, *this); + } + + // this -= rhs + vector& operator-=(const vector& rhs) + { + ctx.parallel_for(handle.shape(), handle.rw(), rhs.handle.read()).set_symbol("-= vector") + ->*[] _CCCL_DEVICE(size_t i, auto dthis, auto drhs) { + dthis(i) -= drhs(i); + }; + + return *this; + } + + size_t size() const + { + return handle.shape().size(); + } + + logical_data> handle; +}; + +class scalar +{ +public: + scalar() + { + handle = ctx.logical_data(shape_of>(1)); + } + + static void copy_scalar(const scalar& from, scalar& to) + { + ctx.parallel_for(to.handle.shape(), to.handle.write(), from.handle.read()) + ->*[] _CCCL_DEVICE(size_t i, slice dto, slice dfrom) { + dto(i) = dfrom(i); + }; + } + + // Copy constructor + scalar(const scalar& a) + { + handle = ctx.logical_data(shape_of>(1)); + copy_scalar(a, *this); + } + + scalar operator/(scalar const& rhs) const + { + // Submit a task that computes this/rhs + scalar res; + ctx.parallel_for(handle.shape(), handle.read(), rhs.handle.read(), res.handle.write()) + ->*[] _CCCL_DEVICE(size_t i, auto dthis, auto drhs, auto dres) { + dres(i) = dthis(i) / drhs(i); + }; + + return res; + } + + scalar operator-() const + { + // Submit a task that computes -s + scalar res; + ctx.parallel_for(handle.shape(), handle.read(), res.handle.write()) + ->*[] _CCCL_DEVICE(size_t i, auto dthis, auto dres) { + dres(i) = -dthis(i); + }; + + return res; + } + + /* This methods reads the content of the logical data in a synchronous manner, and returns its value. */ + double get() const + { + double ret; + ctx.task(exec_place::host, handle.read())->*[&](cudaStream_t stream, auto dval) { + cuda_safe_call(cudaStreamSynchronize(stream)); + ret = dval(0); + }; + return ret; + } + + logical_data> handle; +}; + +scalar DOT(vector& a, vector& b) +{ + scalar res; + + /* We initialize the result with a trivial kernel so that we do not need a + * cooperative kernel launch for the reduction */ + ctx.parallel_for(box(1), res.handle.write())->*[] _CCCL_DEVICE(size_t, auto dres) { + dres(0) = 0.0; + }; + + auto spec = par<128>(con<32>()); + ctx.launch(spec, exec_place::current_device(), a.handle.read(), b.handle.read(), res.handle.rw()) + ->*[] _CCCL_DEVICE(auto th, auto da, auto db, auto dres) { + // Each thread computes the dot product of the elements assigned to it + double local_sum = 0.0; + for (auto i : th.apply_partition(shape(da), std::tuple())) + { + local_sum += da(i) * db(i); + } + + auto ti = th.inner(); + __shared__ double block_sum[th.static_width(1)]; + block_sum[ti.rank()] = local_sum; + + /* Reduce within blocks */ + for (size_t s = ti.size() / 2; s > 0; s /= 2) + { + ti.sync(); + if (ti.rank() < s) + { + block_sum[ti.rank()] += block_sum[ti.rank() + s]; + } + } + + /* Every first thread of a block writes its contribution */ + if (ti.rank() == 0) + { + atomicAdd(&dres(0), block_sum[0]); + } + }; + + return res; +}; + +// Y = Y + alpha * X +void AXPY(const scalar& alpha, vector& x, vector& y) +{ + ctx.parallel_for(x.handle.shape(), alpha.handle.read(), x.handle.read(), y.handle.rw()) + ->*[] _CCCL_DEVICE(size_t i, auto dalpha, auto dx, auto dy) { + dy(i) += dalpha(0) * dx(i); + }; +}; + +// Y = alpha*Y + X +void SCALE_AXPY(const scalar& alpha, const vector& x, vector& y) +{ + ctx.parallel_for(x.handle.shape(), alpha.handle.read(), x.handle.read(), y.handle.rw()) + ->*[] _CCCL_DEVICE(size_t i, auto dalpha, auto dx, auto dy) { + dy(i) = dalpha(0) * dy(i) + dx(i); + }; +}; + +void SPMV(csr_matrix& a, vector& x, vector& y) +{ + ctx.parallel_for( + y.handle.shape(), a.val_handle.read(), a.col_handle.read(), a.row_handle.read(), x.handle.read(), y.handle.write()) + ->*[] _CCCL_DEVICE(size_t row, auto da_val, auto da_col, auto da_row, auto dx, auto dy) { + int row_start = da_row(row); + int row_end = da_row(row + 1); + + double sum = 0.0; + for (int elt = row_start; elt < row_end; elt++) + { + sum += da_val(elt) * dx(da_col(elt)); + } + + dy(row) = sum; + }; +} + +void cg(csr_matrix& A, vector& X, vector& B) +{ + size_t N = B.size(); + vector R = B; + + // R = R - A*X + vector Ax(N); + SPMV(A, X, Ax); + R -= Ax; + + vector P = R; + + // RSOLD = R'*R + scalar rsold = DOT(R, R); + + const int MAXITER = N; + for (int k = 0; k < MAXITER; k++) + { + vector Ap(N); + + // Ap = A*P + SPMV(A, P, Ap); + + // alpha = rsold / (p' * Ap); + scalar alpha = rsold / DOT(P, Ap); + + // x = x + alpha * p; + AXPY(alpha, P, X); + + // r = r - alpha * Ap; + AXPY(-alpha, Ap, R); + + // rsnew = r' * r; + scalar rsnew = DOT(R, R); + + // Read the residual on the CPU, and halt the iterative process if we have converged + // (note that this will block the submission of tasks) + double err = sqrt(rsnew.get()); + if (err < 1e-10) + { + // We have converged + fprintf(stderr, "Successfully converged (err = %le)\n", err); + break; + } + + // p = r + (rsnew / rsold) * p; + SCALE_AXPY(rsnew / rsold, R, P); + + rsold = mv(rsnew); + } +} + +/* genTridiag: generate a random tridiagonal symmetric matrix + from : + https://github.com/NVIDIA/cuda-samples/blob/master/Samples/4_CUDA_Libraries/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu + */ +void genTridiag(size_t* I, size_t* J, double* val, size_t N, size_t nz) +{ + const double d = 2.0; + + I[0] = 0, J[0] = 0, J[1] = 1; + val[0] = drand48() + d; + val[1] = drand48(); + int start; + + for (size_t i = 1; i < N; i++) + { + if (i > 1) + { + I[i] = I[i - 1] + 3; + } + else + { + I[1] = 2; + } + + start = (i - 1) * 3 + 2; + J[start] = i - 1; + J[start + 1] = i; + + if (i < N - 1) + { + J[start + 2] = i + 1; + } + + val[start] = val[start - 1]; + val[start + 1] = drand48() + d; + + if (i < N - 1) + { + val[start + 2] = drand48(); + } + } + + I[N] = nz; +} + +int main(int argc, char** argv) +{ + size_t N = 10485760; + + if (argc > 1) + { + N = atoi(argv[1]); + fprintf(stderr, "N = %zu\n", N); + } + + size_t nz = (N - 2) * 3 + 4; + + size_t* row_offsets; + size_t* column_indices; + double* values; + cuda_safe_call(cudaHostAlloc(&row_offsets, (N + 1) * sizeof(size_t), cudaHostAllocMapped)); + cuda_safe_call(cudaHostAlloc(&column_indices, nz * sizeof(size_t), cudaHostAllocMapped)); + cuda_safe_call(cudaHostAlloc(&values, nz * sizeof(double), cudaHostAllocMapped)); + + // Generate a random matrix that is supposed to be invertible + genTridiag(row_offsets, column_indices, values, N, nz); + + csr_matrix A(N, nz, values, row_offsets, column_indices); + + vector X(N), B(N); + + // RHS + ctx.parallel_for(B.handle.shape(), B.handle.write())->*[] _CCCL_DEVICE(size_t i, auto dB) { + dB(i) = 1.0; + }; + + // Initial guess + ctx.parallel_for(X.handle.shape(), X.handle.write())->*[] _CCCL_DEVICE(size_t i, auto dX) { + dX(i) = 1.0; + }; + + cg(A, X, B); + + ctx.finalize(); +} diff --git a/cudax/examples/stf/linear_algebra/cg_dense_2D.cu b/cudax/examples/stf/linear_algebra/cg_dense_2D.cu new file mode 100644 index 00000000000..e98f737e524 --- /dev/null +++ b/cudax/examples/stf/linear_algebra/cg_dense_2D.cu @@ -0,0 +1,460 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Conjugate gradient for a tiled dense matrix + */ + +#include + +using namespace cuda::experimental::stf; + +static cublasHandle_t cublas_handle; + +stream_ctx ctx; + +class matrix +{ +public: + matrix(size_t N) + : N(N) + { + h_addr.reset(new double[N * N]); + cuda_safe_call(cudaHostRegister(h_addr.get(), N * N * sizeof(double), cudaHostRegisterPortable)); + handle = to_shared(ctx.logical_data(make_slice(h_addr.get(), std::tuple{N, N}, N))); + } + + void fill(const std::function& f) + { + ctx.task(exec_place::host, handle->write())->*[&f](cudaStream_t stream, auto ds) { + cuda_safe_call(cudaStreamSynchronize(stream)); + + for (size_t col = 0; col < ds.extent(1); col++) + { + for (size_t row = 0; row < ds.extent(0); row++) + { + ds(row, col) = f(row, col); + } + } + }; + } + + size_t N; + std::unique_ptr h_addr; + std::shared_ptr>> handle; +}; + +class vector +{ +public: + vector(size_t N, size_t _block_size, bool is_tmp = false) + : N(N) + , block_size(_block_size) + , nblocks((N + block_size - 1) / block_size) + { + handles.resize(nblocks); + + if (is_tmp) + { + // There is no physical backing for this temporary vector + for (int b = 0; b < nblocks; b++) + { + size_t bs = std::min(N - block_size * b, block_size); + handles[b] = to_shared(ctx.logical_data(shape_of>(bs))); + } + } + else + { + h_addr.reset(new double[N]); + cuda_safe_call(cudaHostRegister(h_addr.get(), N * sizeof(double), cudaHostRegisterPortable)); + for (size_t b = 0; b < nblocks; b++) + { + size_t bs = std::min(N - block_size * b, block_size); + handles[b] = to_shared(ctx.logical_data(make_slice(&h_addr[block_size * b], bs))); + } + } + } + + // Copy constructor + vector(const vector& a) + : N(a.N) + , block_size(a.block_size) + , nblocks(a.nblocks) + { + handles.resize(nblocks); + + for (int b = 0; b < nblocks; b++) + { + size_t bs = std::min(N - block_size * b, block_size); + handles[b] = to_shared(ctx.logical_data(shape_of>(bs))); + + ctx.task(handles[b]->write(), a.handles[b]->read())->*[bs](cudaStream_t stream, auto dthis, auto da) { + // There are likely much more efficient ways. + cuda_safe_call(cudaMemcpyAsync( + dthis.data_handle(), da.data_handle(), bs * sizeof(double), cudaMemcpyDeviceToDevice, stream)); + }; + } + } + + void fill(const std::function& f) + { + size_t bs = block_size; + for (int b = 0; b < nblocks; b++) + { + ctx.task(exec_place::host, handles[b]->write())->*[&f, b, bs](cudaStream_t stream, auto ds) { + cuda_safe_call(cudaStreamSynchronize(stream)); + + for (int local_row = 0; local_row < ds.extent(0); local_row++) + { + ds(local_row) = f(local_row + b * bs); + } + }; + } + } + + size_t N; + size_t block_size; + size_t nblocks; + + mutable std::vector>>> handles; + std::unique_ptr h_addr; +}; + +__global__ void scalar_div(const double* a, const double* b, double* c) +{ + *c = *a / *b; +} + +// A += B +__global__ void scalar_add(double* a, const double* b) +{ + *a = *a + *b; +} + +__global__ void scalar_minus(const double* a, double* res) +{ + *res = -(*a); +} + +class scalar +{ +public: + scalar(bool is_tmp = false) + { + size_t s = sizeof(double); + + if (is_tmp) + { + // There is no physical backing for this temporary vector + handle = to_shared(ctx.logical_data(shape_of>(1))); + } + else + { + h_addr.reset(new double); + cuda_safe_call(cudaHostRegister(h_addr.get(), s, cudaHostRegisterPortable)); + handle = to_shared(ctx.logical_data(make_slice(h_addr.get(), 1))); + } + } + + scalar(scalar&&) = default; + scalar& operator=(scalar&&) = default; + + // Copy constructor + scalar(const scalar& a) + { + handle = to_shared(ctx.logical_data(shape_of>(1))); + + ctx.task(handle->write(), a.handle->read())->*[](cudaStream_t stream, auto dthis, auto da) { + // There are likely much more efficient ways. + cuda_safe_call( + cudaMemcpyAsync(dthis.data_handle(), da.data_handle(), sizeof(double), cudaMemcpyDeviceToDevice, stream)); + }; + } + + scalar operator/(scalar const& rhs) const + { + // Submit a task that computes this/rhs + scalar res(true); + ctx.task(handle->read(), rhs.handle->read(), res.handle->write()) + ->*[](cudaStream_t stream, auto da, auto db, auto dres) { + scalar_div<<<1, 1, 0, stream>>>(da.data_handle(), db.data_handle(), dres.data_handle()); + }; + + return res; + } + + // this += rhs + scalar& operator+=(const scalar& rhs) + { + ctx.task(handle->rw(), rhs.handle->read())->*[](cudaStream_t stream, auto dthis, auto drhs) { + scalar_add<<<1, 1, 0, stream>>>(dthis.data_handle(), drhs.data_handle()); + }; + + return *this; + } + + scalar operator-() const + { + // Submit a task that computes -s + scalar res(true); + ctx.task(handle->read(), res.handle->write())->*[](cudaStream_t stream, auto dthis, auto dres) { + scalar_minus<<<1, 1, 0, stream>>>(dthis.data_handle(), dres.data_handle()); + }; + + return res; + } + + // Get value on the host + double get_value() + { + double val; + ctx.task(exec_place::host, handle->read())->*[&val](cudaStream_t stream, auto ds) { + cuda_safe_call(cudaStreamSynchronize(stream)); + val = ds(0); + }; + + return val; + } + + mutable std::shared_ptr>> handle; + std::unique_ptr h_addr; +}; + +class scalar DOT(vector& a, class vector& b) +{ + assert(a.nblocks == b.nblocks); + scalar global_res(true); + + // Loop over all blocks, + for (int bid = 0; bid < a.nblocks; bid++) + { + scalar res(true); + + // Note that it works even if a.handle == b.handle because they have the same acces mode + ctx.task(a.handles[bid]->read(), b.handles[bid]->read(), res.handle->write()) + ->*[](cudaStream_t stream, auto da, auto db, auto dres) { + cuda_safe_call(cublasSetStream(cublas_handle, stream)); + cuda_safe_call(cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE)); + cuda_safe_call( + cublasDdot(cublas_handle, da.extent(0), da.data_handle(), 1, db.data_handle(), 1, dres.data_handle())); + }; + + if (bid == 0) + { + // First access requires an assigment because it was not initialized + global_res = std::move(res); + } + else + { + global_res += res; + } + } + + return global_res; +}; + +// Y = Y + alpha * X +void AXPY(const class scalar& alpha, class vector& x, class vector& y) +{ + assert(x.N == y.N); + assert(x.nblocks == y.nblocks); + + for (int b = 0; b < x.nblocks; b++) + { + ctx.task(alpha.handle->read(), x.handles[b]->read(), y.handles[b]->rw()) + ->* + [](cudaStream_t stream, auto dalpha, auto dx, auto dy) { + auto nx = dx.extent(0); + cuda_safe_call(cublasSetStream(cublas_handle, stream)); + cuda_safe_call(cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE)); + cuda_safe_call(cublasDaxpy(cublas_handle, nx, dalpha.data_handle(), dx.data_handle(), 1, dy.data_handle(), 1)); + }; + } +}; + +// Y = alpha*Y + X +void SCALE_AXPY(const scalar& alpha, const class vector& x, class vector& y) +{ + assert(x.N == y.N); + assert(x.nblocks == y.nblocks); + + for (int b = 0; b < x.nblocks; b++) + { + ctx.task(alpha.handle->read(), x.handles[b]->read(), y.handles[b]->rw()) + ->*[](cudaStream_t stream, auto dalpha, auto dx, auto dy) { + cuda_safe_call(cublasSetStream(cublas_handle, stream)); + + auto nx = dx.extent(0); + + // Y = alpha Y + cuda_safe_call(cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE)); + cuda_safe_call(cublasDscal(cublas_handle, nx, dalpha.data_handle(), dy.data_handle(), 1)); + + // Y = Y + X + const double one = 1.0; + cuda_safe_call(cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST)); + cuda_safe_call(cublasDaxpy(cublas_handle, nx, &one, dx.data_handle(), 1, dy.data_handle(), 1)); + }; + } +}; + +// y = alpha Ax + beta y +void GEMV(double alpha, class matrix& a, class vector& x, double beta, class vector& y) +{ + assert(a.N == x.N); + assert(x.N == y.N); + + size_t block_size = x.block_size; + assert(block_size == y.block_size); + + for (int row_y = 0; row_y < y.nblocks; row_y++) + { + for (int row_x = 0; row_x < x.nblocks; row_x++) + { + double local_beta = (row_x == 0) ? beta : 1.0; + + // If beta is null, then this is a write only mode + auto y_mode = local_beta == 0.0 ? access_mode::write : access_mode::rw; + + ctx.task(a.handle->read(), x.handles[row_x]->read(), task_dep>(*(y.handles[row_y].get()), y_mode)) + ->*[alpha, local_beta, row_x, row_y, block_size](cudaStream_t stream, auto da, auto dx, auto dy) { + auto nx = dx.extent(0); + auto ny = dy.extent(0); + auto ldA = da.stride(1); + const double* Ablock = &da(row_y * block_size, row_x * block_size); + + cuda_safe_call(cublasSetStream(cublas_handle, stream)); + cuda_safe_call(cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST)); + cuda_safe_call(cublasDgemv( + cublas_handle, + CUBLAS_OP_N, + ny, + nx, + &alpha, + Ablock, + ldA, + dx.data_handle(), + 1, + &local_beta, + dy.data_handle(), + 1)); + }; + } + } +} + +void cg(matrix& A, vector& X, vector& B) +{ + int N = A.N; + + assert(N == X.N); + assert(N == B.N); + + vector R = B; + + // R = R - A*X + GEMV(-1.0, A, X, 1.0, R); + + vector P = R; + + // RSOLD = R'*R + scalar rsold = DOT(R, R); + + int MAXITER = N; + + if (getenv("MAXITER")) + { + MAXITER = atoi(getenv("MAXITER")); + } + + for (int k = 0; k < MAXITER; k++) + { + vector Ap(N, P.block_size, true); + + // Ap = A*P + GEMV(1.0, A, P, 0.0, Ap); + + // alpha = rsold / (p' * Ap); + scalar alpha = rsold / DOT(P, Ap); + + // x = x + alpha * p; + AXPY(alpha, P, X); + + // r = r - alpha * Ap; + AXPY(-alpha, Ap, R); + + // rsnew = r' * r; + scalar rsnew = DOT(R, R); + + // Read the residual on the CPU, and halt the iterative process if we have converged + { + double err; + ctx.task(exec_place::host, rsnew.handle->read())->*[&err](cudaStream_t stream, auto dres) { + cuda_safe_call(cudaStreamSynchronize(stream)); + err = sqrt(dres(0)); + }; + + if (err < 1e-10) + { + // We have converged + // fprintf(stderr, "Successfully converged (err = %le)\n", err); + break; + } + } + + // p = r + (rsnew / rsold) * p; + SCALE_AXPY(rsnew / rsold, R, P); + + rsold = std::move(rsnew); + } +} + +int main(int argc, char** argv) +{ + size_t N = 1024; + + if (argc > 1) + { + N = atoi(argv[1]); + fprintf(stderr, "N = %zu\n", N); + } + + size_t block_size = N / 4; + + if (argc > 2) + { + block_size = atoi(argv[2]); + fprintf(stderr, "block_size = %zu\n", block_size); + } + + // Do this lazily ? + cuda_safe_call(cublasCreate(&cublas_handle)); + + matrix A(N); + A.fill([&](int row, int col) { + return (1.0 / (row + col + 1) + (row == col ? 0.1 : 0.0)); + }); + + vector B(N, block_size); + vector X(N, block_size); + + B.fill([&](int /*unused*/) { + return 1.0; + }); + + X.fill([&](int /*unused*/) { + return 0.0; + }); + + cg(A, X, B); + + ctx.finalize(); +} diff --git a/cudax/examples/stf/linear_algebra/strassen.cu b/cudax/examples/stf/linear_algebra/strassen.cu new file mode 100644 index 00000000000..0b00bd41f48 --- /dev/null +++ b/cudax/examples/stf/linear_algebra/strassen.cu @@ -0,0 +1,489 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Strassen matrix multiplication algorithm + * + * This demonstrates how CUDASTF helps combining many interdependant tasks and + * deal with temporary data. + */ + +#include + +static const size_t BLOCKSIZE = 1024; + +using namespace cuda::experimental::stf; + +using logical_matrix = logical_data>; + +inline size_t get_m(logical_matrix& s) +{ + return s.shape().extent(0); +} + +inline size_t get_n(logical_matrix& s) +{ + return s.shape().extent(1); +} + +// XXX global for the sake of simplicity, yet ... +static std::vector cublas_handle; + +cublasHandle_t get_cublas_handle() +{ + int dev; + cuda_safe_call(cudaGetDevice(&dev)); + return cublas_handle[dev]; +} + +// C = AB +void MULT_CLASSIC(context& ctx, logical_matrix& A, logical_matrix& B, logical_matrix& C) +{ + ctx.task(A.read(), B.read(), C.write()).set_symbol("MULT")->*[](cudaStream_t s, auto a, auto b, auto c) { + cuda_safe_call(cublasSetStream(get_cublas_handle(), s)); + + size_t N = a.extent(0); + + const double zero = 0.0; + const double one = 1.0; + cuda_safe_call(cublasDgemm( + get_cublas_handle(), + CUBLAS_OP_N, + CUBLAS_OP_N, + N, + N, + N, + &one, + a.data_handle(), + a.stride(1), + b.data_handle(), + b.stride(1), + &zero, + c.data_handle(), + c.stride(1))); + }; +} + +// A = A + alpha B +template +__global__ void add_kernel(int m, int n, T* A, int ld_A, T alpha, const T* B, int ld_B) +{ + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < n; idx += blockDim.x * gridDim.x) + { + for (int idy = threadIdx.y + blockIdx.y * blockDim.y; idy < m; idy += blockDim.y * gridDim.y) + { + A[idy + idx * ld_A] += alpha * B[idy + idx * ld_B]; + } + } +} + +// Compute A = A + B +template +void ADD(context& ctx, logical_matrix& A, T alpha, logical_matrix& B) +{ + ctx.task(A.rw(), B.read()).set_symbol("ADD")->*[&](cudaStream_t s, auto a, auto b) { + int m_A = a.extent(0); + int n_A = a.extent(1); + + int ld_A = a.stride(1); + int ld_B = b.stride(1); + + T* addr_A = a.data_handle(); + const T* addr_B = b.data_handle(); + + add_kernel<<<16, 16, 0, s>>>(m_A, n_A, addr_A, ld_A, alpha, addr_B, ld_B); + }; +} + +template +__global__ void copy_kernel(int m, int n, const T* src, int ld_src, T* dst, int ld_dst) +{ + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < n; idx += blockDim.x * gridDim.x) + { + for (int idy = threadIdx.y + blockIdx.y * blockDim.y; idy < m; idy += blockDim.y * gridDim.y) + { + dst[idy + idx * ld_dst] = src[idy + idx * ld_src]; + } + } +} + +// row and col = 0 or 1 +template +void COPY_TO_SUBMATRIX(context& ctx, logical_data>& A, logical_data>& subA, int row, int col) +{ + // To copy to a subset, this is a write only access, so that we did not need a valid copy for subA before ... + ctx.task(A.read(), subA.write()).set_symbol("COPY_TO")->*[&](cudaStream_t s, auto a, auto subA) { + int ld_A = a.stride(1); + int ld_subA = subA.stride(1); + int m_subA = subA.extent(0); + int n_subA = subA.extent(1); + T* addr_subA = subA.data_handle(); + const T* addr_A_base = a.data_handle(); + const T* addr_A = addr_A_base + row * m_subA + col * n_subA * ld_A; + + // subA = A_row,col + copy_kernel<<<16, 16, 0, s>>>(m_subA, n_subA, addr_A, ld_A, addr_subA, ld_subA); + }; +} + +template +void COPY_FROM_SUBMATRICES(context& ctx, logical_data>& A, logical_data> subA[2][2]) +{ + // To copy to a subset, this is a write only access, so that we did not need a valid copy for subA before ... + // When copying from a subset to the whole matrix, we need a RW because we only modify a part of the matrix + ctx.task(A.write(), subA[0][0].read(), subA[0][1].read(), subA[1][0].read(), subA[1][1].read()).set_symbol("COPY_FROM") + ->*[&](cudaStream_t s, auto a, auto a00, auto a01, auto a10, auto a11) { + int ld_A = a.stride(1); + T* addr_A_base = a.data_handle(); + + for (int col = 0; col < 2; col++) + { + for (int row = 0; row < 2; row++) + { + auto& subA = col == 0 ? (row == 0 ? a00 : a10) : (row == 0 ? a01 : a11); + int m_subA = subA.extent(0); + int n_subA = subA.extent(1); + int ld_subA = subA.stride(1); + const T* addr_subA = subA.data_handle(); + T* addr_A = addr_A_base + row * m_subA + col * n_subA * ld_A; + + // A_row,col= subA + copy_kernel<<<16, 16, 0, s>>>(m_subA, n_subA, addr_subA, ld_subA, addr_A, ld_A); + } + } + }; +} + +template +void COPY_MATRIX(context& ctx, logical_data>& dst, logical_data>& src) +{ + // This is a write only access, so that we did not need a valid copy for subA before ... + ctx.task(dst.write(), src.read()).set_symbol("COPY")->*[&](cudaStream_t s, auto d_dst, auto d_src) { + int ld_src = d_dst.stride(1); + int ld_dst = d_src.stride(1); + + auto m = d_src.extent(0); + assert(m == d_dst.extent(0)); + + auto n = d_src.extent(1); + assert(n == d_dst.extent(1)); + + const T* addr_src = d_src.data_handle(); + T* addr_dst = d_dst.data_handle(); + + copy_kernel<<<16, 16, 0, s>>>(m, n, addr_src, ld_src, addr_dst, ld_dst); + }; +} + +void MULT(context& ctx, logical_matrix& A, logical_matrix& B, logical_matrix& C); + +void MULT_REC_NAIVE(context& ctx, logical_matrix& A, logical_matrix& B, logical_matrix& C) +{ + logical_matrix subA[2][2], subB[2][2], subC[2][2]; + + size_t N = get_m(A); + + assert(get_m(A) == get_n(A)); + assert(get_m(B) == get_n(B)); + assert(get_m(C) == get_n(C)); + + assert(N % 2 == 0); + + size_t half_N = N / 2; + + // These are TMP data which don't have a valid copy yet + for (int col = 0; col < 2; col++) + { + for (int row = 0; row < 2; row++) + { + subA[row][col] = ctx.logical_data(shape_of>(half_N, half_N)); + subB[row][col] = ctx.logical_data(shape_of>(half_N, half_N)); + subC[row][col] = ctx.logical_data(shape_of>(half_N, half_N)); + + COPY_TO_SUBMATRIX(ctx, A, subA[row][col], row, col); + COPY_TO_SUBMATRIX(ctx, B, subB[row][col], row, col); + } + } + + for (int col = 0; col < 2; col++) + { + for (int row = 0; row < 2; row++) + { + for (int k = 0; k < 2; k++) + { + auto Ck = ctx.logical_data(shape_of>(half_N, half_N)); + MULT(ctx, subA[row][k], subB[k][col], Ck); + + ADD(ctx, subC[row][col], 1.0, Ck); + } + + // C_row,col = subC[row][col] + COPY_FROM_SUBMATRICES(ctx, C, subC); + } + } +} + +void MULT_STRASSEN(context& ctx, logical_matrix& A, logical_matrix& B, logical_matrix& C) +{ + /* + * STRASSEN ALGORITHM + * + * M1 = (A00 + A11)(B00 + B11) + * M2 = (A10 + A11)B00 + * M3 = A00(B01 - B11) + * M4 = A11(B10 - B00) + * M5 = (A00 + A01)B11 + * M6 = (A10 - A00)(B00 + B01) + * M7 = (A01 - A11)(B10 + B11) + * + * C00 = M1 + M4 - M5 + M7 + * C01 = M3 + M5 + * C10 = M2 + M4 + * C11 = M1 - M2 + M3 + M6 + * + */ + size_t N = get_m(A); + assert(N % 2 == 0); + size_t half_N = N / 2; + + logical_matrix subA[2][2], subB[2][2], subC[2][2]; + auto M1 = ctx.logical_data(shape_of>(half_N, half_N)); + auto M2 = ctx.logical_data(shape_of>(half_N, half_N)); + auto M3 = ctx.logical_data(shape_of>(half_N, half_N)); + auto M4 = ctx.logical_data(shape_of>(half_N, half_N)); + auto M5 = ctx.logical_data(shape_of>(half_N, half_N)); + auto M6 = ctx.logical_data(shape_of>(half_N, half_N)); + auto M7 = ctx.logical_data(shape_of>(half_N, half_N)); + + assert(get_m(A) == get_n(A)); + assert(get_m(B) == get_n(B)); + assert(get_m(C) == get_n(C)); + + // These are TMP data which don't have a valid copy yet + for (int col = 0; col < 2; col++) + { + for (int row = 0; row < 2; row++) + { + subA[row][col] = ctx.logical_data(shape_of>(half_N, half_N)); + subB[row][col] = ctx.logical_data(shape_of>(half_N, half_N)); + subC[row][col] = ctx.logical_data(shape_of>(half_N, half_N)); + + COPY_TO_SUBMATRIX(ctx, A, subA[row][col], row, col); + COPY_TO_SUBMATRIX(ctx, B, subB[row][col], row, col); + } + } + + // M1 = (A00 + A11)(B00 + B11) + { + auto left = ctx.logical_data(shape_of>(half_N, half_N)), + right = ctx.logical_data(shape_of>(half_N, half_N)); + + COPY_MATRIX(ctx, left, subA[0][0]); + ADD(ctx, left, 1.0, subA[1][1]); + + COPY_MATRIX(ctx, right, subB[0][0]); + ADD(ctx, right, 1.0, subB[1][1]); + + MULT(ctx, left, right, M1); + } + + // M2 = (A10 + A11)B00 + { + auto left = ctx.logical_data(shape_of>(half_N, half_N)); + + COPY_MATRIX(ctx, left, subA[1][0]); + ADD(ctx, left, 1.0, subA[1][1]); + + MULT(ctx, left, subB[0][0], M2); + } + + // M3 = A00(B01 - B11) + { + auto right = ctx.logical_data(shape_of>(half_N, half_N)); + + COPY_MATRIX(ctx, right, subB[0][1]); + ADD(ctx, right, -1.0, subB[1][1]); + + MULT(ctx, subA[0][0], right, M3); + } + + // M4 = A11(B10 - B00) + { + auto right = ctx.logical_data(shape_of>(half_N, half_N)); + + COPY_MATRIX(ctx, right, subB[1][0]); + ADD(ctx, right, -1.0, subB[0][0]); + + MULT(ctx, subA[1][1], right, M4); + } + + // M5 = (A00 + A01)B11 + { + auto left = ctx.logical_data(shape_of>(half_N, half_N)); + + COPY_MATRIX(ctx, left, subA[0][0]); + ADD(ctx, left, 1.0, subA[0][1]); + + MULT(ctx, left, subB[1][1], M5); + } + + // M6 = (A10 - A00)(B00 + B01) + { + auto left = ctx.logical_data(shape_of>(half_N, half_N)), + right = ctx.logical_data(shape_of>(half_N, half_N)); + + COPY_MATRIX(ctx, left, subA[1][0]); + ADD(ctx, left, -1.0, subA[1][1]); + + COPY_MATRIX(ctx, right, subB[0][0]); + ADD(ctx, right, 1.0, subB[0][1]); + + MULT(ctx, left, right, M6); + } + + // M7 = (A01 - A11)(B10 + B11) + { + auto left = ctx.logical_data(shape_of>(half_N, half_N)); + auto right = ctx.logical_data(shape_of>(half_N, half_N)); + + COPY_MATRIX(ctx, left, subA[0][1]); + ADD(ctx, left, -1.0, subA[1][1]); + + COPY_MATRIX(ctx, right, subB[1][0]); + ADD(ctx, right, 1.0, subB[1][1]); + + MULT(ctx, left, right, M7); + } + + // C00 = M1 + M4 - M5 + M7 + COPY_MATRIX(ctx, subC[0][0], M1); + ADD(ctx, subC[0][0], 1.0, M4); + ADD(ctx, subC[0][0], -1.0, M5); + ADD(ctx, subC[0][0], -1.0, M5); + ADD(ctx, subC[0][0], 1.0, M7); + + // C01 = M3 + M5 + COPY_MATRIX(ctx, subC[0][1], M3); + ADD(ctx, subC[0][1], 1.0, M5); + + // C10 = M2 + M4 + COPY_MATRIX(ctx, subC[1][0], M2); + ADD(ctx, subC[1][0], 1.0, M4); + + // C11 = M1 - M2 + M3 + M6 + COPY_MATRIX(ctx, subC[1][1], M1); + ADD(ctx, subC[1][1], -1.0, M2); + ADD(ctx, subC[1][1], 1.0, M3); + ADD(ctx, subC[1][1], 1.0, M6); + + // Write back subsets of C to C + COPY_FROM_SUBMATRICES(ctx, C, subC); +} + +void MULT(context& ctx, logical_matrix& A, logical_matrix& B, logical_matrix& C) +{ + size_t N = get_m(A); + + if (N <= BLOCKSIZE) + { + MULT_CLASSIC(ctx, A, B, C); + } + else + { + // MULT_REC_NAIVE(ctx, A, B, C); + MULT_STRASSEN(ctx, A, B, C); + } +} + +void strassen_test(context& ctx, size_t N) +{ + double* A = new double[N * N]; + double* B = new double[N * N]; + double* C = new double[N * N]; + + int ldA = N; + int ldB = N; + int ldC = N; + + cuda_safe_call(cudaHostRegister(A, N * N * sizeof(double), cudaHostRegisterPortable)); + cuda_safe_call(cudaHostRegister(B, N * N * sizeof(double), cudaHostRegisterPortable)); + cuda_safe_call(cudaHostRegister(C, N * N * sizeof(double), cudaHostRegisterPortable)); + + for (int col = 0; col < N; col++) + { + for (int row = 0; row < N; row++) + { + A[row + N * col] = 1.0; + B[row + N * col] = -1.0; + C[row + N * col] = 0.0; + } + } + + auto descA = ctx.logical_data(make_slice(A, std::tuple{N, N}, ldA)), + descB = ctx.logical_data(make_slice(B, std::tuple{N, N}, ldB)), + descC = ctx.logical_data(make_slice(C, std::tuple{N, N}, ldC)); + descA.set_symbol("A"); + descB.set_symbol("B"); + descC.set_symbol("C"); + + std::chrono::steady_clock::time_point start, stop; + + ctx.host_launch(descC.read())->*[&](auto /* ignored */) { + start = std::chrono::steady_clock::now(); + }; + + MULT(ctx, descA, descB, descC); + + ctx.host_launch(descC.read())->*[&](auto /* ignored */) { + stop = std::chrono::steady_clock::now(); + }; + + ctx.finalize(); + + std::chrono::duration duration = stop - start; + fprintf(stderr, "Elapsed: %.2lf ms\n", duration.count() * 1000.0); +} + +int main(int argc, char** argv) +{ + long N = 2 * BLOCKSIZE; + + if (argc > 1) + { + N = atoi(argv[1]); + } + + bool use_graphs = false; + if (argc > 2) + { + use_graphs = (atoi(argv[2]) > 0); + } + + // Set up CUBLAS + int ndevs; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + cublas_handle.resize(ndevs); + for (int d = 0; d < ndevs; d++) + { + cuda_safe_call(cudaSetDevice(d)); + cuda_safe_call(cublasCreate(&cublas_handle[d])); + } + + cuda_safe_call(cudaSetDevice(0)); + + context ctx; + if (use_graphs) + { + ctx = graph_ctx(); + } + + strassen_test(ctx, N); +} diff --git a/cudax/examples/stf/logical_gates_composition.cu b/cudax/examples/stf/logical_gates_composition.cu new file mode 100644 index 00000000000..1b58836c926 --- /dev/null +++ b/cudax/examples/stf/logical_gates_composition.cu @@ -0,0 +1,73 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Composition of boolean operations applied on logical data + */ + +#include + +using namespace cuda::experimental::stf; + +// z = AND(x,y) +logical_data> AND(context& ctx, logical_data> x, logical_data> y) +{ + assert(x.shape().size() == y.shape().size()); + + auto z = ctx.logical_data(x.shape()); + + std::string symbol = "(" + x.get_symbol() + " & " + y.get_symbol() + ")"; + z.set_symbol(symbol); + + ctx.parallel_for(z.shape(), x.read(), y.read(), z.write()).set_symbol("AND")->* + [] __device__(size_t i, auto dx, auto dy, auto dz) { + dz(i) = dx(i) & dy(i); + }; + + return z; +} + +// y = NOT(x) +logical_data> NOT(context& ctx, logical_data> x) +{ + auto y = ctx.logical_data(x.shape()); + + std::string symbol = "( !" + x.get_symbol() + ")"; + y.set_symbol(symbol); + + ctx.parallel_for(y.shape(), x.read(), y.write()).set_symbol("NOT")->*[] __device__(size_t i, auto dx, auto dy) { + dy(i) = ~dx(i); + }; + + return y; +} + +int main() +{ + const size_t n = 12; + + int X[n], Y[n], Z[n]; + + context ctx; + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + auto lZ = ctx.logical_data(Z); + + lX.set_symbol("X"); + lY.set_symbol("Y"); + lZ.set_symbol("Z"); + + auto lB = AND(ctx, AND(ctx, lX, lY), AND(ctx, lX, lZ)); + auto lC = AND(ctx, NOT(ctx, AND(ctx, lB, NOT(ctx, lY))), lX); + + ctx.finalize(); +} diff --git a/cudax/examples/stf/mandelbrot.cu b/cudax/examples/stf/mandelbrot.cu new file mode 100644 index 00000000000..da56ba2d52a --- /dev/null +++ b/cudax/examples/stf/mandelbrot.cu @@ -0,0 +1,129 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief A transparent multi-GPU implementation of Mandelbrot fractal using parallel_for + */ + +#include + +#include +#include + +using namespace cuda::experimental::stf; + +int main(int argc, char** argv) +{ + context ctx; + + // Image dimensions + size_t width = 2000; + size_t height = 1000; + + // Complex plane boundaries + double xMin = -2.0; + double xMax = 1.0; + double yMin = -1.5; + double yMax = 1.5; + + // Maximum number of iterations + int maxIterations = 256; + + // Describe a 2D array of integers of size (width x heigth) + auto lbuffer = ctx.logical_data(shape_of>(width, height)); + + cudaEvent_t start, stop; + cuda_safe_call(cudaEventCreate(&start)); + cuda_safe_call(cudaEventCreate(&stop)); + + cuda_safe_call(cudaEventRecord(start, ctx.task_fence())); + + // Compute each pixel + ctx.parallel_for(blocked_partition(), exec_place::all_devices(), lbuffer.shape(), lbuffer.write()) + ->*[=] _CCCL_DEVICE(size_t x, size_t y, auto buffer) { + // Map pixel coordinates to complex plane + // c = cr + i ci + double cr = x * (xMax - xMin) / width + xMin; + double ci = y * (yMax - yMin) / height + yMin; + + // z = zr + i zi + double zr = 0.0; + double zi = 0.0; + + int iterations = 0; + + // Evaluate depth + while (zr * zr + zi * zi < 4 && iterations < maxIterations) + { + // compute : z = z * z + c; + // + // z = (zr + i zi) (zr + i zi) + cr + i ci + // z = zr zr - zi zi + 2 i zi zr + cr + i ci + // zr = (zr zr - zi zi + cr) + // zi = (2 zi zr + ci) + double zr_prev = zr; + double zi_prev = zi; + zr = zr_prev * zr_prev - zi_prev * zi_prev + cr; + zi = 2.0 * zr_prev * zi_prev + ci; + + iterations++; + } + + buffer(x, y) = iterations; + }; + + cuda_safe_call(cudaEventRecord(stop, ctx.task_fence())); + + if (argc > 1) + { + auto fileName = std::string(argv[1]); + + // Generate a PPM file from the buffer + ctx.host_launch(lbuffer.read())->*[&](auto buffer) { + std::ofstream imageFile(fileName, std::ios::binary); + if (!imageFile) + { + std::cerr << "Failed to create image file: " << fileName << std::endl; + return; + } + + imageFile << "P6\n"; + imageFile << width << " " << height << "\n"; + imageFile << "255\n"; + + for (size_t y = 0; y < height; y++) + { + for (size_t x = 0; x < width; x++) + { + int iterations = buffer(x, y); + // Convert iterations to RGB values + unsigned char r = (iterations % 8) * 32; + unsigned char g = (iterations % 16) * 16; + unsigned char b = (iterations % 32) * 8; + + // Write pixel data to file + imageFile << r << g << b; + } + } + + imageFile.close(); + std::cout << "Mandelbrot image generated and saved as " << fileName << std::endl; + }; + } + + ctx.finalize(); + + // Must call this first, see e.g. + // https://stackoverflow.com/questions/6551121/cuda-cudaeventelapsedtime-returns-device-not-ready-error + cuda_safe_call(cudaEventSynchronize(stop)); + + fprintf(stderr, "Mandelbrot took %.2f ms\n", cuda_try(start, stop)); +} diff --git a/cudax/examples/stf/parallel_for_2D.cu b/cudax/examples/stf/parallel_for_2D.cu new file mode 100644 index 00000000000..42cc2ccead0 --- /dev/null +++ b/cudax/examples/stf/parallel_for_2D.cu @@ -0,0 +1,73 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief An example using parallel_for on shapes with different dimensions + */ + +#include + +using namespace cuda::experimental::stf; + +__host__ __device__ double x0(size_t i, size_t j) +{ + return sin((double) (i - j)); +} + +__host__ __device__ double y0(size_t i, size_t j) +{ + return cos((double) (i + j)); +} + +int main() +{ + context ctx; + + const size_t N = 16; + double X[2 * N * 2 * N]; + double Y[N * N]; + + auto lx = ctx.logical_data(make_slice(&X[0], std::tuple{2 * N, 2 * N}, 2 * N)); + auto ly = ctx.logical_data(make_slice(&Y[0], std::tuple{N, N}, N)); + + ctx.parallel_for(lx.shape(), lx.write())->*[=] _CCCL_DEVICE(size_t i, size_t j, auto sx) { + sx(i, j) = x0(i, j); + }; + + ctx.parallel_for(ly.shape(), lx.read(), ly.write())->*[=] _CCCL_DEVICE(size_t i, size_t j, auto sx, auto sy) { + sy(i, j) = y0(i, j); + for (size_t ii = 0; ii < 2; ii++) + { + for (size_t jj = 0; jj < 2; jj++) + { + sy(i, j) += sx(2 * i + ii, 2 * j + jj); + } + } + }; + + ctx.parallel_for(exec_place::host, ly.shape(), ly.read())->*[=] __host__(size_t i, size_t j, slice sy) { + double expected = y0(i, j); + for (size_t ii = 0; ii < 2; ii++) + { + for (size_t jj = 0; jj < 2; jj++) + { + expected += x0(2 * i + ii, 2 * j + jj); + } + } + + if (fabs(sy(i, j) - expected) > 0.001) + { + printf("sy(%zu, %zu) %f expect %f\n", i, j, sy(i, j), expected); + } + }; + + ctx.finalize(); +} diff --git a/cudax/examples/stf/scan.cu b/cudax/examples/stf/scan.cu new file mode 100644 index 00000000000..46e9182bf97 --- /dev/null +++ b/cudax/examples/stf/scan.cu @@ -0,0 +1,204 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief A parallel scan algorithm using CUB kernels + * + */ + +#include // or equivalently + +#include + +using namespace cuda::experimental::stf; + +__host__ __device__ double X0(int i) +{ + return sin((double) i); +} + +/** + * @brief Performs an inclusive scan on a logical data slice using CUB. + * + * This function determines the temporary device storage requirements for a scan, allocates + * temporary storage, and then performs the scan using the CUB library. The scan is performed + * in place, modifying the input `logical_data` slice. + * + * @tparam Ctx The context type for data management and task execution. + * @tparam T The data type of the elements in the `logical_data` slice. + * + * @param ctx Reference to the context object. + * @param ld Reference to the `logical_data` object containing the data slice. + * @param dp The `data_place` enum specifying where the data should reside (e.g., CPU, GPU). + */ +template +void scan(Ctx& ctx, logical_data>& ld, data_place dp) +{ + // Determine temporary device storage requirements + auto num_items = int(ld.shape().size()); + size_t tmp_size = 0; + cub::DeviceScan::InclusiveSum(nullptr, tmp_size, (T*) nullptr, (T*) nullptr, num_items); + + // fprintf(stderr, "SCAN %ld items TMP = %ld bytes\n", num_items, tmp_size); + + logical_data> ltmp = ctx.logical_data(shape_of>(tmp_size)).set_symbol("tmp"); + + ctx.task(ld.rw(mv(dp)), ltmp.write()).set_symbol("scan " + ld.get_symbol()) + ->*[=](cudaStream_t stream, auto d, auto tmp) mutable { + T* buffer = d.data_handle(); + cub::DeviceScan::InclusiveSum(tmp.data_handle(), tmp_size, buffer, buffer, num_items, stream); + }; +} + +int main(int argc, char** argv) +{ + stream_ctx ctx; + // graph_ctx ctx; + + // const size_t N = 128ULL*1024ULL*1024ULL; + size_t nmb = 128; + if (argc > 1) + { + nmb = atoi(argv[1]); + } + + int check = 0; + if (argc > 2) + { + check = atoi(argv[2]); + } + + const size_t N = nmb * 1024ULL * 1024ULL; + + const int ndevs = cuda_try(); + const size_t NBLOCKS = 2 * ndevs; + + size_t BLOCK_SIZE = (N + NBLOCKS - 1) / NBLOCKS; + + auto fixed_alloc = block_allocator(ctx, BLOCK_SIZE * sizeof(double)); + ctx.set_allocator(fixed_alloc); + + // dummy task to initialize the allocator XXX + { + auto ldummy = ctx.logical_data(shape_of>(NBLOCKS)).set_symbol("dummy"); + ctx.task(ldummy.write(data_place::managed))->*[](cudaStream_t, auto) {}; + } + + std::vector X(N); + std::vector>> lX(NBLOCKS); + logical_data> laux; + + // If we were to register each part one by one, there could be pages which + // cross multiple parts, and the pinning operation would fail. + cuda_safe_call(cudaHostRegister(&X[0], N * sizeof(double), cudaHostRegisterPortable)); + + for (size_t b = 0; b < NBLOCKS; b++) + { + size_t start = b * BLOCK_SIZE; + size_t end = std::min(start + BLOCK_SIZE, N); + lX[b] = ctx.logical_data(&X[start], {end - start}).set_symbol("X_" + std::to_string(b)); + + // No need to move this back to the host if we do not check the result + if (!check) + { + lX[b].set_write_back(false); + } + } + + for (size_t b = 0; b < NBLOCKS; b++) + { + cuda_safe_call(cudaSetDevice(b % ndevs)); + size_t start = b * BLOCK_SIZE; + ctx.parallel_for(lX[b].shape(), lX[b].write())->*[=] _CCCL_DEVICE(size_t i, auto lx) { + lx(i) = X0(i + start); + }; + } + + cuda_safe_call(cudaStreamSynchronize(ctx.task_fence())); + + cudaEvent_t start, stop; + cuda_safe_call(cudaEventCreate(&start)); + cuda_safe_call(cudaEventCreate(&stop)); + cuda_safe_call(cudaEventRecord(start, ctx.task_fence())); + + for (size_t k = 0; k < 100; k++) + { + // Create an auxiliary temporary buffer and blank it + laux = ctx.logical_data(shape_of>(NBLOCKS)).set_symbol("aux"); + ctx.parallel_for(laux.shape(), laux.write(data_place::managed)).set_symbol("init_aux") + ->*[] _CCCL_DEVICE(size_t i, auto aux) { + aux(i) = 0.0; + }; + + // Scan each block + for (size_t b = 0; b < NBLOCKS; b++) + { + cuda_safe_call(cudaSetDevice(b % ndevs)); + scan(ctx, lX[b], data_place::device(b % ndevs)); + } + + for (size_t b = 0; b < NBLOCKS; b++) + { + // cuda_safe_call(cudaSetDevice(b % ndevs)); + + ctx.parallel_for(exec_place::device(0), + box({b, b + 1}), + lX[b].read(data_place::device(b % ndevs)), + laux.rw(data_place::managed)) + .set_symbol("store sum X_" + std::to_string(b)) + ->*[] _CCCL_DEVICE(size_t ind, auto Xb, auto aux) { + aux(ind) = Xb(Xb.extent(0) - 1); + }; + } + + // Prefix sum of the per-block sums + scan(ctx, laux, data_place::managed); + + // Add partial sum of Xi to X(i+1) + for (size_t b = 1; b < NBLOCKS; b++) + { + cuda_safe_call(cudaSetDevice(b % ndevs)); + ctx.parallel_for(lX[b].shape(), lX[b].rw(), laux.read(data_place::managed)) + .set_symbol("add X_" + std::to_string(b)) + ->*[=] _CCCL_DEVICE(size_t i, auto Xb, auto aux) { + Xb(i) += aux(b - 1); + }; + } + } + + cuda_safe_call(cudaEventRecord(stop, ctx.task_fence())); + + ctx.finalize(); + + float ms = 0; + cuda_safe_call(cudaEventElapsedTime(&ms, start, stop)); + + fprintf(stdout, "%zu %f ms\n", N / 1024 / 1024, ms); + + if (check) + { +#if 0 + for (size_t i = 0; i < N; i++) { + EXPECT(fabs(X[i] - expected_result[i]) < 0.00001); + } +#endif + +#if 1 + fprintf(stderr, "Checking result ...\n"); + EXPECT(fabs(X[0] - X0(0)) < 0.00001); + for (size_t i = 0; i < N; i++) + { + EXPECT(fabs(X[i] - X[i - 1] - X0(i)) < 0.00001); + } + } +#endif +} diff --git a/cudax/examples/stf/standalone-launches.cu b/cudax/examples/stf/standalone-launches.cu new file mode 100644 index 00000000000..7717ab3925c --- /dev/null +++ b/cudax/examples/stf/standalone-launches.cu @@ -0,0 +1,77 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief This test illustrates how we can use multiple reserved::launch in a single task on different pieces of data + */ + +#include + +using namespace cuda::experimental::stf; + +int X0(int i) +{ + return i * i + 12; +} + +int main() +{ + stream_ctx ctx; + + const int N = 16; + int X[N], Y[N], Z[N]; + + for (size_t ind = 0; ind < N; ind++) + { + X[ind] = X0(ind); + Y[ind] = 0; + Z[ind] = 0; + } + + auto handle_X = ctx.logical_data(X, {N}); + auto handle_Y = ctx.logical_data(Y, {N}); + auto handle_Z = ctx.logical_data(Z, {N}); + + ctx.task(handle_X.read(), handle_Y.write(), handle_Z.write()) + ->*[](cudaStream_t s, slice x, slice y, slice z) { + std::vector streams; + streams.push_back(s); + auto spec = par(1024); + reserved::launch(spec, exec_place::current_device(), streams, std::tuple{x, y}) + ->*[] _CCCL_DEVICE(auto t, slice x, slice y) { + size_t tid = t.rank(); + size_t nthreads = t.size(); + for (size_t ind = tid; ind < N; ind += nthreads) + { + y(ind) = 2 * x(ind); + } + }; + + reserved::launch(spec, exec_place::current_device(), streams, std::tuple{y, z}) + ->*[] _CCCL_DEVICE(auto t, slice y, slice z) { + size_t tid = t.rank(); + size_t nthreads = t.size(); + for (size_t ind = tid; ind < N; ind += nthreads) + { + z(ind) = 3 * y(ind); + } + }; + }; + + ctx.finalize(); + + for (size_t ind = 0; ind < N; ind++) + { + assert(Y[ind] == 2 * X[ind]); + assert(Z[ind] == 3 * Y[ind]); + } +} diff --git a/cudax/examples/stf/thrust_zip_iterator.cu b/cudax/examples/stf/thrust_zip_iterator.cu new file mode 100644 index 00000000000..14fa5deed3a --- /dev/null +++ b/cudax/examples/stf/thrust_zip_iterator.cu @@ -0,0 +1,135 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief This example illustrates how we can convert Thrust iterators to + * logical data, and how to create thrust iterators from data instances in a + * task. + */ + +#include +#include +#include +#include +#include + +#include + +#include + +using namespace cuda::experimental::stf; + +// Functor to apply the transformation +struct my_transform_functor +{ + __host__ __device__ int operator()(const thrust::tuple& t) const + { + int a = thrust::get<0>(t); + char b = thrust::get<1>(t); + return a + static_cast(b); // Example operation + } +}; + +/* + * How to use CUDASTF to manipulate data originally created using Thrust + */ +template +void thrust_algorithm(context& ctx, ZippedIt& first, ZippedIt& last, OutIt& output, data_place data_location) +{ + /* + * Interpret Thrust data structures as logical data + */ + size_t num_elements = thrust::distance(first, last); + + // Extract underlying iterators from the zip iterator + auto itA = thrust::get<0>(first.get_iterator_tuple()); + int* A = thrust::raw_pointer_cast(&(*itA)); + + auto itB = thrust::get<1>(first.get_iterator_tuple()); + char* B = thrust::raw_pointer_cast(&(*itB)); + + int* C = thrust::raw_pointer_cast(output.data()); + + auto lA = ctx.logical_data(make_slice(A, num_elements), data_location); + auto lB = ctx.logical_data(make_slice(B, num_elements), data_location); + auto lC = ctx.logical_data(make_slice(C, num_elements), data_location); + + /* Important : result C will only be valid once we finalize the context or introduce a task fence ! */ + ctx.task(lA.read(), lB.read(), lC.write())->*[](cudaStream_t stream, auto dA, auto dB, auto dC) { + // Reconstruct a zipped iterator from the data instances passed to the lambda function + size_t num_elements = dA.size(); + auto dfirst = thrust::make_zip_iterator(thrust::make_tuple(dA.data_handle(), dB.data_handle())); + auto dlast = dfirst + num_elements; + + // Create a device pointer from the raw pointer + thrust::device_ptr dout = thrust::device_pointer_cast(dC.data_handle()); + + thrust::transform(thrust::cuda::par_nosync.on(stream), dfirst, dlast, dout, my_transform_functor()); + }; +} + +int main() +{ + context ctx; + + /* + * First create device vectors and zipped them + */ + thrust::device_vector A(3); + thrust::device_vector B(3); + thrust::device_vector C(3); + + A[0] = 10; + A[1] = 20; + A[2] = 30; + B[0] = 'x'; + B[1] = 'y'; + B[2] = 'z'; + + auto first = thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin())); + auto last = thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end())); + + thrust_algorithm(ctx, first, last, C, data_place::current_device()); + + /* + * Use host data, and rely on CUDASTF for transfers + */ + + thrust::host_vector hA(3); + thrust::host_vector hB(3); + thrust::host_vector hC(3); + + hA[0] = 10; + hA[1] = 20; + hA[2] = 30; + hB[0] = 'x'; + hB[1] = 'y'; + hB[2] = 'z'; + + auto hfirst = thrust::make_zip_iterator(thrust::make_tuple(hA.begin(), hB.begin())); + auto hlast = thrust::make_zip_iterator(thrust::make_tuple(hA.end(), hB.end())); + + thrust_algorithm(ctx, hfirst, hlast, hC, data_place::host); + + /* Before this, we cannot assume that the Thrust algorithms have been + * performed and/or that the results have been written back to their original + * location. */ + ctx.finalize(); + + // Check results + for (size_t i = 0; i < 3; i++) + { + EXPECT(C[i] == (A[i] + static_cast(B[i]))); + EXPECT(hC[i] == (hA[i] + static_cast(hB[i]))); + } + + return 0; +} diff --git a/cudax/examples/stf/void_data_interface.cu b/cudax/examples/stf/void_data_interface.cu new file mode 100644 index 00000000000..72ac76e6fe1 --- /dev/null +++ b/cudax/examples/stf/void_data_interface.cu @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Illustrate how to use the void data interface + * + */ + +#include + +using namespace cuda::experimental::stf; + +int main() +{ + context ctx; + + auto ltask_res = ctx.logical_data(shape_of()); + ctx.task(ltask_res.write())->*[](cudaStream_t, auto) { + + }; + + void_interface sync; + auto ltask2_res = ctx.logical_data(sync); + ctx.task(ltask2_res.write(), ltask_res.read())->*[](cudaStream_t, auto, auto) { + + }; + + ctx.finalize(); +} diff --git a/cudax/examples/stf/word_count.cu b/cudax/examples/stf/word_count.cu new file mode 100644 index 00000000000..0e1128e76af --- /dev/null +++ b/cudax/examples/stf/word_count.cu @@ -0,0 +1,96 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Counting words in a text using a launch kernel + */ + +#include + +using namespace cuda::experimental::stf; + +// determines whether the character is alphabetical +__host__ __device__ bool is_alpha(const char c) +{ + return (c >= 'A' && c <= 'z'); +} + +int main() +{ + // Paragraph from 'The Raven' by Edgar Allan Poe + // http://en.wikipedia.org/wiki/The_Raven + const char raw_input[] = + " But the raven, sitting lonely on the placid bust, spoke only,\n" + " That one word, as if his soul in that one word he did outpour.\n" + " Nothing further then he uttered - not a feather then he fluttered -\n" + " Till I scarcely more than muttered `Other friends have flown before -\n" + " On the morrow he will leave me, as my hopes have flown before.'\n" + " Then the bird said, `Nevermore.'\n"; + + // std::cout << "Text sample:" << std::endl; + // std::cout << raw_input << std::endl; + + // fprintf(stderr, "TEXT size : %ld\n", sizeof(raw_input)); + + context ctx; + + auto ltext = ctx.logical_data(const_cast(&raw_input[0]), {sizeof(raw_input)}); + + int cnt = 0; + auto lcnt = ctx.logical_data(&cnt, {1}); + + auto number_devices = 2; + auto all_devs = exec_place::repeat(exec_place::device(0), number_devices); + + auto spec = par(con(128)); + + ctx.launch(spec, all_devs, ltext.read(), lcnt.rw())->*[] _CCCL_DEVICE(auto th, auto text, auto cnt) { + int local_cnt = 0; + for (size_t i = th.rank(); i < text.size() - 1; i += th.size()) + { + /* If the thread encounters the beginning of a new word, increment + * its local counter */ + if (!is_alpha(text(i)) && is_alpha(text(i + 1))) + { + local_cnt++; + } + } + + // Get a piece of shared memory, and zero it + __shared__ int block_cnt; + block_cnt = 0; + th.inner().sync(); + + // In every block, partial sums are gathered, and added to the result + // by the first thread of the block. + atomicAdd(&block_cnt, local_cnt); + th.inner().sync(); + + if (th.inner().rank() == 0) + { + atomicAdd(&cnt(0), block_cnt); + } + }; + + ctx.finalize(); + + int ref_cnt = 0; + for (size_t i = 0; i < sizeof(raw_input) - 1; i++) + { + if (!is_alpha(raw_input[i]) && is_alpha(raw_input[i + 1])) + { + ref_cnt++; + } + } + + // fprintf(stderr, "Result : found %d words (expected %d)\n", cnt, ref_cnt); + EXPECT(cnt == ref_cnt); +} diff --git a/cudax/include/cuda/experimental/__stf/allocators/adapters.cuh b/cudax/include/cuda/experimental/__stf/allocators/adapters.cuh new file mode 100644 index 00000000000..fce719de5e6 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/allocators/adapters.cuh @@ -0,0 +1,223 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** @file + * + * @brief Methods to adapt allocators to rely on a third-party allocator + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief Allocator which defers allocations to the CUDA asynchronous memory + * allocations APIs (cudaMallocAsync, cudaFreeAsync) + * + * This can be used as an alternative in CUDA graphs to avoid creating CUDA + * graphs with a large memory footprints. + */ +class stream_adapter +{ + /** + * @brief Description of an allocated buffer + */ + struct raw_buffer + { + raw_buffer(void* ptr_, size_t sz_, data_place memory_node_) + : ptr(ptr_) + , sz(sz_) + , memory_node(memory_node_) + {} + + void* ptr; + size_t sz; + data_place memory_node; + }; + + /** + * @brief allocator interface created within the stream_adapter + */ + class adapter_allocator : public block_allocator_interface + { + public: + adapter_allocator(cudaStream_t stream_, stream_adapter* sa_) + : stream(stream_) + , sa(sa_) + {} + + // these are events from a graph_ctx + void* allocate( + backend_ctx_untyped&, const data_place& memory_node, ::std::ptrdiff_t& s, event_list& /* prereqs */) override + { + // prereqs are unchanged + + void* result; + EXPECT(!memory_node.is_composite()); + + if (memory_node == data_place::host) + { + cuda_safe_call(cudaMallocHost(&result, s)); + } + else if (memory_node == data_place::managed) + { + cuda_safe_call(cudaMallocManaged(&result, s)); + } + else + { + const int prev_dev_id = cuda_try(); + // (Note device_ordinal works with green contexts as well) + const int target_dev_id = device_ordinal(memory_node); + + if (memory_node.is_green_ctx()) + { + fprintf(stderr, + "Pretend we use cudaMallocAsync on green context (using device %d in reality)\n", + device_ordinal(memory_node)); + } + + if (prev_dev_id != target_dev_id) + { + cuda_safe_call(cudaSetDevice(target_dev_id)); + } + + SCOPE(exit) + { + if (target_dev_id != prev_dev_id) + { + cuda_safe_call(cudaSetDevice(prev_dev_id)); + } + }; + + cuda_safe_call(cudaMallocAsync(&result, s, stream)); + } + + return result; + } + + void deallocate( + backend_ctx_untyped&, const data_place& memory_node, event_list& /* prereqs */, void* ptr, size_t sz) override + { + // Do not deallocate buffers, this is done later after + sa->to_free.emplace_back(ptr, sz, memory_node); + // Prereqs are unchanged + } + + event_list deinit(backend_ctx_untyped&) override + { + // no op + return event_list(); + } + + ::std::string to_string() const override + { + return "stream_adapter"; + } + + private: + cudaStream_t stream; + stream_adapter* sa; + }; + +public: + template + stream_adapter(context_t& ctx, cudaStream_t stream_ /*, block_allocator_untyped root_allocator_*/) + : stream(stream_) /*, root_allocator(mv(root_allocator_))*/ + { + alloc = block_allocator(ctx, stream, this); + } + + /** + * @brief Free resources allocated by the stream_adapter object + */ + void clear() + { + // We avoid changing device around every CUDA API call, so we will only + // change it when necessary, and restore the current device at the end + // of the loop. + const int prev_dev_id = cuda_try(); + int current_dev_id = prev_dev_id; + + // No need to wait for the stream multiple times + bool stream_was_synchronized = false; + + for (auto& b : to_free) + { + if (b.memory_node == data_place::host) + { + if (!stream_was_synchronized) + { + cuda_safe_call(cudaStreamSynchronize(stream)); + stream_was_synchronized = true; + } + cuda_safe_call(cudaFreeHost(b.ptr)); + } + else if (b.memory_node == data_place::managed) + { + if (!stream_was_synchronized) + { + cuda_safe_call(cudaStreamSynchronize(stream)); + stream_was_synchronized = true; + } + cuda_safe_call(cudaFree(b.ptr)); + } + else + { + // (Note device_ordinal works with green contexts as well) + int target_dev_id = device_ordinal(b.memory_node); + if (current_dev_id != target_dev_id) + { + cuda_safe_call(cudaSetDevice(target_dev_id)); + current_dev_id = target_dev_id; + } + + cuda_safe_call(cudaFreeAsync(b.ptr, stream)); + } + } + + if (current_dev_id != prev_dev_id) + { + cuda_safe_call(cudaSetDevice(prev_dev_id)); + } + + to_free.clear(); + } + + /** + * @brief Get the underlying block allocator so that we can set the + * allocator of a context, or a logical data + */ + auto& allocator() + { + return alloc; + } + +private: + cudaStream_t stream; + + block_allocator_untyped alloc; + + ::std::vector to_free; +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/allocators/block_allocator.cuh b/cudax/include/cuda/experimental/__stf/allocators/block_allocator.cuh new file mode 100644 index 00000000000..3eaf9090f9a --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/allocators/block_allocator.cuh @@ -0,0 +1,186 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** @file + * + * @brief Facilities to define specific allocators + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include + +namespace cuda::experimental::stf +{ + +class backend_ctx_untyped; + +/** + * @brief Interface for block allocator + * + * This class provides an interface for a block allocator. It defines methods for allocation, deallocation, + * initialization, and printing of information. + */ +class block_allocator_interface +{ +public: + /** + * @brief Default constructor + */ + block_allocator_interface() = default; + + /** + * @brief Virtual destructor + */ + virtual ~block_allocator_interface() = default; + + /** + * @brief Asynchronous allocation of memory + * + * This method asynchronously allocates `*s` bytes on the specified memory node. + * Upon success, `*ptr` will contain the address of the allocated buffer. + * If the allocation fails, `*s` will contain a negative value. + * + * @param ctx Untyped backend context + * @param memory_node Memory node where the allocation should occur + * @param s Pointer to the size of the allocation + * @param prereqs List of events that should finish before the allocation starts + * @return void* Pointer to the allocated memory + */ + virtual void* + allocate(backend_ctx_untyped&, const data_place& memory_node, ::std::ptrdiff_t& s, event_list& prereqs) = 0; + + /** + * @brief Deallocation of memory + * + * This method deallocates memory previously allocated by the allocate method. + * + * @param ctx Untyped backend context + * @param memory_node Memory node where the deallocation should occur + * @param prereqs List of events that should finish before the deallocation starts + * @param ptr Pointer to the memory to be deallocated + * @param sz Size of the memory to be deallocated + */ + virtual void + deallocate(backend_ctx_untyped&, const data_place& memory_node, event_list& prereqs, void* ptr, size_t sz) = 0; + + /** + * @brief Deinitialization of the allocator + * @return List of events indicating that this was deinitialized + */ + virtual event_list deinit(backend_ctx_untyped&) = 0; + + /** + * @brief String representation of the allocator + * + * This method returns a string representation of the allocator. + * + * @return std::string String representation of the allocator + */ + virtual ::std::string to_string() const = 0; + + /** + * @brief Print information about the allocator + * + * This method prints information about the allocator to stderr. + */ + virtual void print_info() const + { + const auto s = to_string(); + fprintf(stderr, "No additional info for allocator of kind \"%.*s\".\n", static_cast(s.size()), s.data()); + } +}; + +/** + * @brief Type-erased counterpart to block_allocator + */ +class block_allocator_untyped +{ +public: + template + friend class block_allocator; + + block_allocator_untyped() = default; + + block_allocator_untyped(::std::shared_ptr ptr) + : pimpl(mv(ptr)) + {} + + template + block_allocator_untyped(ctx_t& ctx, ::std::shared_ptr ptr) + : pimpl(mv(ptr)) + { + ctx.attach_allocator(pimpl); + } + + void* allocate(backend_ctx_untyped& bctx, const data_place& memory_node, ::std::ptrdiff_t& s, event_list& prereqs) + { + return pimpl->allocate(bctx, memory_node, s, prereqs); + } + + void deallocate(backend_ctx_untyped& bctx, const data_place& memory_node, event_list& prereqs, void* ptr, size_t sz) + { + return pimpl->deallocate(bctx, memory_node, prereqs, ptr, sz); + } + + event_list deinit(backend_ctx_untyped& bctx) + { + return pimpl->deinit(bctx); + } + + ::std::string to_string() const + { + return pimpl->to_string(); + } + + explicit operator bool() const + { + return pimpl != nullptr; + } + +private: + ::std::shared_ptr pimpl; +}; + +/** + * @brief Handle to an allocator that is attached to a context + * + * When a block_allocator is constructed, it is attached to the context passed + * as the first argument. The allocator will be detached when the context is + * finalized (ie. deinit will be called). + */ +template +class block_allocator : public block_allocator_untyped +{ +public: + template + block_allocator(ctx_t& ctx, Args... args) + : block_allocator_untyped(ctx, ::std::make_shared(::std::forward(args)...)) + {} + + block_allocator(::std::shared_ptr ptr) + : block_allocator_untyped(mv(ptr)) + {} +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/allocators/buddy_allocator.cuh b/cudax/include/cuda/experimental/__stf/allocators/buddy_allocator.cuh new file mode 100644 index 00000000000..4978bdb01bc --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/allocators/buddy_allocator.cuh @@ -0,0 +1,366 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** @file + * + * @brief Buddy allocator implementation + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include + +namespace cuda::experimental::stf +{ + +namespace reserved +{ + +/** + * @brief Buddy allocator for one data place + * + * It does not manipulate memory at all, but returns offsets within some memory space of size "size" + * + * We (currently) assume that the size is a power of 2 + */ +class buddy_allocator_metadata +{ +private: + /** + * @brief Describes an available piece of memory + */ + struct avail_block + { + // Copy prereqs + avail_block(size_t index_, event_list prereqs_) + : index(index_) + , prereqs(mv(prereqs_)) + {} + + size_t index = 0; // location of the block + event_list prereqs; // dependencies to use that block + }; + +public: + buddy_allocator_metadata(size_t size, event_list init_prereqs) + { + total_size_ = next_power_of_two(size); + assert(total_size_ == size); + + max_level_ = int_log2(total_size_); + + free_lists_.resize(max_level_ + 1); + + // Initially, the whole memory is free, but depends on init_prereqs + free_lists_[max_level_].emplace_back(0, init_prereqs); + } + + ::std::ptrdiff_t allocate(size_t size, event_list& prereqs) + { + size = next_power_of_two(size); + size_t level = int_log2(size); + + if (level > max_level_) + { + fprintf(stderr, "Level %zu > max level %zu\n", level, max_level_); + return -1; + } + + ::std::ptrdiff_t alloc_index = find_free_block(level, prereqs); + if (alloc_index == -1) + { + fprintf(stderr, "No free block available for size %zu\n", size); + return -1; + } + + return alloc_index; + } + + void deallocate(::std::ptrdiff_t index, size_t size, event_list& prereqs) + { + size = next_power_of_two(size); + size_t level = int_log2(size); + + // Deallocated blocks will depend on these, and we will merge the + // previous dependencies when merging buddies + event_list block_prereqs(prereqs); + + while (level < max_level_) + { + size_t buddy_index = get_buddy_index(index, level); + auto& buddy_list = free_lists_[level]; + auto it = ::std::find_if(buddy_list.begin(), buddy_list.end(), [buddy_index](const avail_block& block) { + return block.index == buddy_index; + }); + + if (it == buddy_list.end()) + { + // No buddy available to merge, stop here + break; + } + // Merge with buddy + block_prereqs.merge(it->prereqs); + + buddy_list.erase(it); + index = ::std::min(index, ::std::ptrdiff_t(buddy_index)); + level++; + } + + free_lists_[level].emplace_back(index, prereqs); + } + + void deinit(event_list& prereqs) + { + for (auto& level : free_lists_) + { + for (auto& block : level) + { + prereqs.merge(block.prereqs); + block.prereqs.clear(); + } + } + } + + void debug_print() const + { + size_t power = 1; + for (size_t i = 0; i <= max_level_; ++i, power *= 2) + { + if (!free_lists_[i].empty()) + { + fprintf(stderr, "Level %zu : %s bytes : ", i, pretty_print_bytes(power).c_str()); + for (const auto& b : free_lists_[i]) + { + fprintf(stderr, "[%zu, %zu[ ", b.index, b.index + power); + } + fprintf(stderr, "\n"); + } + } + } + +private: + static size_t next_power_of_two(size_t size) + { + if (size == 0) + { + return 1; + } + size_t power = 1; + while (power < size) + { + power *= 2; + } + return power; + } + + static size_t int_log2(size_t n) + { + assert(n > 0); + + size_t log = 0; + while (n >>= 1) + { // Right shift until n becomes 0 + log++; + } + return log; + } + + ::std::ptrdiff_t find_free_block(size_t level, event_list& prereqs) + { + for (size_t current_level : each(level, max_level_ + 1)) + { + if (free_lists_[current_level].empty()) + { + continue; + } + auto& b = free_lists_[current_level].back(); + size_t block_index = b.index; + event_list b_prereqs = mv(b.prereqs); + free_lists_[current_level].pop_back(); + + // Dependencies to reuse that block + prereqs.merge(b_prereqs); + + // If we are not at the requested level, split blocks + while (current_level > level) + { + current_level--; + size_t buddy_index = block_index + (1ull << current_level); + // split blocks depend on the previous dependencies of the whole unsplit block + free_lists_[current_level].emplace_back(buddy_index, b_prereqs); + } + return block_index; + } + + return -1; // No block available + } + + size_t get_buddy_index(size_t index, size_t level) + { + assert(level <= 63); + return index ^ (1ull << level); // XOR to find the buddy block + } + + ::std::vector<::std::vector> free_lists_; + size_t total_size_ = 0; + size_t max_level_ = 0; +}; + +} // end namespace reserved + +/** + * @brief Buddy allocator policy which relies on a root allocator to create + * large buffers which are then suballocated with a buddy allocation algorithm + */ +class buddy_allocator : public block_allocator_interface +{ +public: + buddy_allocator() = default; + buddy_allocator(block_allocator_untyped root_allocator_) + : root_allocator(mv(root_allocator_)) + {} + +private: + // Per data place buffer and its corresponding metadata + struct per_place + { + per_place(void* base_, size_t size, event_list& prereqs) + : base(base_) + , buffer_size(size) + { + metadata = ::std::make_shared(buffer_size, prereqs); + } + + void* base = nullptr; + size_t buffer_size = 0; + ::std::shared_ptr metadata; + }; + +public: + void* + allocate(backend_ctx_untyped& ctx, const data_place& memory_node, ::std::ptrdiff_t& s, event_list& prereqs) override + { + auto it = map.find(memory_node); + if (it == map.end()) + { + // There is currently no buffer associated to this place, create one lazily + // 1. create memory on that place + ::std::ptrdiff_t sz = 128 * 1024 * 1024; // TODO + auto& a = root_allocator ? root_allocator : ctx.get_uncached_allocator(); + void* base = a.allocate(ctx, memory_node, sz, prereqs); + + // 2. creates meta data for that buffer, and 3. associate it to the data place + it = map.emplace(memory_node, ::std::make_shared(base, sz, prereqs)).first; + } + + // There should be exactly one entry in the map + assert(map.count(memory_node) == 1); + auto& m = it->second; + + ::std::ptrdiff_t offset = m->metadata->allocate(s, prereqs); + assert(offset != -1); + return static_cast(m->base) + offset; + } + + void + deallocate(backend_ctx_untyped&, const data_place& memory_node, event_list& prereqs, void* ptr, size_t sz) override + { + // There should be exactly one entry in the map + assert(map.count(memory_node) == 1); + auto& m = map[memory_node]; + + size_t offset = static_cast(ptr) - static_cast(m->base); + + m->metadata->deallocate(offset, sz, prereqs); + } + + event_list deinit(backend_ctx_untyped& ctx) override + { + event_list result; + // For every place in the map + for (auto& [memory_node, pp] : map) + { + event_list local_prereqs; + + // Deinitialize the metadata of the buddy allocator for this place + pp->metadata->deinit(local_prereqs); + + // Deallocate the underlying buffer for this buddy allocator + auto& a = root_allocator ? root_allocator : ctx.get_uncached_allocator(); + a.deallocate(ctx, memory_node, local_prereqs, pp->base, pp->buffer_size); + + result.merge(local_prereqs); + } + return result; + } + + ::std::string to_string() const override + { + return "buddy allocator"; + } + +private: + ::std::unordered_map, hash> map; + + block_allocator_untyped root_allocator; +}; + +#ifdef UNITTESTED_FILE + +UNITTEST("buddy_allocator is movable") +{ + static_assert(std::is_move_constructible::value, "buddy_allocator must be move constructible"); + static_assert(std::is_move_assignable::value, "buddy_allocator must be move assignable"); +}; + +UNITTEST("buddy allocator meta data") +{ + event_list prereqs; // starts empty + + reserved::buddy_allocator_metadata allocator(1024, prereqs); + + // ::std::cout << "Initial state:" << ::std::endl; + // allocator.debug_print(); + + event_list dummy; + + ::std::ptrdiff_t ptr1 = allocator.allocate(200, dummy); // Allocate 200 bytes + // ::std::cout << "\nAfter allocating 200 bytes:" << ::std::endl; + // allocator.debug_print(); + + ::std::ptrdiff_t ptr2 = allocator.allocate(300, dummy); // Allocate 300 bytes + // ::std::cout << "\nAfter allocating 300 bytes:" << ::std::endl; + // allocator.debug_print(); + + allocator.deallocate(ptr1, 200, dummy); // Free the 200 bytes + // ::std::cout << "\nAfter freeing 200 bytes:" << ::std::endl; + // allocator.debug_print(); + + allocator.deallocate(ptr2, 300, dummy); // Free the 300 bytes + // ::std::cout << "\nAfter freeing 300 bytes:" << ::std::endl; + // allocator.debug_print(); +}; + +#endif // UNITTESTED_FILE + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/allocators/cached_allocator.cuh b/cudax/include/cuda/experimental/__stf/allocators/cached_allocator.cuh new file mode 100644 index 00000000000..8031707dfb5 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/allocators/cached_allocator.cuh @@ -0,0 +1,183 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** @file + * + * @brief Cached allocation strategy to reuse previously free'd buffers + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief Cached block allocator that implements block_allocator_interface. + * + * This allocator uses an internal cache to reuse previously allocated memory blocks. + * The aim is to reduce the overhead of memory allocation. + */ +class cached_block_allocator : public block_allocator_interface +{ +public: + /** + * @brief This constructor takes a root allocator which performs + * allocations when there is a cache miss + */ + cached_block_allocator(block_allocator_untyped root_allocator_) + : root_allocator(mv(root_allocator_)) + {} + + /** + * @brief Allocates a memory block. + * + * Attempts to allocate a memory block from the internal cache if available. + * + * @param ctx The backend context (unused in this implementation). + * @param memory_node Memory location where the allocation should happen. + * @param s Pointer to the size of memory required. + * @param prereqs List of events that this allocation depends on. + * @return void* A pointer to the allocated memory block or nullptr if allocation fails. + */ + void* + allocate(backend_ctx_untyped& ctx, const data_place& memory_node, ::std::ptrdiff_t& s, event_list& prereqs) override + { + EXPECT(s > 0); + + ::std::lock_guard<::std::mutex> g(allocator_mutex); + if (auto it = free_cache.find(memory_node); it != free_cache.end()) + { + per_place_map_t& m = it->second; + if (auto it2 = m.find(s); it2 != m.end()) + { + alloc_cache_entry& e = it2->second; + prereqs.merge(mv(e.prereq)); + void* result = e.ptr; + m.erase(it2); + return result; + } + } + + // That is a miss, we need to allocate data using the root allocator + return root_allocator.allocate(ctx, memory_node, s, prereqs); + } + + /** + * @brief Deallocates a memory block. + * + * Puts the deallocated block back into the internal cache for future reuse. + * + * @param ctx The backend context (unused in this implementation). + * @param memory_node Memory location where the deallocation should happen. + * @param prereqs List of events that this deallocation depends on. + * @param ptr Pointer to the memory block to be deallocated. + * @param sz Size of the memory block. + */ + void + deallocate(backend_ctx_untyped&, const data_place& memory_node, event_list& prereqs, void* ptr, size_t sz) override + { + ::std::lock_guard<::std::mutex> g(allocator_mutex); + // We do not call the deallocate method of the root allocator, we discard buffers instead + free_cache[memory_node].emplace(sz, alloc_cache_entry{ptr, prereqs}); + } + + /** + * @brief De-initializes the allocator. + * + */ + event_list deinit(backend_ctx_untyped& ctx) override + { + ::std::unordered_map> free_cache_janitor; + { + ::std::lock_guard<::std::mutex> g(allocator_mutex); + free_cache.swap(free_cache_janitor); + } + + event_list result; + for (auto& entry : free_cache_janitor) + { + auto& where = entry.first; + per_place_map_t& m = entry.second; + for (auto& entry2 : m) + { + size_t sz = entry2.first; + alloc_cache_entry& ace = entry2.second; + + root_allocator.deallocate(ctx, where, ace.prereq, ace.ptr, sz); + + // Move all events + result.merge(mv(ace.prereq)); + } + } + return result; + } + + /** + * @brief Returns a string representation of the allocator. + * + * @return std::string The string "cached_block_allocator". + */ + ::std::string to_string() const override + { + return "cached_block_allocator"; + } + + /** + * @brief Prints additional information about the allocator. + * + * This function currently prints no additional information. + */ + void print_info() const override + { + const auto s = to_string(); + fprintf(stderr, "No additional info for allocator of kind \"%.*s\".\n", static_cast(s.size()), s.data()); + } + +protected: + /** + * @brief Underlying root allocator for base buffers + */ + block_allocator_untyped root_allocator; + + /** + * @brief Struct representing an entry in the cache. + * + * Holds the address of the allocated memory and a list of prerequisites for its reuse. + */ + struct alloc_cache_entry + { + void* ptr; ///< Pointer to the allocated memory block. + event_list prereq; ///< Prerequisites for reusing this block. + }; + + /// Maps sizes to cache entries for a given data_place. + using per_place_map_t = ::std::unordered_multimap; + + /// Top-level cache map mapping data_place to per_place_map_t. + ::std::unordered_map> free_cache; + + ::std::mutex allocator_mutex; +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/allocators/pooled_allocator.cuh b/cudax/include/cuda/experimental/__stf/allocators/pooled_allocator.cuh new file mode 100644 index 00000000000..9db361cd201 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/allocators/pooled_allocator.cuh @@ -0,0 +1,530 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** @file + * + * @brief Allocation strategy which suballocate entries from large pieces of memory + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include + +namespace cuda::experimental::stf +{ + +namespace reserved +{ + +class block_data_pool_set; + +/** + * @brief A set of blocks (of the same size) of predetermined size located on a + * specific data place. It is possible to have multiple of these per data + * place. + */ +class block_data_pool +{ + /// One block that can be used for an allocation + struct metadata + { + event_list prereqs; + bool used = false; + }; + +public: + block_data_pool( + backend_ctx_untyped& ctx, + block_allocator_untyped root_allocator_, + event_list& prereqs, + data_place place, + const size_t block_size, + size_t nentries, + double max_ratio) + : place(mv(place)) + , block_size(block_size) + , root_allocator(mv(root_allocator_)) + { + if (this->place == data_place::host || this->place == data_place::managed) + { + /* Pinned memory is not cheap, so we currently only allocate 4 blocks (arbitrarily) */ + nentries = 4; + } + else + { + const int dev = device_ordinal(this->place); + assert(dev >= 0); + cudaDeviceProp prop; + cuda_safe_call(cudaGetDeviceProperties(&prop, dev)); + + size_t max_mem = prop.totalGlobalMem; + + // We cap the memory at a certain fraction of total memory + if (getenv("USER_ALLOC_POOLS_MEM_CAP")) + { + max_ratio = atof(getenv("USER_ALLOC_POOLS_MEM_CAP")); + } + + if (nentries * block_size > (size_t) (max_ratio * max_mem)) + { + nentries = size_t((max_ratio * max_mem) / block_size); + fprintf(stderr, + "Capping pool size at %f %%: nentries = %zu of %s...\n", + max_ratio * 100.0, + nentries, + pretty_print_bytes(block_size).c_str()); + } + } + + ::std::ptrdiff_t sz = nentries * block_size; + base = root_allocator.allocate(ctx, place, sz, prereqs); + assert(sz > 0); + assert(base); + + allocated_size = sz; + + entries.resize(nentries); + for (auto& e : entries) + { + // Every entry depends on the allocation of the underlying buffer + e.prereqs.merge(prereqs); + } + + available_cnt = nentries; + } + + /// Find one entry that is not used, sets the input/output event + /// dependencies, and give a pointer to the corresponding chunk of memory + void* get_entry(event_list& prereqs) + { + metadata* e = find_avail_pool_entry(); + + if (!e) + { + assert(available_cnt == 0); + return nullptr; + } + + // We do not rely on a specific stream here so we do not join but + // simply use existing deps to avoid useless hw dependencies + prereqs.merge(e->prereqs); + + // This list will be used again when deallocating the entry, but we + // have transferred existing dependencies to the allocate method + e->prereqs.clear(); + + assert(available_cnt > 0); + available_cnt--; + + // Compute and return the pointer to data + const ::std::ptrdiff_t offset = e - &entries[0]; + assert(offset >= 0 && offset < ::std::ptrdiff_t(entries.size())); + return static_cast(base) + offset * block_size; + } + + void release_entry(const event_list& prereqs, void* ptr) + { + // Find the index of the entry given the offset between ptr and base addresses + const ptrdiff_t offset = static_cast(ptr) - static_cast(base); + EXPECT(offset >= 0); + const size_t index = size_t(offset) / block_size; + auto& e = entries[index]; + + EXPECT(index < entries.size()); + EXPECT(e.used); + EXPECT(e.prereqs.size() == 0); + + // To reuse this buffer, we need to wait for all users to finish using it + e.prereqs.merge(prereqs); + e.used = false; + + available_cnt++; + + // Set the last found to this, it will be next available + // last_found = index; + } + + // This indicates whether a pointer corresponds to an entry in this pool + bool is_inside(void* ptr) const + { + const ptrdiff_t offset = static_cast(ptr) - static_cast(base); + // Add an assertion that offset % block_size == 0 ? + return (offset >= 0 && offset < static_cast(entries.size() * block_size)); + } + + // Free the memory allocated for this block of allocations + void unpopulate(backend_ctx_untyped& ctx, event_list& prereqs) + { + root_allocator.deallocate(ctx, place, prereqs, base, allocated_size); + } + +private: + // Find one entry that is not used + metadata* find_avail_pool_entry() + { + for (auto i : each(0, entries.size())) + { + // Start from the item after the last found : the rationale is that + // we want to avoid using a buffer that was recently discarded to + // prevent implicit dependencies between presumably independant + // tasks using the same memory buffers + size_t j = 1 + last_found + i; + if (j >= entries.size()) + { + j -= entries.size(); + } + assert(j < entries.size()); + if (!entries[j].used) + { + last_found = j; + entries[j].used = true; + return &entries[j]; + } + } + return nullptr; + } + + // Cache the last entry found + size_t last_found = 0; + + // Number of unallocated blocks + size_t available_cnt = 0; + + // Start of memory hunk + void* base = nullptr; + size_t allocated_size = 0; + + // Memory belongs here + data_place place; + // Each block in the hunk has this size + size_t block_size = 0; + // One entry per block + ::std::vector entries; + + // The allocator used to create the base blocks + block_allocator_untyped root_allocator; + + // To enable access to entries + friend class block_data_pool_set; +}; + +} // end namespace reserved + +/** + * @brief Optional configurations to tune pooled allocators + */ +struct pooled_allocator_config +{ + // Maximum number of allocations per data place + ::std::optional max_entries_per_place; + + // Maximum amount of memory allocated per data place (as a ratio with the + // total amount of memory) + ::std::optional max_ratio; + + // Maximum number of bytes allocated per data place + ::std::optional max_footprint_per_place; + + size_t get_max_entries_per_place() const + { + return max_entries_per_place.has_value() ? max_entries_per_place.value() : ::std::numeric_limits::max(); + }; + + double get_max_ratio() const + { + return max_ratio.has_value() ? max_ratio.value() : 0.9; + }; +}; + +namespace reserved +{ + +/// This class implements a set of blocks of allocations +class block_data_pool_set +{ +public: + block_data_pool_set() = default; + // Disable copy constructor and assignment operator + block_data_pool_set(const block_data_pool_set&) = delete; + block_data_pool_set& operator=(const block_data_pool_set&) = delete; + + void* get_pool_entry(backend_ctx_untyped& ctx, + block_allocator_untyped& root_allocator, + const data_place& memory_node, + size_t block_size, + event_list& prereqs) + { + // Get the pool or create it lazily + block_data_pool* pool = get_pool(ctx, root_allocator, prereqs, memory_node, block_size); + if (!pool) + { + return nullptr; + } + // Select an entry from it + return pool->get_entry(prereqs); + } + + void release_pool_entry(const data_place& memory_node, size_t block_size, const event_list& prereqs, void* ptr) + { + const size_t padded_sz = next_power_of_two(block_size); + const auto key = ::std::make_pair(to_index(memory_node), padded_sz); + + auto range = map.equal_range(key); + + /* Try to find an existing allocation that has at least an available slot */ + for (auto it = range.first; it != range.second; ++it) + { + if (it->second.is_inside(ptr)) + { + it->second.release_entry(prereqs, ptr); + return; + } + } + + // Should not be reached + fprintf(stderr, "Error: pointer %p was released, but does not belong to a known pool.\n", ptr); + abort(); + } + + event_list deinit_pools(backend_ctx_untyped& ctx) + { + event_list res; + for (auto& entry : map) + { + event_list per_place_res; + + block_data_pool& p = entry.second; + // fprintf(stderr, "UNPOPULATE pool %p for size %zu on node %zu\n", &entry.second, entry.first.first, + // entry.first.second); + for (auto& m : p.entries) + { + per_place_res.merge(mv(m.prereqs)); + } + + p.unpopulate(ctx, per_place_res); + + res.merge(per_place_res); + } + + return res; + } + + void set_max_ratio(double r) + { + assert(r >= 0.0 && r <= 1.0); + config.max_ratio = r; + } + + void set_pool_size(size_t s) + { + assert(s > 0); + config.max_entries_per_place = s; + } + + void set_config(pooled_allocator_config _config) + { + config = mv(_config); + } + +private: + /* Compute the power of 2 next to n (inclusive) */ + static size_t next_power_of_two(size_t n) + { + if (n <= 1) + { + return 1; + } + + // Multiply by 2 until we reach a number greater or equal than n + size_t powerOfTwo = 2; + while (powerOfTwo < n) + { + powerOfTwo *= 2; + } + + return powerOfTwo; + } + + block_data_pool* get_pool( + backend_ctx_untyped& ctx, + block_allocator_untyped& root_allocator, + event_list& prereqs, + const data_place& memory_node, + size_t sz) + { + const size_t padded_sz = next_power_of_two(sz); + const auto key = ::std::make_pair(to_index(memory_node), padded_sz); + + /* Try to find an existing allocation that has at least an available slot */ + auto range = map.equal_range(key); + for (auto it : each(range.first, range.second)) + { + if (it->second.available_cnt > 0) + { + return &it->second; + } + } + + /* There is no entry with an available slot so we need a new one */ + // Default number of entries per block of data + return &map + .emplace(key, + block_data_pool( + ctx, + root_allocator, + prereqs, + memory_node, + padded_sz, + config.get_max_entries_per_place(), + config.get_max_ratio())) + ->second; + } + + // For each memory node, a map of size_t to block_data_pool + ::std::unordered_multimap<::std::pair, + block_data_pool, + ::cuda::experimental::stf::hash<::std::pair>> + map; + +private: + pooled_allocator_config config = { + .max_entries_per_place = 1024, .max_ratio = 0.2, .max_footprint_per_place = ::std::nullopt}; +}; + +} // end namespace reserved + +/** + * @brief A pooled allocation strategy, where each memory node has a map, each + * map associates a size_t with a pool of blocks of this size. + * + * This non static version uses different pools for the different instances of the allocator + */ +class pooled_allocator : public block_allocator_interface +{ +public: + // TODO pass two with std::optional or a structure pool_allocator_config with optionals ? + pooled_allocator(block_allocator_untyped root_allocator_, double max_ratio = 0.2) + : root_allocator(mv(root_allocator_)) + { + pool_set.set_max_ratio(max_ratio); + } + pooled_allocator(double max_ratio = 0.2) + { + pool_set.set_max_ratio(max_ratio); + } + + pooled_allocator(pooled_allocator_config config) + { + pool_set.set_config(mv(config)); + }; + + void* + allocate(backend_ctx_untyped& ctx, const data_place& memory_node, ::std::ptrdiff_t& s, event_list& prereqs) override + { + auto* res = pool_set.get_pool_entry( + ctx, root_allocator ? root_allocator : ctx.get_uncached_allocator(), memory_node, s, prereqs); + if (res == nullptr) + { + s = -s; + } + return res; + } + + void deallocate( + backend_ctx_untyped&, const data_place& memory_node, event_list& prereqs, void* ptr, size_t block_size) override + { + pool_set.release_pool_entry(memory_node, block_size, prereqs, ptr); + } + + event_list deinit(backend_ctx_untyped& ctx) override + { + return pool_set.deinit_pools(ctx); + } + + ::std::string to_string() const override + { + return "pooled"; + } + +private: + reserved::block_data_pool_set pool_set; + block_allocator_untyped root_allocator; +}; + +/** + * @brief An allocator where all (preallocated) blocks have the same size + * + */ +class fixed_size_allocator : public block_allocator_interface +{ +public: + fixed_size_allocator(size_t block_size) + : block_size(block_size) + {} + + fixed_size_allocator(size_t block_size, pooled_allocator_config config) + : block_size(block_size) + { + pool_set.set_config(mv(config)); + } + + fixed_size_allocator() = delete; + + void* + allocate(backend_ctx_untyped& ctx, const data_place& memory_node, ::std::ptrdiff_t& s, event_list& prereqs) override + { + EXPECT(s <= block_size); + auto* res = pool_set.get_pool_entry(ctx, ctx.get_uncached_allocator(), memory_node, block_size, prereqs); + if (res == nullptr) + { + s = -s; + } + return res; + } + + void + deallocate(backend_ctx_untyped&, const data_place& memory_node, event_list& prereqs, void* ptr, size_t sz) override + { + EXPECT(sz <= block_size); + pool_set.release_pool_entry(memory_node, block_size, prereqs, ptr); + } + + event_list deinit(backend_ctx_untyped& ctx) override + { + return pool_set.deinit_pools(ctx); + } + + ::std::string to_string() const override + { + return "fixed size "; + } + +private: + reserved::block_data_pool_set pool_set; + + // The (fixed) size of all blocks + const size_t block_size; +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/allocators/uncached_allocator.cuh b/cudax/include/cuda/experimental/__stf/allocators/uncached_allocator.cuh new file mode 100644 index 00000000000..f318e499a85 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/allocators/uncached_allocator.cuh @@ -0,0 +1,75 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** @file + * + * @brief Uncached allocation strategy where every allocation/deallocation results in a direct API call + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief Uncached block allocator where allocations and deallocations are + * directly performed as CUDA API calls (cudaMallocAsync, ...) + * + * This is actually a wrapper on top of the uncached allocator automatically + * created within each context backend, so deinit is a no-op. The intent of + * this wrapper is to make it possible to create an allocator policy using this + * internal allocator. + */ +class uncached_block_allocator : public block_allocator_interface +{ + void* + allocate(backend_ctx_untyped& ctx, const data_place& memory_node, ::std::ptrdiff_t& s, event_list& prereqs) override + { + auto& uncached = ctx.get_uncached_allocator(); + return uncached.allocate(ctx, memory_node, s, prereqs); + } + + void deallocate( + backend_ctx_untyped& ctx, const data_place& memory_node, event_list& prereqs, void* ptr, size_t sz) override + { + auto& uncached = ctx.get_uncached_allocator(); + uncached.deallocate(ctx, memory_node, prereqs, ptr, sz); + } + + /* Nothing is done here because this is just a wrapper on top of some + * existing allocator automatically created during backend initialization + * */ + event_list deinit(backend_ctx_untyped&) override + { + return event_list(); + } + + ::std::string to_string() const override + { + return "uncached block allocator (wrapper)"; + } +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh new file mode 100644 index 00000000000..cbbd32a2c8e --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh @@ -0,0 +1,1034 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Implements the graph_ctx backend that executes tasks using the CUDA graph API + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include // for unit test! + +namespace cuda::experimental::stf +{ + +namespace reserved +{ + +// For counters +class graph_tag +{ +public: + class launch + {}; + class instantiate + {}; + class update + { + public: + class success + {}; + class failure + {}; + }; +}; + +} // end namespace reserved + +/** + * @brief Uncached allocator (used as a base for other allocators) + * + * Any allocation/deallocation results in an actual underlying CUDA API call + * (e.g. `cudaGraphAddMemAllocNode`). This allocator should generally not be used + * directly, but used within an allocator with a more advanced strategy + * (caching, heap allocation, ...) + */ +class uncached_graph_allocator : public block_allocator_interface +{ +public: + uncached_graph_allocator() = default; + + void* + allocate(backend_ctx_untyped& bctx, const data_place& memory_node, ::std::ptrdiff_t& s, event_list& prereqs) override + { + // This is not implemented yet + EXPECT(!memory_node.is_composite(), "Composite data places are not implemented yet."); + + void* result = nullptr; + + const size_t graph_epoch = bctx.epoch(); + const cudaGraph_t graph = bctx.graph(); + const ::std::vector nodes = reserved::join_with_graph_nodes(prereqs, graph_epoch); + cudaGraphNode_t out = nullptr; + + if (memory_node == data_place::host) + { + cuda_try(cudaMallocHost(&result, s)); + SCOPE(fail) + { + cudaFreeHost(&result); + }; + cuda_try(cudaGraphAddEmptyNode(&out, graph, nodes.data(), nodes.size())); + } + else + { + if (getenv("USE_CUDA_MALLOC")) + { + cuda_safe_call(cudaGraphAddEmptyNode(&out, graph, nodes.data(), nodes.size())); + cuda_try(cudaMalloc(&result, s)); + } + else + { + result = add_mem_alloc_node(memory_node, out, graph, s, nodes); + } + assert(s > 0); + } + + reserved::fork_from_graph_node(bctx, out, graph_epoch, prereqs, "alloc"); + return result; + } + + void deallocate( + backend_ctx_untyped& bctx, const data_place& memory_node, event_list& prereqs, void* ptr, size_t /*sz*/) override + { + const cudaGraph_t graph = bctx.graph(); + const size_t graph_epoch = bctx.epoch(); + cudaGraphNode_t out = nullptr; + const ::std::vector nodes = reserved::join_with_graph_nodes(prereqs, graph_epoch); + if (memory_node == data_place::host) + { + // fprintf(stderr, "TODO deallocate host memory (graph_ctx)\n"); + cuda_safe_call(cudaGraphAddEmptyNode(&out, graph, nodes.data(), nodes.size())); + } + else + { + cuda_safe_call(cudaGraphAddMemFreeNode(&out, graph, nodes.data(), nodes.size(), ptr)); + } + reserved::fork_from_graph_node(bctx, out, graph_epoch, prereqs, "dealloc"); + } + + ::std::string to_string() const override + { + return "uncached graph allocator"; + } + + // Nothing is done as all deallocation are done immediately + event_list deinit(backend_ctx_untyped& /* ctx */) override + { + return event_list(); + } + +private: + void* add_mem_alloc_node( + const data_place& memory_node, + cudaGraphNode_t& out, + cudaGraph_t graph, + size_t s, + const ::std::vector& input_nodes) + { + static const int ndevices = cuda_try(); + // We need to declare who may access this buffer + ::std::vector desc(ndevices); + for (int peer : each(0, ndevices)) + { + desc[peer].location.type = cudaMemLocationTypeDevice; + desc[peer].location.id = peer; + desc[peer].flags = cudaMemAccessFlagsProtReadWrite; + } + + // Parameter structure for cudaGraphAddMemAllocNode - most values are constants. + static cudaMemAllocNodeParams params = [] { + cudaMemAllocNodeParams result; + result.poolProps.allocType = cudaMemAllocationTypePinned; + result.poolProps.handleTypes = cudaMemHandleTypeNone; + result.poolProps.location = {.type = cudaMemLocationTypeDevice, .id = 0}; + result.poolProps.win32SecurityAttributes = nullptr; +#if CUDA_VERSION >= 12030 + result.poolProps.maxSize = 0; +#endif + result.accessDescs = nullptr; + result.accessDescCount = 0; + result.bytesize = 0; + result.dptr = nullptr; + return result; + }(); + + // Set only the variable parameters + params.poolProps.location.id = device_ordinal(memory_node); + params.accessDescs = desc.data(); + params.accessDescCount = size_t(ndevices); + params.bytesize = size_t(s); + + cuda_safe_call(cudaGraphAddMemAllocNode(&out, graph, input_nodes.data(), input_nodes.size(), ¶ms)); + + return params.dptr; + } +}; + +/** + * @brief A graph context, which is a CUDA graph that we can automatically built using tasks. + * + */ +class graph_ctx : public backend_ctx +{ + class impl : public backend_ctx::impl + { + public: + impl(async_resources_handle _async_resources = async_resources_handle(nullptr)) + : backend_ctx::impl(mv(_async_resources)) + , _graph(shared_cuda_graph()) + { + reserved::backend_ctx_setup_allocators(*this); + } + + // Note that graph contexts with an explicit graph passed by the user cannot use epochs + impl(cudaGraph_t g) + : _graph(wrap_cuda_graph(g)) + , explicit_graph(true) + { + reserved::backend_ctx_setup_allocators(*this); + } + + ~impl() override {} + + // Due to circular dependencies, we need to define it here, and not in backend_ctx_untyped + void update_uncached_allocator(block_allocator_untyped custom) override + { + reserved::backend_ctx_update_uncached_allocator(*this, mv(custom)); + } + + cudaGraph_t graph() const override + { + assert(_graph.get()); + return *_graph; + } + + size_t epoch() const override + { + return graph_epoch; + } + + /* Store a vector of previously instantiated graphs, with the number of + * nodes, number of edges, the executable graph, and the corresponding epoch. + * */ + ::std::vector<::std::tuple, size_t>> previous_exec_graphs; + cudaStream_t submitted_stream = nullptr; // stream used in submit + ::std::shared_ptr exec_graph = nullptr; + ::std::shared_ptr _graph = nullptr; + size_t graph_epoch = 0; + bool submitted = false; // did we submit ? + mutable bool explicit_graph = false; + + /* By default, the finalize operation is blocking, unless user provided + * a stream when creating the context */ + bool blocking_finalize = true; + }; + +public: + using task_type = graph_task<>; + + /** + * @brief Definition for the underlying implementation of `data_interface` + * + * @tparam T + */ + template + using data_interface = typename graphed_interface_of::type; + + /// @brief This type is copyable, assignable, and movable. Howeever, copies have reference semantics. + ///@{ + graph_ctx(async_resources_handle handle = async_resources_handle(nullptr)) + : backend_ctx(::std::make_shared(mv(handle))) + {} + + graph_ctx(cudaStream_t user_stream, async_resources_handle handle = async_resources_handle(nullptr)) + : backend_ctx(::std::make_shared(mv(handle))) + { + auto& state = this->state(); + + // Ensure that we use this stream to launch the graph(s) + state.submitted_stream = user_stream; + + state.blocking_finalize = false; + } + + /// @brief Constructor taking a user-provided graph. User code is not supposed to destroy the graph later. + graph_ctx(cudaGraph_t g) + : backend_ctx(::std::make_shared(g)) + {} + ///@} + + ::std::string to_string() const + { + return "graph backend context"; + } + + using backend_ctx::task; + + /** + * @brief Creates a typed task on the specified execution place + */ + template + auto task(exec_place e_place, task_dep... deps) + { + auto dump_hooks = reserved::get_dump_hooks(this, deps...); + auto res = graph_task(*this, get_graph(), get_graph_epoch(), mv(e_place), mv(deps)...); + res.add_post_submission_hook(dump_hooks); + return res; + } + + // submit a new epoch : this will submit a graph in a stream that we return + cudaStream_t task_fence() + { + change_epoch(); + // Get the stream used to submit the graph of the epoch we have + // launched when changing the epoch + return this->state().submitted_stream; + } + + void finalize() + { + assert(get_phase() < backend_ctx_untyped::phase::finalized); + auto& state = this->state(); + if (!state.submitted) + { + // Not submitted yet + submit(); + } + assert(state.submitted_stream); + if (state.blocking_finalize) + { + cuda_try(cudaStreamSynchronize(state.submitted_stream)); + } + state.submitted_stream = nullptr; + state.cleanup(); + set_phase(backend_ctx_untyped::phase::finalized); + +#ifdef CUDASTF_DEBUG + const char* display_stats_env = getenv("CUDASTF_DISPLAY_STATS"); + if (!display_stats_env || atoi(display_stats_env) == 0) + { + return; + } + + fprintf( + stderr, "[STATS CUDA GRAPHS] instantiated=%lu\n", reserved::counter.load()); + fprintf(stderr, "[STATS CUDA GRAPHS] launched=%lu\n", reserved::counter.load()); + fprintf(stderr, + "[STATS CUDA GRAPHS] updated=%lu success=%ld failed=%ld\n", + reserved::counter.load(), + reserved::counter.load(), + reserved::counter.load()); +#endif + } + + void submit(cudaStream_t stream = nullptr) + { + assert(get_phase() < backend_ctx_untyped::phase::submitted); + auto& state = this->state(); + if (!state.submitted_stream) + { + if (stream) + { + // We could submit just the last iteration in that specific stream, and synchronize the user-provided + // stream with this one + EXPECT(state.graph_epoch == 0, "Cannot call submit with an explicit stream and change epoch."); + state.submitted_stream = stream; + } + else if (!state.submitted_stream) + { + state.submitted_stream = pick_stream(); + assert(state.submitted_stream != nullptr); + } + } + + // Only for the latest graph ... Will also do write-back and cleanup + instantiate(); + + // cuda_safe_call(cudaStreamSynchronize(state.submitted_stream)); + + cuda_try(cudaGraphLaunch(*state.exec_graph, state.submitted_stream)); + +#ifdef CUDASTF_DEBUG + reserved::counter.increment(); +#endif + + // Note that we comment this out for now, so that it is possible to use + // the print_to_dot method; but we may perhaps discard this graph to + // some dedicated member variable. + // Ensure nobody tries to use that graph again ... + // state.set_graph(nullptr); + + state.submitted = true; + set_phase(backend_ctx_untyped::phase::submitted); + } + + // Start a new epoch ! + void change_epoch() + { + auto& state = this->state(); + + EXPECT(!state.explicit_graph); + + // Put the graph of the current epoch on the vector of previous graphs + submit_one_epoch(*state._graph, state.graph_epoch); + + // We now use a new graph for that epoch + state._graph = shared_cuda_graph(); + + state.graph_epoch++; + + auto& dot = *get_dot(); + if (dot.is_tracing()) + { + dot.change_epoch(); + } + // fprintf(stderr, "Starting epoch %ld : previous graph %p new graph %p\n", state.graph_epoch, prev_graph, + // new_graph); + } + + /// @brief Get the "support" graph associated with the context + ::std::shared_ptr get_shared_graph() const + { + return state()._graph; + } + + cudaGraph_t& get_graph() const + { + assert(state()._graph); + return *(state()._graph); + } + + size_t get_graph_epoch() const + { + return state().graph_epoch; + } + + auto get_exec_graph() const + { + return state().exec_graph; + } + + /* Make sure all pending operations are finished, and release resources */ + ::std::shared_ptr finalize_as_graph() + { + // Write-back data and erase automatically created data instances + get_state().erase_all_logical_data(); + get_state().detach_allocators(*this); + + // Make sure all pending async ops are sync'ed with + task_fence_impl(); + + auto& state = this->state(); + + if (getenv("CUDASTF_DISPLAY_GRAPH_INFO")) + { + display_graph_info(get_graph()); + } + + return state._graph; + } + + // Execute the CUDA graph in the provided stream. + ::std::shared_ptr instantiate() + { + ::std::shared_ptr g = finalize_as_graph(); + + size_t nedges; + size_t nnodes; + + cuda_safe_call(cudaGraphGetNodes(*g, nullptr, &nnodes)); + cuda_safe_call(cudaGraphGetEdges(*g, nullptr, nullptr, &nedges)); + + auto& state = this->state(); + + for (const auto& [nnodes_cached, nedges_cached, prev_e_ptr] : async_resources().get_cached_graphs()) + { + // Skip the update early if needed + if (nnodes_cached != nnodes || nedges_cached != nedges) + { + continue; + } + cudaGraphExec_t& prev_e = *prev_e_ptr; + if (try_updating_executable_graph(prev_e, *g)) + { + // Return the updated graph + state.exec_graph = prev_e_ptr; + return prev_e_ptr; + } + } + + if (getenv("CUDASTF_DUMP_GRAPHS")) + { + static int instantiated_graph = 0; + print_to_dot("instantiated_graph" + ::std::to_string(instantiated_graph++) + ".dot"); + } + + state.exec_graph = graph_instantiate(*g); + + // Save this graph for later use + async_resources().put_graph_in_cache(nnodes, nedges, state.exec_graph); + + return state.exec_graph; + } + + void display_graph_info(cudaGraph_t g) + { + size_t numNodes; + cuda_safe_call(cudaGraphGetNodes(g, nullptr, &numNodes)); + + size_t numEdges; + cuda_safe_call(cudaGraphGetEdges(g, nullptr, nullptr, &numEdges)); + + cuuint64_t mem_attr; + cuda_safe_call(cudaDeviceGetGraphMemAttribute(0, cudaGraphMemAttrUsedMemHigh, &mem_attr)); + + // fprintf(stderr, "INSTANTIATING graph %p with %ld nodes %ld edges - MEM %ld\n", g, numNodes, numEdges, + // mem_attr); + } + + // flags = cudaGraphDebugDotFlagsVerbose to display all available debug information + void print_to_dot(const ::std::string& filename, enum cudaGraphDebugDotFlags flags = cudaGraphDebugDotFlags(0)) + { + cudaGraphDebugDotPrint(get_graph(), filename.c_str(), flags); + } + +private: + impl& state() + { + return dynamic_cast(get_state()); + }; + const impl& state() const + { + return dynamic_cast(get_state()); + }; + + /// @brief Inserts an empty node that depends on all pending operations + cudaStream_t task_fence_impl() + { + auto& state = this->state(); + + auto& cs = this->get_stack(); + auto prereq_fence = cs.insert_task_fence(*get_dot()); + + const size_t graph_epoch = state.graph_epoch; + ::std::vector nodes = reserved::join_with_graph_nodes(prereq_fence, graph_epoch); + + // Create an empty graph node + cudaGraphNode_t n; + cuda_safe_call(cudaGraphAddEmptyNode(&n, get_graph(), nodes.data(), nodes.size())); + + reserved::fork_from_graph_node(*this, n, graph_epoch, prereq_fence, "fence"); + + return nullptr; // for conformity with the stream version + } + + // Instantiate a CUDA graph + static ::std::shared_ptr graph_instantiate(cudaGraph_t g) + { + // Custom deleter specifically for cudaGraphExec_t + auto cudaGraphExecDeleter = [](cudaGraphExec_t* pGraphExec) { + cudaGraphExecDestroy(*pGraphExec); + }; + + ::std::shared_ptr res(new cudaGraphExec_t, cudaGraphExecDeleter); + + cuda_try(cudaGraphInstantiateWithFlags(res.get(), g, 0)); + +#ifdef CUDASTF_DEBUG + reserved::counter.increment(); +#endif + + return res; + } + + // Creates a new CUDA graph and wrap it into a shared_ptr + static ::std::shared_ptr shared_cuda_graph() + { + auto cudaGraphDeleter = [](cudaGraph_t* pGraph) { + cudaGraphDestroy(*pGraph); + }; + + ::std::shared_ptr res(new cudaGraph_t, cudaGraphDeleter); + + cuda_try(cudaGraphCreate(res.get(), 0)); + + return res; + } + + // Wrap an existing CUDA graph into a shared_ptr, the destruction of the graph is let to the application + static ::std::shared_ptr wrap_cuda_graph(cudaGraph_t g) + { + // Allocate memory for a new cudaGraph_t and copy the existing graph to it + cudaGraph_t* pGraph = new cudaGraph_t; + *pGraph = g; + + // There is no custom deleter : only the pointer itself will be destroyed + ::std::shared_ptr res(pGraph); + + return res; + } + + cudaStream_t submit_one_epoch(cudaGraph_t g, size_t epoch) + { + auto& state = this->state(); + // TODO rework to make sure we don't push a specific stream later ? + if (!state.submitted_stream) + { + // fprintf(stderr, "Initializing stream support for ctx ...\n"); + state.submitted_stream = pick_stream(); + assert(state.submitted_stream != nullptr); + } + // cuda_safe_call(cudaStreamSynchronize(state.submitted_stream)); + + size_t nedges; + size_t nnodes; + + cuda_safe_call(cudaGraphGetNodes(g, nullptr, &nnodes)); + cuda_safe_call(cudaGraphGetEdges(g, nullptr, nullptr, &nedges)); + + cudaGraphExec_t local_exec_graph = nullptr; + + // fprintf(stderr, "Instantiate epoch %ld\n", epoch); + display_graph_info(g); + +#if 0 + ::std::string name = "epoch" + ::std::to_string(epoch) + ".dot"; + cudaGraphDebugDotPrint(g, name.c_str(), cudaGraphDebugDotFlagsVerbose); +#endif + + bool found = false; + + // Try to reuse previous instantiated graphs + for (auto& [prev_nnodes, prev_nedges, prev_e_ptr, last_epoch] : state.previous_exec_graphs) + { + // Early exit if the topology cannot match + if (prev_nnodes != nnodes || prev_nedges != nedges) + { + continue; + } + cudaGraphExec_t& prev_e = *prev_e_ptr; + if (try_updating_executable_graph(prev_e, g)) + { + local_exec_graph = prev_e; + + // Update epoch in the vector of pairs + last_epoch = epoch; + + found = true; + break; + } + } + + if (!found) + { + // We need to instantiate a new exec_graph + auto e_graph_ptr = graph_instantiate(g); + + // Save for future use + state.previous_exec_graphs.push_back(::std::make_tuple(nnodes, nedges, e_graph_ptr, epoch)); + + local_exec_graph = *e_graph_ptr; + } + + cuda_try(cudaGraphLaunch(local_exec_graph, state.submitted_stream)); + +#ifdef CUDASTF_DEBUG + reserved::counter.increment(); +#endif + + return state.submitted_stream; + } + +public: + // This tries to instantiate the graph by updating an existing executable graph + // the returned value indicates whether the update was successful or not + static bool try_updating_executable_graph(cudaGraphExec_t exec_graph, cudaGraph_t graph) + { +#if CUDA_VERSION < 12000 + cudaGraphNode_t errorNode; + cudaGraphExecUpdateResult updateResult; + cudaGraphExecUpdate(exec_graph, graph, &errorNode, &updateResult); +#else + cudaGraphExecUpdateResultInfo resultInfo; + cudaGraphExecUpdate(exec_graph, graph, &resultInfo); +#endif + + // Be sure to "erase" the last error + cudaError_t res = cudaGetLastError(); + +#ifdef CUDASTF_DEBUG + reserved::counter.increment(); + if (res == cudaSuccess) + { + reserved::counter.increment(); + } + else + { + reserved::counter.increment(); + } +#endif + + return (res == cudaSuccess); + } + + friend inline cudaGraph_t ctx_to_graph(backend_ctx_untyped& ctx) + { + return ctx.graph(); + } + + friend inline size_t ctx_to_graph_epoch(backend_ctx_untyped& ctx) + { + return ctx.epoch(); + } +}; + +#ifdef UNITTESTED_FILE + +UNITTEST("movable graph_ctx") +{ + graph_ctx ctx; + graph_ctx ctx2 = mv(ctx); +}; + +UNITTEST("copyable graph_task<>") +{ + graph_ctx ctx; + graph_task<> t = ctx.task(); + graph_task<> t_cpy = t; +}; + +UNITTEST("copyable graph_ctx") +{ + graph_ctx ctx; + graph_ctx ctx2 = ctx; + graph_task<> t = ctx.task(); + graph_task<> t2 = ctx2.task(); +}; + +UNITTEST("movable graph_task<>") +{ + graph_ctx ctx; + graph_task<> t = ctx.task(); + graph_task<> t_cpy = mv(t); +}; + +UNITTEST("set_symbol on graph_task and graph_task<>") +{ + graph_ctx ctx; + + double X[1024], Y[1024]; + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + graph_task<> t = ctx.task(); + t.add_deps(lX.rw(), lY.rw()); + t.set_symbol("graph_task<>"); + t.start(); + cudaGraphNode_t n; + cuda_safe_call(cudaGraphAddEmptyNode(&n, t.get_graph(), nullptr, 0)); + t.end(); + + graph_task, slice> t2 = ctx.task(lX.rw(), lY.rw()); + t2.set_symbol("graph_task"); + t2.start(); + cudaGraphNode_t n2; + cuda_safe_call(cudaGraphAddEmptyNode(&n2, t2.get_graph(), nullptr, 0)); + t2.end(); + + ctx.finalize(); +}; + +# ifdef __CUDACC__ +namespace reserved +{ + +inline void unit_test_graph_epoch() +{ + graph_ctx ctx; + + const size_t N = 8; + const size_t NITER = 10; + + double A[N]; + for (size_t i = 0; i < N; i++) + { + A[i] = 1.0 * i; + } + + auto lA = ctx.logical_data(A); + + for (size_t k = 0; k < NITER; k++) + { + ctx.parallel_for(blocked_partition(), exec_place::current_device(), lA.shape(), lA.rw()) + ->*[] _CCCL_HOST_DEVICE(size_t i, slice A) { + A(i) = cos(A(i)); + }; + + ctx.change_epoch(); + } + + ctx.finalize(); + + for (size_t i = 0; i < N; i++) + { + double Ai_ref = 1.0 * i; + for (size_t k = 0; k < NITER; k++) + { + Ai_ref = cos(Ai_ref); + } + + EXPECT(fabs(A[i] - Ai_ref) < 0.01); + } +} + +UNITTEST("graph with epoch") +{ + unit_test_graph_epoch(); +}; + +inline void unit_test_graph_empty_epoch() +{ + graph_ctx ctx; + + const size_t N = 8; + const size_t NITER = 10; + + double A[N]; + for (size_t i = 0; i < N; i++) + { + A[i] = 1.0 * i; + } + + auto lA = ctx.logical_data(A); + + for (size_t k = 0; k < NITER; k++) + { + ctx.parallel_for(blocked_partition(), exec_place::current_device(), lA.shape(), lA.rw()) + ->*[] _CCCL_HOST_DEVICE(size_t i, slice A) { + A(i) = cos(A(i)); + }; + + ctx.change_epoch(); + // Nothing in between + ctx.change_epoch(); + } + + ctx.finalize(); + + for (size_t i = 0; i < N; i++) + { + double Ai_ref = 1.0 * i; + for (size_t k = 0; k < NITER; k++) + { + Ai_ref = cos(Ai_ref); + } + + EXPECT(fabs(A[i] - Ai_ref) < 0.01); + } +} + +UNITTEST("graph with empty epoch") +{ + unit_test_graph_empty_epoch(); +}; + +inline void unit_test_graph_epoch_2() +{ + graph_ctx ctx; + + const size_t N = 8; + const size_t NITER = 10; + + double A[N]; + for (size_t i = 0; i < N; i++) + { + A[i] = 1.0 * i; + } + + auto lA = ctx.logical_data(A); + + for (size_t k = 0; k < NITER; k++) + { + if ((k % 2) == 0) + { + ctx.parallel_for(blocked_partition(), exec_place::current_device(), lA.shape(), lA.rw()) + ->*[] _CCCL_HOST_DEVICE(size_t i, slice A) { + A(i) = cos(A(i)); + }; + } + else + { + ctx.parallel_for(blocked_partition(), exec_place::current_device(), lA.shape(), lA.rw()) + ->*[] _CCCL_HOST_DEVICE(size_t i, slice A) { + A(i) = sin(A(i)); + }; + } + + ctx.change_epoch(); + } + + ctx.finalize(); + + for (size_t i = 0; i < N; i++) + { + double Ai_ref = 1.0 * i; + for (size_t k = 0; k < NITER; k++) + { + Ai_ref = ((k % 2) == 0) ? cos(Ai_ref) : sin(Ai_ref); + } + + EXPECT(fabs(A[i] - Ai_ref) < 0.01); + } +} + +UNITTEST("graph with epoch 2") +{ + unit_test_graph_epoch_2(); +}; + +inline void unit_test_graph_epoch_3() +{ + graph_ctx ctx; + + const size_t N = 8; + const size_t NITER = 10; + + double A[N]; + double B[N]; + for (size_t i = 0; i < N; i++) + { + A[i] = 1.0 * i; + B[i] = -1.0 * i; + } + + auto lA = ctx.logical_data(A); + auto lB = ctx.logical_data(B); + + for (size_t k = 0; k < NITER; k++) + { + if ((k % 2) == 0) + { + ctx.parallel_for(blocked_partition(), exec_place::current_device(), lA.shape(), lA.rw(), lB.read()) + ->*[] _CCCL_HOST_DEVICE(size_t i, slice A, slice B) { + A(i) = cos(B(i)); + }; + } + else + { + ctx.parallel_for(blocked_partition(), exec_place::current_device(), lA.shape(), lA.read(), lB.rw()) + ->*[] _CCCL_HOST_DEVICE(size_t i, slice A, slice B) { + B(i) = sin(A(i)); + }; + } + + ctx.change_epoch(); + } + + ctx.finalize(); + + for (size_t i = 0; i < N; i++) + { + double Ai_ref = 1.0 * i; + double Bi_ref = -1.0 * i; + for (size_t k = 0; k < NITER; k++) + { + if ((k % 2) == 0) + { + Ai_ref = cos(Bi_ref); + } + else + { + Bi_ref = sin(Ai_ref); + } + } + + EXPECT(fabs(A[i] - Ai_ref) < 0.01); + EXPECT(fabs(B[i] - Bi_ref) < 0.01); + } +} + +UNITTEST("graph with epoch 3") +{ + unit_test_graph_epoch_3(); +}; + +inline void unit_test_launch_graph() +{ + graph_ctx ctx; + SCOPE(exit) + { + ctx.finalize(); + }; + auto lA = ctx.logical_data(shape_of>(64)); + ctx.launch(lA.write())->*[] _CCCL_DEVICE(auto t, slice A) { + for (auto i : t.apply_partition(shape(A))) + { + A(i) = 2 * i; + } + }; + ctx.host_launch(lA.read())->*[](auto A) { + for (size_t i = 0; i < 64; i++) + { + EXPECT(A(i) == 2 * i); + } + }; +} + +UNITTEST("basic launch test (graph_ctx)") +{ + unit_test_launch_graph(); +}; + +inline void unit_test_launch_many_graph_ctx() +{ + // Stress the allocators and all ressources ! + for (size_t i = 0; i < 256; i++) + { + graph_ctx ctx; + auto lA = ctx.logical_data(shape_of>(64)); + ctx.launch(lA.write())->*[] _CCCL_DEVICE(auto t, slice A) { + for (auto i : t.apply_partition(shape(A))) + { + A(i) = 2 * i; + } + }; + ctx.finalize(); + } +} + +UNITTEST("create many graph ctxs") +{ + unit_test_launch_many_graph_ctx(); +}; + +} // end namespace reserved + +# endif // __CUDACC__ + +#endif // UNITTESTED_FILE +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_data_interface.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_data_interface.cuh new file mode 100644 index 00000000000..767473c3425 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/graph/graph_data_interface.cuh @@ -0,0 +1,89 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Implementation of the graph_data_interface class which makes it + * possible to define data interfaces in the CUDA graph backend + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief Data interface for the CUDA graph backend + */ +template +class graph_data_interface : public data_impl_base +{ +public: + using base = data_impl_base; + using shape_t = typename base::shape_t; + + graph_data_interface(T p) + : base(mv(p)) + {} + + graph_data_interface(shape_of s) + : base(mv(s)) + {} + + virtual cudaGraphNode_t graph_data_copy( + cudaMemcpyKind kind, + instance_id_t src_instance_id, + instance_id_t dst_instance_id, + cudaGraph_t graph, + const cudaGraphNode_t* input_nodes, + size_t input_cnt) = 0; + + // Returns prereq + void data_copy(backend_ctx_untyped& ctx_, + const data_place& dst_memory_node, + instance_id_t dst_instance_id, + const data_place& src_memory_node, + instance_id_t src_instance_id, + event_list& prereqs) override + { + ::std::ignore = src_memory_node; + ::std::ignore = dst_memory_node; + assert(src_memory_node != dst_memory_node); + + cudaGraph_t graph = ctx_.graph(); + size_t graph_epoch = ctx_.epoch(); + assert(graph && graph_epoch != size_t(-1)); + + const ::std::vector nodes = reserved::join_with_graph_nodes(prereqs, graph_epoch); + + // Let CUDA figure out from pointers + cudaMemcpyKind kind = cudaMemcpyDefault; + + cudaGraphNode_t out = graph_data_copy(kind, src_instance_id, dst_instance_id, graph, nodes.data(), nodes.size()); + + reserved::fork_from_graph_node(ctx_, out, graph_epoch, prereqs, "copy"); + } +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh new file mode 100644 index 00000000000..7634af6828f --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh @@ -0,0 +1,534 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Implement tasks in the CUDA graph backend (graph_ctx) + * + * @see graph_ctx + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include // graph_task<> has-a backend_ctx_untyped +#include +#include + +namespace cuda::experimental::stf +{ + +template +class graph_task; + +/** @brief This describes an untyped task within a CUDA graph. + * + * A graph task is implemented as a child graph in the graph associated to the + * context. The body of the task is in the child graph, and CUDASTF introduces + * the dependencies from and to that child graph, in addition to extra nodes + * intended to implement data transfers, or allocations for example. + * + * For graph tasks generated automatically by CUDASTF which are only made of a + * single CUDA graph node, we may use the graph node directly rather than + * embedding it in a child graph. + */ +template <> +class graph_task<> : public task +{ +public: + // A cudaGraph_t is needed + graph_task() = delete; + + graph_task(backend_ctx_untyped ctx, cudaGraph_t g, size_t epoch, exec_place e_place = exec_place::current_device()) + : task(mv(e_place)) + , ctx_graph(EXPECT(g)) + , epoch(epoch) + , ctx(mv(ctx)) + { + this->ctx.increment_task_count(); + } + + graph_task(graph_task&&) = default; + graph_task& operator=(graph_task&&) = default; + + graph_task(graph_task&) = default; + graph_task(const graph_task&) = default; + graph_task& operator=(const graph_task&) = default; + + graph_task& start() + { + event_list prereqs = acquire(ctx); + + // The CUDA graph API does not like duplicate dependencies + prereqs.optimize(); + + // Reserve for better performance + ready_dependencies.reserve(prereqs.size()); + + for (auto& e : prereqs) + { + auto ge = reserved::graph_event(e, reserved::use_dynamic_cast); + if (ge->epoch == epoch) + { + ready_dependencies.push_back(ge->node); + } + } + + return *this; + } + + /* End the task, but do not clear its data structures yet */ + graph_task<>& end_uncleared() + { + cudaGraphNode_t n; + + auto done_prereqs = event_list(); + + // We either created independant task nodes, or a child graph. We need + // to inject input dependencies, and make the task completion depend on + // task nodes or the child graph. + if (task_nodes.size() > 0) + { + for (auto& node : task_nodes) + { +#ifndef NDEBUG + // Ensure the node does not have dependencies yet + size_t num_deps; + cuda_safe_call(cudaGraphNodeGetDependencies(node, nullptr, &num_deps)); + assert(num_deps == 0); + + // Ensure there are no output dependencies either (or we could not + // add input dependencies later) + size_t num_deps_out; + cuda_safe_call(cudaGraphNodeGetDependentNodes(node, nullptr, &num_deps_out)); + assert(num_deps_out == 0); +#endif + + // Repeat node as many times as there are input dependencies + ::std::vector out_array(ready_dependencies.size(), node); + cuda_safe_call( + cudaGraphAddDependencies(ctx_graph, ready_dependencies.data(), out_array.data(), ready_dependencies.size())); + auto gnp = reserved::graph_event(node, epoch); + gnp->set_symbol(ctx, "done " + get_symbol()); + /* This node is now the output dependency of the task */ + done_prereqs.add(mv(gnp)); + } + } + else + { + // Note that if nothing was done in the task, this will create a child + // graph too, which will be useful as a node to synchronize with anyway. + const cudaGraph_t childGraph = get_graph(); + + const cudaGraphNode_t* deps = ready_dependencies.data(); + + assert(ctx_graph); + /* This will duplicate the childGraph so we can destroy it after */ + cuda_safe_call(cudaGraphAddChildGraphNode(&n, ctx_graph, deps, ready_dependencies.size(), childGraph)); + + // Destroy the child graph unless we should not + if (must_destroy_child_graph) + { + cuda_safe_call(cudaGraphDestroy(childGraph)); + } + + auto gnp = reserved::graph_event(n, epoch); + gnp->set_symbol(ctx, "done " + get_symbol()); + /* This node is now the output dependency of the task */ + done_prereqs.add(mv(gnp)); + } + + release(ctx, done_prereqs); + + return *this; + } + + graph_task<>& end() + { + end_uncleared(); + clear(); + return *this; + } + + void populate_deps_scheduling_info() const + { + // Error checking copied from acquire() in acquire_release() + + int index = 0; + const auto& deps = get_task_deps(); + for (const auto& dep : deps) + { + if (!dep.get_data().is_initialized()) + { + fprintf(stderr, "Error: dependency number %d is an uninitialized logical data.\n", index); + abort(); + } + dep.set_symbol(dep.get_data().get_symbol()); + dep.set_data_footprint(dep.get_data().get_data_interface().data_footprint()); + index++; + } + } + + /** + * @brief Use the scheduler to assign a device to this task + * + * @return returns true if the task's time needs to be recorded + */ + bool schedule_task() + { + auto& dot = *ctx.get_dot(); + auto& statistics = reserved::task_statistics::instance(); + + const bool is_auto = get_exec_place().affine_data_place() == data_place::device_auto; + bool calibrate = false; + + // We need to know the data footprint if scheduling or calibrating tasks + if (is_auto || statistics.is_calibrating()) + { + populate_deps_scheduling_info(); + } + + if (is_auto) + { + auto [place, needs_calibration] = ctx.schedule_task(*this); + set_exec_place(place); + calibrate = needs_calibration; + } + + return dot.is_tracing() || (calibrate && statistics.is_calibrating()); + } + + /** + * @brief Invokes a lambda that takes either a `cudaStream_t` or a `cudaGraph_t`. Dependencies must be + * set with `add_deps` manually before this call. + * + * @tparam Fun Type of lambda to call, must accept either a `cudaStream_t` or a `cudaGraph_t` as sole argument + * @param f lambda function to call + */ + template + void operator->*(Fun&& f) + { + auto& dot = *ctx.get_dot(); + auto& statistics = reserved::task_statistics::instance(); + + // cudaEvent_t start_event, end_event; + + bool record_time = schedule_task(); + + if (statistics.is_calibrating_to_file()) + { + record_time = true; + } + + start(); + + if (record_time) + { + // Events must be created here to avoid issues with multi-gpu + // cuda_safe_call(cudaEventCreate(&start_event)); + // cuda_safe_call(cudaEventCreate(&end_event)); + // cuda_safe_call(cudaEventRecord(start_event)); + } + + SCOPE(exit) + { + end_uncleared(); + if (record_time) + { + // cuda_safe_call(cudaEventRecord(end_event)); + // cuda_safe_call(cudaEventSynchronize(end_event)); + + float milliseconds = 0; + // cuda_safe_call(cudaEventElapsedTime(&milliseconds, start_event, end_event)); + + if (dot.is_tracing()) + { + dot.template add_vertex_timing(*this, milliseconds); + } + + if (statistics.is_calibrating()) + { + statistics.log_task_time(*this, milliseconds); + } + } + clear(); + }; + + // Default for the first argument is a `cudaStream_t`. + if constexpr (::std::is_invocable_v) + { + // + // CAPTURE the lambda + // + + // Get a stream from the pool associated to the execution place + cudaStream_t capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream; + + cudaGraph_t childGraph = nullptr; + cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal)); + + // Launch the user provided function + f(capture_stream); + + cuda_safe_call(cudaStreamEndCapture(capture_stream, &childGraph)); + + // This implements the child graph of the `graph_task<>`, we will later + // insert the proper dependencies around it + set_child_graph(childGraph); + } + else + { + // + // Give the lambda a child graph + // + + // Create a child graph in the `graph_task<>` + cudaGraph_t childGraph = get_graph(); + + // Launch the user provided function + f(childGraph); + } + } + + // Return the child graph, and create it if it does not exist yet + cudaGraph_t& get_graph() + { + // We either use a child graph or task nodes, not both + assert(task_nodes.empty()); + + // Lazy creation + if (child_graph == nullptr) + { + cuda_safe_call(cudaGraphCreate(&child_graph, 0)); + must_destroy_child_graph = true; + } + + return child_graph; + } + + // Create a node in the graph + cudaGraphNode_t& get_node() + { + // We either use a child graph or task nodes, not both + assert(!child_graph); + + // Create a new entry and return it + task_nodes.emplace_back(); + return task_nodes.back(); + } + + // Get the graph associated to the whole context (not the task) + cudaGraph_t& get_ctx_graph() + { + return ctx_graph; + } + + void set_current_place(pos4 p) + { + get_exec_place().as_grid().set_current_place(ctx, p); + } + + void unset_current_place() + { + get_exec_place().as_grid().unset_current_place(ctx); + } + + const exec_place& get_current_place() const + { + return get_exec_place().as_grid().get_current_place(); + } + +private: + // So that graph_ctx can access set_child_graph + template + friend class graph_task; + + // If the child graph was created using a capture mechanism, for example + void set_child_graph(cudaGraph_t explicit_g) + { + child_graph = explicit_g; + must_destroy_child_graph = false; + } + + /* The child graph associated to that `graph_task<>`, this was either created + * explicitly, or by the means of a capture mechanism. */ + cudaGraph_t child_graph = nullptr; + bool must_destroy_child_graph = false; + + /* If the task corresponds to independant graph nodes, we do not use a + * child graph, but add nodes directly */ + ::std::vector task_nodes; + + /* This is the support graph associated to the entire context */ + cudaGraph_t ctx_graph = nullptr; + size_t epoch = 0; + + ::std::vector ready_dependencies; + + backend_ctx_untyped ctx; +}; + +/** + * @brief Provides a graph scope object within which functions invoked are + * either captured in a graph or passed to a subgraph, depending on the + * function type. Invocation is carried by means of operator->*. Dependencies + * are typed appropriately. + */ +template +class graph_task : public graph_task<> +{ +public: + graph_task(backend_ctx_untyped ctx, cudaGraph_t g, size_t epoch, exec_place e_place, task_dep... deps) + : graph_task<>(mv(ctx), g, epoch, mv(e_place)) + { + static_assert(sizeof(*this) == sizeof(graph_task<>), "Cannot add state - it would be lost by slicing."); + add_deps(mv(deps)...); + } + + /** + * @brief Set the symbol object + * + * @param s + * @return graph_task& + */ + graph_task& set_symbol(::std::string s) & + { + graph_task<>::set_symbol(mv(s)); + return *this; + } + + graph_task&& set_symbol(::std::string s) && + { + graph_task<>::set_symbol(mv(s)); + return mv(*this); + } + +#if defined(_CCCL_COMPILER_MSVC) + // TODO (miscco): figure out why MSVC is complaining about unreachable code here + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_MSVC(4702) // unreachable code +#endif // _CCCL_COMPILER_MSVC + + template + void operator->*(Fun&& f) + { + auto& dot = *ctx.get_dot(); + auto& statistics = reserved::task_statistics::instance(); + + // cudaEvent_t start_event, end_event; + + bool record_time = schedule_task(); + + if (statistics.is_calibrating_to_file()) + { + record_time = true; + } + + start(); + + if (record_time) + { + // Events must be created here to avoid issues with multi-gpu + // cuda_safe_call(cudaEventCreate(&start_event)); + // cuda_safe_call(cudaEventCreate(&end_event)); + // cuda_safe_call(cudaEventRecord(start_event)); + } + + SCOPE(exit) + { + end_uncleared(); + if (record_time) + { + // cuda_safe_call(cudaEventRecord(end_event)); + // cuda_safe_call(cudaEventSynchronize(end_event)); + + float milliseconds = 0; + // cuda_safe_call(cudaEventElapsedTime(&milliseconds, start_event, end_event)); + + if (dot.is_tracing()) + { + dot.template add_vertex_timing(*this, milliseconds); + } + + if (statistics.is_calibrating()) + { + statistics.log_task_time(*this, milliseconds); + } + } + clear(); + }; + + if (dot.is_tracing()) + { + dot.template add_vertex(*this); + } + + // Default for the first argument is a `cudaStream_t`. + if constexpr (::std::is_invocable_v) + { + // + // CAPTURE the lambda + // + + // Get a stream from the pool associated to the execution place + cudaStream_t capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream; + + cudaGraph_t childGraph = nullptr; + cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal)); + + // Launch the user provided function + ::std::apply(f, tuple_prepend(mv(capture_stream), typed_deps())); + + cuda_safe_call(cudaStreamEndCapture(capture_stream, &childGraph)); + + // Save this child graph as the implementation of the + // graph_task<>. CUDASTF will then add all necessary + // dependencies, or data transfers, allocations etc. + // Since this was captured, we will not destroy that graph (should we ?) + set_child_graph(childGraph); + } + else + { + static_assert(::std::is_invocable_v, "Incorrect lambda function signature."); + // + // Give the lambda a child graph + // + + // This lazily creates a childGraph which will be destroyed when the task ends + cudaGraph_t childGraph = get_graph(); + + // Launch the user provided function + ::std::apply(f, tuple_prepend(mv(childGraph), typed_deps())); + } + } +#if defined(_CCCL_COMPILER_MSVC) + _CCCL_DIAG_POP +#endif // _CCCL_COMPILER_MSVC + +private: + auto typed_deps() + { + return make_tuple_indexwise([&](auto i) { + return this->get<::std::tuple_element_t>>(i); + }); + } +}; + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh new file mode 100644 index 00000000000..da429a639e6 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh @@ -0,0 +1,309 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Implementation of the slice data interface in the `graph_ctx` backend + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief Slice data interface implementation over CUDA graphs. + * + * @tparam T Element type for the slice + * @tparam dimensions rank of the slice + * + * The type manipulated will be `slice`. + */ +template +class slice_graph_interface : public graph_data_interface> +{ +public: + /// @brief Alias for the base class + using base = graph_data_interface>; + /// @brief Alias for the shape type + using typename base::shape_t; + + using mutable_value_type = typename ::std::remove_const::type; + + /** + * @brief Constructor from slice + * + * @param s either a slice or the shape of a slice + */ + slice_graph_interface(slice s) + : base(mv(s)) + {} + + /** + * @brief Constructor from shape + * + * @param s either a slice or the shape of a slice + */ + slice_graph_interface(shape_of> s) + : base(mv(s)) + {} + + /// @brief Implementation of interface primitive + void data_allocate( + backend_ctx_untyped& bctx, + block_allocator_untyped& custom_allocator, + const data_place& memory_node, + instance_id_t instance_id, + ::std::ptrdiff_t& s, + void** extra_args, + event_list& prereqs) override + { + s = this->shape.size() * sizeof(T); + auto& local_desc = this->instance(instance_id); + + if (!memory_node.is_composite()) + { + void* base_ptr = custom_allocator.allocate(bctx, memory_node, s, prereqs); + local_desc = this->shape.create(static_cast(base_ptr)); + return; + } + + exec_place_grid grid = memory_node.get_grid(); + size_t total_size = this->shape.size(); + + // position (x,y,z,t) on (nx,ny,nz,nt) + // * index = x + nx*y + nx*ny*z + nx*ny*nz*t + // * index = x + nx(y + ny(z + nz*t)) + // So we can compute x, y, z, t from the index + // * x := index % nx; + // * index - x = nx(y + ny(z + nz*t)) + // * (index - x)/nx = y + ny(z + nz*t)) + // So, y := ( (index - x)/nx ) %ny + // index = x + nx(y + ny(z + nz*t)) + // (index - x)/nx = y + ny(z+nz*t) + // (index - x)/nx - y = ny(z+nz*t) + // ( (index - x)/nx - y)/ny = z+nz*t + // So, z := (( (index - x)/nx - y)/ny) % nz + // ( (index - x)/nx - y)/ny - z = nz*t + // ( ( (index - x)/nx - y)/ny - z ) / nz = t + auto delinearize = [&](size_t ind) { + static_assert(dimensions <= 4); + + size_t nx, ny, nz; + size_t x = 0, y = 0, z = 0, t = 0; + if constexpr (dimensions >= 1) + { + nx = this->shape.extent(0); + x = ind % nx; + } + if constexpr (dimensions >= 2) + { + ny = this->shape.extent(1); + y = ((ind - x) / nx) % ny; + } + if constexpr (dimensions >= 3) + { + nz = this->shape.extent(2); + z = (((ind - x) / nx - y) / ny) % nz; + } + + if constexpr (dimensions >= 4) + { + t = (((ind - x) / nx - y) / ny - z) / nz; + } + + return pos4(x, y, z, t); + }; + + // Get the extents stored as a dim4 + const dim4 data_dims = this->shape.get_data_dims(); + + auto array = bctx.get_composite_cache().get( + memory_node, memory_node.get_partitioner(), delinearize, total_size, sizeof(T), data_dims); + assert(array); + array->merge_into(prereqs); + T* base_ptr = static_cast(array->get_base_ptr()); + + // Store this localized array in the extra_args associated to the + // data instance so that we can use it later in the deallocation + // method. + *extra_args = array.release(); + local_desc = this->shape.create(base_ptr); + } + + /// @brief Implementation of interface primitive + void data_deallocate( + backend_ctx_untyped& bctx, + block_allocator_untyped& custom_allocator, + const data_place& memory_node, + instance_id_t instance_id, + void* extra_args, + event_list& prereqs) final + { + const size_t sz = this->shape.size() * sizeof(T); + auto& local_desc = this->instance(instance_id); + // We can deallocate a copy of a logical data even if it was only accessible in read only mode + auto ptr = const_cast(local_desc.data_handle()); + + // TODO find a way to erase this variable to facilitate debugging + // local_desc = slice(); // optional, helps with debugging + + if (!memory_node.is_composite()) + { + custom_allocator.deallocate(bctx, memory_node, prereqs, ptr, sz); + return; + } + + assert(extra_args); + + // To properly erase a composite data, we would need to synchronize the + // calling thread with all events because the current CUDA VMM facility + // does not have an asynchronous counterpart. Instead, we put the + // localized_array object into a cache, which will speedup the + // allocation of identical arrays, if any. + // This cached array is only usable once the prereqs of this deallocation are fulfilled. + auto* array = static_cast(extra_args); + bctx.get_composite_cache().put(::std::unique_ptr(array), prereqs); + } + + /// @brief Implementation of interface primitive + cudaGraphNode_t graph_data_copy( + cudaMemcpyKind kind, + instance_id_t src_instance_id, + instance_id_t dst_instance_id, + cudaGraph_t graph, + const cudaGraphNode_t* input_nodes, + size_t input_cnt) override + { + // static_assert(dimensions <= 2, "Unsupported yet."); + const auto& b = this->shape; + const auto& src_instance = this->instance(src_instance_id); + const auto& dst_instance = this->instance(dst_instance_id); + + cudaMemcpy3DParms cpy_params; + + /* We are copying so the destination will be changed, but from this + * might be a constant variable (when using a read-only access). Having a T + * type with a const qualifier is therefore possible, even if these API do not + * want const pointers. */ + auto dst_ptr = const_cast(dst_instance.data_handle()); + + // make_cudaPitchedPtr wants non const pointer + auto src_ptr = const_cast(src_instance.data_handle()); + + if constexpr (dimensions == 0) + { + cpy_params = { + .srcArray = nullptr, + .srcPos = make_cudaPos(size_t(0), size_t(0), size_t(0)), + .srcPtr = make_cudaPitchedPtr(src_ptr, sizeof(T), 1, 1), + .dstArray = nullptr, + .dstPos = make_cudaPos(size_t(0), size_t(0), size_t(0)), + .dstPtr = make_cudaPitchedPtr(dst_ptr, sizeof(T), 1, 1), + .extent = make_cudaExtent(sizeof(T), 1, 1), + .kind = kind}; + } + else if constexpr (dimensions == 1) + { + cpy_params = { + .srcArray = nullptr, + .srcPos = make_cudaPos(size_t(0), size_t(0), size_t(0)), + .srcPtr = make_cudaPitchedPtr(src_ptr, b.extent(0) * sizeof(T), b.extent(0), 1), + .dstArray = nullptr, + .dstPos = make_cudaPos(size_t(0), size_t(0), size_t(0)), + .dstPtr = make_cudaPitchedPtr(dst_ptr, b.extent(0) * sizeof(T), b.extent(0), 1), + .extent = make_cudaExtent(b.extent(0) * sizeof(T), 1, 1), + .kind = kind}; + } + else if constexpr (dimensions == 2) + { + cpy_params = { + .srcArray = nullptr, + .srcPos = make_cudaPos(size_t(0), size_t(0), size_t(0)), + .srcPtr = make_cudaPitchedPtr(src_ptr, src_instance.stride(1) * sizeof(T), b.extent(0), b.extent(1)), + .dstArray = nullptr, + .dstPos = make_cudaPos(size_t(0), size_t(0), size_t(0)), + .dstPtr = make_cudaPitchedPtr(dst_ptr, dst_instance.stride(1) * sizeof(T), b.extent(0), b.extent(1)), + .extent = make_cudaExtent(b.extent(0) * sizeof(T), b.extent(1), 1), + .kind = kind}; + } + else + { + // For more than 2D + EXPECT(contiguous_dims(b) == dimensions, "Cannot handle non-contiguous multidimensional array above 2D."); + + cpy_params = { + .srcArray = nullptr, + .srcPos = make_cudaPos(size_t(0), size_t(0), size_t(0)), + .srcPtr = make_cudaPitchedPtr(src_ptr, b.size() * sizeof(T), b.size(), 1), + .dstArray = nullptr, + .dstPos = make_cudaPos(size_t(0), size_t(0), size_t(0)), + .dstPtr = make_cudaPitchedPtr(dst_ptr, b.size() * sizeof(T), b.size(), 1), + .extent = make_cudaExtent(b.size() * sizeof(T), 1, 1), + .kind = kind}; + } + + cudaGraphNode_t result; + cuda_safe_call(cudaGraphAddMemcpyNode(&result, graph, input_nodes, input_cnt, &cpy_params)); + + return result; + } + + /// @brief Implementation of interface primitive + bool pin_host_memory(instance_id_t instance_id) override + { + auto s = this->instance(instance_id); + return s.data_handle() != nullptr && pin(s); + } + + /// @brief Implementation of interface primitive + void unpin_host_memory(instance_id_t /*instance_id*/) override + { + // no-op ... we unfortunately cannot unpin memory safely yet because + // the graph may be executed a long time after this unpinning occurs. + /// assert(memory_node == data_place::host); + + /// const auto& common = this->common; + /// const auto& per_inst = this->instance(instance_id); + + /// slice s = slice(common, per_inst); + + /// unpin(s); + } +}; + +/** + * @brief Given a type, defines `type` as the corresponding data interface over graphs. + * + * @tparam T + */ +template +struct graphed_interface_of; + +template +struct graphed_interface_of> +{ + using type = slice_graph_interface::rank()>; +}; + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/graph/internal/event_types.cuh b/cudax/include/cuda/experimental/__stf/graph/internal/event_types.cuh new file mode 100644 index 00000000000..da3b75dcee2 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/graph/internal/event_types.cuh @@ -0,0 +1,96 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +namespace cuda::experimental::stf::reserved +{ + +/* A prereq corresponds to a node, which is always paired to its graph */ +class graph_event_impl : public event_impl +{ +protected: + graph_event_impl() = default; + graph_event_impl(const graph_event_impl&) = delete; + graph_event_impl(cudaGraphNode_t n, size_t epoch) + : node(n) + , epoch(epoch) + { + assert(node); + } + +public: + mutable cudaGraphNode_t node; + mutable size_t epoch; +}; + +using graph_event = reserved::handle; + +// This converts a prereqs and converts it to a vector of graph nodes. As a +// side-effect, it also remove duplicates from the prereqs list of events +inline ::std::vector join_with_graph_nodes(event_list& prereqs, size_t current_epoch) +{ + ::std::vector nodes; + + // CUDA Graph API does not want to have the same dependency passed multiple times + prereqs.optimize(); + + for (const auto& e : prereqs) + { + const auto ge = reserved::graph_event(e, reserved::use_dynamic_cast); + EXPECT(current_epoch >= ge->epoch); + + // If current_epoch > ge->epoch, then this was already implicitely + // synchronized as different epochs are submitted sequentially. + if (current_epoch == ge->epoch) + { + nodes.push_back(ge->node); + } + } + + return nodes; +} + +// This creates a new CUDASTF event list from a cudaGraphNode_t and sets the appropriate annotations in the DOT output. +/* previous_prereqs is only passed so that we can insert the proper DOT annotations */ +template +inline void fork_from_graph_node( + context_t& ctx, cudaGraphNode_t n, size_t epoch, event_list& previous_prereqs, ::std::string prereq_string) +{ + auto gnp = reserved::graph_event(n, epoch); + gnp->set_symbol(ctx, mv(prereq_string)); + + auto& dot = *ctx.get_dot(); + if (dot.is_tracing_prereqs()) + { + for (const auto& e : previous_prereqs) + { + dot.add_edge(e->unique_prereq_id, gnp->unique_prereq_id, 1); + } + } + + previous_prereqs = event_list(gnp); +} + +} // namespace cuda::experimental::stf::reserved diff --git a/cudax/include/cuda/experimental/__stf/internal/acquire_release.cuh b/cudax/include/cuda/experimental/__stf/internal/acquire_release.cuh new file mode 100644 index 00000000000..89965749008 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/acquire_release.cuh @@ -0,0 +1,329 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Enforce dependencies before and after task submission to implement + * the STF model + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief Acquires necessary resources and dependencies for a task to run. + * + * This function prepares a task for execution by setting up its execution context, + * sorting its dependencies to avoid deadlocks, and ensuring all necessary data + * dependencies are fulfilled. It handles both small and large tasks by checking + * the task size and adjusting its behavior accordingly. Dependencies are processed + * to mark data usage, allocate necessary resources, and update data instances for + * task execution. This function also handles the task's transition from the setup + * phase to the running phase. + * + * @param ctx The backend context in which the task is executed. This context contains + * the execution stack and other execution-related information. + * @param tsk The task to be prepared for execution. The task must be in the setup phase + * before calling this function. + * @return An event_list containing all the input events and any additional events + * generated during the acquisition of dependencies. This list represents the + * prerequisites for the task to start execution. + * + * @note The function `EXPECT`s the task to be in the setup phase and the execution place + * not to be `exec_place::device_auto`. + * @note Dependencies are sorted by logical data addresses to prevent deadlocks. + * @note For tasks with multiple dependencies on the same logical data, only one + * instance of the data is used, and its access mode is determined by combining + * the access modes of all dependencies on that data. + */ +inline event_list task::acquire(backend_ctx_untyped& ctx) +{ + EXPECT(get_task_phase() == task::phase::setup); + + const auto eplace = get_exec_place(); + EXPECT(eplace != exec_place::device_auto); + // If there are any extra dependencies to fulfill + auto result = get_input_events(); + + // Automatically set the appropriate context (device, SM affinity, ...) + pimpl->saved_place_ctx = eplace.activate(ctx); + + auto& task_deps = pimpl->deps; + + for (auto index : each(task_deps.size())) + { + assert(task_deps[index].get_data().is_initialized()); + // Save index before reordering + task_deps[index].dependency_index = static_cast(index); + // Mark up data to avoid them being reclaimed while they are going to be used anyway + task_deps[index].get_data().add_ref(); + } + + // Sort deps by logical data addresses to avoid deadlocks when multiple + // threads are going to acquire the mutexes associated to the different + // logical data + ::std::sort(task_deps.begin(), task_deps.end()); + + // Process all dependencies one by one + for (auto it = task_deps.begin(); it != task_deps.end(); ++it) + { + logical_data_untyped d = it->get_data(); + access_mode mode = it->get_access_mode(); + + auto [frozen, frozen_mode] = d.is_frozen(); + if (frozen) + { + // If we have a frozen data, we can only access it if we are making + // a read only access, and if the data was frozen in read only mode. + // Otherwise it's a mistake as we would modify some data possibly + // being used outside the task + if (!(frozen_mode == access_mode::read && mode == access_mode::read)) + { + fprintf(stderr, "Error: illegal access on frozen logical data\n"); + abort(); + } + } + + // Make sure the context of the logical data and the context of the task match + // This is done by comparing the addresses of the context states + if (ctx != d.get_ctx()) + { + fprintf(stderr, "Error: mismatch between task context and logical data context\n"); + abort(); + } + + // We possibly "merge" multiple dependencies. If they have different modes, those are combined. + // Since logical data are ordered by addresses, "mergeable" deps will be + // stored contiguously, so we can stop as soon as there is another + // dependency + for (auto next = ::std::next(it); next != task_deps.end() && *it == *next; ++it, ++next) + { + mode |= next->get_access_mode(); + it->skipped = true; + } + + /* Get of this dependency which is not skipped, and save it in a vector. We also save the equivalent merged + * mode. */ + size_t d_index = it - task_deps.begin(); + pimpl->unskipped_indexes.push_back(::std::make_pair(d_index, mode)); + + /* Make sure the logical data is locked until the task is released */ + d.get_mutex().lock(); + + // The affine data place is set at the task level, it can be inherited + // from the execution place, or be some composite data place set up in + // a parallel_for construct, for example + const data_place& dplace = it->get_dplace() == data_place::affine ? get_affine_data_place() : it->get_dplace(); + + const instance_id_t instance_id = + mode == access_mode::redux ? d.find_unused_instance_id(dplace) : d.find_instance_id(dplace); + + if (mode == access_mode::redux) + { + d.get_data_instance(instance_id).set_redux_op(it->get_redux_op()); + } + + // We will need to remind this when we need to access that + // piece of data, or when we release it to manipulate the + // proper instance. + it->set_instance_id(instance_id); + + // This enforces dependencies with previous tasks, and ensures the data + // instances can be used (there can be extra prereqs for an instance to + // ensure it's properly allocated for example, or if there is a pending + // copy from that instance to another, so we need to keep track of both + // task dependencies (STF) and these instance-specific dependencies. + reserved::fetch_data(ctx, d, instance_id, *this, mode, eplace, dplace, result); + } + + // In the (rare case) where there is no data dependency for a task, the + // task would still depend on the entry events of the context, if any + if ((task_deps.size() == 0) && ctx.has_start_events()) + { + result.merge(ctx.get_start_events()); + } + + // @@@@ TODO@@@@ explain this algorithm, and why we need to go in reversed + // order because we skipped equivalent dependencies that were stored + // contiguously after sorting. + instance_id_t previous_instance_id = instance_id_t::invalid; + for (auto it : each(task_deps.rbegin(), task_deps.rend())) + { + if (!it->skipped) + { + previous_instance_id = it->get_instance_id(); + } + else + { + assert(previous_instance_id != instance_id_t::invalid); + // @@@@ TODO @@@@ make a unit test to make sure we have the same instance id for different acquired_data + // ? fprintf(stderr, "SETTING SKIPPED INSTANCE ID ... %d\n", previous_instance_id); + it->set_instance_id(previous_instance_id); + } + } + + for (const auto& e : task_deps) + { + logical_data_untyped d = e.get_data(); + + if (e.get_access_mode() == access_mode::redux) + { + // Save the last task accessing the instance in with a relaxed coherency mode + d.get_data_instance(e.get_instance_id()).set_last_task_relaxed(*this); + } + + // Remove temporary reference + d.remove_ref(); + } + + auto& dot = *ctx.get_dot(); + if (dot.is_tracing()) + { + // Declare that the node identified by unique_id depends on these prereqs + result.dot_declare_prereqs(dot, get_unique_id(), 1); + } + + // We consider the setup phase is over + pimpl->phase = task::phase::running; + + return result; +} + +/** + * @brief Releases resources associated with a task and transitions it to the finished phase. + * + * This function releases a task after it has completed its execution. It merges the + * list of prerequisites (events) that are marked as done, updates the dependencies for + * the task's logical data, resets the execution context to its original configuration, + * and marks the task as finished. + * + * After calling this function, the task is considered "over" and is transitioned + * from the `running` phase to the `finished` phase. All associated resources are unlocked + * and post-submission hooks (if any) are executed. + * + * @param ctx The context of the backend, which manages the execution environment. + * @param done_prereqs A list of events that must be marked as complete before the task can be released. + * + * @pre The task must be in the `running` phase. + * @pre The task's list of prerequisites (dependencies) must be empty at the time of calling. + * + * The function performs the following actions: + * - Merges the provided list of `done_prereqs` into the task's list of prerequisites. + * - Updates logical data dependencies based on the access mode (read or write). + * - Ensures proper synchronization by setting reader/writer prerequisites on the logical data. + * - Updates internal structures to reflect that the task has become a new "leaf task." + * - Resets the execution context (device, SM affinity, etc.) to its previous state. + * - Unlocks mutexes for the logical data that were locked during task execution. + * - Releases any references to logical data, preventing potential memory leaks. + * - Executes any post-submission hooks attached to the task. + * + * The function also interacts with tracing and debugging tools, marking + * the task's completion and declaring the task as a prerequisite for future tasks in the trace. + * + * @note After calling this function, the task is no longer in the `running` phase + * and cannot be modified. + * + * @warning The task must have completed all its work before calling this function. + * Failure to follow the task's lifecycle correctly may lead to undefined behavior. + */ +inline void task::release(backend_ctx_untyped& ctx, event_list& done_prereqs) +{ + // After release(), the task is over + assert(get_task_phase() == task::phase::running); + assert(get_done_prereqs().size() == 0); + + auto& task_deps = pimpl->deps; + auto& cs = ctx.get_stack(); + + // We copy the list of prereqs into the task + merge_event_list(done_prereqs); + + // Get the indices of logical data which were not skipped (redundant + // dependencies are merged). + for (auto& [ind, mode] : pimpl->unskipped_indexes) + { + auto& e = task_deps[ind]; + logical_data_untyped d = e.get_data(); + + auto&& data_instance = d.get_data_instance(e.get_instance_id()); + + if (mode == access_mode::read) + { + // If we have a read-only task, we only need to make sure that write accesses waits for this task + data_instance.add_write_prereq(done_prereqs); + } + else + { + data_instance.set_read_prereq(done_prereqs); + data_instance.clear_write_prereq(); + } + + // Update last reader/writer tasks + reserved::enforce_stf_deps_after(d, *this, mode); + } + + // Automatically reset the context to its original configuration (device, SM affinity, ...) + get_exec_place().deactivate(ctx, pimpl->saved_place_ctx); + + auto& dot = *ctx.get_dot(); + if (dot.is_tracing()) + { + // These prereqs depend on the task identified by unique_id + auto& done_prereqs_ = get_done_prereqs(); + done_prereqs_.dot_declare_prereqs_from(dot, get_unique_id(), 1); + } + + // This task becomes a new "leaf task" until another task depends on it + cs.add_leaf_task(*this); + + pimpl->phase = task::phase::finished; + + /* We unlock the mutex which were locked. We only locked each logical data + * once, so merged dependencies are ignored and we loop over unskipped indices. + */ + for (auto& [ind, _] : pimpl->unskipped_indexes) + { + logical_data_untyped d = task_deps[ind].get_data(); + d.get_mutex().unlock(); + } + + // Doing this shunts a circular dependency that would otherwise leak logical_data objects. + // Note that we do this for all dependencies, including redundant ones. + for (auto& dep : task_deps) + { + dep.reset_logical_data(); + } + + /* Execute hooks (if any) */ + for (auto& hook : pimpl->post_submission_hooks) + { + hook(); + } + + // This will, in particular, release shared_ptr to logical data captured in + // the dependencies. + pimpl->post_submission_hooks.clear(); +} + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh b/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh new file mode 100644 index 00000000000..c9bc893d6f9 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh @@ -0,0 +1,490 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Implementation of the generic event class + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace cuda::experimental::stf +{ + +class event_impl; +using event = reserved::handle; + +namespace reserved +{ +using unique_id_t = unique_id; + +using event_vector = small_vector; +static_assert(sizeof(event_vector) == 120); +} // namespace reserved + +class backend_ctx_untyped; +class event_list; + +/** + * @brief This is the base event type. It defines an abstract object which + * represents that an asynchronous event happened. + * + * It is possible to insert a dependency between two events provided the + * dependency follows the chronological order of creation: an event E2 created + * after an event E1 cannot depend on E1. + * + * Inserting a dependency between two events is supposed to be a non-blocking + * operation, but the actual event implementation is allowed to block + * temporarily. + */ +class event_impl +{ +public: + /** + * @brief Deleted copy constructor to prevent copying and ensure unique event identifiers. + */ + event_impl(const event_impl&) = delete; + + /** + * @brief Deleted copy assignment operator to prevent copying and ensure unique event identifiers. + */ + event_impl& operator=(const event_impl&) = delete; + + /** + * @brief Virtual destructor to support polymorphic deletion. + */ + virtual ~event_impl() = default; + + /** + * @brief Default constructor. + */ + event_impl() = default; + + /** + * @brief Compares this event with another event for equality based on their unique identifiers. + * @param e The event to compare with. + * @return True if the events have the same unique identifier, false otherwise. + */ + bool operator==(const event_impl& e) const + { + return unique_prereq_id == e.unique_prereq_id; + } + + /** + * @brief Compares this event with another event for ordering based on their unique identifiers. + * @param e The event to compare with. + * @return True if this event's unique identifier is less than the other event's identifier, false otherwise. + */ + bool operator<(const event_impl& e) const + { + return unique_prereq_id < e.unique_prereq_id; + } + + /** + * @brief Sets a symbolic name for the event, useful for debugging or tracing. + * @param s The symbolic name to associate with this event. + */ + template + void set_symbol(context_t& ctx, ::std::string s) + { + symbol = mv(s); + auto& dot = *ctx.get_dot(); + if (dot.is_tracing()) + { + dot.add_prereq_vertex(symbol, unique_prereq_id); + } + } + + /** + * @brief Optionally simplifies the event vector to remove redundant entries. + * @param unused A vector of events potentially containing redundant entries. + * @return True if redundant entries were removed and further uniqueness processing is unnecessary, false otherwise. + * @note This function provides a hook for derived classes to implement optimization strategies. + */ + virtual bool factorize(reserved::event_vector& /*unused*/) + { + return false; + } + + // stream then depends on the list of events + virtual void sync_with_stream(backend_ctx_untyped&, event_list&, cudaStream_t) const + { + fprintf(stderr, "Unsupported synchronization with stream.\n"); + abort(); + } + + // return stream then depends on the list of events + virtual event_list from_stream(backend_ctx_untyped&, cudaStream_t) const; + + /** + * @brief Retrieves the symbolic name of the event. + * @return The symbolic name of the event. Generates a default name if none was set. + */ + const ::std::string& get_symbol() const + { + if (symbol.empty()) + { + symbol = "event " + ::std::to_string(unique_prereq_id); + } + return symbol; + } + + /// A unique identifier for the event, used to ensure proper event ordering. + mutable reserved::unique_id_t unique_prereq_id; + + ::std::atomic outbound_deps = 0; + +protected: + /// The symbolic name associated with the event, mutable to allow lazy initialization. + mutable ::std::string symbol; +}; + +/** + * @brief List of events. + * + * Many CUDASTF routines take event lists as asynchronous prerequisites, and return a list of prerequisites. + */ +class event_list +{ + using event_vector = reserved::event_vector; + +public: + event_list() = default; + event_list(const event_list&) = default; + event_list(event_list&&) = default; + event_list& operator=(const event_list&) = default; + event_list& operator=(event_list&&) = default; + + /// Create a list from a single event + event_list(event e) + : payload{mv(e)} + {} + + event_list(event_vector& payload_) + : payload(mv(payload_)) + , optimized(false) + {} + + /// Add an event to an existing list + void add(event e) + { + payload.push_back(mv(e)); + optimized = payload.size() == 1; + } + + /// Optimize the list to remove redundant entries which are either + /// identical events, or events which are implicit from other events in the + /// list. + void optimize() + { + // No need to remove duplicates on a list that was already sanitized, + // and that has not been modified since + if (optimized) + { + return; + } + + assert(!payload.empty()); + + // nvtx_range r("optimize"); + + // All items will have the same (derived) event type as the type of the front element. + // If the type of the event does not implement a factorize method, a + // false value is returned (eg. with cudaGraphs) + bool factorized = payload.front()->factorize(payload); + + if (!factorized) + { + // This is a list of shared_ptr to events, we want to sort by events + // ID, not by addresses of the ptr + ::std::sort(payload.begin(), payload.end(), [](auto& a, auto& b) { + return *a < *b; + }); + auto new_end = unstable_unique(payload.begin(), payload.end(), [](auto& a, auto& b) { + return *a == *b; + }); + // Erase the "undefined" elements at the end of the container + payload.erase(new_end, payload.end()); + } + + optimized = true; + } + + // Introduce a dependency between the event list and the CUDA stream, this + // relies on the sync_with_stream virtual function of the event list. + void sync_with_stream(backend_ctx_untyped& bctx, cudaStream_t stream) + { + if (payload.size() > 0) + { + payload.front()->sync_with_stream(bctx, *this, stream); + } + } + + // id_to can be the id of a task or another prereq + void dot_declare_prereqs(reserved::per_ctx_dot& dot, int id_to, int array_style = 0) + { + if (!dot.is_tracing_prereqs()) + { + return; + } + + for (auto& e : payload) + { + dot.add_edge(e->unique_prereq_id, id_to, array_style); + } + } + + // id_from can be the id of a task or another prereq + void dot_declare_prereqs_from(reserved::per_ctx_dot& dot, int id_from, int array_style = 0) const + { + if (!dot.is_tracing_prereqs()) + { + return; + } + + for (auto& e : payload) + { + dot.add_edge(id_from, e->unique_prereq_id, array_style); + } + } + + /// Concatenate the content of a list into another list + /// + /// This does not depend on the actual event type + template + void merge(Ts&&... events) + { + static_assert(sizeof...(events) > 0); + + // Special case move from a single event list + if constexpr (sizeof...(Ts) == 1) + { + using First = ::std::tuple_element_t<0, ::std::tuple>; + if constexpr (!::std::is_lvalue_reference_v) + { + if (payload.empty()) + { + // Disposable copy, move from the argument and we're all done + *this = mv(events...); + return; + } + } + } + + // This will be the new size of payload with the new events in tow + const size_t new_size = payload.size() + (... + events.size()); + + // Attempt to find enough capacity in one of the added events + each_in_pack( + [&](auto&& event) { + if constexpr (!::std::is_lvalue_reference_v) + { + if (event.payload.capacity() >= new_size && payload.capacity() < new_size) + { + // Awesome, we found enough capacity. + // Swapping is fine because all elements will be merged and order is not important. + event.payload.swap(payload); + } + } + }, + ::std::forward(events)...); + + if (payload.capacity() < new_size) + { + payload.reserve(::std::max(payload.capacity() * 2, new_size)); + } + + each_in_pack( + [&](auto&& event) { + if constexpr (::std::is_lvalue_reference_v) + { + // Simply append copies of elements + payload.insert(payload.end(), event.begin(), event.end()); + } + else + { + payload.insert(payload.end(), + ::std::make_move_iterator(event.payload.begin()), + ::std::make_move_iterator(event.payload.end())); + } + }, + ::std::forward(events)...); + + assert(payload.size() == new_size); + optimized = payload.size() <= 1; + } + + template + event_list& operator+=(T&& rhs) + { + merge(::std::forward(rhs)); + return *this; + } + + /// Empty a list + void clear() + { + payload.clear(); + optimized = true; + } + + /// Get the number of events in the list + size_t size() const + { + return payload.size(); + } + + // Display the content of the event list as a string + ::std::string to_string() const + { + ::std::string result = "LIST OF EVENTS: "; + if (payload.empty()) + { + result += "(empty)\n"; + return result; + } + + for (const auto& e : payload) + { + result += ::std::to_string(int(e->unique_prereq_id)); + result += '('; + result += e->get_symbol(); + result += ") "; + } + result += "\n"; + + return result; + } + + auto begin() + { + return payload.begin(); + } + auto end() + { + return payload.end(); + } + auto begin() const + { + return payload.begin(); + } + auto end() const + { + return payload.end(); + } + + // Computes the largest prereq id + int max_prereq_id() const + { + int res = 0; + for (const auto& e : payload) + { + res = ::std::max(res, int(e->unique_prereq_id)); + } + return res; + } + +private: + // ::std::vector payload; + event_vector payload; + + /// Indicates if some factorization or removal of duplicated entries was + /// already performed on that list + bool optimized = true; +}; + +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_MSVC(4702) // unreachable code +inline event_list event_impl::from_stream(backend_ctx_untyped&, cudaStream_t) const +{ + fprintf(stderr, "Unsupported synchronization with stream.\n"); + abort(); + return event_list(); +} +_CCCL_DIAG_POP + +namespace reserved +{ + +// For counters +class join_tag +{}; + +} // end namespace reserved + +/** + * @brief Introduce a dependency from all entries of an event list to an event. + + * Makes it so that `to` depends on all of `prereq_in`. + */ +template +void join(context_t& ctx, some_event& to, event_list& prereq_in) +{ + bool typechecked = false; + for (auto& item : prereq_in) + { + assert(item); + some_event* from; + if (!typechecked) + { + from = dynamic_cast(item.operator->()); + if (!from) + { + fprintf(stderr, + "Internal error: cannot dynamically cast event \"%s\" from %s to %.*s.\n", + item->get_symbol().c_str(), + typeid(decltype(*item)).name(), // Change made here + static_cast(type_name.size()), + type_name.data()); + abort(); + } + typechecked = true; + } + else + { + from = static_cast(item.operator->()); + } + reserved::counter::increment(); + to.insert_dep(ctx.async_resources(), *from); + from->outbound_deps++; + } + + // If we are making a DOT output for prereqs, we create a local list (it's easier to get the) + auto& dot = *ctx.get_dot(); + if (dot.is_tracing_prereqs()) + { + prereq_in.dot_declare_prereqs(dot, to.unique_prereq_id, 1); + } +} + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh b/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh new file mode 100644 index 00000000000..3d174731c00 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh @@ -0,0 +1,528 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Implement classes for reusable asynchronous resources such as async_resources_handle + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include // for ::std::hash<::std::pair<::std::ptrdiff_t, ::std::ptrdiff_t>> +#include +#include + +#include +#include +#include + +namespace cuda::experimental::stf +{ + +class green_context_helper; + +// Needed to set/get affinity +class exec_place; + +/** + * @brief A class to store a CUDA stream along with a few information to avoid CUDA queries + * + * It contains + * - the stream itself, + * - a unique id (proper to CUDASTF, and only valid for streams in our pool, or equal to -1), + * - the pool associated to the unique ID, when valid + * - the device index in which the stream is + */ +struct decorated_stream +{ + decorated_stream(cudaStream_t stream = nullptr, ::std::ptrdiff_t id = -1, int dev_id = -1) + : stream(stream) + , id(id) + , dev_id(dev_id) + {} + + cudaStream_t stream = nullptr; + // Unique ID (-1 if this is not part of our pool) + ::std::ptrdiff_t id = -1; + // Device in which this stream resides + int dev_id = -1; +}; + +class async_resources_handle; + +/** + * @brief A stream_pool object stores a set of streams associated to a specific CUDA context (device, green context, + * ...) + * + * Stream pools are populated lazily. + */ +struct stream_pool +{ + stream_pool() = default; + ~stream_pool() = default; + + /** + * @brief stream_pool constructor taking a number of slots, an optional CUDA context and a device id. + * + * Streams are initialized lazily when calling next(). + */ + stream_pool(size_t n, int dev_id, CUcontext cuctx = nullptr) + : payload(n, decorated_stream(nullptr, -1, dev_id)) + , dev_id(dev_id) + , primary_ctx(cuctx) + {} + + stream_pool(stream_pool&& rhs) + { + ::std::lock_guard<::std::mutex> locker(rhs.mtx); + // ATTENTION: add to these whenever adding members + payload = mv(rhs.payload); + rhs.payload.clear(); + index = mv(rhs.index); + dev_id = mv(rhs.dev_id); + } + + /** + * @brief Get the next stream in the pool + * + * The stream is wrapped into a decorated_stream class to embed information + * such as its position in the pool, or the device id. + */ + decorated_stream next() + { + ::std::lock_guard<::std::mutex> locker(mtx); + assert(index < payload.size()); + + auto& result = payload.at(index); + + assert(result.dev_id != -1); + + /* Note that we do not check if id is initialized to a unique id (!= + * -1) because we could create pools of unregistered streams. Such + * identifiers are useful to avoid some synchronizations, but are not + * mandatory. */ + + /* Streams are created lazily, so there may be no stream yet */ + if (!result.stream) + { + if (primary_ctx) + { + cuda_safe_call(cuCtxPushCurrent(primary_ctx)); + + /* Create a CUDA stream on that context */ + cuda_safe_call(cudaStreamCreate(&result.stream)); + + CUcontext dummy; + cuda_safe_call(cuCtxPopCurrent(&dummy)); + } + else + { + const auto old_dev_id = cuda_try(); + if (old_dev_id != dev_id) + { + cuda_safe_call(cudaSetDevice(dev_id)); + } + + /* Create a CUDA stream on that device */ + cuda_safe_call(cudaStreamCreate(&result.stream)); + + if (old_dev_id != dev_id) + { + cuda_safe_call(cudaSetDevice(old_dev_id)); + } + } + } + + if (++index >= payload.size()) + { + index = 0; + } + + return result; + } + + // To iterate over all entries of the pool + using iterator = ::std::vector::iterator; + iterator begin() + { + return payload.begin(); + } + iterator end() + { + return payload.end(); + } + + /** + * @brief Number of streams in the pool + * + * CUDA streams are initialized lazily, so this gives the number of slots + * available in the pool, not the number of streams initialized. + */ + size_t size() const + { + ::std::lock_guard<::std::mutex> locker(mtx); + return payload.size(); + } + + // ATTENTION: see move ctor + mutable ::std::mutex mtx; + ::std::vector payload; + size_t index = 0; + int dev_id = 1; + CUcontext primary_ctx = nullptr; +}; + +/** + * @brief A handle which stores resources useful for an efficient asynchronous + * execution. For example this will store the pools of CUDA streams. + * + * This class relies on a PIMPL idiom and can be passed by value. Creating a + * new object of this type does not initialize any resource, as these will be + * set lazily. + */ +class async_resources_handle +{ + // TODO: optimize based on measurements + +public: + static constexpr size_t pool_size = 4; + static constexpr size_t data_pool_size = 4; + +private: + /** + * @brief A helper class to maintain a set of available IDs, and attributes IDs + */ + class id_pool + { + public: + ~id_pool() + { + assert(released.load() == current.load()); + } + + ::std::ptrdiff_t get_unique_id(size_t cnt = 1) + { + // Use fetch_add to atomically increment current and return the previous value + return current.fetch_add(cnt); + } + + void release_unique_id(::std::ptrdiff_t /* id */, size_t cnt = 1) + { + // Use fetch_add to atomically increment released + released.fetch_add(cnt); + } + + private: + // next available ID + ::std::atomic<::std::ptrdiff_t> current{0}; + // Number of IDs released, for bookkeeping + ::std::atomic<::std::ptrdiff_t> released{0}; + }; + + /** + * @brief This class implements a matrix to keep track of the previous + * synchronization that occured between each pair of streams in our pools. + * + * We record the largest event ID used to synchronized between two streams + * because synchronizing these two streams with an older event (with a lower + * ID) is implied by the previous synchronization, so it can be skipped thanks + * to stream-ordering of operations. + * + * This is implemented as a hash table where keys are pairs of IDs. + */ + class last_event_per_stream + { + public: + // We are trying to insert a dependency from the event with id event_id + // located on stream "from" to stream "dst" (stream dst waits for the + // event) + // Returned value : boolean indicating if we can skip the synchronization + bool validate_sync_and_update(::std::ptrdiff_t dst, ::std::ptrdiff_t src, int event_id) + { + // If either of the streams is not from the pool, do not skip + if (dst == -1 || src == -1) + { + return false; + } + + const auto key = ::std::pair(src, dst); + + if (auto i = interactions.find(key); i != interactions.end()) + { + // If there is already an entry, potentially update it + int& last_event_id = i->second; + if (last_event_id >= event_id) + { + // We can skip this synchronization because it was already enforced + return true; + } + last_event_id = event_id; + return false; + } + // These two streams did not interact yet, we need to create a new entry + interactions[key] = event_id; + return false; + } + + private: + // For each pair of unique IDs, we keep the last event id + ::std::unordered_map<::std::pair<::std::ptrdiff_t, ::std::ptrdiff_t>, + int, + cuda::experimental::stf::hash<::std::pair<::std::ptrdiff_t, ::std::ptrdiff_t>>> + interactions; + + ::std::mutex mtx; + }; + + // We use a pimpl idiom + class impl + { + public: + impl() + { + // Save current device so that this goes unnoticed + const int ndevices = cuda_try(); + assert(ndevices > 0); + assert(pool_size > 0); + assert(data_pool_size > 0); + + pool.resize(ndevices); + per_device_gc_helper.resize(ndevices, nullptr); + /* For every device, we keep two pools, one dedicated to computation, + * the other for auxiliary methods such as data transfers. This is intended to + * improve overlapping of transfers and computation, for example. */ + for (auto d : each(ndevices)) + { + stream_pool_init_internal(pool[d].first, d, pool_size); + stream_pool_init_internal(pool[d].second, d, data_pool_size); + } + } + + ~impl() + { + // Release the resources for each device (for each device, there is a + // stream_pool class to store CUDA streams for computation and another + // for data transfers) + for (auto& p : pool) + { + stream_pool_cleanup_internal(p.first); + stream_pool_cleanup_internal(p.second); + } + } + + stream_pool& get_device_stream_pool(int dev_id, bool for_computation) + { + assert(dev_id < int(pool.size())); + return for_computation ? pool[dev_id].first : pool[dev_id].second; + } + + private: + // Initialize a stream pool for the dev_id CUDA device with n slots which will be populated lazily + void stream_pool_init_internal(stream_pool& p, int dev_id, size_t n) + { + assert(n > 0); + + // Do most of the work outside the critical section + ::std::vector new_payload; + new_payload.reserve(n); + for (auto i : each(n)) + { + ::std::ignore = i; + new_payload.emplace_back(nullptr, ids.get_unique_id(), dev_id); + } + + ::std::lock_guard<::std::mutex> locker(p.mtx); + p.dev_id = dev_id; + p.payload = std::move(new_payload); + } + + void stream_pool_cleanup_internal(stream_pool& p) + { + ::std::vector goner; + { + ::std::lock_guard<::std::mutex> locker(p.mtx); + p.payload.swap(goner); + } + // Clean up outside the critical section + for (auto& e : goner) + { + ids.release_unique_id(e.id); + if (e.stream) + { + cuda_safe_call(cudaStreamDestroy(e.stream)); + } + } + } + + public: + // These are constructed and destroyed in reversed order + id_pool ids; + + // This memorize what was the last event used to synchronize a pair of streams + last_event_per_stream cached_syncs; + + // For each device, a pair of stream_pool objects, each stream_pool objects + // stores a pool of streams on this device + ::std::vector<::std::pair> pool; + + /* Store a vector of previously instantiated graphs, with the number of + * nodes, number of edges, and the executable graph */ + ::std::vector<::std::tuple>> cached_graphs; + + ::std::vector<::std::shared_ptr> per_device_gc_helper; + + mutable exec_affinity affinity; + }; + + ::std::shared_ptr pimpl; + +public: + async_resources_handle() + : pimpl(::std::make_shared()) + {} + + explicit async_resources_handle(::std::nullptr_t) {} + + explicit operator bool() const + { + return pimpl != nullptr; + } + + stream_pool& get_device_stream_pool(int dev_id, bool for_computation) + { + assert(pimpl); + return pimpl->get_device_stream_pool(dev_id, for_computation); + } + + ::std::ptrdiff_t get_unique_id(size_t cnt = 1) + { + assert(pimpl); + return pimpl->ids.get_unique_id(cnt); + } + + void release_unique_id(::std::ptrdiff_t id, size_t cnt = 1) + { + assert(pimpl); + return pimpl->ids.release_unique_id(id, cnt); + } + + bool validate_sync_and_update(::std::ptrdiff_t dst, ::std::ptrdiff_t src, int event_id) + { + assert(pimpl); + return pimpl->cached_syncs.validate_sync_and_update(dst, src, event_id); + } + + void put_graph_in_cache(size_t nnodes, size_t nedges, ::std::shared_ptr exec_g) + { + assert(pimpl); + pimpl->cached_graphs.push_back(::std::make_tuple(nnodes, nedges, mv(exec_g))); + } + + auto& get_cached_graphs() + { + assert(pimpl); + return pimpl->cached_graphs; + } + + // Get the green context helper cached for this device (or let the user initialize it) + auto& gc_helper(int dev_id) + { + assert(pimpl); + assert(dev_id < int(pimpl->per_device_gc_helper.size())); + return pimpl->per_device_gc_helper[dev_id]; + } + + exec_affinity& get_affinity() + { + assert(pimpl); + return pimpl->affinity; + } + + const exec_affinity& get_affinity() const + { + assert(pimpl); + return pimpl->affinity; + } + + bool has_affinity() const + { + assert(pimpl); + return pimpl->affinity.has_affinity(); + } + + void push_affinity(::std::vector<::std::shared_ptr> p) const + { + assert(pimpl); + pimpl->affinity.push(mv(p)); + } + + void push_affinity(::std::shared_ptr p) const + { + assert(pimpl); + pimpl->affinity.push(mv(p)); + } + + void pop_affinity() const + { + assert(pimpl); + pimpl->affinity.pop(); + } + + const ::std::vector<::std::shared_ptr>& current_affinity() const + { + assert(pimpl); + return pimpl->affinity.top(); + } +}; + +/* Record a user-provided CUDA stream (optimization) */ +inline decorated_stream register_stream(async_resources_handle& async_resources, cudaStream_t user_stream) +{ + // Get a unique ID + const auto id = async_resources.get_unique_id(); + const int dev_id = get_device_from_stream(user_stream); + + return decorated_stream(user_stream, id, dev_id); +} + +inline void unregister_stream(async_resources_handle& async_resources, decorated_stream& dstream) +{ + async_resources.release_unique_id(dstream.id); + // reset the decorated stream + dstream.id = -1; +} + +#ifdef UNITTESTED_FILE +/* + * This test ensures that the async_resources_handle type is default + * constructible, which for example makes it possible to use it in a raft + * handle. + */ +UNITTEST("async_resources_handle is_default_constructible") +{ + static_assert(::std::is_default_constructible::value, + "async_resources_handle must be default constructible"); +}; +#endif + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_allocator_setup.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_allocator_setup.cuh new file mode 100644 index 00000000000..8244e4ddfdd --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/backend_allocator_setup.cuh @@ -0,0 +1,121 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Implements code that is common to both backends but which needs full definition of backend_ctx_untyped + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +namespace cuda::experimental::stf +{ +class block_allocator_untyped; +} + +namespace cuda::experimental::stf::reserved +{ + +template +auto allocators_create_and_attach(ctx_impl_t& i, Args&&... args) +{ + /* We cannot create a block_allocator directly because the context + * is not usable yet, this will not attach the allocator. */ + auto res = block_allocator_untyped(::std::make_shared(::std::forward(args)...)); + + /* Ensure the allocator is cleaned up when the context ends */ + i.attached_allocators.push_back(res); + + return res; +}; + +template +void backend_ctx_set_default_allocator(ctx_impl_t& i, block_allocator_untyped& uncached) +{ + const char* default_alloc_env = getenv("CUDASTF_DEFAULT_ALLOCATOR"); + if (default_alloc_env) + { + ::std::string default_alloc_str(default_alloc_env); + if (default_alloc_str == "uncached") + { + i.default_allocator = uncached; + } + else if (default_alloc_str == "cached") + { + i.default_allocator = allocators_create_and_attach(i, uncached); + } + else if (default_alloc_str == "pooled") + { + i.default_allocator = allocators_create_and_attach(i); + } + else + { + fprintf(stderr, "Error: invalid CUDASTF_DEFAULT_ALLOCATOR value.\n"); + abort(); + } + } + else + { + // Default allocator = cached + i.default_allocator = allocators_create_and_attach(i, uncached); + } + + // Make it possible to customize the context-wide default allocator (use the default one for now) + i.custom_allocator = i.default_allocator; +} + +/** + * @brief This code selects the appropriate allocators when creating contexts, + * it was moved to a separate file because the same code is used in different + * context implementations. + * + * This method is a friend of the backend_ctx_untyped class so that it may + * modify its internal structures. + */ +template +void backend_ctx_setup_allocators(ctx_impl_t& i) +{ + i.uncached_allocator = allocators_create_and_attach(i); + + // Select the default allocator based on this uncached allocator + backend_ctx_set_default_allocator(i, i.uncached_allocator); +} + +/** + * @brief Select a different uncached allocator after the context was already + * created. We change the default allocators too to reflect this new uncached + * allocator. + * + * Note : this resets the current context allocator (set_allocator) to the + * default one too. + */ +template +void backend_ctx_update_uncached_allocator(ctx_impl_t& i, block_allocator_untyped uncached_allocator) +{ + // Update the uncached allocator + i.uncached_allocator = uncached_allocator; + + backend_ctx_set_default_allocator(i, uncached_allocator); +} + +} // namespace cuda::experimental::stf::reserved diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh new file mode 100644 index 00000000000..53968c64b2a --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh @@ -0,0 +1,1110 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Implements backend_ctx which is the base of all backends, such as `stream_ctx` or `graph_ctx` + * + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include // backend_ctx::launch() uses execution_policy +#include +#include +#include // backend_ctx_untyped::impl usese machine +#include // backend_ctx_untyped::impl uses reorderer +#include +#include // backend_ctx_untyped::impl uses scheduler +#include // backend_ctx uses shape_of +#include // backend_ctx_untyped::impl has-a ctx_stack +#include +#include + +// XXX there is currently a dependency on this header for places.h +// Until we find a solution we need to include this +#include + +#include +#include +#include +#include +#include + +namespace cuda::experimental::stf +{ + +template +class logical_data; + +template +class frozen_logical_data; + +class graph_ctx; + +class null_partition; + +namespace reserved +{ + +template +class parallel_for_scope; + +template +class launch_scope; + +/** + * @brief Result of `host_launch` (below) + * + * @tparam Deps Types of dependencies + * + * @see `host_launch` + */ +template +class host_launch_scope +{ +public: + host_launch_scope(Ctx& ctx, task_dep... deps) + : ctx(ctx) + , deps(mv(deps)...) + {} + + host_launch_scope(const host_launch_scope&) = delete; + host_launch_scope& operator=(const host_launch_scope&) = delete; + // move-constructible + host_launch_scope(host_launch_scope&&) = default; + + /** + * @brief Sets the symbol for this object. + * + * This method moves the provided string into the internal symbol member and returns a reference to the current + * object, allowing for method chaining. + * + * @param s The string to set as the symbol. + * @return A reference to the current object. + */ + auto& set_symbol(::std::string s) + { + symbol = mv(s); + return *this; + } + + /** + * @brief Takes a lambda function and executes it on the host in a graph callback node. + * + * @tparam Fun type of lambda function + * @param f Lambda function to execute + */ + template + void operator->*(Fun&& f) + { + auto t = ctx.task(exec_place::host); + t.add_deps(deps); + if (!symbol.empty()) + { + t.set_symbol(symbol); + } + + t.start(); + SCOPE(exit) + { + t.end(); + }; + + auto& dot = *ctx.get_dot(); + if (dot.is_tracing()) + { + dot.template add_vertex(t); + } + + auto payload = [&]() { + if constexpr (called_from_launch) + { + return tuple_prepend(thread_hierarchy<>(), deps.instance(t)); + } + else + { + return deps.instance(t); + } + }(); + auto* wrapper = new ::std::pair{::std::forward(f), mv(payload)}; + + auto callback = [](void* untyped_wrapper) { + auto w = static_cast(untyped_wrapper); + SCOPE(exit) + { + delete w; + }; + ::std::apply(::std::forward(w->first), mv(w->second)); + }; + + if constexpr (::std::is_same_v) + { + cudaHostNodeParams params = {.fn = callback, .userData = wrapper}; + // Put this host node into the child graph that implements the graph_task<> + cuda_safe_call(cudaGraphAddHostNode(&t.get_node(), t.get_ctx_graph(), nullptr, 0, ¶ms)); + } + else + { + cuda_safe_call(cudaLaunchHostFunc(t.get_stream(), callback, wrapper)); + } + } + +private: + ::std::string symbol; + Ctx& ctx; + task_dep_vector deps; +}; + +} // end namespace reserved + +/** + * @brief Description of a CUDA kernel + * + * This is used to describe kernels passed to the `ctx.cuda_kernel` and + * `ctx.cuda_kernel_chain` API calls. + */ +struct cuda_kernel_desc +{ + template + cuda_kernel_desc(Fun func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, Args... args) + : func((const void*) func) + , gridDim(gridDim_) + , blockDim(blockDim_) + , sharedMem(sharedMem_) + { + using TupleType = ::std::tuple<::std::decay_t...>; + + // We first copy all arguments into a tuple because the kernel + // implementation needs pointers to the argument, so we cannot use + // directly those passed in the pack of arguments + auto arg_tuple = ::std::make_shared(std::forward(args)...); + + // Ensure we are packing arguments of the proper types to call func + static_assert(::std::is_invocable_v); + + // Get the address of every tuple entry + ::std::apply( + [this](auto&... elems) { + // Push back the addresses of each tuple element into the args vector + ((args_ptr.push_back(static_cast(&elems))), ...); + }, + *arg_tuple); + + // Save the tuple in a typed erased value + arg_tuple_type_erased = mv(arg_tuple); + } + + /* __global__ function */ + const void* func; + dim3 gridDim; + dim3 blockDim; + size_t sharedMem; + + // Vector of pointers to the arg_tuple which saves arguments in a typed-erased way + ::std::vector args_ptr; + +private: + ::std::shared_ptr arg_tuple_type_erased; +}; + +namespace reserved +{ + +/** + * @brief Implementation of the CUDA kernel construct + * + * If the chained flag is set, we expect to have a chain of kernels, otherwise a single kernel + */ +template +class cuda_kernel_scope +{ +public: + cuda_kernel_scope(Ctx& ctx, task_dep... deps) + : ctx(ctx) + , deps(mv(deps)...) + {} + + // Provide an explicit execution place + cuda_kernel_scope(Ctx& ctx, exec_place e_place, task_dep... deps) + : ctx(ctx) + , deps(mv(deps)...) + , e_place(mv(e_place)) + {} + + cuda_kernel_scope(const cuda_kernel_scope&) = delete; + cuda_kernel_scope& operator=(const cuda_kernel_scope&) = delete; + // move-constructible + cuda_kernel_scope(cuda_kernel_scope&&) = default; + + /** + * @brief Sets the symbol for this object. + * + * This method moves the provided string into the internal symbol member and returns a reference to the current + * object, allowing for method chaining. + * + * @param s The string to set as the symbol. + * @return A reference to the current object. + */ + auto& set_symbol(::std::string s) + { + symbol = mv(s); + return *this; + } + + /** + * @brief Takes a lambda function and executes it on the host in a graph callback node. + * + * @tparam Fun type of lambda function + * @param f Lambda function to execute + */ + template + void operator->*(Fun&& f) + { + // If a place is specified, use it + auto t = e_place ? ctx.task(e_place.value()) : ctx.task(); + + t.add_deps(deps); + if (!symbol.empty()) + { + t.set_symbol(symbol); + } + + t.start(); + SCOPE(exit) + { + t.end(); + }; + + auto& dot = *ctx.get_dot(); + if (dot.is_tracing()) + { + dot.template add_vertex(t); + } + + // When chained is enable, we expect a vector of kernel description which should be executed one after the other + if constexpr (chained) + { + ::std::vector res = ::std::apply(f, deps.instance(t)); + assert(!res.empty()); + + if constexpr (::std::is_same_v) + { + // We have two situations : either there is a single kernel and we put the kernel in the context's + // graph, or we rely on a child graph + if (res.size() == 1) + { + insert_one_kernel(res[0], t.get_node(), t.get_ctx_graph()); + } + else + { + // Get the (child) graph associated to the task + auto g = t.get_graph(); + + cudaGraphNode_t n = nullptr; + cudaGraphNode_t prev_n = nullptr; + + // Create a chain of kernels + for (size_t i = 0; i < res.size(); i++) + { + if (i > 0) + { + prev_n = n; + } + + insert_one_kernel(res[i], n, g); + if (i > 0) + { + cuda_safe_call(cudaGraphAddDependencies(g, &prev_n, &n, 1)); + } + } + } + } + else + { + // Rely on stream semantic to have a dependency between the kernels + for (auto& k : res) + { + cuda_safe_call( + cudaLaunchKernel(k.func, k.gridDim, k.blockDim, k.args_ptr.data(), k.sharedMem, t.get_stream())); + } + } + } + else + { + // We have an unchained cuda_kernel, which means there is a single + // CUDA kernel described, and the function should return a single + // descriptor, not a vector + static_assert(!chained); + + cuda_kernel_desc res = ::std::apply(f, deps.instance(t)); + + if constexpr (::std::is_same_v) + { + insert_one_kernel(res, t.get_node(), t.get_ctx_graph()); + } + else + { + cuda_safe_call( + cudaLaunchKernel(res.func, res.gridDim, res.blockDim, res.args_ptr.data(), res.sharedMem, t.get_stream())); + } + } + } + +private: + /* Add a kernel to a CUDA graph given its description */ + auto insert_one_kernel(cuda_kernel_desc& k, cudaGraphNode_t& n, cudaGraph_t& g) const + { + cudaKernelNodeParams kconfig; + kconfig.blockDim = k.blockDim; + kconfig.extra = nullptr; + kconfig.func = const_cast(k.func); + kconfig.gridDim = k.gridDim; + kconfig.kernelParams = k.args_ptr.data(); + kconfig.sharedMemBytes = k.sharedMem; + cuda_safe_call(cudaGraphAddKernelNode(&n, g, nullptr, 0, &kconfig)); + } + + ::std::string symbol; + Ctx& ctx; + task_dep_vector deps; + ::std::optional e_place; +}; + +} // end namespace reserved + +/** + * @brief Unified context!!! + * + */ +class backend_ctx_untyped +{ +public: + /** + * @brief current context status + * + * We keep track of the status of context so that we do not make API calls at an + * inappropriate time, such as synchronizing twice. + */ + enum class phase + { + setup, // the context is getting initialized + submitted, // between acquire and release + finalized, // we have called finalize + }; + +#if 0 + static ::std::string phase_to_string(phase p) { + switch (p) { + case phase::setup: return ::std::string("setup"); + case phase::submitted: return ::std::string("submitted"); + case phase::finalized: return ::std::string("finalized"); + } + return ::std::string("error"); + } +#endif + +protected: + /** + * @brief This stores states attached to any context, which are not specific to a + * given backend. For example the stack of tasks. + */ + class impl + { + public: + friend class backend_ctx_untyped; + + impl(async_resources_handle async_resources = async_resources_handle()) + : auto_scheduler(reserved::scheduler::make(getenv("CUDASTF_SCHEDULE"))) + , auto_reorderer(reserved::reorderer::make(getenv("CUDASTF_TASK_ORDER"))) + , async_resources(async_resources ? mv(async_resources) : async_resources_handle()) + , is_tracing(reserved::dot::instance().is_tracing()) + , is_tracing_prereqs(reserved::dot::instance().is_tracing_prereqs()) + { + // Enable peer memory accesses (if not done already) + reserved::machine::instance().enable_peer_accesses(); + + // If CUDASTF_DISPLAY_STATS is set to a non 0 value, record stats + const char* record_stats_env = getenv("CUDASTF_DISPLAY_STATS"); + if (record_stats_env && atoi(record_stats_env) != 0) + { + is_recording_stats = true; + } + + // We generate symbols if we may use them +#ifdef CUDASTF_DEBUG + generate_event_symbols = true; +#else + generate_event_symbols = is_tracing_prereqs; +#endif + + // Initialize a structure to generate a visualization of the activity in this context + dot = ::std::make_shared(is_tracing, is_tracing_prereqs); + + // Record it in the list of all traced contexts + reserved::dot::instance().per_ctx.push_back(dot); + } + + virtual ~impl() + { + // We can't assert here because there may be tasks inside tasks + // assert(total_task_cnt == 0 && "You created some tasks but forgot to call finalize()."); + + if (!is_recording_stats) + { + return; + } + + display_transfers(); + + fprintf(stderr, "TOTAL SYNC COUNT: %lu\n", reserved::counter::load()); + } + + impl(const impl&) = delete; + impl& operator=(const impl&) = delete; + + // Due to circular dependencies, every context defines the same, but we + // cannot implement it here. + virtual void update_uncached_allocator(block_allocator_untyped custom) = 0; + + virtual cudaGraph_t graph() const + { + return nullptr; + } + +#if defined(_CCCL_COMPILER_MSVC) + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_MSVC(4702) // unreachable code +#endif // _CCCL_COMPILER_MSVC + virtual event_list stream_to_event_list(cudaStream_t, ::std::string) const + { + fprintf(stderr, "Internal error.\n"); + abort(); + return event_list(); + } +#if defined(_CCCL_COMPILER_MSVC) + _CCCL_DIAG_POP +#endif // _CCCL_COMPILER_MSVC + + virtual size_t epoch() const + { + return size_t(-1); + } + + auto& get_default_allocator() + { + return default_allocator; + } + + auto& get_uncached_allocator() + { + return uncached_allocator; + } + + /** + * @brief Write-back data and erase automatically created data instances + * The implementation requires logical_data_untyped_impl to be complete + */ + void erase_all_logical_data(); + + /** + * @brief Add an allocator to the vector of allocators which will be + * deinitialized when the context is finalized + */ + void attach_allocator(block_allocator_untyped a) + { + attached_allocators.push_back(mv(a)); + } + + /** + * @brief Detach all allocators previously attached in this context to + * release ressources that might have been cached + */ + void detach_allocators(backend_ctx_untyped& bctx) + { + // Deinitialize all attached allocators in reversed order + for (auto it : each(attached_allocators.rbegin(), attached_allocators.rend())) + { + stack.add_dangling_events(it->deinit(bctx)); + } + + // Erase the vector of allocators now that they were deinitialized + attached_allocators.clear(); + + stack.add_dangling_events(composite_cache.deinit()); + } + + void display_transfers() const + { + // fprintf(stderr, "display_transfers() => transfers.size() %ld\n", transfers.size()); + // if (transfers.size() == 0) + // return; + + size_t total_cnt = 0; + size_t total_bytes = 0; + + fprintf(stderr, "CTX STATS\n"); + for (auto& e : transfers) + { + ::std::pair nodes = e.first; + ::std::pair res = e.second; + fprintf(stderr, "\t%d->%d : cnt %zu (%zu bytes)\n", nodes.first, nodes.second, res.first, res.second); + total_cnt += res.first; + total_bytes += res.second; + } + + fprintf(stderr, "TOTAL: %zu transfers (%zu bytes)\n", total_cnt, total_bytes); + } + + void cleanup() + { + // assert(!stack.hasCurrentTask()); + attached_allocators.clear(); + total_task_cnt.store(0); + // Leave custom_allocator, auto_scheduler, and auto_reordered as they were. + } + + /* Current context-wide allocator (same as default_allocator unless it is changed) */ + block_allocator_untyped custom_allocator; + block_allocator_untyped default_allocator; + block_allocator_untyped uncached_allocator; + reserved::ctx_stack stack; + + // A vector of all allocators used in this ctx, so that they are + // destroyed when calling finalize() + ::std::vector attached_allocators; + reserved::composite_slice_cache composite_cache; + + ::std::unique_ptr auto_scheduler; + ::std::unique_ptr auto_reorderer; + // Stats-related stuff + ::std::unordered_map<::std::pair, + ::std::pair, + cuda::experimental::stf::hash<::std::pair>> + transfers; + bool is_recording_stats = false; + // Keep track of the number of tasks generated in the context + ::std::atomic total_task_cnt; + + // This data structure contains all resources useful for an efficient + // asynchronous execution. This will for example contain pools of CUDA + // streams which are costly to create. + // + // We use an optional to avoid instantiating it until we have initialized it + async_resources_handle async_resources; + + // Should we generate a DOT output ? Should it include prereqs too ? + mutable bool is_tracing = false; + mutable bool is_tracing_prereqs = false; + + // Do we need to generate symbols for events ? This is true when we are + // in debug mode (as we may inspect structures with a debugger, or when + // generating a dot output) + bool generate_event_symbols = false; + + ::std::shared_ptr& get_dot() + { + return dot; + } + + const ::std::shared_ptr& get_dot() const + { + return dot; + } + + auto get_phase() const + { + return ctx_phase; + } + + void set_phase(backend_ctx_untyped::phase p) + { + ctx_phase = p; + } + + bool has_start_events() const + { + return stack.has_start_events(); + } + + const event_list& get_start_events() const + { + return stack.get_start_events(); + } + + private: + // Used if we print the task graph using DOT + ::std::shared_ptr dot; + + backend_ctx_untyped::phase ctx_phase = backend_ctx_untyped::phase::setup; + }; + +public: + backend_ctx_untyped() = delete; + + backend_ctx_untyped(::std::shared_ptr impl) + : pimpl(mv(impl)) + { + assert(pimpl); + } + + explicit operator bool() const + { + return pimpl != nullptr; + } + + bool operator==(const backend_ctx_untyped& rhs) const + { + return pimpl == rhs.pimpl; + } + + bool operator!=(const backend_ctx_untyped& rhs) const + { + return !(*this == rhs); + } + + async_resources_handle& async_resources() const + { + assert(pimpl); + assert(pimpl->async_resources); + return pimpl->async_resources; + } + + auto& get_stack() + { + assert(pimpl); + return pimpl->stack; + } + + bool reordering_tasks() const + { + assert(pimpl); + return pimpl->auto_reorderer != nullptr; + } + + auto& get_composite_cache() + { + return pimpl->composite_cache; + } + + ::std::pair schedule_task(const task& t) const + { + assert(pimpl); + assert(pimpl->auto_scheduler); + return pimpl->auto_scheduler->schedule_task(t); + } + + void reorder_tasks(::std::vector& tasks, ::std::unordered_map& task_map) + { + assert(pimpl); + assert(pimpl->auto_reorderer); + pimpl->auto_reorderer->reorder_tasks(tasks, task_map); + } + + void increment_task_count() + { + ++pimpl->total_task_cnt; + } + + size_t task_count() const + { + return pimpl->total_task_cnt; + } + + /* Customize the allocator used by all logical data */ + void set_allocator(block_allocator_untyped custom) + { + pimpl->custom_allocator = mv(custom); + } + + /* Customize the uncached allocator used by other allocators */ + void set_uncached_allocator(block_allocator_untyped custom) + { + pimpl->uncached_allocator = mv(custom); + } + + auto& get_allocator() + { + return pimpl->custom_allocator; + } + const auto& get_allocator() const + { + return pimpl->custom_allocator; + } + + auto& get_default_allocator() + { + return pimpl->get_default_allocator(); + } + + auto& get_uncached_allocator() + { + return pimpl->get_uncached_allocator(); + } + + void update_uncached_allocator(block_allocator_untyped uncached_allocator) + { + pimpl->update_uncached_allocator(mv(uncached_allocator)); + } + + void attach_allocator(block_allocator_untyped a) + { + pimpl->attach_allocator(mv(a)); + } + + void add_transfer(const data_place& src_node, const data_place& dst_node, size_t s) + { + if (!pimpl->is_recording_stats) + { + return; + } + ::std::pair nodes(device_ordinal(src_node), device_ordinal(dst_node)); + // Increment the count for the pair + pimpl->transfers[nodes].first++; + // Add the value of s to the sum for the pair + pimpl->transfers[nodes].second += s; + } + + bool is_tracing() const + { + return pimpl->is_tracing; + } + + bool is_tracing_prereqs() const + { + return pimpl->is_tracing_prereqs; + } + + bool generate_event_symbols() const + { + return pimpl->generate_event_symbols; + } + + cudaGraph_t graph() const + { + return pimpl->graph(); + } + + event_list stream_to_event_list(cudaStream_t stream, ::std::string event_symbol) const + { + assert(pimpl); + return pimpl->stream_to_event_list(stream, mv(event_symbol)); + } + + size_t epoch() const + { + return pimpl->epoch(); + } + + // protected: + impl& get_state() + { + assert(pimpl); + return *pimpl; + } + + const impl& get_state() const + { + assert(pimpl); + return *pimpl; + } + + const auto& get_dot() const + { + assert(pimpl); + return pimpl->get_dot(); + } + + auto& get_dot() + { + assert(pimpl); + return pimpl->get_dot(); + } + + template + void set_parent_ctx(parent_ctx_t& parent_ctx) + { + reserved::per_ctx_dot::set_parent_ctx(parent_ctx.get_dot(), get_dot()); + } + + auto get_phase() const + { + return pimpl->get_phase(); + } + + void set_phase(backend_ctx_untyped::phase p) + { + pimpl->set_phase(p); + } + + bool has_start_events() const + { + return pimpl->has_start_events(); + } + + const event_list& get_start_events() const + { + return pimpl->get_start_events(); + } + + // Shortcuts to manipulate the current affinity stored in the async_resources_handle of the ctx + void push_affinity(::std::vector<::std::shared_ptr> p) const + { + async_resources().push_affinity(mv(p)); + } + void push_affinity(::std::shared_ptr p) const + { + async_resources().push_affinity(mv(p)); + } + void pop_affinity() const + { + async_resources().pop_affinity(); + } + + const ::std::vector<::std::shared_ptr>& current_affinity() const + { + return async_resources().current_affinity(); + } + + const exec_place& current_exec_place() const + { + assert(has_affinity() && "current_exec_place() cannot be called without setting the affinity first."); + assert(current_affinity().size() > 0); + return *(current_affinity()[0]); + } + + bool has_affinity() const + { + return async_resources().has_affinity(); + } + + // Determines the default execution place for a given context, which + // corresponds to the execution place when no place is provided. + // + // By default, we select the current device, unless an affinity was set in the + // context, in which case we take the first execution place in the current + // places. + exec_place default_exec_place() const + { + return has_affinity() ? current_exec_place() : exec_place::current_device(); + } + + // Automatically pick a CUDA stream from the pool attached to the current + // execution place + auto pick_dstream() + { + return default_exec_place().get_stream_pool(async_resources(), true).next(); + } + cudaStream_t pick_stream() + { + return pick_dstream().stream; + } + +private: + ::std::shared_ptr pimpl; +}; + +/** + * @brief This is a placeholder class so that we can put common utilities to design a + * backend ctx. The state of the backend itself is put elsewhere. + */ +template +class backend_ctx : public backend_ctx_untyped +{ +public: + backend_ctx(::std::shared_ptr impl) + : backend_ctx_untyped(mv(impl)) + { + static_assert(sizeof(*this) == sizeof(backend_ctx_untyped), "Derived value type cannot add state."); + } + + ~backend_ctx() = default; + + /** + * @brief Returns a `logical_data` object with the given shape, tied to this graph. Initial data place is invalid. + * + * @tparam T Underlying type for the logical data object + * @param shape shape of the created object + * @return `logical_data` usable with this graph + */ + template + cuda::experimental::stf::logical_data logical_data(shape_of shape) + { + return cuda::experimental::stf::logical_data(*this, make_data_interface(shape), data_place::invalid); + } + + template + auto logical_data(T prototype, data_place dplace = data_place::host) + { + EXPECT(dplace != data_place::invalid); + assert(self()); + return cuda::experimental::stf::logical_data(*this, make_data_interface(prototype), mv(dplace)); + } + + template + auto logical_data(T (&array)[n], data_place dplace = data_place::host) + { + EXPECT(dplace != data_place::invalid); + return logical_data(make_slice(&array[0], n), mv(dplace)); + } + + template + auto logical_data(size_t elements, Sizes... more_sizes) + { + constexpr size_t num_sizes = sizeof...(Sizes) + 1; + return logical_data(shape_of>(elements, more_sizes...)); + } + + template + auto logical_data(T* p, size_t n, data_place dplace = data_place::host) + { + EXPECT(dplace != data_place::invalid); + return logical_data(make_slice(p, n), mv(dplace)); + } + + template + frozen_logical_data freeze(cuda::experimental::stf::logical_data d, + access_mode m = access_mode::read, + data_place where = data_place::invalid) + { + return frozen_logical_data(*this, mv(d), m, mv(where)); + } + + /** + * @brief Creates a typed task on the current CUDA device + * @return An instantiation of `task` with the appropriate arguments, suitable for use with `operator->*`. + */ + template + auto task(task_dep... deps) + { + return self().task(self().default_exec_place(), mv(deps)...); + } + + /** + * @brief Creates an object able to launch a lambda function on the host. + * + * @tparam Deps Dependency types + * @param deps dependencies + * @return `host_launch_scope` ready for the `->*` operator + */ + template + auto host_launch(task_dep... deps) + { + return reserved::host_launch_scope(self(), mv(deps)...); + } + + template + auto cuda_kernel(task_dep... deps) + { + return reserved::cuda_kernel_scope(self(), mv(deps)...); + } + + template + auto cuda_kernel_chain(task_dep... deps) + { + return reserved::cuda_kernel_scope(self(), mv(deps)...); + } + + template + auto launch(thread_hierarchy_spec_t spec, exec_place e_place, task_dep... deps) + { + return reserved::launch_scope(self(), mv(spec), mv(e_place), mv(deps)...); + } + + /* Using ctx.launch with a host place */ + template + auto launch(exec_place_host, task_dep... deps) + { + return reserved::host_launch_scope(self(), mv(deps)...); + } + + /* Default execution policy, explicit place */ + // default depth to avoid breaking all codes (XXX temporary) + template + auto launch(exec_place e_place, task_dep... deps) + { + return launch(par(par()), mv(e_place), mv(deps)...); + } + + /* Default execution policy, on automatically selected device */ + template + auto launch(task_dep... deps) + { + return launch(self().default_exec_place(), mv(deps)...); + } + + auto repeat(size_t count) + { + return reserved::repeat_scope(self(), count); + } + + auto repeat(::std::function condition) + { + return reserved::repeat_scope(self(), mv(condition)); + } + + /* + * parallel_for : apply an operation over a shaped index space + */ + template + auto parallel_for(exec_place e_place, S shape, task_dep... deps) + { + return reserved::parallel_for_scope(self(), mv(e_place), mv(shape), mv(deps)...); + } + + template + auto parallel_for(partitioner_t, exec_place e_place, S shape, task_dep... deps) + { + return reserved::parallel_for_scope(self(), mv(e_place), mv(shape), mv(deps)...); + } + + template + auto parallel_for(exec_place_grid e_place, S shape, task_dep... deps) = delete; + + template + auto parallel_for(partitioner_t p, exec_place_grid e_place, S shape, task_dep... deps) + { + return parallel_for(mv(p), exec_place(mv(e_place)), mv(shape), mv(deps)...); + } + + template + auto parallel_for(S shape, task_dep... deps) + { + return parallel_for(self().default_exec_place(), mv(shape), mv(deps)...); + } + +private: + Engine& self() + { + static_assert(::std::is_base_of_v, Engine>); + return static_cast(*this); + } + + template + auto make_data_interface(P&&... p) + { + return ::std::make_shared>(::std::forward

(p)...); + } +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/constants.cuh b/cudax/include/cuda/experimental/__stf/internal/constants.cuh new file mode 100644 index 00000000000..2c3cbf31fc9 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/constants.cuh @@ -0,0 +1,88 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Different constant values and some related methods + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief Encodes an access mode (read, write, redux, ...) + */ +enum class access_mode : unsigned int +{ + none = 0, + read = 1, + write = 2, + rw = 3, // READ + WRITE + redux = 4, /* operator ? */ +}; + +/** + * @brief Combines two access mode into a compatible access mode for both + * accesses (eg. read+write = rw, read+read = read) + */ +inline access_mode operator|(access_mode lhs, access_mode rhs) +{ + assert(as_underlying(lhs) < 16); + assert(as_underlying(rhs) < 16); + EXPECT(lhs != access_mode::redux); + EXPECT(rhs != access_mode::redux); + return access_mode(as_underlying(lhs) | as_underlying(rhs)); +} + +inline access_mode& operator|=(access_mode& lhs, access_mode rhs) +{ + return lhs = lhs | rhs; +} + +/** + * @brief Convert an access mode into a C string + */ +inline const char* access_mode_string(access_mode mode) +{ + switch (mode) + { + case access_mode::none: + return "none"; + case access_mode::read: + return "read"; + case access_mode::rw: + return "rw"; + case access_mode::write: + return "write"; + case access_mode::redux: + return "redux"; // op ? + default: + assert(false); + abort(); + } +} + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/cooperative_group_system.cuh b/cudax/include/cuda/experimental/__stf/internal/cooperative_group_system.cuh new file mode 100644 index 00000000000..728b129fe07 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/cooperative_group_system.cuh @@ -0,0 +1,129 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#ifdef __CUDACC__ +# include +#endif + +namespace cuda::experimental::stf::reserved +{ + +/** + * This class implements a synchronization mechanism at system scale, in particular to mimic an implementation of + * multi-device cooperative kernels. + * + * The sync() method, in particular, assumes that all kernels are running at + * the same time. It is the responsability of the caller to ensure this is + * enforced. + */ +class cooperative_group_system +{ +public: + _CCCL_HOST_DEVICE cooperative_group_system(unsigned char* hostMemoryArrivedList = nullptr) + : hostMemoryArrivedList(hostMemoryArrivedList) + {} + + ///@{ @name Host Memory Arrived List getter/setter + void set_arrived_list(unsigned char* addr) + { + hostMemoryArrivedList = addr; + } + unsigned char* get_arrived_list() const + { + return hostMemoryArrivedList; + } + ///@} + +#ifdef __CUDACC__ + _CCCL_DEVICE void sync(size_t devid, size_t ndevs) const + { + auto grid = cooperative_groups::this_grid(); + grid.sync(); + + if (ndevs > 1) + { + assert(hostMemoryArrivedList != nullptr); + } + + // One thread from each grid participates in the sync. + if (grid.thread_rank() == 0) + { + if (devid == 0) + { + // Leader grid waits for others to join and then releases them. + // Other GPUs can arrive in any order, so the leader have to wait for + // all others. + for (int i = 0; i < ndevs - 1; i++) + { + while (load_arrived(&hostMemoryArrivedList[i]) == 0) + ; + } + for (int i = 0; i < ndevs - 1; i++) + { + store_arrived(&hostMemoryArrivedList[i], 0); + } + __threadfence_system(); + } + else + { + // Other grids note their arrival and wait to be released. + store_arrived(&hostMemoryArrivedList[devid - 1], 1); + while (load_arrived(&hostMemoryArrivedList[devid - 1]) == 1) + ; + } + } + + grid.sync(); + } +#endif // __CUDACC__ + +private: + unsigned char* hostMemoryArrivedList = nullptr; ///< Pointer to the host memory synchronization list. + +#ifdef __CUDACC__ + _CCCL_DEVICE unsigned char load_arrived(unsigned char* arrived) const + { +# if __CUDA_ARCH__ < 700 + return *(volatile unsigned char*) arrived; +# else + unsigned int result; + asm volatile("ld.acquire.sys.global.u8 %0, [%1];" : "=r"(result) : "l"(arrived) : "memory"); + return result; +# endif + } + + _CCCL_DEVICE void store_arrived(unsigned char* arrived, unsigned char val) const + { +# if __CUDA_ARCH__ < 700 + *(volatile unsigned char*) arrived = val; +# else + unsigned int reg_val = val; + asm volatile("st.release.sys.global.u8 [%1], %0;" ::"r"(reg_val) "l"(arrived) : "memory"); + // Avoids compiler warnings from unused variable val. + (void) (reg_val = reg_val); +# endif + } +#endif // __CUDACC__ +}; + +} // end namespace cuda::experimental::stf::reserved diff --git a/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh b/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh new file mode 100644 index 00000000000..a68f0e43640 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh @@ -0,0 +1,450 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Low-level abstractions related to `logical_data` + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include + +#include +#include + +namespace cuda::experimental::stf +{ + +class logical_data_untyped; +class data_place; +class task; +template +class shape_of; + +namespace reserved +{ + +// Helper `struct` for deducing read-only types +template +struct readonly_type_of +{ + // The result of the deduction + using type = T; +}; + +// Specialization of `readonly_type_of` for `mdspan`. +template +struct readonly_type_of> +{ + using type = mdspan; +}; + +// Helper struct to deduce read-write types. +template +struct rw_type_of +{ + using type = ::std::remove_const_t; +}; + +// Specialization of rw_type_of for `mdspan`. +template +struct rw_type_of> +{ + using type = mdspan; +}; + +// Specialization of rw_type_of for const mdspan. +template +struct rw_type_of> +{ + using type = mdspan; +}; + +} // namespace reserved + +/** + * @brief Given a type `T`, returns a type that is suitable for read-only access. + * @tparam T Type to process + */ +template +using readonly_type_of = typename reserved::readonly_type_of::type; + +/** + * @brief Given a type `T`, returns the inverse of `constify`. + * @tparam T Type to process + */ +template +using rw_type_of = typename reserved::rw_type_of::type; + +namespace reserved +{ +template +void dep_allocate( + backend_ctx_untyped& ctx, + Data& d, + access_mode mode, + const data_place& dplace, + const ::std::optional eplace, + instance_id_t instance_id, + event_list& prereqs); +} // end namespace reserved + +/** + * @brief The data_interface class defines the methods used to allocate, deallocate, or transfer a piece of data with a + * specific data interface. This could be a block of data, a CSR matrix, or any other data structure. + */ +class data_interface +{ +public: + // Noncopyable, always use via pointer + data_interface(const data_interface&) = delete; + data_interface& operator=(const data_interface&) = delete; + + /// @brief Destructor for the data_interface + virtual ~data_interface() {} + + /// @brief Default constructor + data_interface() {} + + /** + * @brief Allocate data and return a prerequisite event list. + * + * @param ctx Backend context state + * @param memory_node The memory node where the data is stored + * @param instance_id The ID of the data instance + * @param s Pointer to the size of the allocated data + * @param extra_args Additional arguments required for allocation + * @param prereqs Prerequisite event list, will be updated as a side effect + */ + virtual void data_allocate( + backend_ctx_untyped& ctx, + block_allocator_untyped& custom_allocator, + const data_place& memory_node, + instance_id_t instance_id, + ::std::ptrdiff_t& s, + void** extra_args, + event_list& prereqs) = 0; + + /** + * @brief Deallocate data and return a prerequisite event list. + * + * @param ctx Backend context state + * @param memory_node The memory node where the data is stored + * @param instance_id The ID of the data instance + * @param extra_args Additional arguments required for deallocation + * @param prereqs Prerequisite event list, will be updated as a side effect + */ + virtual void data_deallocate( + backend_ctx_untyped& ctx, + block_allocator_untyped& custom_allocator, + const data_place& memory_node, + instance_id_t instance_id, + void* extra_args, + event_list& prereqs) = 0; + + /** + * @brief Copy data from one memory node to another, returning a prerequisite event list. + * + * @param ctx Backend context state + * @param dst_memory_node The destination memory node + * @param dst_instance_id The destination instance ID + * @param src_memory_node The source memory node + * @param src_instance_id The source instance ID + * @param arg Additional arguments required for copying data + * @param prereqs Prerequisite event list, will be updated as a side effect + */ + virtual void data_copy( + backend_ctx_untyped& ctx, + const data_place& dst_memory_node, + instance_id_t dst_instance_id, + const data_place& src_memory_node, + instance_id_t src_instance_id, + event_list& prereqs) = 0; + + /** + * @brief Pin host memory. + * + * @param instance_id The ID of the data instance + * @return true if the instance was pinned, false otherwise + */ + virtual bool pin_host_memory(instance_id_t /*instance_id*/) + { + return false; + } + + virtual ::std::optional get_memory_type(instance_id_t) + { + return ::std::nullopt; + }; + + /// @brief Unpin host memory. + /// + /// @param instance_id The ID of the data instance + virtual void unpin_host_memory(instance_id_t /*instance_id*/) {} + + /** + * @brief Get the hash of the data representation for the given instance ID. + * + * @param instance_id The ID of the data instance + * @return The hash of the data representation + */ + virtual size_t data_hash(instance_id_t instance_id) const = 0; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen fails to parse this + /** + * @brief Returns the size of the data represented by this logical data. + * + * This may be an approximated value as this is only used for statistical + * purposes, or for the scheduling strategies. + */ + virtual size_t data_footprint() const = 0; +#endif // DOXYGEN_SHOULD_SKIP_THIS + + /** + * @brief Get the part of the data interface that is common to all data instances. + * + * @tparam T The type of the data shape + * @return A const reference to the data shape + */ + template + const T& shape() const + { + using R = rw_type_of; + const auto& result = *static_cast(get_common_impl(typeid(R), type_name)); + if constexpr (::std::is_same_v) + { + return result; // lvalue straight into the store + } + else + { + return T(result); // rvalue + } + } + + /** + * @brief Get the part of the data interface that is specific to each data instance. + * + * @tparam T The type of the data instance + * @param instance_id The ID of the data instance + * @return A reference or value of the data instance + */ + template + decltype(auto) instance(instance_id_t instance_id) + { + using R = rw_type_of; + auto& result = *static_cast(get_instance_impl(instance_id, typeid(R), type_name)); + if constexpr (::std::is_same_v) + { + return result; // lvalue straight into the store + } + else + { + return T(result); // rvalue + } + } + + /** + * @brief Get a const reference to the data instance of the given ID. + * + * @tparam T The type of the data instance + * @param instance_id The ID of the data instance + * @return A const reference to the data instance + */ + template + const T& instance_const(instance_id_t instance_id) const + { + return *static_cast(get_instance_impl(instance_id, typeid(T), type_name)); + } + + /** + * @brief Returns the index (ID) of the data instance for this logical data in the current task context. + * If this is called outside a task, this will result in an error. + * + * @param ctx The backend context state + * @param d The logical data_untyped + * @return The ID of the data instance for this logical data + */ + template + instance_id_t get_default_instance_id(backend_ctx_untyped&, const logical_data_untyped& d, task& tp) const + { + return tp.find_data_instance_id(d); + } + +private: + /** + * @brief Get the common implementation of the data interface. + * + * @param asked_ti The type info of the requested shape + * @param tname The name of the type + * @return A pointer to the common implementation + */ + virtual const void* get_common_impl(const ::std::type_info& asked_ti, ::std::string_view tname) const = 0; + + /** + * @brief Get the instance implementation of the data interface. + * + * @param instance_id The ID of the data instance + * @param asked_ti The type info of the requested instance + * @param tname The name of the type + * @return A pointer to the instance implementation + */ + virtual void* + get_instance_impl(instance_id_t instance_id, const ::std::type_info& asked_ti, ::std::string_view tname) = 0; + + /** + * @brief Get the const instance implementation of the data interface. + * + * @param instance_id The ID of the data instance + * @param asked_ti The type info of the requested instance + * @param tname The name of the type + * @return A const pointer to the instance implementation + */ + virtual const void* + get_instance_impl(instance_id_t instance_id, const ::std::type_info& asked_ti, ::std::string_view tname) const = 0; +}; + +/** + * @brief Base implementation of data_interface using Data as constant data and PerInstanceData for each instance. + * Adds the state, implements shape and instance and leaves everything else alone. + */ +template +class data_impl_base : public data_interface +{ +public: + using element_type = T; + using shape_t = shape_of; + + explicit data_impl_base(T object) + : shape(shape_t(object)) + , prototype(mv(object)) + {} + + explicit data_impl_base(shape_of shape) + : shape(mv(shape)) + , prototype() + {} + + /*nonvirtual*/ T& instance(instance_id_t instance_id) + { + assert(instance_id != instance_id_t::invalid && "instance: Invalid argument."); + const auto i = size_t(instance_id); + if (i >= store.size()) + { + if (i == store.size()) + { + // Make sure growth is scalable + store.push_back(prototype); + } + else + { + store.resize(i + 1, prototype); + } + } + return store[i]; + } + + /*nonvirtual*/ const T& instance(instance_id_t instance_id) const + { + assert(instance_id != instance_id_t::invalid && "instance: Invalid argument."); + return store[size_t(instance_id)]; + } + + size_t data_hash(instance_id_t instance_id) const final + { + const auto& inst_id = instance(instance_id); + return ::cuda::experimental::stf::hash{}(inst_id); + } + + size_t data_footprint() const final + { + return shape.size(); + } + +protected: + const shape_t shape; + const T prototype; + ::std::deque store; + +private: + const void* get_common_impl(const ::std::type_info& asked_ti, ::std::string_view tname) const final override + { + if (typeid(shape_t) != asked_ti) + { + fprintf(stderr, + "Shape type mismatch.\nAssumed: %.*s\nActual: %.*s\n", + static_cast(type_name.size()), + type_name.data(), + static_cast(tname.size()), + tname.data()); + abort(); + } + return &shape; + } + + void* get_instance_impl(instance_id_t instance_id, const ::std::type_info& ti, ::std::string_view tname) final + { + // We pass types where we removed const qualifiers in instance() + if (ti != typeid(rw_type_of) && ti != typeid(void)) + { + fprintf(stderr, + "Data interface type mismatch.\nAssumed: %.*s\nActual: %.*s\n", + static_cast(type_name.size()), + type_name.data(), + static_cast(tname.size()), + tname.data()); + abort(); + } + return &instance(instance_id); + } + + const void* + get_instance_impl(instance_id_t instance_id, const ::std::type_info& ti, ::std::string_view tname) const final + { + // We pass types where we removed const qualifiers in instance() + if (ti != typeid(rw_type_of) && ti != typeid(void)) + { + fprintf(stderr, + "Data interface type mismatch.\nAssumed: %.*s\nActual: %.*s\n", + static_cast(type_name.size()), + type_name.data(), + static_cast(tname.size()), + tname.data()); + abort(); + } + return &instance(instance_id); + } +}; + +/** + * @brief A free function which returns the shape of a data instance (e.g. a slice) + */ +template +inline _CCCL_HOST_DEVICE shape_of shape(const T& inst) +{ + return shape_of(inst); +} + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/dot.cuh b/cudax/include/cuda/experimental/__stf/internal/dot.cuh new file mode 100644 index 00000000000..2b675ed3e3a --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/dot.cuh @@ -0,0 +1,528 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Implements the generation of a DOT file to visualize the task graph + * + * CUDASTF_DOT_FILE + * CUDASTF_DOT_IGNORE_PREREQS + * CUDASTF_DOT_COLOR_BY_DEVICE + * CUDASTF_DOT_REMOVE_DATA_DEPS + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace cuda::experimental::stf::reserved +{ + +/** + * @brief Some helper type + */ +using IntPairSet = ::std::unordered_set<::std::pair, cuda::experimental::stf::hash<::std::pair>>; + +class dot; + +class per_ctx_dot +{ +public: + per_ctx_dot(bool _is_tracing, bool _is_tracing_prereqs) + : _is_tracing(_is_tracing) + , _is_tracing_prereqs(_is_tracing_prereqs) + {} + ~per_ctx_dot() = default; + + void finish() + { + prev_oss.push_back(mv(oss)); + } + + const auto& get_streams() const + { + return prev_oss; + } + + void set_ctx_symbol(::std::string s) + { + ctx_symbol = mv(s); + } + + void add_fence_vertex(int unique_id) + { + if (getenv("CUDASTF_DOT_NO_FENCE")) + { + return; + } + + ::std::lock_guard<::std::mutex> guard(mtx); + + vertices.push_back(unique_id); + + // Add a node in the DOT file + oss << "\"NODE_" << unique_id << "\" [style=\"filled\" fillcolor=\"red\" label=\"task fence\"]\n"; + } + + void add_prereq_vertex(const ::std::string& symbol, int prereq_unique_id) + { + if (!is_tracing_prereqs()) + { + return; + } + + ::std::lock_guard<::std::mutex> guard(mtx); + + vertices.push_back(prereq_unique_id); + + set_current_color_by_device(guard); + // Add an entry in the DOT file + oss << "\"NODE_" << prereq_unique_id << "\" [label=\"" << symbol << "\", style=dashed]\n"; + } + + // Edges are not rendered directly, so that we can decide to filter out + // some of them later. They are rendered when the graph is finalized. + // + // style = 0 => plain + // style = 1 => dashed + // + // Note that while tasks are topologically ordered, when we generate graphs + // which includes internal async events, we may have (prereq) nodes which + // are not ordered, so we cannot expect "id_from < id_to" + void add_edge(int id_from, int id_to, int style = 0) + { + if (!is_tracing()) + { + return; + } + + ::std::lock_guard<::std::mutex> guard(mtx); + + if (is_discarded(id_from, guard) || is_discarded(id_to, guard)) + { + return; + } + + if (style == 1 && getenv("CUDASTF_DOT_NO_FENCE")) + { + return; + } + + auto p = ::std::pair(id_from, id_to); + + // Note that since this is a set, there is no need to check if that + // was already in the set. + existing_edges.insert(p); + } + + template + void add_vertex(task_type t) + { + // Do this work outside the critical section + const auto remove_deps = getenv("CUDASTF_DOT_REMOVE_DATA_DEPS"); + + ::std::lock_guard<::std::mutex> guard(mtx); + + if (!tracing_enabled) + { + discard(t.get_unique_id()); + return; + } + + set_current_color_by_device(guard); + + vertices.push_back(t.get_unique_id()); + + // Add an entry in the DOT file + oss << "\"NODE_" << t.get_unique_id() << "\" [style=\"filled\" fillcolor=\"" << get_current_color() << "\" label=\"" + << t.get_symbol(); + + // Append the text with a list of accessed logical data, and the corresponding access modes + if (!remove_deps) + { + const auto deps = t.get_task_deps(); + for (auto& e : deps) + { + data_type d = e.get_data(); + access_mode mode = e.get_access_mode(); + size_t size = d.get_data_interface().data_footprint(); + oss << "\\n" << d.get_symbol() << "(" << access_mode_string(mode) << ")(" << size << ") "; + } + } + + oss << "\"]\n"; + } + + template + void add_vertex_timing(task_type t, float time_ms, int device = -1) + { + ::std::lock_guard<::std::mutex> guard(mtx); + + if (!tracing_enabled) + { + return; + } + + oss << "// " << t.get_unique_id() << " : mapping_id=" << t.get_mapping_id() << " time=" << time_ms + << " device=" << device << "\n"; + } + + // Take a reference to an (unused) `::std::lock_guard<::std::mutex>` to make sure someone ddid take a lock. + void set_current_color_by_device(::std::lock_guard<::std::mutex>&) + { + if (getenv("CUDASTF_DOT_COLOR_BY_DEVICE")) + { + int dev; + cuda_safe_call(cudaGetDevice(&dev)); + EXPECT(dev < sizeof(colors) / sizeof(*colors)); + current_color = colors[dev]; + } + } + + void set_current_color(const char* color) + { + if (!is_tracing()) + { + return; + } + ::std::lock_guard<::std::mutex> guard(mtx); + current_color = color; + } + + const char* get_current_color() + { + return current_color; + } + + void change_epoch() + { + if (getenv("CUDASTF_DOT_DISPLAY_EPOCHS")) + { + ::std::lock_guard<::std::mutex> guard(mtx); + prev_oss.push_back(mv(oss)); + oss.clear(); + } + } + + // Change the current tracing mode to enable or disable it dynamically + void set_tracing(bool enable) + { + tracing_enabled = enable; + } + + void discard(int id) + { + single_threaded_section guard(mtx); + discarded_tasks.insert(id); + } + bool is_discarded(int id, ::std::lock_guard<::std::mutex>&) const + { + return discarded_tasks.find(id) != discarded_tasks.end(); + } + + static void set_parent_ctx(::std::shared_ptr parent_dot, ::std::shared_ptr child_dot) + { + parent_dot->children.push_back(child_dot); + child_dot->parent = mv(parent_dot); + } + + ::std::shared_ptr parent; + ::std::vector<::std::shared_ptr> children; + + const ::std::string get_ctx_symbol() const + { + return ctx_symbol; + } + +private: + mutable ::std::string ctx_symbol; + + mutable ::std::mutex mtx; + + ::std::vector vertices; + + ::std::unordered_set discarded_tasks; + +public: + // Keep track of existing edges, to make the output possibly look better + IntPairSet existing_edges; + + bool is_tracing() const + { + return _is_tracing; + } + bool _is_tracing; + bool is_tracing_prereqs() const + { + return _is_tracing_prereqs; + } + bool _is_tracing_prereqs; + + // We may temporarily discard some tasks + bool tracing_enabled = true; + + // A palette of colors + const char* const colors[8] = {"#ff5500", "#66ccff", "#9933cc", "#00cc66", "#ffcc00", "#00b3e6", "#cc0066", "#009933"}; + + const char* current_color = "white"; + +public: // XXX protected, friend : dot + // string for the current epoch + mutable ::std::ostringstream oss; + // strings of the previous epochs + mutable ::std::vector<::std::ostringstream> prev_oss; +}; + +class dot : public reserved::meyers_singleton +{ +protected: + dot() + { + const char* filename = getenv("CUDASTF_DOT_FILE"); + if (!filename) + { + return; + } + + dot_filename = filename; + //::std::cout << "Creating a DOT file in " << filename << ::std::endl; + + const char* ignore_prereqs_str = getenv("CUDASTF_DOT_IGNORE_PREREQS"); + tracing_prereqs = ignore_prereqs_str && atoi(ignore_prereqs_str) == 0; + } + + ~dot() + { + finish(); + } + +public: + void + print_one_context(::std::ofstream& outFile, size_t& ctx_cnt, bool display_clusters, ::std::shared_ptr pc) + { + // Pick up an identifier in DOT (we may update this value later) + size_t ctx_id = ctx_cnt++; + if (display_clusters) + { + outFile << "subgraph cluster_" << ctx_id << " {\n"; + } + size_t epoch_cnt = pc->prev_oss.size(); + for (size_t epoch_id = 0; epoch_id < epoch_cnt; epoch_id++) + { + if (epoch_cnt > 1) + { + outFile << "subgraph cluster_" << epoch_id << "_" << ctx_id << " {\n"; + outFile << "label=\"epoch " << epoch_id << "\"\n"; + } + + outFile << pc->prev_oss[epoch_id].str(); + + if (epoch_cnt > 1) + { + outFile << "} // end subgraph cluster_" << epoch_id << "_" << ctx_id << "\n"; + } + + if (!getenv("CUDASTF_DOT_SKIP_CHILDREN")) + { + for (auto& child_pc : pc->children) + { + print_one_context(outFile, ctx_cnt, display_clusters, child_pc); + } + } + } + if (display_clusters) + { + if (pc->get_ctx_symbol().empty()) + { + outFile << "label=\"cluster_" << ctx_id << "\"\n"; + } + else + { + outFile << "label=\"" << pc->get_ctx_symbol() << "\"\n"; + } + outFile << "} // end subgraph cluster_" << ctx_id << "\n"; + } + + for (const auto& e : pc->existing_edges) + { + existing_edges.insert(e); + } + } + + // This should not need to be called explicitly, unless we are doing some automatic tests for example + void finish() + { + single_threaded_section guard(mtx); + + if (dot_filename.empty()) + { + return; + } + + for (const auto& pc : per_ctx) + { + pc->finish(); + } + + ::std::ofstream outFile(dot_filename); + if (outFile.is_open()) + { + outFile << "digraph {\n"; + size_t ctx_cnt = 0; + bool display_clusters = (per_ctx.size() > 1); + /* + * For every context, we write the description of the DAG per + * epoch. Then we write the edges after removing redundant ones. + */ + for (const auto& pc : per_ctx) + { + // If the context has a parent, it will be printed by this parent itself + if (!pc->parent) + { + print_one_context(outFile, ctx_cnt, display_clusters, pc); + } + } + + if (!getenv("CUDASTF_DOT_KEEP_REDUNDANT")) + { + remove_redundant_edges(existing_edges); + } + + /* Edges do not have to belong to the cluster (Vertices do) */ + for (const auto& [from, to] : existing_edges) + { + outFile << "\"NODE_" << from << "\" -> \"NODE_" << to << "\"\n"; + } + + outFile << "}\n"; + + outFile.close(); + } + else + { + ::std::cerr << "Unable to open file: " << dot_filename << ::std::endl; + } + + dot_filename.clear(); + } + + bool is_tracing() const + { + return !dot_filename.empty(); + } + + bool is_tracing_prereqs() + { + return tracing_prereqs; + } + + ::std::vector<::std::shared_ptr> per_ctx; + +private: + bool reachable(int from, int to, ::std::unordered_set& visited) + { + visited.insert(to); + + for (auto p : predecessors[to]) + { + if (p == from) + { + return true; + } + + if (visited.find(p) == visited.end()) + { + if (reachable(from, p, visited)) + { + return true; + } + } + } + + return false; + } + + // This method will check whether an edge can be removed when there is already a direct path + void remove_redundant_edges(IntPairSet& edges) + { + single_threaded_section guard(mtx); + // We first dump the set of edges into a map of predecessors per node + for (const auto& [from, to] : edges) + { + predecessors[to].push_back(from); + } + + // Maybe this is not the most efficient, we could use a vector of + // pair if efficiency matters here. + IntPairSet edges_cpy = edges; + + // We will put back only those needed + edges.clear(); + + for (const auto& [from, to] : edges_cpy) + { + bool keep = true; + auto& preds = predecessors[to]; + ::std::sort(preds.begin(), preds.end()); + + // To avoid checking for the same pairs many times, we keep track + // of nodes already tested + ::std::unordered_set visited; + for (auto p : preds) + { + if (reachable(from, p, visited)) + { + // Put this edge back in the set of edges + keep = false; + break; + } + } + + if (keep) + { + edges.insert(::std::pair(from, to)); + } + } + } + + bool tracing_prereqs = false; + + // Keep track of existing edges, to make the output possibly look better + IntPairSet existing_edges; + + ::std::unordered_map> predecessors; + + mutable ::std::mutex mtx; + + ::std::string dot_filename; +}; + +} // namespace cuda::experimental::stf::reserved diff --git a/cudax/include/cuda/experimental/__stf/internal/exec_affinity.cuh b/cudax/include/cuda/experimental/__stf/internal/exec_affinity.cuh new file mode 100644 index 00000000000..58ca7a25f20 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/exec_affinity.cuh @@ -0,0 +1,109 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Facilities to define the place affinity. + * + * Setting an affinity will help defining what is the default execution place. + * They are used in combination with loop_dispatch to dispatch computation over + * a set of execution places. + * + * \see loop_dispatch + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include +#include +#include +#include + +namespace cuda::experimental::stf +{ + +class exec_place; + +/** + * @brief Defines the current execution places associated with a context. + * + * When an affinity is set, the first entry of the current execution places + * defines the default execution place used in CUDASTF constructs when no + * execution place is supplied. + */ +class exec_affinity +{ +public: + exec_affinity() = default; + ~exec_affinity() = default; + + /** + * @brief Set the current affinity to a vector of execution places + */ + void push(::std::vector<::std::shared_ptr> p) + { + s.push(mv(p)); + } + + /** + * @brief Set the current affinity to a single execution place + */ + void push(::std::shared_ptr p) + { + s.push(::std::vector<::std::shared_ptr>{::std::move(p)}); + } + + /** + * @brief Restore the affinity to its value before calling push + */ + void pop() + { + s.pop(); + } + + /** + * @brief Indicates if an affinity was set or not + */ + bool has_affinity() const + { + return !s.empty(); + } + + /** + * @brief Get a reference to the vector of place pointers at the top of the stack (ie. thread's current affinity) + */ + const auto& top() const + { + return s.top(); + } + +private: + // A stack per thread + // (We use vectors of shared_ptr because exec_place implementation cannot + // be available so we rely on type erasure) + static thread_local ::std::stack<::std::vector<::std::shared_ptr>> s; +}; + +// Define the static thread_local member outside the class +inline thread_local ::std::stack<::std::vector<::std::shared_ptr>> exec_affinity::s; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/execution_policy.cuh b/cudax/include/cuda/experimental/__stf/internal/execution_policy.cuh new file mode 100644 index 00000000000..d2cc954bfa9 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/execution_policy.cuh @@ -0,0 +1,776 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** @file + * @brief Execution policy header + */ +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief Typed numeric for representing memory allocation size (in bytes) for the `thread_hierarchy_spec` API below. + * Use `mem(n)` to pass memory size requirements to the `par()` and `con()` family of functions. + * + */ +enum class mem : size_t +{ +}; + +/** + * @brief Describes the hardware scope for a given synchronization operation. + * + * This `enum class` defines various hardware scopes like `none`, `device`, `block`, + * `thread`, and `all`, which are useful for specifying the level of granularity + * at which an operation should occur. + * + * Note that we use powers of two so that it is easy to implement | or & operators. + */ +enum class hw_scope : unsigned int +{ + none = 0, ///< No hardware scope. + thread = 1, ///< Thread-level scope. + block = 2, ///< Block-level scope. + device = 4, ///< Device-level scope. + all = 7 ///< All levels of hardware scope. +}; + +/** + * @brief Bitwise OR operator for combining two `hw_scope` values. + * + * This function performs bitwise OR on the underlying types of two `hw_scope` + * values. It's useful for combining multiple hardware scopes into a single + * `hw_scope` value. + * + * @param lhs The left-hand side hw_scope. + * @param rhs The right-hand side hw_scope. + * @return A new hw_scope that is the bitwise OR of lhs and rhs. + * + * @note The function includes assertions to ensure that lhs and rhs are + * within the allowed range. + */ +inline hw_scope operator|(const hw_scope& lhs, const hw_scope& rhs) +{ + assert(as_underlying(lhs) <= as_underlying(hw_scope::all)); + assert(as_underlying(rhs) <= as_underlying(hw_scope::all)); + return hw_scope(as_underlying(lhs) | as_underlying(rhs)); +} + +/** + * @brief Bitwise AND operator for combining two `hw_scope` values. + * + * This function performs bitwise AND on the underlying types of two `hw_scope` + * values. It's useful for checking if a scope if included into another `hw_scope` value. + * + * @param lhs The left-hand side hw_scope. + * @param rhs The right-hand side hw_scope. + * @return A new hw_scope that is the bitwise AND of lhs and rhs. + * + * @note The function includes assertions to ensure that lhs and rhs are + * within the allowed range. + */ +inline hw_scope operator&(const hw_scope& lhs, const hw_scope& rhs) +{ + assert(as_underlying(lhs) <= as_underlying(hw_scope::all)); + assert(as_underlying(rhs) <= as_underlying(hw_scope::all)); + return hw_scope(as_underlying(lhs) & as_underlying(rhs)); +} + +inline hw_scope operator~(const hw_scope& s) +{ + assert(as_underlying(s) <= as_underlying(hw_scope::all)); + // Keep the bits beyond `all` zero at all times, and + auto x = ~as_underlying(s) & as_underlying(hw_scope::all); + return hw_scope(x); +} + +template +class thread_hierarchy_spec; + +namespace reserved +{ + +template +struct deduce_execution_policy; + +template +struct deduce_execution_policy +{ + using type = thread_hierarchy_spec; +}; + +template +struct deduce_execution_policy +{ + using type = typename deduce_execution_policy::type; +}; + +template +struct deduce_execution_policy, Ts...> +{ + static_assert(::std::is_same_v::type, thread_hierarchy_spec>, + "Only one argument of type deduce_execution_policy<...> is allowed."); + using type = thread_hierarchy_spec; +}; + +} // namespace reserved + +template +class thread_hierarchy; + +template <> +class thread_hierarchy_spec<> +{}; + +/** + * @brief A template class for specifying a thread hierarchy. + * @tparam can_sync A boolean indicating if synchronization is possible. + * @tparam width The width of the thread group at this level (0 to set it dynamically). + * @tparam lower_levels Further specifications for lower levels of the thread hierarchy. + */ +template +class thread_hierarchy_spec +{ + static_assert(sizeof...(lower_levels) % 2 == 0, "Must specify level specifications as pairs of bool and size_t."); + +public: + using thread_hierarchy_t = thread_hierarchy; + + ///@{ @name Usual constructors + thread_hierarchy_spec() = default; + thread_hierarchy_spec(const thread_hierarchy_spec&) = default; + thread_hierarchy_spec(thread_hierarchy_spec&) = default; + thread_hierarchy_spec(thread_hierarchy_spec&&) = default; + ///@} + + /** + * @brief Constructor with variadic parameters (usually no needed; use `par` and `con` below instead) + * + * Parameters are used for initializing `this` depending on their types as follows: + * - If type is `thread_hierarchy_spec`, parameter is used to initalize the inner part + * - If type is `size_t`, parameter is used to initialize the width of the object + * - If type is `mem`, parameter is used to initialize the memory (bytes) attribute of the object + * - If type is `hw_scope`, parameter is used to initialize the scope attribute of the object + * + * Parameters can be specified in any order. All are optional. It is illegal to pass the same parameter type more + * than once. + * + * @param P... Types of parameters + * @param p Values of parameters + */ + template + explicit thread_hierarchy_spec(const P&... p) + : inner(only_convertible_or(thread_hierarchy_spec(), p...)) + , dynamic_width(only_convertible_or(decltype(dynamic_width)(), p...)) + , sync_scope(only_convertible_or(hw_scope::all, p...)) + , mem_bytes(only_convertible_or(mem(0), p...)) + { + shuffled_args_check(inner, dynamic_width, sync_scope, mem_bytes); + if constexpr (sizeof...(lower_levels) > 0) + { + // Unclear about the logic here. + if (inner.get_scope(0) != hw_scope::all) + { + sync_scope = sync_scope & ~inner.get_scope(0); + } + } + } + + constexpr bool operator==(const thread_hierarchy_spec& rhs) const noexcept + { + if (dynamic_width != rhs.dynamic_width || sync_scope != rhs.sync_scope || mem_bytes != rhs.mem_bytes) + { + return false; + } + if constexpr (sizeof...(lower_levels) > 0) + { + return inner == rhs.inner; + } + else + { + return true; + } + } + + // For other types ... + template >, int> = 0> + constexpr bool operator==(const thread_hierarchy_spec&) const noexcept + { + return false; + } + + constexpr bool operator!=(const thread_hierarchy_spec& rhs) const noexcept + { + return !(*this == rhs); + } + + template >, int> = 0> + constexpr bool operator!=(const thread_hierarchy_spec&) const noexcept + { + return true; + } + + /// @brief Compute the depth of the thread hierarchy. + /// @return The depth. + static constexpr size_t depth() + { + return 1 + sizeof...(lower_levels) / 2; + } + + /// @brief Check if synchronization is possible. + /// @return A boolean indicating if synchronization is possible. + static constexpr bool synchronizable() + { + return can_sync; + } + + /** + * @brief Set the inner thread hierarchy. + * @tparam P Template arguments of the inner thread_hierarchy_spec object. + * @param inner The inner thread_hierarchy_spec object. + */ + template + void set_inner(const thread_hierarchy_spec& inner) + { + this->inner = inner; + } + + /** + * @brief Get the memory bytes at a specific level. + * @param level The level. + * @return The memory bytes. + */ + constexpr mem get_mem(size_t level) const + { + if constexpr (depth() > 1) + { + if (level > 0) + { + return inner.get_mem(level - 1); + } + } + return mem_bytes; + } + + /** + * @brief Set the memory bytes at a specific level. + * @param level The level. + * @param value The memory bytes. + */ + constexpr void set_mem(size_t level, mem value) + { + if constexpr (depth() > 1) + { + if (level > 0) + { + return inner.set_mem(level - 1, value); + } + } + mem_bytes = value; + } + + /** + * @brief Checks if the given `thread_hierarchy_spec` is synchronizable at the given `level`. + * + * @tparam level The level in the hierarchy to check for the `sync` property. Level starts from 0 (top-level). + */ +#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen fails to parse this + template + static inline constexpr bool is_synchronizable = [] { + if constexpr (level > 0) + { + return thread_hierarchy_spec::template is_synchronizable; + } + else + { + return can_sync; + } + }(); +#else + template + static inline constexpr bool is_synchronizable; +#endif + + /** + * @brief Get the statically-specified width at a specific level + * @param level The level + * @return The width (0 if width is dynamic) + */ + static inline constexpr size_t static_width(size_t level) + { + size_t data[] = {width, lower_levels...}; + return data[2 * level]; + } + + /** + * @brief Get the width at a specific level. + * @param level The level. + * @return The width. + */ + constexpr size_t get_width(size_t level) const + { + if constexpr (depth() > 1) + { + if (level > 0) + { + return inner.get_width(level - 1); + } + } + return dynamic_width.get(); + } + + /** + * @brief Set the width. + * @param new_width The new width. + */ + void set_width(size_t level, size_t new_width) + { + if constexpr (depth() > 1) + { + if (level > 0) + { + return inner.set_width(level - 1, new_width); + } + } + if constexpr (width == 0) + { + dynamic_width = new_width; + } + else + { + assert(!"Cannot set width at this level."); + } + } + + /** + * @brief Get the scope at a specific level + */ + constexpr hw_scope get_scope(size_t level) const + { + if constexpr (depth() > 1) + { + if (level > 0) + { + return inner.get_scope(level - 1); + } + } + return sync_scope; + } + +private: + /// @brief The inner thread hierarchy. + [[no_unique_address]] thread_hierarchy_spec inner; + /// @brief The dynamic width, if applicable. + [[no_unique_address]] optionally_static dynamic_width; + /// @brief Synchronization level(s) + hw_scope sync_scope = hw_scope::none; + /// @brief The memory bytes. + mem mem_bytes = mem(0); +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen fails to parse this +/** + * @brief Creates and returns a `thread_hierarchy_spec` object with no synchronization and dynamic width. + * + * Parameters are used for initializing the result depending on their types as follows: + * - If type is `thread_hierarchy_spec`, parameter is used to initalize the inner part + * - If type is `size_t`, parameter is used to initialize the width of the object + * - If type is `mem`, parameter is used to initialize the memory (bytes) attribute of the object + * - If type is `hw_scope`, parameter is used to initialize the scope attribute of the object + * + * @tparam P... Types of parameters + * @param p Values of parameters + * @return A `thread_hierarchy_spec` instantiation with the appropriate arguments. + */ +template +constexpr auto par(const P&... p) +{ + using R = typename reserved::deduce_execution_policy::type; + return R(p...); +} + +/** + * @brief Creates and returns a `thread_hierarchy_spec` object with no synchronization and static width. + * + * @overload + * + * Parameters are used for initializing the result depending on their types as follows: + * - If type is `thread_hierarchy_spec`, parameter is used to initalize the inner part + * - If type is `mem`, parameter is used to initialize the memory (bytes) attribute of the object + * - If type is `hw_scope`, parameter is used to initialize the scope attribute of the object + * + * @tparam width level width + * @tparam P... Types of parameters + * @param p Values of parameters + * @return A `thread_hierarchy_spec` instantiation with the appropriate arguments. + */ +template +constexpr auto par(const P&... p) +{ + using R = typename reserved::deduce_execution_policy::type; + return R(p...); +} + +/// @{ +/** + * @brief Creates and returns a `thread_hierarchy_spec` object with synchronization and dynamic width. + * + * Parameters are used for initializing the result depending on their types as follows: + * - If type is `thread_hierarchy_spec`, parameter is used to initalize the inner part + * - If type is `size_t`, parameter is used to initialize the width of the object + * - If type is `mem`, parameter is used to initialize the memory (bytes) attribute of the object + * - If type is `hw_scope`, parameter is used to initialize the scope attribute of the object + * + * @param P... Types of parameters + * @param p Values of parameters + * @return A `thread_hierarchy_spec` instantiation with the appropriate arguments. + */ +template +constexpr auto con(const P&... p) +{ + using R = typename reserved::deduce_execution_policy::type; + return R(p...); +} + +/** + * @brief Creates and returns a `thread_hierarchy_spec` object with synchronization and static width. + * + * @overload + * + * Parameters are used for initializing the result depending on their types as follows: + * - If type is `thread_hierarchy_spec`, parameter is used to initalize the inner part + * - If type is `mem`, parameter is used to initialize the memory (bytes) attribute of the object + * - If type is `hw_scope`, parameter is used to initialize the scope attribute of the object + * + * @param P... Types of parameters + * @param p Values of parameters + * @return A `thread_hierarchy_spec` instantiation with the appropriate arguments. + */ +template +constexpr auto con(const P&... p) +{ + using R = typename reserved::deduce_execution_policy::type; + return R(p...); +} +/// @} +#endif // DOXYGEN_SHOULD_SKIP_THIS + +#ifdef UNITTESTED_FILE + +// clang-format off +UNITTEST("par") { + { + auto x = par(); + static_assert(::std::is_same_v>); + static_assert(x.depth() == 1); + static_assert(!thread_hierarchy_spec::template is_synchronizable<0>); + static_assert(x.static_width(0) == 0); + ::std::ignore = x; + } + + { + auto x = par<1024>(); + static_assert(::std::is_same_v>); + static_assert(x.depth() == 1); + ::std::ignore = x; + } + + { + auto x = par(1024); + static_assert(::std::is_same_v>); + static_assert(x.depth() == 1); + ::std::ignore = x; + assert(x.get_width(1) == 1024); + } + + { + auto x = par(par()); + static_assert(::std::is_same_v>); + static_assert(x.depth() == 2); + static_assert(!thread_hierarchy_spec::template is_synchronizable<0>); + static_assert(!thread_hierarchy_spec::template is_synchronizable<1>); + ::std::ignore = x; + } + + { + auto x = par<1024>(par()); + static_assert(::std::is_same_v>); + static_assert(x.depth() == 2); + assert(x.get_width(0) == 1024); + ::std::ignore = x; + } + + { + auto x = par(par(), 512); + static_assert(::std::is_same_v>); + static_assert(x.depth() == 2); + assert(x.get_width(0) == 512); + ::std::ignore = x; + } + + { + auto x = par(par(), mem(512)); + static_assert(::std::is_same_v>); + static_assert(x.depth() == 2); + ::std::ignore = x; + } + + { + auto x = par<256>(par(), mem(512)); + static_assert(::std::is_same_v>); + static_assert(x.depth() == 2); + ::std::ignore = x; + } + + { + auto x = par(par<256>(), 1024, mem(512)); + static_assert(::std::is_same_v>); + static_assert(x.depth() == 2); + assert(x.get_width(0) == 1024); + assert(x.get_width(1) == 256); + assert(x.get_mem(0) == mem(512)); + ::std::ignore = x; + } +}; + +UNITTEST("con") { + { + auto x = con(); + static_assert(::std::is_same_v>); + static_assert(x.depth() == 1); + static_assert(thread_hierarchy_spec::template is_synchronizable<0>); + ::std::ignore = x; + } + + { + auto x = con<1024>(); + static_assert(::std::is_same_v>); + static_assert(x.depth() == 1); + ::std::ignore = x; + } + + { + auto x = con(1024); + static_assert(::std::is_same_v>); + static_assert(x.depth() == 1); + ::std::ignore = x; + assert(x.get_width(1) == 1024); + } + + { + auto x = con(con()); + static_assert(::std::is_same_v>); + static_assert(x.depth() == 2); + ::std::ignore = x; + } + + { + auto x = con<1024>(con()); + static_assert(::std::is_same_v>); + static_assert(x.depth() == 2); + assert(x.get_width(0) == 1024); + ::std::ignore = x; + } + + { + auto x = con(con(), 512); + static_assert(::std::is_same_v>); + static_assert(x.depth() == 2); + assert(x.get_width(0) == 512); + ::std::ignore = x; + } + + { + auto x = con(con(), mem(512)); + static_assert(::std::is_same_v>); + static_assert(x.depth() == 2); + ::std::ignore = x; + } + + { + auto x = con<256>(con(), mem(512)); + static_assert(::std::is_same_v>); + static_assert(x.depth() == 2); + ::std::ignore = x; + } + + { + auto x = con(con<256>(), 1024, mem(512)); + static_assert(::std::is_same_v>); + static_assert(x.depth() == 2); + assert(x.get_width(0) == 1024); + assert(x.get_width(1) == 256); + assert(x.get_mem(0) == mem(512)); + ::std::ignore = x; + } +}; +// clang-format on + +// unittest for core.h that can't be there +UNITTEST("optionally_static") +{ + optionally_static v1; + static_assert(v1.get() == 42); + static_assert(v1 == v1); + static_assert(v1 == 42UL); + + optionally_static v2; + static_assert(v2.get() == 43UL); + + optionally_static v3; + EXPECT(v3.get() == 0); + v3 = 44; + EXPECT(v3.get() == 44UL); + +# if 0 + // TODO clarify these tests ! + + // Make sure the size is optimized properly + struct S1 + { + [[no_unique_address]] optionally_static x; + int y; + }; + static_assert(sizeof(S1) == sizeof(int)); + struct S2 + { + int y; + [[no_unique_address]] optionally_static x; + }; + static_assert(sizeof(S1) == sizeof(int)); +# endif + + // Multiplication + static_assert(v1 * v1 == 42UL * 42UL); + static_assert(v1 * v2 == 42UL * 43UL); + static_assert(v1 * 44 == 42UL * 44UL); + static_assert(44 * v1 == 42UL * 44UL); + EXPECT(v1 * v3 == 42 * 44); + + // Odds and ends + optionally_static<3, 18> v4; + optionally_static<6, 18> v5; + static_assert(v4 * v5 == 18UL); + static_assert(v4 * v5 == (optionally_static<18, 18>(18))); + +// TODO solve these there are some ambiguities ! +# if 0 + // Mutating operators + optionally_static<1, 1> v6; + v6 += v6; + EXPECT(v6 == 0); + v6 += 2; + EXPECT(v6 == 2); + v6 -= 1; + EXPECT(v6 == 1); + v6++; + ++v6; + EXPECT(v6 == 3); + --v6; + v6--; + EXPECT(v6 == 1); + EXPECT(-v6 == -1); +# endif +}; + +UNITTEST("thread hierarchy spec equality") +{ + EXPECT(par() == par()); + EXPECT(con() == con()); + EXPECT(con<128>() == con<128>()); + EXPECT(con(128) == con(128)); + + static_assert(con<128>() == con<128>()); + static_assert(con<128>() != con<64>()); + static_assert(con() != par()); + + EXPECT(par() != con()); + + EXPECT(par(par<256>(), 1024, mem(512)) == par(par<256>(), 1024, mem(512))); + + // Change one of the par to con + EXPECT(par(par<256>(), 1024, mem(512)) != par(con<256>(), 1024, mem(512))); + + EXPECT(con(mem(512)) != con()); + EXPECT(con(mem(512)) != con(mem(128))); +}; + +UNITTEST("spec with scope") +{ + auto spec = par(hw_scope::device | hw_scope::block, par<128>(hw_scope::thread)); + + EXPECT(spec.get_scope(0) == (hw_scope::device | hw_scope::block)); + EXPECT(spec.get_scope(1) == hw_scope::thread); +}; + +UNITTEST("spec with scope and implicit scopes") +{ + auto spec = par(par<128>(hw_scope::thread)); + + EXPECT(spec.get_scope(1) == hw_scope::thread); + + // Since the second level is bound to the thread scope, level 0 should be + // bound to blocks and devices instead. + EXPECT((spec.get_scope(0) & hw_scope::thread) == hw_scope::none); +}; + +UNITTEST("thread hierarchy spec equality") +{ + EXPECT(par() == par()); + EXPECT(con() == con()); + EXPECT(con<128>() == con<128>()); + EXPECT(con(128) == con(128)); + + static_assert(con<128>() == con<128>()); + static_assert(con<128>() != con<64>()); + static_assert(con() != par()); + + EXPECT(par() != con()); + + EXPECT(par(par<256>(), 1024, mem(512)) == par(par<256>(), 1024, mem(512))); + + // Change one of the par to con + EXPECT(par(par<256>(), 1024, mem(512)) != par(con<256>(), 1024, mem(512))); + + EXPECT(con(mem(512)) != con()); + EXPECT(con(mem(512)) != con(mem(128))); +}; + +UNITTEST("spec with scope") +{ + auto spec = par(hw_scope::device | hw_scope::block, par<128>(hw_scope::thread)); + + EXPECT(spec.get_scope(0) == (hw_scope::device | hw_scope::block)); + EXPECT(spec.get_scope(1) == hw_scope::thread); +}; + +#endif // UNITTESTED_FILE + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/frozen_logical_data.cuh b/cudax/include/cuda/experimental/__stf/internal/frozen_logical_data.cuh new file mode 100644 index 00000000000..0623066e254 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/frozen_logical_data.cuh @@ -0,0 +1,206 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Definition of `frozen_logical_data` + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +namespace cuda::experimental::stf +{ + +template +class logical_data; + +/** + * @brief Frozen logical data are logical data which can be accessed outside tasks + * + * They are created using ctx.freeze, which returns a frozen_logical_data + * object. The get() and unfreeze() method allow to get an instance of the + * frozen data that is valid until unfreeze is called. + */ +template +class frozen_logical_data +{ +private: + class impl + { + public: + impl(backend_ctx_untyped bctx_, logical_data ld_, access_mode m_, data_place place_) + : ld(mv(ld_)) + , m(m_) + , place(mv(place_)) + , bctx(mv(bctx_)) + { + ld.freeze(m, place); + + // A fake task is used to store output dependencies, so that future + // tasks (or frozen data) depending on the unfreeze operation might + // depend on something. No task is launch, and we simply use it for + // this purpose. + fake_task.set_symbol("FREEZE\n" + ld.get_symbol() + "(" + access_mode_string(m) + ")"); + + auto& dot = bctx.get_dot(); + if (dot->is_tracing()) + { + dot->template add_vertex(fake_task); + } + } + + /** + * @brief Get the instance of a frozen data on a data place. It returns + * the instance and the corresponding prereqs. + */ + ::std::pair get(data_place place_) + { + auto result = ld.template get_frozen(fake_task, mv(place_), m); + + auto& dot = bctx.get_dot(); + if (dot->is_tracing_prereqs()) + { + for (auto& e : result.second) + { + int fake_task_id = fake_task.get_unique_id(); + dot->add_edge(e->unique_prereq_id, fake_task_id, 1); + } + } + + /* Use the ID of the fake task to identify "get" events. This makes + * it possible to automatically synchronize with these events when calling + * task_fence. */ + bctx.get_stack().add_pending_freeze(fake_task, result.second); + + return mv(result); + } + + T get(data_place place_, cudaStream_t stream) + { + // Get the tuple and synchronize it with the user-provided stream + ::std::pair p = get(mv(place_)); + auto& prereqs = p.second; + prereqs.sync_with_stream(bctx, stream); + return p.first; + } + + void unfreeze(event_list prereqs) + { + auto& dot = bctx.get_dot(); + if (dot->is_tracing_prereqs()) + { + int fake_task_id = fake_task.get_unique_id(); + for (const auto& out_e : prereqs) + { + dot->add_edge(fake_task_id, out_e->unique_prereq_id, 1); + } + } + + // There is no need to automatically synchronize with the get() operation in task_fence now + bctx.get_stack().remove_pending_freeze(fake_task); + + fake_task.merge_event_list(prereqs); + ld.unfreeze(fake_task, mv(prereqs)); + } + + void unfreeze(cudaStream_t stream) + { + event_list prereqs = bctx.stream_to_event_list(stream, "unfreeze"); + unfreeze(mv(prereqs)); + + fake_task.clear(); + } + + void set_automatic_unfreeze(bool flag = true) + { + ld.set_automatic_unfreeze(fake_task, flag); + } + + private: + logical_data ld; + access_mode m; + data_place place; + backend_ctx_untyped bctx; + + // This is used internally to keep track of the dependencies of the + // unfreeze operations, so that future operations on the logical data + // may depend on them + task fake_task; + }; + +public: + frozen_logical_data(backend_ctx_untyped bctx, logical_data ld, access_mode m, data_place place) + : pimpl(::std::make_shared(mv(bctx), mv(ld), m, mv(place))) + {} + + // So that we can have a frozen data variable that is populated later + frozen_logical_data() = default; + + // Copy constructor + frozen_logical_data(const frozen_logical_data& other) = default; + + // Move constructor + frozen_logical_data(frozen_logical_data&& other) noexcept = default; + + // Copy assignment + frozen_logical_data& operator=(const frozen_logical_data& other) = default; + + // Move assignment + frozen_logical_data& operator=(frozen_logical_data&& other) noexcept = default; + + ::std::pair get(data_place place) + { + assert(pimpl); + return pimpl->get(mv(place)); + } + + T get(data_place place, cudaStream_t stream) + { + assert(pimpl); + return pimpl->get(mv(place), stream); + } + + void unfreeze(event_list prereqs) + { + assert(pimpl); + pimpl->unfreeze(mv(prereqs)); + } + + void unfreeze(cudaStream_t stream) + { + assert(pimpl); + pimpl->unfreeze(stream); + } + + frozen_logical_data& set_automatic_unfreeze(bool flag = true) + { + assert(pimpl); + pimpl->set_automatic_unfreeze(flag); + return *this; + } + +private: + ::std::shared_ptr pimpl = nullptr; +}; + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/hashtable_linearprobing.cuh b/cudax/include/cuda/experimental/__stf/internal/hashtable_linearprobing.cuh new file mode 100644 index 00000000000..7a5dac9abde --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/hashtable_linearprobing.cuh @@ -0,0 +1,266 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief A simple hashtable implementation based on https://nosferalatu.com/SimpleGPUHashTable.html + * The goal of this class is to illustrate the extensibility of our data interface mechanism. + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +namespace cuda::experimental::stf +{ + +namespace reserved +{ + +const uint32_t kEmpty = 0xffffffff; + +/** + * @brief Key/value pair for storing in a hashtable in device memory + * + */ +class KeyValue +{ +public: + ///@{ @name Constructors + KeyValue() = default; + _CCCL_HOST_DEVICE KeyValue(uint32_t key, uint32_t value) + : key(key) + , value(value) + {} + ///@} + uint32_t key; + uint32_t value; +}; + +/* Default capacity */ +const ::std::uint32_t kHashTableCapacity = 64 * 1024 * 1024; + +} // end namespace reserved + +/** + * @brief A simple hashtable that maps `size_t` to `size_t` + * + */ +class hashtable +{ +public: + /** @brief Default constructor (runs on host) */ + hashtable(uint32_t capacity = reserved::kHashTableCapacity) + : capacity(capacity) + { + init(); + } + + /** @brief Constructor from pointer to `KeyValue` (runs on host or device) */ + _CCCL_HOST_DEVICE hashtable(reserved::KeyValue* addr, uint32_t capacity = reserved::kHashTableCapacity) + : addr(addr) + , capacity(capacity) + {} + + hashtable(const hashtable&) = default; + + _CCCL_HOST_DEVICE ~hashtable() + { + if (automatically_allocated) + { + // TODO destroy + } + } + + /** + * @brief TODO + * + */ + void cpu_cat() const + { + for (size_t i = 0; i < capacity; i++) + { + if (addr[i].value != reserved::kEmpty) + { + fprintf(stderr, "VALID ENTRY at slot %zu, value %d key %d\n", i, addr[i].value, addr[i].key); + } + } + } + + /** + * @brief Get the entry with the requested key, if any + * + */ + _CCCL_HOST_DEVICE uint32_t get(uint32_t key) const + { + uint32_t slot = hash(key); + while (true) + { + uint32_t prev = addr[slot].key; + if (prev == reserved::kEmpty || prev == key) + { + return addr[slot].value; + } + slot = (slot + 1) & (capacity - 1); + } + } + + /** + * @brief TODO + * + */ + _CCCL_HOST_DEVICE void insert(uint32_t key, uint32_t value) + { + insert(reserved::KeyValue(key, value)); + } + + /** + * @brief Introduce a pair of key/value in a hashtable + * + */ + _CCCL_HOST_DEVICE void insert(const reserved::KeyValue& kvs) + { + uint32_t key = kvs.key; + uint32_t value = kvs.value; + uint32_t slot = hash(key); + + while (true) + { +#if defined(__CUDA_ARCH__) + uint32_t prev = atomicCAS(&addr[slot].key, reserved::kEmpty, key); +#else + uint32_t prev = addr[slot].key; +#endif + if (prev == reserved::kEmpty || prev == key) + { + addr[slot].value = value; +#if !defined(__CUDA_ARCH__) + addr[slot].key = key; +#endif + // printf("INSERT VALUE %d key %d at slot %d\n", value, key, slot); + return; + } + slot = (slot + 1) & (capacity - 1); + } + } + + reserved::KeyValue* addr; + + _CCCL_HOST_DEVICE size_t get_capacity() const + { + return capacity; + } + +private: + mutable size_t capacity; + + // 32 bit Murmur3 hash + inline _CCCL_HOST_DEVICE uint32_t hash(uint32_t k) const + { + k ^= k >> 16; + k *= 0x85ebca6b; + k ^= k >> 13; + k *= 0xc2b2ae35; + k ^= k >> 16; + return k & (capacity - 1); + } + + // Initialization of the table (host memory) + void init() + { + size_t sz = capacity * sizeof(reserved::KeyValue); + cuda_try(cudaHostAlloc(&addr, sz, cudaHostAllocMapped)); + memset(addr, 0xff, sz); + automatically_allocated = true; + } + + bool automatically_allocated = false; +}; + +template +class shape_of; + +/** + * @brief defines the shape of a hashtable + * + * @extends shape_of + */ +template <> +class shape_of +{ +public: + /** + * @brief Initialize with a specific capacity + */ + shape_of(uint32_t capacity = reserved::kHashTableCapacity) + : capacity(capacity) + {} + + /** + * @name Copies a shape. + * + * All `shape_of` specializations must define this constructor. + */ + shape_of(const shape_of&) = default; + + /** + * @brief Extracts the shape from a hashtable + * + * @param h hashtable to get the shape from + * + * All `shape_of` specializations must define this constructor. + */ + shape_of(const hashtable& h) + : shape_of(static_cast(h.get_capacity())) + {} + + // This size() value does not correspond to the actual amount of bytes + // transfered, but this value is only used for stat / scheduling purposes. + // We may just have an approximate reporting ... + // + // In practice, the current implementation effectively transfers the whole + // array so it is correct. + _CCCL_HOST_DEVICE size_t size() const + { + return capacity * sizeof(uint32_t); + } + + _CCCL_HOST_DEVICE uint32_t get_capacity() const + { + return capacity; + } + +private: + mutable uint32_t capacity; +}; + +template <> +struct hash +{ + ::std::size_t operator()(hashtable const& s) const noexcept + { + return ::std::hash{}(s.addr); + } +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/hooks.cuh b/cudax/include/cuda/experimental/__stf/internal/hooks.cuh new file mode 100644 index 00000000000..a32b39a74ad --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/hooks.cuh @@ -0,0 +1,222 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +namespace cuda::experimental::stf +{ + +namespace reserved +{ + +/* + * When we dump the content of the logical data, we put them in files which are + * automatically named according to this counter. By having such a determinitic + * counter shared by all contexts, we can compare the content of files with the + * same index during different executions. + */ +class dump_hook_cnt : public reserved::meyers_singleton +{ +protected: + dump_hook_cnt() + { + cnt = 0; + } + + ~dump_hook_cnt() = default; + +public: + static int get() + { + return instance().cnt++; + } + +private: + int cnt; +}; +} // namespace reserved + +template +void data_dump(Unknown, ::std::ostream& file = ::std::cerr) +{ + file << "Dunno how to dump object of type " << type_name << ".\n"; +} + +template +size_t data_hash(Unknown) +{ + return 0; +} + +namespace reserved +{ +inline void create_dump_dir(const ::std::string& dump_dir) +{ + // Create the directory (no-op if it already exists) + if (::std::filesystem::create_directories(dump_dir)) + { + //::std::cout << "Directory \"" << dump_dir << "\" was created successfully." << ::std::endl; + } + else + { + if (!::std::filesystem::exists(dump_dir)) + { + ::std::cerr << "An error occurred while trying to create the dump_dir \"" << dump_dir << "\"." << ::std::endl; + abort(); + } + } +} + +inline void ensure_directory_exists(const ::std::string& dir_path) +{ + // Check if the directory exists + if (!::std::filesystem::exists(dir_path)) + { + ::std::cerr << "Directory \"" << dir_path << "\" does not exist." << ::std::endl; + abort(); + } +} + +/* Compute a vector of hooks to dump modified logical data (using + * typed-erased hooks). This will generate one host_launch task for each + * modified logical data after task submission. */ +template +static ::std::vector<::std::function> get_dump_hooks(ctxt_t* ctx, task_dep... deps) +{ + ::std::vector<::std::function> hooks; + + // If the CUDASTF_AUTO_DUMP is not set, or set to 0, we don't save the content + const char* dump_str = ::std::getenv("CUDASTF_AUTO_DUMP"); + bool dump = dump_str && atoi(dump_str) != 0; + + const char* compare_str = ::std::getenv("CUDASTF_AUTO_COMPARE"); + bool compare = compare_str && atoi(compare_str) != 0; + + if (!dump && !compare) + { + return hooks; + } + + bool hash_only = ::std::getenv("CUDASTF_AUTO_DUMP_ONLY_HASH"); + + // Where do we write dumped content ? We postpone the creation of this + // directory to the first time we need to create a directory to avoid + // creating an empty dir if no data was dumped + const char* dump_dir_env = ::std::getenv("CUDASTF_AUTO_DUMP_DIR"); + ::std::string dump_dir = (dump_dir_env != nullptr) ? dump_dir_env : "dump/"; + + // For every dependency, we create a hook to dump the content of the + // logical data if it was modified. + auto dump_dep = [&, dump_dir](auto dep) { + auto dep_ld = dep.get_data(); + if (dep.get_access_mode() != access_mode::read && dep_ld.get_auto_dump()) + { + auto ro_dep = dep.as_read_mode(); + + /* We either make sure the directory exists or lazily create it if + * we need to add content when dumping data */ + if (compare) + { + ensure_directory_exists(dump_dir); + } + else + { + create_dump_dir(dump_dir); + } + + // Create a hook that will be executed after the submission of the + // tasks: this will submit a host callback to write the content in + // a file + auto h = [ctx, ro_dep, dump_dir, hash_only, compare]() { + // Get the next counter (to have a repeatable order) + int cnt = reserved::dump_hook_cnt::get(); + ::std::string filePath = dump_dir + "/" + ::std::to_string(cnt); + + if (compare) + { + // Instead of using a host callback which might have had + // better performance, we use a task and a synchronization + // because it is easier to break on errors with a debugger + // when a mismatch is found. + ctx->task(exec_place::host, ro_dep).set_symbol("compare " + ::std::to_string(cnt)) + ->*[filePath](cudaStream_t stream, auto s) { + cuda_safe_call(cudaStreamSynchronize(stream)); + ::std::ifstream f(filePath); + if (!f.is_open()) + { + ::std::cerr << "Failed to open " << filePath << ::std::endl; + abort(); + } + + size_t saved_hash; + f >> saved_hash; + f.close(); + + size_t computed_hash = data_hash(s); + if (computed_hash != saved_hash) + { + ::std::cerr << "Hash mismatch : computed = " << computed_hash << ", saved = " << saved_hash + << " in " << filePath << ::std::endl; + if (getenv("CUDASTF_AUTO_COMPARE_ABORT_ON_ERRORS")) + { + abort(); + } + } + }; + } + else + { + ctx->host_launch(ro_dep).set_symbol("dump " + ::std::to_string(cnt))->*[filePath, hash_only](auto s) { + ::std::ofstream f(filePath); + if (!f.is_open()) + { + ::std::cerr << "Failed to open " << filePath << ::std::endl; + abort(); + } + // Compute a hash of the content, to easily compare equality + size_t hsh = data_hash(s); + f << hsh << ::std::endl; + + if (!hash_only) + { + // Dump the actual data content (may be very large) + data_dump(s, f); + } + f.close(); + }; + } + }; + hooks.push_back(h); + } + }; + ::std::ignore = dump_dep; + + (dump_dep(deps), ...); // Call dump_dep on every dependency + + return hooks; +} + +} // end namespace reserved + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/interpreted_execution_policy.cuh b/cudax/include/cuda/experimental/__stf/internal/interpreted_execution_policy.cuh new file mode 100644 index 00000000000..b9286f32078 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/interpreted_execution_policy.cuh @@ -0,0 +1,323 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include + +namespace cuda::experimental::stf +{ + +class exec_place; + +template +class thread_hierarchy; + +/** + * This corresponds to an execution_policy_spec (e.g. par(con(32))) which we + * map on an execution place. + * + * The goal of this intermediate class is to take the high-level spec + * description and compute how the different levels are mapped on the machine. + * In particular, how levels are mapped to the CUDA hierarchy (threads, blocks, + * devices). + */ +template +class interpreted_execution_policy +{ + static constexpr size_t depth = sizeof...(spec) / 2; + static_assert(sizeof...(spec) % 2 == 0, "Number of template arguments must be even."); + +public: + using thread_hierarchy_t = thread_hierarchy; + + /** + * Each level of the interpreted policy is a vector which describes how the + * level is spread across the machine. For example, a level could be (128 + * threads), or it could be (4 blocks) x (32 threads). In the latter + * example, the level is described as a vector of 2 subentries. + */ + class level + { + public: + level(::std::pair desc, size_t local_mem = 0) + : local_mem(local_mem) + { + level_desc.push_back(mv(desc)); + } + level(::std::initializer_list<::std::pair> desc, size_t local_mem = 0) + : level_desc(desc.begin(), desc.end()) + , local_mem(local_mem) + {} + + // Get the total width of the level, which is the product of all sublevel sizes + size_t width() const + { + size_t res = 1; + for (auto& e : level_desc) + { + res *= e.second; + } + return res; + } + + void set_mem(size_t size) + { + local_mem = size; + } + size_t get_mem() const + { + return local_mem; + } + + void set_sync(bool sync) + { + may_sync = sync; + } + bool get_sync() const + { + return may_sync; + } + + const auto& get_desc() const + { + return level_desc; + } + + private: + ::std::vector<::std::pair> level_desc; + size_t local_mem; + bool may_sync = false; + }; + +public: + interpreted_execution_policy() = default; + ~interpreted_execution_policy() = default; + + /* A substem that contains sync() related functionality for the thread hierarchy */ + reserved::cooperative_group_system cg_system; + + template + interpreted_execution_policy(const thread_hierarchy_spec& p, const exec_place& where, const Fun& f); + + void add_level(level l) + { + levels.push_back(mv(l)); + } + + void set_level_mem(int level, size_t size) + { + assert(level < depth); + levels[level].set_mem(size); + } + + size_t get_level_mem(int level) const + { + assert(level < depth); + return levels[level].get_mem(); + } + + void set_level_sync(int level, bool sync) + { + assert(level < depth); + levels[level].set_sync(sync); + } + + bool get_level_sync(size_t level) const + { + assert(level < depth); + return levels[level].get_sync(); + } + + // Returns the width of a level + size_t width(int l) const + { + EXPECT(static_cast(l) < depth); + return levels[l].width(); + } + + const auto& get_levels() const + { + return levels; + } + + /* Compute the kernel configuration given all levels */ + ::std::array get_config() const + { + ::std::array config = {1, 1, 1}; + + // Scan all subsets of all levels. Consider three levels, level0 = (4 + // GPUs) level1 = (32 blocks x 4 threads) level 2= (32 threads). Then + // we would have an overall configuration of 4 GPUs, 32 blocks and + // 4*32=128 threads. + for (auto& l : levels) + { + for (auto& e : l.get_desc()) + { + switch (e.first) + { + case hw_scope::device: + config[0] *= e.second; + break; + + case hw_scope::block: + config[1] *= e.second; + break; + + case hw_scope::thread: + config[2] *= e.second; + break; + + default: + assert(!"Corrupt hw_scope value."); + abort(); + } + } + } + + return config; + } + + /* Helper function to return the hw scope of the last level in the interpreted thread hierarchy. Probably a more + * elegant and safer way to do the below exists. */ + hw_scope last_level_scope() const + { + return (levels[0].get_desc())[0].first; + } + + /* Compute overall memory requirements */ + ::std::array get_mem_config() const + { + /* We take the kernel configuration : config[2] is the number of + * threads per block, config[1] the number of blocks, config[0] the + * number of devices on which this kernel is launched. The total number + * of threads per scope corresponds to the product of these values. On + * each device, we for example get config[1] * config[2] threads, which + * is the product of the number of blocks,and the number of threads per + * block. */ + ::std::array config = get_config(); + size_t system_scope_size = config[0] * config[1] * config[2]; + size_t device_scope_size = config[1] * config[2]; + size_t block_scope_size = config[2]; + + ::std::array mem_config = {0, 0, 0}; + + size_t width_product = 1; + + // Scan all levels in reversed order : we look for the number of + // threads contained in a level : the memory assigned to a level + // corresponds to the hardware level which "contains" this level. For + // example, if a level has the same number of threads as the device, + // this means we are going to allocate device memory for this + // scratchpad. + for (size_t i = levels.size(); i-- > 0;) + { + auto& l = levels[i]; + size_t w = l.width(); + width_product *= w; + + size_t m = l.get_mem(); + if (m == 0) + { + continue; + } + + // We only support levels that match boundaries of hw scopes for now + EXPECT((width_product == system_scope_size || width_product == device_scope_size + || width_product == block_scope_size)); + + /* Find which hardware level matches the size of that spec level */ + if (width_product == block_scope_size) + { + mem_config[2] += m; + } + else if (width_product == device_scope_size) + { + mem_config[1] += m; + } + else + { + EXPECT(width_product == system_scope_size); + mem_config[0] += m; + } + } + + return mem_config; + } + + bool need_cooperative_kernel_launch() const + { + ::std::array config = get_config(); + size_t block_scope_size = config[2]; + size_t width_product = 1; + + // Scan all levels in reversed order + for (size_t i = levels.size(); i-- > 0;) + { + auto& l = levels[i]; + size_t w = l.width(); + width_product *= w; + + if (width_product > block_scope_size && get_level_sync(i)) + { + return true; + } + } + + return false; + } + + void set_system_mem(void* addr) + { + system_mem = addr; + } + void* get_system_mem() const + { + return system_mem; + } + +private: + ::std::vector levels; + + // Stored for convenience here + void* system_mem = nullptr; +}; + +#ifdef UNITTESTED_FILE +UNITTEST("interpreted policy") +{ + interpreted_execution_policy p; + + // par(2) + p.add_level({::std::make_pair(hw_scope::device, 2UL)}); + p.set_level_mem(0, 0); + p.set_level_sync(0, false); + + // con(16*32) + p.add_level({::std::make_pair(hw_scope::block, 16UL), ::std::make_pair(hw_scope::thread, 32UL)}); + p.set_level_mem(1, 0); + p.set_level_sync(1, true); +}; +#endif // UNITTESTED_FILE + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh new file mode 100644 index 00000000000..9d9abad3bf5 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh @@ -0,0 +1,509 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include // launch_impl() uses execution_policy +#include +#include +#include +#include +#include // graph_launch_impl() uses SCOPE + +namespace cuda::experimental::stf +{ + +// This feature requires a CUDA compiler +#ifdef __CUDACC__ + +class stream_ctx; +template +class stream_task; + +namespace reserved +{ + +template +__global__ void launch_kernel(Fun f, Arg arg) +{ + ::std::apply(mv(f), mv(arg)); +} + +template +void cuda_launcher(interpreted_spec interpreted_policy, Fun&& f, void** args, Stream_t& stream) +{ + const ::std::array config = interpreted_policy.get_config(); + const ::std::array mem_config = interpreted_policy.get_mem_config(); + + bool cooperative_kernel = interpreted_policy.need_cooperative_kernel_launch(); + + cudaLaunchAttribute attrs[1]; + attrs[0].id = cudaLaunchAttributeCooperative; + attrs[0].val.cooperative = cooperative_kernel ? 1 : 0; + + cudaLaunchConfig_t lconfig; + lconfig.gridDim = static_cast(config[1]); + lconfig.blockDim = static_cast(config[2]); + lconfig.attrs = attrs; + lconfig.numAttrs = 1; + lconfig.dynamicSmemBytes = mem_config[2]; + lconfig.stream = stream; + + cuda_safe_call(cudaLaunchKernelExC(&lconfig, (void*) f, args)); +} + +template +void cuda_launcher_graph(interpreted_spec interpreted_policy, Fun&& f, void** args, cudaGraph_t& g, cudaGraphNode_t& n) +{ + const ::std::array config = interpreted_policy.get_config(); + const ::std::array mem_config = interpreted_policy.get_mem_config(); + + cudaKernelNodeParams kconfig; + kconfig.gridDim = static_cast(config[1]); + kconfig.blockDim = static_cast(config[2]); + kconfig.extra = nullptr; + kconfig.func = (void*) f; + kconfig.kernelParams = args; + kconfig.sharedMemBytes = static_cast(mem_config[2]); + + cuda_safe_call(cudaGraphAddKernelNode(&n, g, nullptr, 0, &kconfig)); + + // Enable cooperative kernel if necessary by updating the node attributes + + bool cooperative_kernel = interpreted_policy.need_cooperative_kernel_launch(); + + cudaKernelNodeAttrValue val; + val.cooperative = cooperative_kernel ? 1 : 0; + cuda_safe_call(cudaGraphKernelNodeSetAttribute(n, cudaKernelNodeAttributeCooperative, &val)); +} + +template +void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg arg, cudaStream_t stream, size_t rank) +{ + assert(!p.is_grid()); + + p->*[&] { + auto th = thread_hierarchy(static_cast(rank), interpreted_policy); + + void* th_dev_tmp_ptr = nullptr; + + /* Allocate temporary device memory */ + auto th_mem_config = interpreted_policy.get_mem_config(); + if (th_mem_config[0] > 0) + { + // Lazily initialize system memory if needed + void* sys_mem = interpreted_policy.get_system_mem(); + if (!sys_mem) + { + sys_mem = allocateManagedMemory(th_mem_config[0]); + interpreted_policy.set_system_mem(sys_mem); + } + + assert(sys_mem); + th.set_system_tmp(sys_mem); + } + + if (th_mem_config[1] > 0) + { + cuda_safe_call(cudaMallocAsync(&th_dev_tmp_ptr, th_mem_config[1], stream)); + th.set_device_tmp(th_dev_tmp_ptr); + } + + auto kernel_args = tuple_prepend(mv(th), mv(arg)); + using args_type = decltype(kernel_args); + void* all_args[] = {&f, &kernel_args}; + + cuda_launcher(interpreted_policy, reserved::launch_kernel, all_args, stream); + + if (th_mem_config[1] > 0) + { + cuda_safe_call(cudaFreeAsync(th_dev_tmp_ptr, stream)); + } + }; +} + +template +void graph_launch_impl(task_t& t, interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg arg, size_t rank) +{ + assert(!p.is_grid()); + + auto kernel_args = tuple_prepend(thread_hierarchy(static_cast(rank), interpreted_policy), mv(arg)); + using args_type = decltype(kernel_args); + void* all_args[] = {&f, &kernel_args}; + + p->*[&] { + cuda_launcher_graph( + interpreted_policy, reserved::launch_kernel, all_args, t.get_ctx_graph(), t.get_node()); + }; +} + +/** + * @brief a free-function implementation of the launch mechanism which can be + * used multiple times in a task, or possibly outside tasks + */ +template +class launch +{ +public: + /** + * @brief Constructor to initialize a light-weight launch + * @param ctx Execution context. + * @param spec The specification of the thread hierarchy. + * @param e_place Execution place (e.g., device, host). + * @param arg Arguments to be passed to the kernel. + */ + launch(spec_t spec, exec_place e_place, ::std::vector streams, Arg arg) + : arg(mv(arg)) + , e_place(mv(e_place)) + , spec(mv(spec)) + , streams(mv(streams)) + {} + + launch(exec_place e_place, ::std::vector streams, Arg arg) + : launch(spec_t(), mv(e_place), mv(arg), mv(streams)) + {} + + template + void operator->*(Fun&& f) + { +# if __NVCOMPILER + // With nvc++, all lambdas can run on host and device. + static constexpr bool is_extended_host_device_lambda_closure_type = true, + is_extended_device_lambda_closure_type = false; +# else + // With nvcpp, dedicated traits tell how a lambda can be executed. + static constexpr bool is_extended_host_device_lambda_closure_type = + __nv_is_extended_host_device_lambda_closure_type(Fun), + is_extended_device_lambda_closure_type = __nv_is_extended_device_lambda_closure_type(Fun); +# endif + + static_assert(is_extended_host_device_lambda_closure_type || is_extended_device_lambda_closure_type, + "Cannot run launch() on the host"); + + EXPECT(e_place != exec_place::host, "Attempt to run a launch on the host."); + + const size_t grid_size = e_place.size(); + + using th_t = typename spec_t::thread_hierarchy_t; + using arg_type = decltype(tuple_prepend(th_t(), arg)); + + auto interpreted_policy = interpreted_execution_policy(spec, e_place, reserved::launch_kernel); + + SCOPE(exit) + { + /* If there was managed memory allocated we need to deallocate it */ + void* sys_mem = interpreted_policy.get_system_mem(); + if (sys_mem) + { + auto th_mem_config = interpreted_policy.get_mem_config(); + deallocateManagedMemory(sys_mem, th_mem_config[0], streams[0]); + } + + unsigned char* hostMemoryArrivedList = interpreted_policy.cg_system.get_arrived_list(); + if (hostMemoryArrivedList) + { + deallocateManagedMemory(hostMemoryArrivedList, grid_size, streams[0]); + } + }; + + /* Should only be allocated / deallocated if the last level used is system wide. Unnecessary and wasteful + * otherwise. */ + if (grid_size > 1) + { + if (interpreted_policy.last_level_scope() == hw_scope::device) + { + auto hostMemoryArrivedList = (unsigned char*) allocateManagedMemory(grid_size - 1); + // printf("About to allocate hostmemarrivedlist : %lu bytes\n", grid_size - 1); + memset(hostMemoryArrivedList, 0, grid_size - 1); + interpreted_policy.cg_system = reserved::cooperative_group_system(hostMemoryArrivedList); + } + } + + // t.get_stream_grid should return the stream from get_stream if this is not a grid ? + size_t p_rank = 0; + for (auto&& p : e_place) + { + launch_impl(interpreted_policy, p, f, arg, streams[p_rank], p_rank); + p_rank++; + } + } + +private: + template + void run_on_host(Fun&& f) + { + assert(!"Not yet implemented"); + abort(); + } + + Arg arg; + exec_place e_place; + ::std::string symbol; + spec_t spec; + ::std::vector streams; +}; + +/** + * @brief A class template for launching tasks with dependencies and execution context. + * @tparam Ctx Type of the execution context. + * @tparam Deps Types of dependencies. + */ +template +class launch_scope +{ +public: + /** + * @brief Constructor to initialize a launch_scope. + * @param ctx Execution context. + * @param spec The specification of the thread hierarchy. + * @param e_place Execution place (e.g., device, host). + * @param deps Dependencies for the task to be launched. + */ + launch_scope(Ctx& ctx, thread_hierarchy_spec_t spec, exec_place e_place, task_dep... deps) + : deps(deps...) + , ctx(ctx) + , e_place(mv(e_place)) + , spec(mv(spec)) + { + dump_hooks = reserved::get_dump_hooks(&ctx, deps...); + } + + /// Deleted copy constructor and copy assignment operator. + launch_scope(const launch_scope&) = delete; + launch_scope& operator=(const launch_scope&) = delete; + + /// Move constructor + launch_scope(launch_scope&&) = default; + + /** + * @brief Set the symbol for the task. + * @param s The symbol string. + * @return Reference to the current object for chaining. + */ + auto& set_symbol(::std::string s) + { + symbol = mv(s); + return *this; + } + + /** + * @brief Operator to launch the task. + * @tparam Fun Type of the function or lambda to be executed. + * @param f Function or lambda to be executed. + */ + template + void operator->*(Fun&& f) + { +# if __NVCOMPILER + // With nvc++, all lambdas can run on host and device. + static constexpr bool is_extended_host_device_lambda_closure_type = true, + is_extended_device_lambda_closure_type = false; +# else + // With nvcpp, dedicated traits tell how a lambda can be executed. + static constexpr bool is_extended_host_device_lambda_closure_type = + __nv_is_extended_host_device_lambda_closure_type(Fun), + is_extended_device_lambda_closure_type = __nv_is_extended_device_lambda_closure_type(Fun); +# endif + + static_assert(is_extended_device_lambda_closure_type || is_extended_host_device_lambda_closure_type, + "Cannot run launch() on the host"); + + EXPECT(e_place != exec_place::host, "Attempt to run a launch on the host."); + + auto& dot = *ctx.get_dot(); + auto& statistics = reserved::task_statistics::instance(); + + auto t = ctx.task(e_place); + + assert(e_place.affine_data_place() == t.get_affine_data_place()); + + /* + * If we have a grid of places, the implicit affine partitioner is the blocked_partition. + * + * An explicit composite data place is required per data dependency to customize this behaviour. + */ + if (e_place.is_grid()) + { + // Create a composite data place defined by the grid of places + the partitioning function + t.set_affine_data_place(data_place::composite(blocked_partition(), e_place.as_grid())); + } + + t.add_post_submission_hook(dump_hooks); + + t.add_deps(deps); + if (!symbol.empty()) + { + t.set_symbol(symbol); + } + + bool record_time = t.schedule_task(); + // Execution place may have changed during scheduling task + e_place = t.get_exec_place(); + + if (statistics.is_calibrating_to_file()) + { + record_time = true; + } + + nvtx_range nr(t.get_symbol().c_str()); + t.start(); + + if (dot.is_tracing()) + { + dot.template add_vertex(t); + } + + int device; + cudaEvent_t start_event, end_event; + + if constexpr (::std::is_same_v) + { + if (record_time) + { + cudaGetDevice(&device); // We will use this to force it during the next run + // Events must be created here to avoid issues with multi-gpu + cuda_safe_call(cudaEventCreate(&start_event)); + cuda_safe_call(cudaEventCreate(&end_event)); + cuda_safe_call(cudaEventRecord(start_event, t.get_stream())); + } + } + + const size_t grid_size = e_place.size(); + + // Put all data instances in a tuple + auto args = data2inst(t); + + using th_t = typename thread_hierarchy_spec_t::thread_hierarchy_t; + using args_type = decltype(tuple_prepend(th_t(), args)); + + auto interpreted_policy = interpreted_execution_policy(spec, e_place, reserved::launch_kernel); + + SCOPE(exit) + { + t.end_uncleared(); + + if constexpr (::std::is_same_v) + { + /* If there was managed memory allocated we need to deallocate it */ + void* sys_mem = interpreted_policy.get_system_mem(); + if (sys_mem) + { + auto th_mem_config = interpreted_policy.get_mem_config(); + deallocateManagedMemory(sys_mem, th_mem_config[0], t.get_stream()); + } + + unsigned char* hostMemoryArrivedList = interpreted_policy.cg_system.get_arrived_list(); + if (hostMemoryArrivedList) + { + deallocateManagedMemory(hostMemoryArrivedList, grid_size, t.get_stream()); + } + + if (record_time) + { + cuda_safe_call(cudaEventRecord(end_event, t.get_stream())); + cuda_safe_call(cudaEventSynchronize(end_event)); + + float milliseconds = 0; + cuda_safe_call(cudaEventElapsedTime(&milliseconds, start_event, end_event)); + + if (dot.is_tracing()) + { + dot.template add_vertex_timing>(t, milliseconds, device); + } + + if (statistics.is_calibrating()) + { + statistics.log_task_time(t, milliseconds); + } + } + } + + t.clear(); + }; + + /* Should only be allocated / deallocated if the last level used is system wide. Unnecessary and wasteful + * otherwise. */ + if (grid_size > 1) + { + if (interpreted_policy.last_level_scope() == hw_scope::device) + { + unsigned char* hostMemoryArrivedList; + hostMemoryArrivedList = (unsigned char*) allocateManagedMemory(grid_size - 1); + memset(hostMemoryArrivedList, 0, grid_size - 1); + interpreted_policy.cg_system = reserved::cooperative_group_system(hostMemoryArrivedList); + } + } + + size_t p_rank = 0; + if constexpr (::std::is_same_v) + { + for (auto p : e_place) + { + reserved::launch_impl(interpreted_policy, p, f, args, t.get_stream(p_rank), p_rank); + p_rank++; + } + } + else + { + for (auto p : e_place) + { + reserved::graph_launch_impl(t, interpreted_policy, p, f, args, p_rank); + p_rank++; + } + } + } + +private: + /** + * @brief Converts a set of slices to a tuple of instances. + * @tparam T Type of the task. + * @tparam S Type of the slice. + * @tparam MoreSlices Types of more slices. + * @param t The task object. + * @param i Index (used internally for recursion). + * @return A tuple containing instances corresponding to slices. + */ + template + auto data2inst(T& t, size_t i = 0) + { + S s = t.template get(i); + if constexpr (sizeof...(MoreSlices) == 0) + { + return ::std::make_tuple(s); + } + else + { + return tuple_prepend(s, data2inst(t, i + 1)); + } + } + + task_dep_vector deps; + Ctx& ctx; + exec_place e_place; + ::std::string symbol; + thread_hierarchy_spec_t spec; + + ::std::vector<::std::function> dump_hooks; +}; + +} // namespace reserved + +#endif // __CUDACC__ +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh b/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh new file mode 100644 index 00000000000..06601227b40 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh @@ -0,0 +1,2482 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Definition of `logical_data` and `task_dep_vector_untyped` + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include // logical_data_untyped_impl has a backend_ctx_untyped +#include +#include + +#include +#include + +namespace cuda::experimental::stf +{ + +logical_data_untyped unpack_state(const ::std::shared_ptr&); + +class logical_data_untyped_impl; + +/** + * @brief This represents a set of tasks. + * + * The goal of this class is to avoid storing very large containers of task + * structure which may hold many events for a long time (and needlessly consume + * ressources), so that we can optimize the events. This class is usefull when + * all tasks in the set are used together, for example when we enforce some + * write-after-read dependency when the writer tasks must depend on all + * previous readers. + * + * We can add an individual task to a set, or remove all of them, but we cannot + * remove a single task. + */ +class task_set +{ +public: + task_set() = default; + + /* Add a task to the set : this will copy the done prereqs, and save the ID */ + void add(const task& t) + { + done_prereqs.merge(t.get_done_prereqs()); + done_prereqs.optimize(); + // this will maybe depend on the use of dot or not + task_ids.push_back(t.get_unique_id()); + } + + void clear() + { + task_ids.clear(); + done_prereqs.clear(); + } + + bool empty() const + { + return task_ids.empty(); + } + + const event_list& get_done_prereqs() const + { + return done_prereqs; + } + + /* Get a reference to the vector of tasks IDs */ + const auto& get_ids() const + { + return task_ids; + } + +private: + /* All task identifiers in the set */ + ::std::vector task_ids; + + // Collection of all events marking the completion of the tasks in the set + event_list done_prereqs; +}; + +namespace reserved +{ + +/** + * @brief This class describes the status of a logical data (e.g. previous writer, + * readers...) in a specific task context. + * + * This makes it possible to use the same logical data in nested tasks while + * enforcing the STF coherency model. + * + * Implementing the STF model for example requires to introduce implicit + * dependencies between a task making a write access, and all preceeding read + * accesses. The list of previous readsers, the previous writer etc... are kept on + * a per context basis. + */ +struct logical_data_state +{ + // Reset structures which hold some state before we can call the destructor + void clear() + { + current_writer.reset(); + previous_writer.reset(); + current_readers.clear(); + } + + access_mode current_mode = access_mode::none; + + // A data structure to store the events to implement dependencies with + // previous read accesse : future writer will have to wait for them (WaR + // accesses) + task_set current_readers; + + // Task currently making a write access : future readers or writer will + // have to wait for it (RaW or WaW accesses) + ::std::optional current_writer; + + // Previous writer which all new readers will need to sync with (RaW accesses) + // We use a vector so that we don't store a null task + ::std::optional previous_writer; + + /* If we are tracing dependencies to generate a DOT output, we keep track + * of the identifiers of the tasks which performed a reduction access on + * that piece of data. */ + ::std::vector pending_redux_id; +}; + +/* +Implementation for `logical_data_untyped`, which uses the pimpl idiom. This +class cannot be nested inside `logical_data_untyped_impl` because it is used in +`task_state`, which holds a map of `logical_data_untyped_impl &`. + +We currently use shared_from_this() when reconstructing a logical data after a +reduction mode. +*/ +class logical_data_untyped_impl : public ::std::enable_shared_from_this +{ +public: + logical_data_untyped_impl( + backend_ctx_untyped ctx_, ::std::shared_ptr dinterface_, const data_place& memory_node) + : ctx(mv(ctx_)) + , dinterface(mv(dinterface_)) + { + assert(this->ctx); + assert(this->dinterface); + + refcnt.store(0); + + // This will automatically create a weak_ptr from the shared_ptr + ctx.get_stack().logical_data_ids_mutex.lock(); + ctx.get_stack().logical_data_ids.emplace(get_unique_id(), *this); + ctx.get_stack().logical_data_ids_mutex.unlock(); + + // It is possible that there is no valid copy (e.g. with temporary accesses) + if (memory_node == data_place::invalid) + { + return; // no data instance present, so nothing to do for the time being + } + + // Get an id (and reserve it) + const instance_id_t instance_id = find_instance_id(memory_node); + + // Record this id for the write back mechanism + reference_instance_id = instance_id; + enable_write_back = true; + + // setup MSI status + auto& inst = get_data_instance(instance_id); + + // If this is host memory, try to pin it + if (memory_node == data_place::host) + { + // ret may be false if we detected that the instance was already pinned, for example + inst.automatically_pinned = dinterface->pin_host_memory(instance_id); + } + +#ifndef NDEBUG + auto mem_type = dinterface->get_memory_type(instance_id); + + if (memory_node == data_place::managed) + { + assert(!mem_type || *mem_type == cudaMemoryTypeManaged); + } + + if (memory_node.is_device()) + { + assert(!mem_type || *mem_type == cudaMemoryTypeDevice); + } +#endif + + inst.set_msir(reserved::msir_state_id::modified); + inst.set_allocated(true); + + assert(inst.get_read_prereq().size() == 0); + assert(inst.get_write_prereq().size() == 0); + + // If the beginning of the context depends on a prereq, we assume this logical data depends on it + if (ctx.has_start_events()) + { + inst.add_read_prereq(ctx.get_start_events()); + } + + // This is not an instance allocated by our library, so we will not + // consider it for reclaiming for example + inst.reclaimable = false; + } + + ~logical_data_untyped_impl() + { + erase(); + } + + /** + * @brief `erase` destroys all data instances that were automatically + * created, and stores a valid value in the reference data instance that + * was used to create the logical data, if any. + */ + void erase(); + + // Data handle unique id : when this object is moved, it is set automatically to -1 + reserved::unique_id_t unique_id; + + backend_ctx_untyped ctx; + + reserved::logical_data_state state; + + // For temporary or redux accesses, we need to be able to find an available entry + ::std::vector used_instances; + + // A string useful for debugging purpose + mutable ::std::string symbol; + + // This will be used to store methods to manage data coherency + // (transfers, allocations etc.) or to have data specific accessors. + ::std::shared_ptr dinterface; + + void freeze(access_mode freeze_mode, data_place place = data_place::invalid) + { + // We cannot freeze some logical data that is already frozen + assert(!frozen_flag); + + if (freeze_mode != access_mode::read) + { + assert(place != data_place::invalid); + } + + frozen_flag = true; + frozen_mode = freeze_mode; + frozen_place = mv(place); + } + + void unfreeze(task& fake_task, event_list prereqs); + + void set_automatic_unfreeze(task& fake_task, bool flag) + { + automatic_unfreeze = flag; + + // Save for future use when destroying data + frozen_fake_task = fake_task; + } + + // This needs the full definition of logical_data_untyped so the implementation is deferred + template + ::std::pair get_frozen(task& fake_task, const data_place& dplace, access_mode m); + + /** + * @brief Indicates whether a logical data is frozen or not, and what is the corresponding mode + * + * This is for example used when creating a task to check that it does not + * try to make illegal accesses on frozen data (eg. concurrent writes) + */ + ::std::pair is_frozen() const + { + return ::std::pair(frozen_flag, frozen_mode); + } + + // Indicate if the data was frozen, and the corresponding mode (mode is ignored otherwise) + bool frozen_flag = false; + access_mode frozen_mode; // meaningful only if frozen + data_place frozen_place; // meaningful only if frozen and frozen_mode is write or rw + + // When set, frozen data will automatically be unfrozen when getting + // destroyed. This assumed all dependencies are solved by other means (eg. + // because it is used within other tasks) + bool automatic_unfreeze = false; + ::std::optional frozen_fake_task; + + // This defines how to allocate/deallocate raw buffers (ptr+size) within + // the interface, if undefined (set to nullptr), then the default allocator + // of the context will be used + block_allocator_untyped custom_allocator; + + template + static auto& get_allocator(Q& qthis) + { + if (qthis.custom_allocator) + { + return qthis.custom_allocator; + } + return qthis.ctx.get_allocator(); + } + + /* We get the per-logical data block_allocator, or the ctx one otherwise */ + const auto& get_allocator() const + { + return get_allocator(*this); + } + auto& get_allocator() + { + return get_allocator(*this); + } + + void set_allocator(block_allocator_untyped a) + { + custom_allocator = mv(a); + } + + // We keep a reference count to see whether there is a task currently + // accessing that piece of data, so that we cannot evict a piece of data + // that is used when looking to recycle memory (during the memory + // reclaiming mechanism). + ::std::atomic refcnt; + + // identifier of the instance used to create the logical_data_untyped + // -1 means there is no reference + // This id will be the instance which will be updated by the write-back + // mechanism when the data is destroyed, or when the context is + // synchronized. + instance_id_t reference_instance_id = instance_id_t::invalid; + + bool enable_write_back = false; + + // Enable or disable write-back. Enabling write-back will cause an error if there is no reference data instance. + void set_write_back(bool flag) + { + if (flag) + { + // Do not enable write-back on a logical data that was initialized from a shape, for example + if (reference_instance_id == instance_id_t::invalid) + { + fprintf(stderr, "Error: cannot enable write-back on a logical data with no reference instance.\n"); + abort(); + } + } + + enable_write_back = flag; + } + + // Explicitly enable or disable auto dump for this logical data + // If true, the logical data will be dumped with CUDASTF_AUTO_DUMP is set, + // if false, CUDASTF_AUTO_DUMP will be ignored for that logical data. + void set_auto_dump(bool flag) + { + enable_auto_dump = flag; + } + + bool get_auto_dump() const + { + return enable_auto_dump; + } + + bool enable_auto_dump = true; + + bool was_erased = false; + + // Get the index of the first available instance_id + instance_id_t find_unused_instance_id(const data_place& dplace) + { + for (auto i : each(used_instances.size())) + { + if (!used_instances[i].get_used()) + { + used_instances[i].set_dplace(dplace); + used_instances[i].set_used(true); + return instance_id_t(i); + } + } + + used_instances.emplace_back(true, dplace); + return instance_id_t(used_instances.size() - 1); + } + + // Get the index of the first used instance_id that matches the data place, or get a new one + instance_id_t find_instance_id(const data_place& dplace) + { + // Try to find a used instance id that has the same data place + for (auto i : each(used_instances.size())) + { + // This will evaluate get_used() first and stop if this is false + if (used_instances[i].get_used() && (used_instances[i].get_dplace() == dplace)) + { + return instance_id_t(i); + } + } + + // We must use a new entry instead because there was no matching entry + return find_unused_instance_id(dplace); + } + + // Make it possible to reuse an id + void release_instance_id(instance_id_t instance_id) + { + const auto i = size_t(instance_id); + assert(instance_id != instance_id_t::invalid && i < used_instances.size() && used_instances[i].get_used()); + used_instances[i].set_used(false); + // note that we do not change dplace as it's supposed to be unused when + // the used flag is set to false. + } + + // Find a valid copy given a hint and return its instance id. Currently the hint parameter is not used. + instance_id_t find_source_instance_id(instance_id_t dst_instance_id) + { + ::std::ignore = dst_instance_id; + assert(get_data_instance(dst_instance_id).get_msir() == reserved::msir_state_id::invalid); + +// @@@@ TODO @@@@ reimplement ! +#if 0 + instance_id_t dst_node = dst_place.memory_node; + + // Force initialization + auto& machine = reserved::machine::instance(); + + // We iterate over nodes, in an order that is could improve locality + for (int n = 0; n < nnodes(); n++) { + int n_aux = machine.get_ith_closest_node(dst_node, n); + if (get_data_instance(n_aux).get_msir() != reserved::msir_state_id::invalid) { + return data_place(n_aux); + } + } +#else + auto nnodes = instance_id_t(get_data_instance_count()); + for (auto id : each(nnodes)) + { + if (get_data_instance(id).get_msir() != reserved::msir_state_id::invalid) + { + return id; + } + } +#endif + + fprintf(stderr, "FATAL: no valid source found.\n"); + abort(); + return instance_id_t::invalid; + } + + bool has_interface() const + { + return dinterface != nullptr; + } + + bool has_ref() const + { + assert(refcnt.load() >= 0); + return refcnt.load() != 0; + } + + void add_ref() + { + assert(refcnt.load() >= 0); + ++refcnt; + } + + void remove_ref() + { + assert(refcnt >= 1); + --refcnt; + } + + const data_place& get_instance_dplace(instance_id_t instance_id) const + { + return used_instances[size_t(instance_id)].get_dplace(); + } + + const data_instance& get_data_instance(instance_id_t instance_id) const + { + return used_instances[size_t(instance_id)]; + } + data_instance& get_data_instance(instance_id_t instance_id) + { + return used_instances[size_t(instance_id)]; + } + + size_t get_data_instance_count() const + { + return used_instances.size(); + } + + void allocate(const data_place& memory_node, + instance_id_t instance_id, + ::std::ptrdiff_t& s, + void** extra_args, + event_list& prereqs) + { + assert(memory_node != data_place::invalid); + assert(has_interface()); + // nvtx_range r("allocate"); + // Get the allocator for this logical data + return dinterface->data_allocate(ctx, get_allocator(), memory_node, instance_id, s, extra_args, prereqs); + } + + void deallocate(const data_place& memory_node, instance_id_t instance_id, void* extra_args, event_list& prereqs) + { + assert(memory_node != data_place::invalid); + assert(has_interface()); + // nvtx_range r("deallocate"); + return dinterface->data_deallocate(ctx, get_allocator(), memory_node, instance_id, extra_args, prereqs); + } + + void data_copy(const data_place& dst_node, + instance_id_t dst_instance_id, + const data_place& src_node, + instance_id_t src_instance_id, + event_list& prereqs) + { + assert(src_node != data_place::invalid); + assert(dst_node != data_place::invalid); + assert(has_interface()); + // nvtx_range r("data_copy"); + ctx.add_transfer(src_node, dst_node, dinterface->data_footprint()); + return dinterface->data_copy(ctx, dst_node, dst_instance_id, src_node, src_instance_id, prereqs); + } + + void write_back(const data_place& src_node, instance_id_t instance_id, event_list& prereqs) + { + assert(reference_instance_id != instance_id_t::invalid); + data_copy(get_instance_dplace(reference_instance_id), reference_instance_id, src_node, instance_id, prereqs); + } + + reserved::logical_data_state& get_state() + { + return state; + } + + int get_unique_id() const + { + return unique_id; + } + + const ::std::string& get_symbol() const + { + if (symbol.empty()) + { + symbol = ::std::to_string(get_unique_id()); + } + + return symbol; + } + + size_t hash() const + { + size_t h = 0; + for (auto i : each(used_instances.size())) + { + // Compute a hash of the MSI/alloc state + size_t h_i_state = used_instances[i].state_hash(); + + // Compute a hash of the actual interface representation + size_t h_i_data = dinterface->data_hash(instance_id_t(i)); + + // Combine it with previously computed hashes + hash_combine(h, h_i_state); + hash_combine(h, h_i_data); + } + + return h; + } + + void reclaim_update_state(const data_place& memory_node, instance_id_t instance_id, event_list& prereqs) + { + auto& current_instance = get_data_instance(instance_id); + auto current_state = current_instance.get_msir(); + + // static size_t total_write_back_cnt = 0; + + /* Update MSI status depending on the current states and the required access mode */ + switch (current_state) + { + case reserved::msir_state_id::invalid: + // fprintf(stderr, "RECLAIM %s WITH NO TRANSFER (INVALID)... (wb cnt = %ld)\n", get_symbol().c_str(), + // total_write_back_cnt); + // No-op ! + current_instance.add_read_prereq(prereqs); + break; + + case reserved::msir_state_id::modified: { + // Host becomes the only valid copy + // XXX @@@@TODO@@@ ! reclaims assumes that 0 == host we need a reference copy + auto& ref_instance = get_data_instance(instance_id_t(0)); + ref_instance.set_msir(reserved::msir_state_id::modified); + current_instance.set_msir(reserved::msir_state_id::invalid); + + prereqs.merge(ref_instance.get_read_prereq(), current_instance.get_read_prereq()); + + write_back(memory_node, instance_id, prereqs); + // total_write_back_cnt++; + // fprintf(stderr, "WRITE BACK... %s (%ld)!!\n", get_symbol().c_str(), total_write_back_cnt); + + ref_instance.add_read_prereq(prereqs); + current_instance.add_read_prereq(prereqs); + break; + } + + case reserved::msir_state_id::shared: { + // fprintf(stderr, "RECLAIM %s WITH NO TRANSFER (SHARED)... (wb cnt = %ld)\n", get_symbol().c_str(), + // total_write_back_cnt); + + // Invalidate this copy, others may become either reserved::msir_state_id::shared or + // reserved::msir_state_id::modified + int cpy_cnt = 0; + + const auto nnodes = instance_id_t(get_data_instance_count()); + for (auto n : each(nnodes)) + { + if (get_data_instance(n).get_msir() != reserved::msir_state_id::invalid) + { + cpy_cnt++; + } + } + + assert(cpy_cnt > 0); + + current_instance.set_msir(reserved::msir_state_id::invalid); + current_instance.add_read_prereq(prereqs); + + // Update other copies (if needed) + for (auto n : each(nnodes)) + { + auto& inst_n = get_data_instance(n); + if (inst_n.get_msir() != reserved::msir_state_id::invalid) + { + // If there was 2 shared copies, there only remain ones + // now, which becomes modified. If there were more they + // remain shared, there cannot be less than 2 copies if + // this is shared + auto new_state = (cpy_cnt == 2) ? reserved::msir_state_id::modified : reserved::msir_state_id::shared; + inst_n.set_msir(new_state); + } + } + + break; + } + + default: + assert(!"Corrupt MSIR state value found."); + abort(); + } + } + + // This does not consumes prereqs, but produces new ones + inline event_list do_reclaim(const data_place& memory_node, instance_id_t instance_id) + { + // Write back data on the host (presumably), but that could be to + // some other data place such as the memory of a device + + /* previous tasks using this data instance */; + event_list result = get_pending_done_prereqs(memory_node); + + reclaim_update_state(memory_node, instance_id, result); + + auto& inst = get_data_instance(instance_id); + + deallocate(memory_node, instance_id, inst.get_extra_args(), result); + + inst.set_allocated(false); + + release_instance_id(instance_id); + + return result; + } + + /* Returns a prereq with all pending tasks accessing this data instance */ + event_list get_pending_done_prereqs(instance_id_t instance_id) + { + const auto& i = get_data_instance(instance_id); + if (!i.has_last_task_relaxed()) + { + return event_list(); + } + + return i.get_last_task_relaxed().get_done_prereqs(); + } + + /* Returns a prereq with all pending tasks accessing this piece of data on the specified memory node */ + // TODO modify to take a logical_data_untyped::state_t + event_list get_pending_done_prereqs(const data_place&) + { + auto prereqs = event_list(); + auto& state_ = get_state(); + + if (state_.current_mode == access_mode::write) + { + if (state_.current_writer.has_value()) + { + prereqs = state_.current_writer->get_done_prereqs(); + } + } + else + { + prereqs.merge(state_.current_readers.get_done_prereqs()); + + if (state_.previous_writer.has_value()) + { + prereqs.merge(state_.previous_writer->get_done_prereqs()); + } + } + + return prereqs; + } + + // prereqs is used to record which prereqs are expected before using a + // piece of data. + // + // Returns prereq_out + void enforce_msi_protocol(instance_id_t instance_id, access_mode mode, event_list& prereqs) + { + // print("BEFORE"); + + auto& current_instance = get_data_instance(instance_id); + auto current_msir = current_instance.get_msir(); + + /* Update msir_statuses depending on the current states and the required access mode */ + switch (mode) + { + case access_mode::read: { + switch (current_msir) + { + case reserved::msir_state_id::modified: + case reserved::msir_state_id::shared: + /* no-op : just reuse previous reqs */ + prereqs.merge(current_instance.get_read_prereq()); + break; + + case reserved::msir_state_id::invalid: { + /* There must be at least a valid copy at another place, so this becomes shared */ + + // There is no local valid copy ... find one ! + instance_id_t dst_instance_id = instance_id; + auto& dst_instance = get_data_instance(dst_instance_id); + auto dst_dplace = dst_instance.get_dplace(); + auto dst_memory_node = dst_dplace; + + instance_id_t src_instance_id = find_source_instance_id(dst_instance_id); + auto& src_instance = get_data_instance(src_instance_id); + const auto src_dplace = src_instance.get_dplace(); + const auto src_memory_node = src_dplace; + + // Initiate the copy once src and dst are ready + auto src_avail_prereq = src_instance.get_read_prereq(); + src_avail_prereq.merge(dst_instance.get_read_prereq(), dst_instance.get_write_prereq(), prereqs); + + data_copy(dst_memory_node, dst_instance_id, src_memory_node, src_instance_id, src_avail_prereq); + + // Make sure this is finished before we delete the source, for example + // We do not remove existing prereqs as there can be concurrent copies along with existing read accesses + src_instance.add_write_prereq(src_avail_prereq); + dst_instance.set_read_prereq(src_avail_prereq); + // Everything is already in the read_prereq + dst_instance.clear_write_prereq(); + + src_avail_prereq.clear(); + + // This is our output + prereqs.merge(dst_instance.get_read_prereq()); + + /* + * Update MSI statuses + */ + dst_instance.set_msir(reserved::msir_state_id::shared); + + /* If there was a single copy, they are turned into shared if they were invalid */ + for (auto& inst : used_instances) + { + if (inst.get_msir() != reserved::msir_state_id::invalid) + { + inst.set_msir(reserved::msir_state_id::shared); + } + } + } + break; + + case reserved::msir_state_id::reduction: { + // This is where we should reconstruct the data ? + + // Invalidate all other existing copies but that one + for (auto& inst : used_instances) + { + inst.set_msir(reserved::msir_state_id::invalid); + } + + get_data_instance(instance_id).set_msir(reserved::msir_state_id::modified); + + break; + } + + default: + assert(!"Corrupt MSIR state detected."); + abort(); + } + + break; + } + + case access_mode::rw: + case access_mode::write: { + switch (current_msir) + { + case reserved::msir_state_id::modified: + /* no-op */ + prereqs.merge(current_instance.get_read_prereq(), current_instance.get_write_prereq()); + break; + + case reserved::msir_state_id::shared: { + // There is a local copy, but we need to invalidate others + prereqs.merge(current_instance.get_read_prereq(), current_instance.get_write_prereq()); + + for (size_t i = 0; i < used_instances.size(); i++) + { + // All other instances become invalid, and their content + // need not be preserved + if (i != size_t(instance_id)) + { + auto& inst_i = get_data_instance(instance_id_t(i)); + inst_i.set_msir(reserved::msir_state_id::invalid); + inst_i.clear_read_prereq(); + inst_i.clear_write_prereq(); + } + } + + current_instance.set_msir(reserved::msir_state_id::modified); + break; + } + case reserved::msir_state_id::invalid: { + // If we need to perform a copy, this will be the source instance + instance_id_t src_instance_id = instance_id_t::invalid; + // Do not find a source if this is write only + if (mode == access_mode::rw) + { + // There is no local valid copy ... find one ! + instance_id_t dst_instance_id = instance_id; + auto dst_dplace = get_instance_dplace(dst_instance_id); + const auto dst_memory_node = dst_dplace; + + src_instance_id = find_source_instance_id(dst_instance_id); + auto src_dplace = get_instance_dplace(src_instance_id); + const auto src_memory_node = src_dplace; + + // Initiate the copy once src and dst are ready + auto src_avail_prereq = get_data_instance(src_instance_id).get_read_prereq(); + src_avail_prereq.merge(get_data_instance(dst_instance_id).get_read_prereq(), + get_data_instance(dst_instance_id).get_write_prereq(), + prereqs); + + data_copy(dst_memory_node, dst_instance_id, src_memory_node, src_instance_id, src_avail_prereq); + + // Make sure this is finished before we delete the source, for example + // We remove previous prereqs since this data is normally only used for this copy, and invalidated + // then + /* TODO CHECK THIS ... being too convervative ? */ + // THIS INSTEAD ?get_data_instance(src_instance_id).set_write_prereq(dst_copied_prereq); + get_data_instance(src_instance_id).set_read_prereq(src_avail_prereq); + get_data_instance(dst_instance_id).set_read_prereq(src_avail_prereq); + + src_avail_prereq.clear(); + + // This is our output + prereqs.merge(get_data_instance(dst_instance_id).get_read_prereq()); + } + else + { + // Write only + assert(mode == access_mode::write); + } + + // Clear and all copies which become invalid, keep prereqs + // needed to perform the copy + for (size_t i = 0; i < used_instances.size(); i++) + { + auto& inst_i = get_data_instance(instance_id_t(i)); + if (i != size_t(instance_id) && i != size_t(src_instance_id)) + { + /* We do not clear write prereqs */ + inst_i.clear_read_prereq(); + } + + // including instance_id, but we will set it to modified after + inst_i.set_msir(reserved::msir_state_id::invalid); + } + + // This is the only valid instance now + current_instance.set_msir(reserved::msir_state_id::modified); + break; + } + + default: + assert(!"Corrupt MSIR state detected."); + abort(); + } + + break; + } + + case access_mode::redux: + current_instance.set_msir(reserved::msir_state_id::reduction); + break; + default: + assert(!"Corrupt MSIR state detected."); + abort(); + } + } + + auto& get_mutex() + { + return mutex; + } + +private: + ::std::mutex mutex; +}; + +} // namespace reserved + +/** @brief Base class of all `logical_data` types. It does not "know" the type of the data, so most of the time it's + * best to use `logical_data`. Use `logical_data_untyped` only in special circumstances. + */ +class logical_data_untyped +{ +public: + ///@{ + /** @name Constructors */ + logical_data_untyped() = default; + + logical_data_untyped(::std::shared_ptr p) + : pimpl(mv(p)) + {} + + /** + * @brief Constructs a new `logical_data_untyped object` with the provided context, backend, memory_node, symbol, + * and data prerequisites. + * + * @param ctx A context + * @param backend A `shared_ptr` to the `data_interface` object underlying the data, must be non-null + * @param memory_node initial data location + * + * The constructor initializes `this` with the provided context, backend, data place, and + * symbol. If `memory_node` is invalid, the constructor returns early without creating a data instance. Otherwise, + * it sets up the data instance, pins the host memory if required, and initializes the MSI status and prerequisite + * events. + */ + logical_data_untyped(backend_ctx_untyped ctx, ::std::shared_ptr backend, const data_place& memory_node) + : pimpl(::std::make_shared(mv(ctx), mv(backend), memory_node)) + {} + ///@} + + ///@{ @name Symbol getter/setter + const ::std::string& get_symbol() const + { + return pimpl->get_symbol(); + } + void set_symbol(::std::string str) + { + pimpl->symbol = mv(str); + } + ///@} + + ///@{ @name allocator setter + void set_allocator(block_allocator_untyped a) + { + pimpl->set_allocator(mv(a)); + } + ///@} + + ///@{ @name Data interface getter + const data_interface& get_data_interface() const + { + assert(has_interface()); + return *pimpl->dinterface; + } + ///@} + + /** @name Get common data associated with this object. The type must be user-specified and is checked dynamically. + * @tparam T the type of common data, must be user-specified + */ + template + const T& common() const + { + assert(has_interface()); + return pimpl->dinterface->shape(); + } + + ///@{ + /** + * @name Retrieves the physical data from this logical data for a given instance id + * + * @tparam T Type stored in this object (must be supplied by user) + * @param instance_id ID of the instance for which the slice is fetched + * @return T Object represented by this handle + * + * The user-provided type is checked dynamically. Program is aborted if there's a mismatch. + */ + template + decltype(auto) instance(instance_id_t instance_id) + { + assert(has_interface()); + return pimpl->dinterface->instance(instance_id); + } + + template + decltype(auto) instance(instance_id_t instance_id) const + { + assert(has_interface()); + return pimpl->dinterface->instance(instance_id); + } + ///@} + + ///@{ + /** + * @name Retrieves the physical data from this logical data for the default instance + */ + template + decltype(auto) instance(task& tp) + { + return instance(pimpl->dinterface->get_default_instance_id(pimpl->ctx, *this, tp)); + } + + template + decltype(auto) instance(task& tp) const + { + return instance(pimpl->dinterface->get_default_instance_id(pimpl->ctx, *this, tp)); + } + ///@} + + void freeze(access_mode freeze_mode, data_place place = data_place::invalid) + { + pimpl->freeze(freeze_mode, place); + } + + template + ::std::pair get_frozen(task& fake_task, const data_place& dplace, access_mode m) + { + return pimpl->template get_frozen(fake_task, dplace, m); + } + + void unfreeze(task& fake_task, event_list prereqs = event_list()) + { + pimpl->unfreeze(fake_task, mv(prereqs)); + } + + void set_automatic_unfreeze(task& fake_task, bool flag) + { + pimpl->set_automatic_unfreeze(fake_task, flag); + } + + /** + * @brief Indicates whether a logical data is frozen or not, and what is the corresponding mode + * + */ + ::std::pair is_frozen() const + { + return pimpl->is_frozen(); + } + + /** + * @brief Allocate memory for this logical data + * + * @param ctx + * @param memory_node + * @param instance_id + * @param s + * @param extra_args + * @param prereqs + */ + void allocate(const data_place& memory_node, + instance_id_t instance_id, + ::std::ptrdiff_t& s, + void** extra_args, + event_list& prereqs) + { + pimpl->allocate(memory_node, instance_id, s, extra_args, prereqs); + } + + /** + * @brief Deallocate memory previously allocated with `allocate` + * + * @param ctx + * @param memory_node + * @param instance_id + * @param extra_args + * @param prereqs + */ + void deallocate(const data_place& memory_node, instance_id_t instance_id, void* extra_args, event_list& prereqs) + { + pimpl->deallocate(memory_node, instance_id, extra_args, prereqs); + } + + /** + * @brief Copy data + * + * @param ctx + * @param dst_node + * @param dst_instance_id + * @param src_node + * @param src_instance_id + * @param arg + * @param prereqs + */ + void data_copy(const data_place& dst_node, + instance_id_t dst_instance_id, + const data_place& src_node, + instance_id_t src_instance_id, + event_list& prereqs) + { + pimpl->data_copy(dst_node, dst_instance_id, src_node, src_instance_id, prereqs); + } + + /** + * @brief Writes back data + * + * @param ctx + * @param src_node + * @param instance_id + * @param prereqs + */ + void write_back(const data_place& src_node, instance_id_t instance_id, event_list& prereqs) + { + pimpl->write_back(src_node, instance_id, prereqs); + } + + /** + * @brief Get the index of the first available instance_id + * + * @param dplace + * @return The found instance_id + */ + instance_id_t find_unused_instance_id(const data_place& dplace) + { + return pimpl->find_unused_instance_id(dplace); + } + + /** + * @brief Get the index of the first used instance_id that matches the data place, or get a new one + * + * @param dplace + * @return The found instance_id + */ + instance_id_t find_instance_id(const data_place& dplace) + { + return pimpl->find_instance_id(dplace); + } + + /** + * @brief Make it possible to reuse an id + * + * @param instance_id + */ + void release_instance_id(instance_id_t instance_id) + { + pimpl->release_instance_id(instance_id); + } + + /** + * @brief Get the data pace associated with a given instance + * + * @param instance_id + * @return const data_place& + */ + const data_place& get_instance_dplace(instance_id_t instance_id) const + { + assert(pimpl); + return pimpl->get_instance_dplace(instance_id); + } + + ///@{ + /** + * @name Get the data pace associated with a given instance + */ + const data_instance& get_data_instance(instance_id_t instance_id) const + { + assert(pimpl); + return pimpl->get_data_instance(instance_id); + } + data_instance& get_data_instance(instance_id_t instance_id) + { + return pimpl->get_data_instance(instance_id); + } + ///@} + + ///@{ @name Reference count query and manipulation + bool has_ref() const + { + return pimpl->has_ref(); + } + void add_ref() + { + pimpl->add_ref(); + } + void remove_ref() + { + pimpl->remove_ref(); + } + ///@} + + /** + * @name Returns a dependency object for read/write/read and write access to this logical data. + * + * @return task_dep_untyped The dependency object corresponding to this logical data + */ + ///@{ + task_dep_untyped read(data_place dp = data_place::affine) + { + return task_dep_untyped(*this, access_mode::read, mv(dp)); + } + + task_dep_untyped write(data_place dp = data_place::affine) + { + return task_dep_untyped(*this, access_mode::write, mv(dp)); + } + + task_dep_untyped rw(data_place dp = data_place::affine) + { + return task_dep_untyped(*this, access_mode::rw, mv(dp)); + } + + task_dep_untyped redux(::std::shared_ptr op, data_place dp = data_place::affine) + { + return task_dep_untyped(*this, access_mode::redux, mv(dp), op); + } + + ///@} + + /** + * @brief Returns true if the data interface of this object has been set + */ + bool has_interface() const + { + assert(pimpl); + return pimpl->dinterface != nullptr; + } + + // This function applies the reduction operator over 2 instances, the one + // identified by "in_instance_id" is not modified, the one identified as + // "inout_instance_id" is where the result is put. + // This method assumes that instances are properly allocated, but ignores + // the MSI state which is managed at other places. + // The reduction occurs on the execution place e_place, and we assume that + // both instances are located at the same data place. + void apply_redux_op(const data_place& memory_node, + const exec_place& e_place, + instance_id_t inout_instance_id, + instance_id_t in_instance_id, + event_list& prereqs) + { + // fprintf(stderr, "APPLY REDUX op(in: %d -> inout %d)\n", in_instance_id, inout_instance_id); + + assert(in_instance_id != inout_instance_id); + + auto& inst_in = get_data_instance(in_instance_id); + auto& inst_inout = get_data_instance(inout_instance_id); + + assert(inst_in.is_allocated()); + assert(inst_inout.is_allocated()); + + // In addition to existing prerequisite, we must make sure all pending operations are done + prereqs.merge(inst_in.get_read_prereq(), inst_inout.get_read_prereq(), inst_inout.get_write_prereq()); + + // Apply reduction operator + ::std::shared_ptr ops_in = get_data_instance(in_instance_id).get_redux_op(); + ::std::shared_ptr ops_inout = get_data_instance(inout_instance_id).get_redux_op(); + assert(ops_in != nullptr || ops_inout != nullptr); + + ::std::shared_ptr ops = (ops_in != nullptr) ? ops_in : ops_inout; + + ops->op_untyped(*this, memory_node, inout_instance_id, memory_node, in_instance_id, e_place, prereqs); + + // Both instances now depend on the completion of this operation + inst_in.set_read_prereq(prereqs); + inst_inout.set_read_prereq(prereqs); + } + + // Perform reductions within a node over a set of instances + // The last item of the list will contain the reduction result + void apply_redux_on_node( + const data_place& memory_node, const exec_place& e_place, ::std::vector& ids, event_list& prereqs) + { + if (ids.size() < 2) + { + return; + } + + // Apply reduction operator over all instances (there are at least two) + for (size_t i = 1; i < ids.size(); i++) + { + auto in_id = ids[i - 1]; + auto inout_id = ids[i]; + apply_redux_op(memory_node, e_place, inout_id, in_id, prereqs); + } + } + + // Asynchronously reconstruct a piece of data into the data instance + // identified by instance_id. This may require to perform reduction on + // differement data places, to perform data transfers, and to reduce these + // temporary results as well. + // If we have a non relaxed type of access after a reduction + // instance_id is the data instance which should have a coherent copy after the reduction + void reconstruct_after_redux(instance_id_t instance_id, const exec_place& e_place, event_list& prereqs) + { + // @@@@TODO@@@@ get from somewhere else (machine ?) + const size_t max_nodes = cuda_try() + 2; + ::std::vector<::std::vector> per_node(max_nodes); + + // Valid copies, we will select only one + ::std::vector ref_copies; + + // These are instances on the target memory node which we can use as + // temporary storage (ie. to receive copies) + ::std::vector available_instances; + + auto& target_instance = get_data_instance(instance_id); + auto& target_dplace = target_instance.get_dplace(); + auto target_memory_node = target_dplace; + instance_id_t target_instance_id = instance_id; + + // fprintf(stderr, "Start to plan reduction of %p on instance %d (memory_node %d)\n", d, instance_id, + // e_place->memory_node); + + // ALL is serialized for now ... + const auto nnodes = instance_id_t(get_data_instance_count()); + +#ifdef REDUCTION_DEBUG + fprintf(stderr, "make_reduction_plan :: target instance %d\n", target_instance_id); + for (size_t i = 0; i < nnodes; i++) + { + auto& instance_i = get_data_instance(i); + fprintf(stderr, + "make_reduction_plan :: instance %d : ops %p dplace %d\n", + i, + instance_i.get_redux_op().get(), + int(instance_i.get_dplace())); + } +#endif + + // We go through all data instances + for (auto i : each(nnodes)) + { + auto& instance_i = get_data_instance(i); + +#ifdef REDUCTION_DEBUG + fprintf(stderr, + "make_reduction_plan :: instance %d status %s\n", + i, + reserved::status_to_string(instance_i.get_msir()).c_str()); +#endif + + // We exclude the target instance id because we want to reduce to this id, not from this id + if (i != instance_id && instance_i.get_msir() == reserved::msir_state_id::reduction) + { + const data_place& dplace = instance_i.get_dplace(); + per_node[to_index(dplace)].push_back(i); + // fprintf(stderr, "instance %d : get_msir() == reserved::msir_state_id::reduction memory_node = %d\n", i, + // memory_node); + continue; + } + + // If this is a valid copy, we put this aside and will select one of these + if (instance_i.get_msir() == reserved::msir_state_id::modified + || instance_i.get_msir() == reserved::msir_state_id::shared) + { + ref_copies.push_back(i); + continue; + } + + // If this is an invalid copy, and that it is on the target memory node, we can use this as a temporary + // storage + if (instance_i.get_msir() == reserved::msir_state_id::invalid && instance_i.get_dplace() == target_memory_node) + { + available_instances.push_back(i); + continue; + } + } + +#ifdef REDUCTION_DEBUG + for (auto r : ref_copies) + { + fprintf(stderr, "make_reduction_plan :: ref copy %d\n", r); + } +#endif + + // Add "the" reference copy (if any) to the list of items to reduce + if (ref_copies.size() > 0) + { + // TODO : we should sort ref_copies to avoid transfers if possible + auto ref_instance_id = ref_copies[0]; + + auto& ref_instance = get_data_instance(ref_instance_id); + const auto ref_memory_node = ref_instance.get_dplace(); + +#ifdef REDUCTION_DEBUG + fprintf(stderr, "Using %d as the reference\n", ref_instance_id); +#endif + + // The target instance id is implicitely the one where to reduce last, so we don't add it + if (ref_instance_id != target_instance_id) + { + // fprintf(stderr, "...adding %d to per_node[%d]\n", ref_instance_id, ref_memory_node); + per_node[to_index(ref_memory_node)].push_back(ref_instance_id); + } + + // Consider whether remaining copies can be used as temporary storage + // A copy that is not used as "the" reference copy, but which is on + // the target memory node, can be used + if (ref_copies.size() > 1) + { + for (auto i : each(1, ref_copies.size())) + { + auto& inst = get_data_instance(instance_id_t(i)); + if (inst.get_dplace() == target_memory_node) + { + available_instances.push_back(ref_copies[i]); + } + } + } + } + +#ifdef REDUCTION_DEBUG + for (auto a : available_instances) + { + fprintf(stderr, "make_reduction_plan :: available instance %d (can be overwritten)\n", a); + } +#endif + + // First start by invalidating all data instances ! + for (auto i : each(nnodes)) + { + get_data_instance(i).set_msir(reserved::msir_state_id::invalid); + } + + // Naive plan : reduce all local instances (except on target node), copy to target node, reduce local instances + // on target node + for (auto n : each(max_nodes)) + { +#ifdef REDUCTION_DEBUG + fprintf(stderr, "make_reduction_plan :: node %d per_node size %d\n", n, per_node[n].size()); +#endif + // Skip if there is nothing to do on that memory node + size_t per_node_size = per_node[n].size(); + if (per_node_size == 0) + { + continue; + } + + // TODO THIS MAY BE A BUG: do we care about managed devices or host? + const auto memory_node = data_place::device(static_cast(n - 2)); + // Skip the target memory node in this step + if (memory_node == target_memory_node) + { + continue; + } + + // fprintf(stderr, "SET EXEC CTX associated to MEMORY NODE %d \n", n); + + // We now ensure that the current execution place is appropriate to + // manipulate data on that memory node (data place). This will + // typically change the current device if needed, or select an + // appropriate affinity mask. + + exec_place e_place_n = memory_node.get_affine_exec_place(); + + auto saved_place = e_place_n.activate(pimpl->ctx); + + // Reduce instances if there are more than one + if (per_node[n].size() > 1) + { + // Apply reduction operator over all instances on node n (there are at least two) +#ifdef REDUCTION_DEBUG + fprintf(stderr, "apply_redux_on_node %d\n", int(memory_node)); +#endif + apply_redux_on_node(memory_node, e_place_n, per_node[n], prereqs); + } + + instance_id_t copy_instance_id; + + // We first try to get an available instance + if (available_instances.size() > 0) + { + copy_instance_id = available_instances.back(); + available_instances.pop_back(); + + // This instance will be used, and we add the current list of events to its existing one + auto& inst = get_data_instance(copy_instance_id); + inst.add_read_prereq(prereqs); + + // fprintf(stderr, "REUSE INSTANCE %d to copy\n", copy_instance_id); +#ifdef REDUCTION_DEBUG + fprintf(stderr, "make_reduction_plan :: reuse instance %d for copy\n", copy_instance_id); +#endif + } + else + { + // There was no available instance, so we allocate a new one, and assign a new instance id + copy_instance_id = find_unused_instance_id(target_dplace); + // fprintf(stderr, "RESERVE ID %d on node %d\n", copy_instance_id, n); + +#ifdef REDUCTION_DEBUG + fprintf(stderr, "make_reduction_plan :: find_unused_instance_id => %d\n", copy_instance_id); +#endif + } + + auto& copy_inst = get_data_instance(copy_instance_id); + if (!copy_inst.is_allocated()) + { + // Allocate an instance on the memory node (data place) + // fprintf(stderr, "ALLOCATE ID %d on node %d\n", copy_instance_id, n); + + // mode is rather meaningless here (?) + // fprintf(stderr, "ALLOCATE ID %d on node %d\n", copy_instance_id, int(target_memory_node)); + reserved::dep_allocate( + pimpl->ctx, *this, access_mode::read, target_memory_node, e_place_n, copy_instance_id, prereqs); + } + + // Copy the last instance to the destination + instance_id_t src_instance_id = per_node[n].back(); + // fprintf(stderr, "COPY id %d (node %d) => id %d (node %d)\n", src_instance_id, n, copy_instance_id, + // target_memory_node); + + assert(get_data_instance(src_instance_id).is_allocated()); + assert(get_data_instance(copy_instance_id).is_allocated()); + // fprintf(stderr, "REDUCTION : copy id %d to id %d\n", src_instance_id, copy_instance_id); + data_copy(target_memory_node, copy_instance_id, memory_node, src_instance_id, prereqs); + + // The copied instance will use the same operator as the source + auto ops = get_data_instance(src_instance_id).get_redux_op(); + if (ops.get()) + { + // fprintf(stderr, "SET REDUX OP for id %d (use that of id %d)\n", copy_instance_id, src_instance_id); + get_data_instance(copy_instance_id).set_redux_op(ops); + } + + // Add this instance to the list of instances on the target node + // If this is the reference id, we don't add it because it will be implicitely the last element where to + // reduce ! + if (copy_instance_id != target_instance_id) + { + // fprintf(stderr, "ADD instance %d to per_node[%d]\n", copy_instance_id, target_memory_node); + per_node[to_index(target_memory_node)].push_back(copy_instance_id); + } + + // Restore the execution place to its previous state (e.g. current CUDA device) + // fprintf(stderr, "RESET CTX\n"); + e_place_n.deactivate(pimpl->ctx, saved_place); + } + + if (per_node[to_index(target_memory_node)].size() > 1) + { + // Reduce all instances on the target node, including temporary ones + apply_redux_on_node(target_memory_node, e_place, per_node[to_index(target_memory_node)], prereqs); + } + + // Reduce to the target id, unless it was obtained from a copy + if (per_node[to_index(target_memory_node)].size() > 0) + { + instance_id_t in_id = per_node[to_index(target_memory_node)].back(); + instance_id_t inout_id = target_instance_id; + + // fprintf(stderr, "REDUCE TO TARGET ID=%d....\n", target_instance_id); + // fprintf(stderr, "\top(in: %d -> inout %d)\n", in_id, inout_id); + + // It is possible (likely) that the target id was not used for a + // reduction, so we use the same operator as the other data + // instance which we are reducing with + auto ops = get_data_instance(in_id).get_redux_op(); + if (ops.get()) + { + get_data_instance(inout_id).set_redux_op(ops); + } + + apply_redux_op(target_memory_node, e_place, inout_id, in_id, prereqs); + } + + // This is likely not optimal : but we assume the only valid copy is + // the last one. Note that in some situations, we may have other valid + // copies if we did copy the reduced data, and did not modify it again + get_data_instance(target_instance_id).set_msir(reserved::msir_state_id::modified); + } + + size_t get_data_instance_count() const + { + return pimpl->get_data_instance_count(); + } + + // prereqs is used to record which prereqs are expected before using a + // piece of data. + // + // Returns prereq_out + void enforce_msi_protocol(instance_id_t instance_id, access_mode mode, event_list& prereqs) + { + pimpl->enforce_msi_protocol(instance_id, mode, prereqs); + } + + // Find a valid copy given a hint and return its instance id. Currently the hint parameter is not used. + instance_id_t find_source_instance_id(instance_id_t dst_instance_id) + { + return pimpl->find_source_instance_id(dst_instance_id); + } + + size_t hash() const + { + return pimpl->hash(); + } + + // Enable or disable write-back. Enabling write-back will cause an error if there is no reference data instance. + void set_write_back(bool flag) + { + pimpl->set_write_back(flag); + } + + // Explicitly enable or disable auto dump for this logical data + // If true, the logical data will be dumped with CUDASTF_AUTO_DUMP is set, + // if false, CUDASTF_AUTO_DUMP will be ignored for that logical data. + void set_auto_dump(bool flag) + { + pimpl->set_auto_dump(flag); + } + + bool get_auto_dump() const + { + return pimpl->get_auto_dump(); + } + + reserved::logical_data_state& get_state() + { + return pimpl->get_state(); + } + + auto& get_ctx() const + { + return pimpl->ctx; + } + + bool is_initialized() const + { + return pimpl.get() != nullptr; + } + + bool operator==(const logical_data_untyped& other) const + { + return pimpl == other.pimpl; + } + + friend inline ::std::shared_ptr pack_state(const logical_data_untyped& d) + { + return d.pimpl; + } + + friend inline logical_data_untyped unpack_state(const ::std::shared_ptr& p) + { + assert(p); + return logical_data_untyped(::std::static_pointer_cast(p)); + } + + auto& get_mutex() + { + return pimpl->get_mutex(); + } + +private: + int get_unique_id() const + { + return pimpl->get_unique_id(); + } + + ::std::shared_ptr pimpl; +}; + +// This implementation is deferred because we need the logical_data_untyped type in it +inline void reserved::logical_data_untyped_impl::erase() +{ + // fprintf(stderr, "ERASING ... %s - get_unique_id() %d was_erased %d\n", get_symbol().c_str(), + // int(get_unique_id()), + // was_erased ? 1 : 0); + + // Early exit if: + if (get_unique_id() == -1 // this logical_data_untyped was moved to another logical_data_untyped + // class which will take care of the necessary cleanups later on, OR + || !ctx // the context was already null, OR + || was_erased) + { // the logical_data_untyped was already erased + return; + } + + if (frozen_flag) + { + if (!automatic_unfreeze) + { + fprintf(stderr, "Error: destroying frozen logical data without unfreeze and no automatic unfreeze\n"); + abort(); + } + + // Freeze data automatically : we assume all dependencies on that + // frozen data are solved by other means (this is the requirement of + // the set_automatic_unfreeze API) + assert(frozen_fake_task.has_value()); + unfreeze(frozen_fake_task.value(), event_list()); + } + + auto& cs = ctx.get_stack(); + + auto wb_prereqs = event_list(); + auto& h_state = get_state(); + + /* If there is a reference instance id, it needs to be updated with a + * valid copy if that is not the case yet */ + if (enable_write_back) + { + instance_id_t ref_id = reference_instance_id; + assert(ref_id != instance_id_t::invalid); + + // Get the state in which we store previous writer, readers, ... + if (h_state.current_mode == access_mode::redux) + { + // Reconstruction of the data on the reference data place needed + + // We create a logical_data from a pointer to an implementation + logical_data_untyped l(shared_from_this()); + + data_instance& ref_instance = get_data_instance(ref_id); + const data_place& ref_dplace = ref_instance.get_dplace(); + auto e = ref_dplace.get_affine_exec_place(); + l.reconstruct_after_redux(ref_id, e, wb_prereqs); + + h_state.current_mode = access_mode::none; + } + + auto s = used_instances[size_t(ref_id)].get_msir(); + if (s == reserved::msir_state_id::invalid) + { + // Write-back needed + // fprintf(stderr, "Write-back needed %s\n", get_symbol().c_str()); + + // Look where to take the valid copy from + instance_id_t src_id = find_source_instance_id(ref_id); + data_instance& src_instance = get_data_instance(src_id); + const data_place& src_dplace = src_instance.get_dplace(); + + data_instance& dst_instance = get_data_instance(ref_id); + + // Initiate the copy once src and dst are ready + auto reqs = src_instance.get_read_prereq(); + reqs.merge(dst_instance.get_read_prereq(), dst_instance.get_write_prereq()); + + write_back(src_dplace, src_id, reqs); + + src_instance.add_write_prereq(reqs); + dst_instance.set_read_prereq(reqs); + + // nobody waits for these events, so we put them in the list of dangling events + cs.add_dangling_events(reqs); + } + } + + for (auto i : each(instance_id_t(used_instances.size()))) + { + auto& inst_i = used_instances[size_t(i)]; + // Unpin an instance if if was automatically pinned + if (inst_i.automatically_pinned) + { + assert(inst_i.get_dplace() == data_place::host); + assert(dinterface); + dinterface->unpin_host_memory(i); + } + + if (inst_i.is_allocated() && inst_i.reclaimable) + { + // Make sure copies or reduction initiated by the erase are finished + auto inst_prereqs = wb_prereqs; + // Wait for preceeding tasks + inst_prereqs.merge(get_pending_done_prereqs(inst_i.get_dplace())); + + inst_prereqs.merge(inst_i.get_read_prereq(), inst_i.get_write_prereq()); + + // We now ask to deallocate that piece of data + deallocate(inst_i.get_dplace(), i, inst_i.get_extra_args(), inst_prereqs); + cs.add_dangling_events(inst_prereqs); + } + + inst_i.clear(); + } + + if (wb_prereqs.size() > 0) + { + // nobody waits for these events, so we put them in the list of dangling events + cs.add_dangling_events(wb_prereqs); + } + + // Clear the state which may contain references (eg. shared_ptr) to other + // ressources. This must be done here because the destructor of the logical + // data may be called after finalize() + h_state.clear(); + + cs.logical_data_ids_mutex.lock(); + + // This unique ID is not associated to a pointer anymore (and should never be reused !) + auto& logical_data_ids = cs.logical_data_ids; + + // fprintf(stderr, "REMOVE %d from logical_data_ids %p (id count %zu)\n", get_unique_id(), + // &logical_data_ids, logical_data_ids.size()); + + // This SHOULD be in the table because that piece of data was created + // in this context and cannot already have been destroyed. + auto erased = logical_data_ids.erase(get_unique_id()); + EXPECT(erased == 1UL, "ERROR: prematurely destroyed data"); + + cs.logical_data_ids_mutex.unlock(); + + // fprintf(stderr, "AFTER REMOVE %d from logical_data_ids %p (id count %zu)\n", get_unique_id(), + // &logical_data_ids, logical_data_ids.size()); + + // Make sure this we do not erase this twice. For example after calling + // finalize() there is no need to erase it again in the constructor + was_erased = true; +} + +namespace reserved +{ + +/** + * @brief Implements STF dependencies. + * + * This method ensures that the current task (task) depends on the appropriate + * predecessors. A second method enforce_stf_deps_after will be called to make + * sure future tasks depend on that task. + */ +template +inline event_list enforce_stf_deps_before( + backend_ctx_untyped& bctx, + logical_data_untyped& handle, + const instance_id_t instance_id, + const task_type& task, + const access_mode mode, + const ::std::optional eplace) +{ + auto result = event_list(); + auto& cs = bctx.get_stack(); + // Get the context in which we store previous writer, readers, ... + auto& ctx_ = handle.get_state(); + + auto& dot = *bctx.get_dot(); + const bool dot_is_tracing = dot.is_tracing(); + + if (mode == access_mode::redux) + { + // A reduction only needs to wait for previous accesses on the data instance + ctx_.current_mode = access_mode::redux; + + if (dot_is_tracing) + { + // Add this task to the list of task accessing the logical data in redux mode + // We only store its id since this is used for dot + ctx_.pending_redux_id.push_back(task.get_unique_id()); + } + + // XXX with a mv we may avoid copies + const auto& data_instance = handle.get_data_instance(instance_id); + result.merge(data_instance.get_read_prereq(), data_instance.get_write_prereq()); + return result; + } + + // This is not a reduction, but perhaps we need to reconstruct the data first? + if (ctx_.current_mode == access_mode::redux) + { + assert(eplace.has_value()); + if (dot_is_tracing) + { + // Add a dependency between previous tasks accessing the handle + // in redux mode, and this task which forces its + // reconstruction. + for (const int redux_task_id : ctx_.pending_redux_id) + { + dot.add_edge(redux_task_id, task.get_unique_id()); + } + ctx_.pending_redux_id.clear(); + } + handle.reconstruct_after_redux(instance_id, eplace.value(), result); + ctx_.current_mode = access_mode::none; + } + + // @@@TODO@@@ cleaner ... + const bool write = (mode == access_mode::rw || mode == access_mode::write); + + // ::std::cout << "Notifying " << (write?"W":"R") << " access on " << get_symbol() << " by task " << + // task->get_symbol() << ::std::endl; + if (write) + { + if (ctx_.current_mode == access_mode::write) + { + // Write after Write (WAW) + assert(ctx_.current_writer.has_value()); + + const auto& cw = ctx_.current_writer.value(); + result.merge(cw.get_done_prereqs()); + + const auto cw_id = cw.get_unique_id(); + + if (dot_is_tracing) + { + dot.add_edge(cw_id, task.get_unique_id()); + } + + cs.remove_leaf_task(cw_id); + + // Replace previous writer + ctx_.previous_writer = cw; + } + else + { + // Write after read + + // The writer depends on all current readers + auto& current_readers = ctx_.current_readers; + result.merge(current_readers.get_done_prereqs()); + + for (const int reader_task_id : current_readers.get_ids()) + { + if (dot_is_tracing) + { + dot.add_edge(reader_task_id, task.get_unique_id()); + } + cs.remove_leaf_task(reader_task_id); + } + + current_readers.clear(); + ctx_.current_mode = access_mode::write; + } + // Note the task will later be set as the current writer + } + else + { + // This is a read access + if (ctx_.current_mode == access_mode::write) + { + // Read after Write + // Current writer becomes the previous writer, and all future readers will depend on this previous + // writer + assert(ctx_.current_writer.has_value()); + ctx_.previous_writer = mv(ctx_.current_writer); + const auto& pw = ctx_.previous_writer; + + result.merge(pw->get_done_prereqs()); + + const int pw_id = pw->get_unique_id(); + + if (dot_is_tracing) + { + dot.add_edge(pw_id, task.get_unique_id()); + } + + cs.remove_leaf_task(pw_id); + + ctx_.current_mode = access_mode::none; + // ::std::cout << "CHANGING to FALSE for " << symbol << ::std::endl; + } + else if (ctx_.previous_writer.has_value()) + { + const auto& pw = ctx_.previous_writer; + result.merge(pw->get_done_prereqs()); + + const int pw_id = pw->get_unique_id(); + if (dot_is_tracing) + { + dot.add_edge(pw_id, task.get_unique_id()); + } + + cs.remove_leaf_task(pw_id); + } + + // Note : the task will later be added to the list of readers + } + + return result; +} + +template +inline void enforce_stf_deps_after(logical_data_untyped& handle, const task_type& task, const access_mode mode) +{ + if (mode == access_mode::redux) + { + // no further action is required + return; + } + + // Get the context in which we store previous writer, readers, ... + auto& ctx_ = handle.get_state(); + + if (mode == access_mode::rw || mode == access_mode::write) + { + ctx_.current_writer = task; + } + else + { + // Add to the list of readers + ctx_.current_readers.add(task); + } +} + +/* Enforce task dependencies, allocations, and copies ... */ +inline void fetch_data( + backend_ctx_untyped& ctx, + logical_data_untyped& d, + const instance_id_t instance_id, + task& t, + access_mode mode, + const ::std::optional eplace, + const data_place& dplace, + event_list& result) +{ + event_list stf_prereq = reserved::enforce_stf_deps_before(ctx, d, instance_id, t, mode, eplace); + + if (d.has_interface()) + { + // Allocate data if needed (and possibly reclaim memory to do so) + reserved::dep_allocate(ctx, d, mode, dplace, eplace, instance_id, stf_prereq); + + /* + * DATA LAZY UPDATE (relying on the MSI protocol) + */ + + // This will initiate a copy if the data was not valid + d.enforce_msi_protocol(instance_id, mode, stf_prereq); + + stf_prereq.optimize(); + + // Gather all prereqs required to fetch this piece of data into the + // dependencies of the task. + // Even temporary allocation may require to enfore dependencies + // because we are reclaiming data for instance. + result.merge(mv(stf_prereq)); + } +} + +}; // namespace reserved + +// This implementation is deferred because we need the logical_data_untyped type in it +template +::std::pair +reserved::logical_data_untyped_impl::get_frozen(task& fake_task, const data_place& dplace, access_mode m) +{ + event_list prereqs; + + // This will pick an instance id, either already valid or which needs to be allocated/populated + auto id = find_instance_id(dplace); + + // Get the logical_data_untyped from the current impl + logical_data_untyped d(shared_from_this()); + + // Exactly like a task that would fetch a piece of data, this introduce + // appropriate dependencies with previous readers/writers (or frozen data + // deps !). Then, if the data wasn't available on the data place, it can be + // allocated and a copy from a valid source can be made. + // This will also update the MSI states of the logical data instances. + reserved::fetch_data(ctx, d, id, fake_task, m, ::std::nullopt, dplace, prereqs); + + // Make sure we now have a valid copy + assert(used_instances[int(id)].is_allocated()); + assert(used_instances[int(id)].get_msir() != reserved::msir_state_id::invalid); + + return ::std::pair(dinterface->instance(id), mv(prereqs)); +} + +// This implementation is deferred because we need the logical_data_untyped type in it +inline void reserved::logical_data_untyped_impl::unfreeze(task& fake_task, event_list prereqs) +{ + assert(frozen_flag); + + // Unlike regular tasks, unfreeze may affects multiple data instances, so + // we do not reuse the same code path to update prereqs on data instances + for (auto i : each(used_instances.size())) + { + if (frozen_mode == access_mode::read) + { + used_instances[i].add_write_prereq(prereqs); + } + else + { + // rw or write + used_instances[i].set_read_prereq(prereqs); + used_instances[i].clear_write_prereq(); + } + } + + // Get the logical_data_untyped from the current impl + logical_data_untyped d(shared_from_this()); + + // Keep track of the previous readers/writers and generate dot + reserved::enforce_stf_deps_after(d, fake_task, frozen_mode); + + frozen_flag = false; +} + +inline void backend_ctx_untyped::impl::erase_all_logical_data() +{ + /* Since we modify the map while iterating on it, we will copy it */ + stack.logical_data_ids_mutex.lock(); + auto logical_data_ids_cpy = stack.logical_data_ids; + stack.logical_data_ids_mutex.unlock(); + + /* Erase all logical data created in this context */ + for (auto p : logical_data_ids_cpy) + { + auto& d = p.second; + d.erase(); + } +} + +// Defined here to avoid circular dependencies +inline logical_data_untyped task_dep_untyped::get_data() const +{ + assert(data); + return unpack_state(data); +} + +// Defined here to avoid circular dependencies +template +inline decltype(auto) task_dep::instance(task& tp) const +{ + auto t = get_data(); + return static_cast&>(t).instance(tp); +} + +// Defined here to avoid circular dependencies +inline instance_id_t task::find_data_instance_id(const logical_data_untyped& d) const +{ + for (auto& it : pimpl->deps) + { + if (d == it.get_data()) + { + // We found the data + return it.get_instance_id(); + } + } + + // This task does not has d in its dependencies + fprintf(stderr, "FATAL: could not find this piece of data in the current task.\n"); + abort(); + + return instance_id_t::invalid; +} + +template +inline decltype(auto) task::get(size_t submitted_index) const +{ + if (pimpl->reordered_indexes.empty()) + { + pimpl->initialize_reordered_indexes(); + } + + EXPECT(submitted_index < pimpl->reordered_indexes.size()); + size_t reordered_id = pimpl->reordered_indexes[submitted_index]; + instance_id_t instance_id = pimpl->deps[reordered_id].get_instance_id(); + logical_data_untyped d = pimpl->deps[reordered_id].get_data(); + return d.template instance(instance_id); +} + +/** + * @brief Represents typed logical data. + * + * @tparam T The type of the underlying data. + */ +template +class logical_data : public logical_data_untyped +{ +public: + /// @brief Alias for `T` + using element_type = T; + /// @brief Alias for `shape_of` + using shape_t = shape_of; + + /// @brief Default constructor + logical_data() = default; + + /// @brief Constructor from an untyped logical data + /// + /// Warning : no checks are done to ensure the type used to create the + /// untyped logical data matches, it is the responsability of the caller to + /// ensure this is a valid conversion + logical_data(logical_data_untyped&& u) + : logical_data_untyped(u) + {} + + /** + * @brief Constructor + * + * @tparam U Backend type + * @param ctx Backend context + * @param instance Reference instance used for initializing this logical data + * @param dp Data place + * @param data_prereq + */ + template + logical_data(backend_ctx_untyped ctx, ::std::shared_ptr instance, data_place dp) + : logical_data_untyped(mv(ctx), mv(instance), mv(dp)) + { + // Note that we did not put this static_assertion in the default + // constructor to keep using = default which is preferred + static_assert(sizeof(logical_data) == sizeof(logical_data_untyped), + "Cannot add state here because it would be lost through slicing"); + + EXPECT(get_ctx()); + static_assert(::std::is_same_v); + static_assert(::std::is_same_v, typename U::shape_t>); + } + + ///@{ @name Execution place getter + const shape_t& shape() const + { + return logical_data_untyped::common(); + } + ///@} + + ///@{ @name Instance getter for a given instance or the default instance + decltype(auto) instance(instance_id_t instance_id) + { + return logical_data_untyped::instance(instance_id); + } + decltype(auto) instance(task& tp) + { + return instance(get_data_interface().get_default_instance_id(get_ctx(), *this, tp)); + } + ///@} + + ///@{ @name Assign a symbolic name for this object + logical_data& set_symbol(::std::string str) + { + logical_data_untyped::set_symbol(mv(str)); + return *this; + } + ///@} + + ///@{ @name Select a custom allocator for this logical data + logical_data& set_allocator(block_allocator_untyped custom_allocator) + { + logical_data_untyped::set_allocator(mv(custom_allocator)); + return *this; + } + ///@} + + ///@{ @name Get hash value + size_t hash() const + { + return logical_data_untyped::hash(); + } + ///@} + + ///@{ + /** + * @name Return a task_dep object for reading and/or writing this logical data. + * + * @tparam Pack Additional parameter types for `task_dep`'s constructor, if any + * @param pack Additional arguments for `task_dep`'s constructor, if any + * @return task_dep The object encapsulating access + */ + template + auto read(Pack&&... pack) const + { + using U = readonly_type_of; + // The constness of *this implies that access mode is read + return task_dep(*this, /* access_mode::read, */ ::std::forward(pack)...); + } + + template + task_dep write(Pack&&... pack) + { + return task_dep(*this, access_mode::write, ::std::forward(pack)...); + } + + template + task_dep rw(Pack&&... pack) + { + return task_dep(*this, access_mode::rw, ::std::forward(pack)...); + } + + template + task_dep redux(Pack&&... pack) + { + return task_dep(*this, access_mode::redux, ::std::forward(pack)...); + } + ///@} +}; + +/** + * @brief Reclaims memory from allocated data instances. + * + * Reclaims memory for requested size. It considers different passes depending on the specified criteria. Memory is + * reclaimed from data instances that are not currently in use and meet the pass criteria. + * + * @param[in] ctx Pointer to the backend context state. + * @param[in] place Data place from where the memory should be reclaimed. + * @param[in] requested_s The size of memory to be reclaimed. + * @param[out] reclaimed_s The size of memory that was successfully reclaimed. + * @param[in,out] prereqs A unique pointer to a list of events that need to be completed before memory can be + * reclaimed. + */ +inline void reclaim_memory( + backend_ctx_untyped& ctx, const data_place& place, size_t requested_s, size_t& reclaimed_s, event_list& prereqs) +{ + const auto memory_node = to_index(place); + + auto& cs = ctx.get_stack(); + + reclaimed_s = 0; + + ctx.get_dot()->set_current_color("red"); + SCOPE(exit) + { + ctx.get_dot()->set_current_color("white"); + }; + + int first_pass = 0; + if (getenv("NAIVE_RECLAIM")) + { + first_pass = 2; + } + + /* + * To implement a Last Recently Used (LRU) reclaiming policy, we + * approximate access time by the largest prereq ID associated to a data + * instance. We thus don't reclaim data in the order they appear in the + * map, but accordingly to this largest ID. + * + * We therefore create a vector of data to select the most appropriate data + * after sorting it. + */ + using tup_t = ::std::tuple; + ::std::vector eligible_data; + + // Go through all entries + // Pass 0 : only locally invalid instances + // Pass 1 : only shared instances + // Pass 2 : all instances + // We stop as soon as we have reclaimed enough memory + size_t eligible_data_size = 0; + for (int pass = first_pass; (reclaimed_s < requested_s) && (eligible_data_size < requested_s) && (pass <= 2); pass++) + { + // Get the table of all logical data ids used in this context (the parent of the task) + ::std::lock_guard<::std::mutex> guard(cs.logical_data_ids_mutex); + auto& logical_data_ids = cs.logical_data_ids; + for (auto& e : logical_data_ids) + { + auto& d = e.second; + + // fprintf(stderr, "Trying to reclaim logical data (id %d)\n", e.first); + + if (d.has_ref()) + { + // Ignore logical data which are currently being used (e.g. those being acquired) + continue; + } + + // Do not try to reclaim memory from a frozen logical data (XXX we might restrict this for write-only or rw + // frozen data) + if (d.frozen_flag) + { + continue; + } + + // We go through all data instances + + size_t ninstances = d.get_data_instance_count(); + for (auto i : each(instance_id_t(ninstances))) + { + auto& inst = d.get_data_instance(i); + + // Some data instances are not eligible for reclaiming (e.g. memory allocated by users) + if (!inst.reclaimable) + { + continue; + } + + // Only reclaim allocated memory + if (!inst.is_allocated()) + { + continue; + } + + // Only reclaim on the requested data place + if (to_index(inst.get_dplace()) != memory_node) + { + continue; + } + + assert(inst.get_used()); + + bool eligible = true; + auto current_state = inst.get_msir(); + + switch (pass) + { + case 0: + // Only process locally invalid instances + if (current_state != reserved::msir_state_id::invalid) + { + eligible = false; + } + break; + case 1: + // Only process instances which will not require a write back (ie. exclude + // reserved::msir_state_id::modified) + if (current_state == reserved::msir_state_id::modified) + { + eligible = false; + } + break; + default: + // no-op + break; + } + + // If this data instance meets all criteria, do reclaim it + if (eligible) + { + if (pass == 2 || pass == 1) + { + // In this case, we do not try to reclaim it + // immediately, but we put it in a vector which will be + // sorted after in order to implement a LRU policy + eligible_data.emplace_back(&inst, &d, instance_id_t(i), inst.max_prereq_id()); + eligible_data_size += inst.allocated_size; + continue; + } + // This does not consumes prereqs, but produces new ones + prereqs.merge(d.do_reclaim(place, i)); + + reclaimed_s += inst.allocated_size; + // fprintf(stderr, "RECLAIMED %ld / %ld - last prereq id was %d\n", reclaimed_s, requested_s); + if (reclaimed_s >= requested_s) + { + return; + } + } + } // end loop over of data instances + } // end loop over logical data ids + } // end loop pass + + // Setting this variable to a non-zero value will disable the LRU policy + const char* str = getenv("RECLAIM_NO_SORT"); + if (!str || atoi(str) == 0) + { + ::std::sort(eligible_data.begin(), eligible_data.end(), [](const auto& a, const auto& b) { + return ::std::get<3>(a) < ::std::get<3>(b); + }); + } + + for (auto& [inst, d, i, _] : eligible_data) + { + prereqs.merge(d->do_reclaim(place, i)); + reclaimed_s += inst->allocated_size; + // fprintf(stderr, "RECLAIMED %ld / %ld - last prereq id was %d\n", reclaimed_s, requested_s); + + // Stop when we have reclaimed enough memory + if (reclaimed_s >= requested_s) + { + return; + } + } +} + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/machine.cuh b/cudax/include/cuda/experimental/__stf/internal/machine.cuh new file mode 100644 index 00000000000..a58a7377875 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/machine.cuh @@ -0,0 +1,132 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief The machine class abstract low-level CUDA mechanisms such as enabling P2P accesses + * + * This class should also provide information about the topology of the machine + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include +#include + +#include + +namespace cuda::experimental::stf::reserved +{ + +/** + * @brief Singleton object abstracting a machine able to set up CUDA peer accesses. + * + */ +class machine : public reserved::meyers_singleton +{ +protected: + machine() + { + cuda_safe_call(cudaGetDeviceCount(&ndevices)); + // TODO: remove this call? Currently anyone getting a machine gets peer access + // and subsequent explicit calls to enable_peer_accesses() are no-ops. + enable_peer_accesses(); + } + + // Nobody can copy or assign. + machine& operator=(const machine&) = delete; + machine(const machine&) = delete; + // Clients can't destroy an object (because they can't create one in the first place). + ~machine() = default; + +public: + void enable_peer_accesses() + { + // Only once + if (initialized_peer_accesses) + { + return; + } + + int current_dev; + cuda_safe_call(cudaGetDevice(¤t_dev)); + + for (int d = 0; d < ndevices; d++) + { + cuda_safe_call(cudaSetDevice(d)); + + cudaMemPool_t mempool; + cuda_safe_call(cudaDeviceGetDefaultMemPool(&mempool, d)); + + for (int peer_d = 0; peer_d < ndevices; peer_d++) + { + if (peer_d == d) + { + continue; + } + int can_access_peer; + cuda_safe_call(cudaDeviceCanAccessPeer(&can_access_peer, d, peer_d)); + + uint64_t threshold = UINT64_MAX; + cuda_safe_call(cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold)); + + if (can_access_peer) + { + cudaError_t res = cudaDeviceEnablePeerAccess(peer_d, 0); + assert(res == cudaErrorPeerAccessAlreadyEnabled || res == cudaSuccess); + if (res == cudaErrorPeerAccessAlreadyEnabled) + { + fprintf(stderr, "[DEV %d] peer access already enabled with device %d\n", d, peer_d); + } + + // Enable access to remote memory pool + cudaMemAccessDesc desc = {.location = {.type = cudaMemLocationTypeDevice, .id = peer_d}, + .flags = cudaMemAccessFlagsProtReadWrite}; + cuda_safe_call(cudaMemPoolSetAccess(mempool, &desc, 1 /* numDescs */)); + // ::std::cout << "[DEV " << d << "] cudaMemPoolSetAccess to peer "<< peer_d << ::std::endl; + } + else + { + fprintf(stderr, "[DEV %d] cannot enable peer access with device %d\n", d, peer_d); + } + } + } + + cuda_safe_call(cudaSetDevice(current_dev)); + + initialized_peer_accesses = true; + }; + + // Naive solution + int get_ith_closest_node(int node, int ith) + { + int nnodes = ndevices + 1; + + return (node + ith) % nnodes; + } + +private: + bool initialized_peer_accesses = false; + int ndevices; +}; + +} // namespace cuda::experimental::stf::reserved diff --git a/cudax/include/cuda/experimental/__stf/internal/msir.cuh b/cudax/include/cuda/experimental/__stf/internal/msir.cuh new file mode 100644 index 00000000000..26cd92f0c9c --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/msir.cuh @@ -0,0 +1,226 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +/* + * We here define the protocol to keep data copies up to date + * Task dependencies are supposed to be enforced by the STF model, so this is + * intended to implement the required data movements/allocations. + * + * M : msir_state_id::modified + * S : msir_state_id::shared + * I : msir_state_id::invalid + * R : msir_state_id::reduction + */ + +#include + +namespace cuda::experimental::stf +{ + +namespace reserved +{ + +enum class msir_state_id +{ + invalid, + modified, + shared, + reduction, +}; + +inline ::std::string status_to_string(msir_state_id status) +{ + switch (status) + { + case msir_state_id::modified: + return "msir_state_id::modified"; + case msir_state_id::shared: + return "msir_state_id::shared"; + case msir_state_id::invalid: + return "msir_state_id::invalid"; + case msir_state_id::reduction: + return "REDUCTION"; + } + + return "UNKNOWN"; +} + +inline char status_to_char(msir_state_id status) +{ + switch (status) + { + case msir_state_id::modified: + return 'M'; + case msir_state_id::shared: + return 'S'; + case msir_state_id::invalid: + return 'I'; + case msir_state_id::reduction: + return 'R'; + } + + return 'U'; +} + +/* + * Generic interface that does not suppose how memory places are organized. + * Could be a concept in the future. Passive documentation for now. + */ +/* +template class msi_state { +public: + // Returns the state of a piece of data at a specific place + int shape(memory_place_interface place, class event_list **msi_prereq); + + // Sets the status for a specific place + void set_state(memory_place_interface place, int state, class event_list *new_msi_prereq); + + // Update data status when accessing data at the specified place with the specified access type + // Returns prereq_out + class event_list *update_state(memory_place_interface place, int access_mode, class event_list *prereq_in); + + // Find a valid copy to move a piece of data to dst_place. Returned value is a possible source place + // TODO perhaps we should return a list + // TODO perhaps this should return a pair of (place+prereq) + memory_place_interface find_source_place(memory_place_interface dst_place); + +private: +}; +*/ + +class per_data_instance_msi_state +{ +public: + per_data_instance_msi_state() {} + + ~per_data_instance_msi_state() {} + + msir_state_id get_msir() const + { + return msir; + } + + void set_msir(msir_state_id _msir) + { + msir = _msir; + } + + bool is_allocated() const + { + return allocated; + } + + void set_allocated(bool _allocated) + { + allocated = _allocated; + } + + const event_list& get_read_prereq() const + { + return read_prereq; + } + + const event_list& get_write_prereq() const + { + return write_prereq; + } + + void set_read_prereq(event_list prereq) + { + read_prereq = mv(prereq); + } + + void set_write_prereq(event_list prereq) + { + write_prereq = mv(prereq); + } + + /** + * @brief Compute the maximum async prereq ID in this list (for both read and write accesses). + */ + int max_prereq_id() const + { + int res = read_prereq.max_prereq_id(); + res = ::std::max(res, write_prereq.max_prereq_id()); + return res; + } + + template + void add_read_prereq(T&& prereq) + { + read_prereq.merge(::std::forward(prereq)); + + if (read_prereq.size() > 16) + { + read_prereq.optimize(); + } + } + template + void add_write_prereq(T&& prereq) + { + write_prereq.merge(::std::forward(prereq)); + + if (write_prereq.size() > 16) + { + write_prereq.optimize(); + } + } + + void clear_read_prereq() + { + read_prereq.clear(); + } + void clear_write_prereq() + { + write_prereq.clear(); + } + + size_t hash() const + { + return hash_all(allocated, (int) msir); + } + +private: + // We need to fulfill these events __and those in read_prereq__ to modify the instance + event_list write_prereq; + + // We need to fulfill these events to read the instance without modifying it + event_list read_prereq; + + msir_state_id msir = msir_state_id::invalid; // MSIR = msir_state_id::modified, ... + bool allocated = false; +}; + +} // end namespace reserved + +// Overload hash to compute the hash of a per_data_instance_msi_state +// class from the MSI and allocated states. +template <> +struct hash +{ + ::std::size_t operator()(reserved::per_data_instance_msi_state const& s) const noexcept + { + return s.hash(); + } +}; + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh new file mode 100644 index 00000000000..46a80e01895 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh @@ -0,0 +1,475 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include // for null_partition +#include +#include + +#include + +namespace cuda::experimental::stf +{ + +#ifdef __CUDACC__ + +class stream_ctx; +class graph_ctx; + +namespace reserved +{ + +/* + * @brief A CUDA kernel for executing a function `f` in parallel over `n` threads. + * + * This kernel takes a function `f` and its parameters `p`, then executes `f(i, p...)` + * for each thread index `i` from 0 through `n-1`. + * + * @tparam F The type of the function to execute. + * @tparam P The types of additional parameters to pass to the function. + * + * @param n The number of times to execute the function in parallel. + * @param f The function to execute. + * @param p The additional parameters to pass to the function `f`. + */ +template +__global__ void loop(const _CCCL_GRID_CONSTANT size_t n, shape_t shape, F f, tuple_args targs) +{ + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + const size_t step = blockDim.x * gridDim.x; + + // This will explode the targs tuple into a pack of data + + // Help the compiler which may not detect that a device lambda is calling a device lambda + CUDASTF_NO_DEVICE_STACK + auto explode_args = [&](auto... data) { + // For every linearized index in the shape + for (; i < n; i += step) + { + CUDASTF_NO_DEVICE_STACK + auto explode_coords = [&](auto... coords) { + f(coords..., data...); + }; + ::std::apply(explode_coords, shape.index_to_coords(i)); + } + }; + ::std::apply(explode_args, mv(targs)); +} + +/** + * @brief Supporting class for the parallel_for construct + * + * This is used to implement operators such as ->* on the object produced by `ctx.parallel_for` + * + * @tparam deps_t + */ +template +class parallel_for_scope +{ +public: + /// @brief Constructor + /// @param ctx Reference to context (it will not be copied, so careful with lifetimes) + /// @param e_place Execution place for this parallel_for + /// @param shape Shape to iterate + /// @param ...deps Dependencies + parallel_for_scope(context& ctx, exec_place e_place, shape_t shape, task_dep... deps) + : deps{deps...} + , ctx(ctx) + , e_place(mv(e_place)) + , shape(mv(shape)) + { + dump_hooks = reserved::get_dump_hooks(&ctx, deps...); + } + + parallel_for_scope(const parallel_for_scope&) = delete; + parallel_for_scope(parallel_for_scope&&) = default; + parallel_for_scope& operator=(const parallel_for_scope&) = delete; + + /** + * @brief Retrieves the task dependencies in an untyped vector. + * + * @return The task dependencies as a `task_dep_vector_untyped` object. + */ + const task_dep_vector_untyped& get_task_deps() const + { + return deps; + } + + /** + * @brief Retrieves the symbol associated with the task. + * + * @return A constant reference to the symbol string. + */ + const ::std::string& get_symbol() const + { + return symbol; + } + + /** + * @brief Sets the symbol associated with the task. + * + * This method uses a custom move function `mv` to handle the transfer of ownership. + * + * @param s The new symbol string. + * @return A reference to the current object, allowing for method chaining. + */ + auto& set_symbol(::std::string s) + { + symbol = mv(s); + return *this; + } + + /** + * @brief Overloads the `operator->*` to perform parallel computations using a user-defined function or lambda. + * + * This method initializes various runtime entities (task, CUDA events, etc.) and applies the parallel_for construct + * over the task's shape. It can work with different execution places and also supports partitioning the shape based + * on a given partitioner. + * + * @tparam Fun The type of the user-defined function or lambda. + * + * @param f The user-defined function or lambda that is to be applied in parallel. + * + * @note + * The method will perform several operations such as: + * - Initialize and populate the task with the required dependencies and symbol. + * - Create CUDA events and record them if needed. + * - Determine whether the operation is to be performed on device or host and proceed accordingly. + * - If there is a partitioner, the method can also break the task's shape into sub-shapes and perform + * computations in parts. + * - Log task time if statistics gathering is enabled. + * + * @pre `e_place.affine_data_place()` should match `t.get_affine_data_place()` where `t` is the task generated from + * `ctx.task(e_place)`. + */ + template + void operator->*(Fun&& f) + { + auto& dot = *ctx.get_dot(); + auto& statistics = reserved::task_statistics::instance(); + auto t = ctx.task(e_place); + + assert(e_place.affine_data_place() == t.get_affine_data_place()); + + // If there is a partitioner, we ensure there is a proper affine data place for this execution place + if constexpr (!::std::is_same_v) + { + // This is only meaningful for grid of places + if (e_place.is_grid()) + { + // Create a composite data place defined by the grid of places + the partitioning function + t.set_affine_data_place(data_place::composite(partitioner_t(), e_place.as_grid())); + } + } + + t.add_post_submission_hook(dump_hooks); + + t.add_deps(deps); + if (!symbol.empty()) + { + t.set_symbol(symbol); + } + + cudaEvent_t start_event, end_event; + + const bool record_time = t.schedule_task() || statistics.is_calibrating_to_file(); + + nvtx_range nr(t.get_symbol().c_str()); + t.start(); + + int device = -1; + + SCOPE(exit) + { + t.end_uncleared(); + if constexpr (::std::is_same_v) + { + if (record_time) + { + cuda_safe_call(cudaEventRecord(end_event, t.get_stream())); + cuda_safe_call(cudaEventSynchronize(end_event)); + + float milliseconds = 0; + cuda_safe_call(cudaEventElapsedTime(&milliseconds, start_event, end_event)); + + if (dot.is_tracing()) + { + dot.template add_vertex_timing(t, milliseconds, device); + } + + if (statistics.is_calibrating()) + { + statistics.log_task_time(t, milliseconds); + } + } + } + + t.clear(); + }; + + if constexpr (::std::is_same_v) + { + if (record_time) + { + cuda_safe_call(cudaGetDevice(&device)); // We will use this to force it during the next run + // Events must be created here to avoid issues with multi-gpu + cuda_safe_call(cudaEventCreate(&start_event)); + cuda_safe_call(cudaEventCreate(&end_event)); + cuda_safe_call(cudaEventRecord(start_event, t.get_stream())); + } + } + + if (dot.is_tracing()) + { + dot.template add_vertex(t); + } + +# if __NVCOMPILER + // With nvc++, all lambdas can run on host and device. + static constexpr bool is_extended_host_device_lambda_closure_type = true, + is_extended_device_lambda_closure_type = false; +# else + // With nvcpp, dedicated traits tell how a lambda can be executed. + static constexpr bool is_extended_host_device_lambda_closure_type = + __nv_is_extended_host_device_lambda_closure_type(Fun), + is_extended_device_lambda_closure_type = __nv_is_extended_device_lambda_closure_type(Fun); +# endif + + if constexpr (is_extended_host_device_lambda_closure_type) + { + // Can run on both - decide dynamically + if (e_place == exec_place::host) + { + return do_parallel_for_host(::std::forward(f), shape, t); + } + // Fall through for the device implementation + } + else if constexpr (is_extended_device_lambda_closure_type) + { + // Lambda can run only on device - make sure they're not trying it on the host + EXPECT(e_place != exec_place::host, "Attempt to run a device function on the host."); + // Fall through for the device implementation + } + else + { + // Lambda can run only on the host - make sure they're not trying it elsewhere + EXPECT(e_place == exec_place::host, "Attempt to run a host function on a device."); + return do_parallel_for_host(::std::forward(f), shape, t); + } + + // Device land. Must use the supplemental if constexpr below to avoid compilation errors. + if constexpr (is_extended_host_device_lambda_closure_type || is_extended_device_lambda_closure_type) + { + if (!e_place.is_grid()) + { + // Apply the parallel_for construct over the entire shape on the + // execution place of the task + do_parallel_for(f, e_place, shape, t); + } + else + { + if constexpr (::std::is_same_v) + { + fprintf(stderr, "Fatal: Grid execution requires a partitioner.\n"); + abort(); + } + else + { + size_t grid_size = t.grid_dims().size(); + for (size_t i = 0; i < grid_size; i++) + { + t.set_current_place(pos4(i)); + const auto sub_shape = partitioner_t::apply(shape, pos4(i), t.grid_dims()); + do_parallel_for(f, t.get_current_place(), sub_shape, t); + t.unset_current_place(); + } + } + } + } + else + { + // This point is never reachable, but we can't prove that statically. + assert(!"Internal CUDASTF error."); + } + } + + // Executes the loop on a device, or use the host implementation + template + void do_parallel_for( + Fun&& f, const exec_place& sub_exec_place, const sub_shape_t& sub_shape, typename context::task_type& t) + { + // parallel_for never calls this function with a host. + assert(sub_exec_place != exec_place::host && "Internal CUDASTF error."); + + if (sub_exec_place == exec_place::device_auto) + { + // We have all latitude - recurse with the current device. + return do_parallel_for(::std::forward(f), exec_place::current_device(), sub_shape, t); + } + + using Fun_no_ref = ::std::remove_reference_t; + + static const auto conf = [] { + // We are using int instead of size_t because CUDA API uses int for occupancy calculations + int min_grid_size = 0, max_block_size = 0, block_size_limit = 0; + // compute_kernel_limits will return the min number of blocks/max + // block size to optimize occupancy, as well as some block size + // limit. We choose to dimension the kernel of the parallel loop to + // optimize occupancy. + reserved::compute_kernel_limits( + &reserved::loop>, + min_grid_size, + max_block_size, + 0, + false, + block_size_limit); + return ::std::pair(size_t(min_grid_size), size_t(max_block_size)); + }(); + + const auto [block_size, min_blocks] = conf; + size_t n = sub_shape.size(); + + // If there is no item in that shape, no need to launch a kernel ! + if (n == 0) + { + // fprintf(stderr, "Empty shape, no kernel ...\n"); + return; + } + + // max_blocks is computed so we have one thread per element processed + const auto max_blocks = (n + block_size - 1) / block_size; + + // TODO: improve this + size_t blocks = ::std::min(min_blocks * 3 / 2, max_blocks); + + if constexpr (::std::is_same_v) + { + reserved::loop<<(blocks), static_cast(block_size), 0, t.get_stream()>>>( + static_cast(n), sub_shape, mv(f), deps.instance(t)); + } + else if constexpr (::std::is_same_v) + { + // Put this kernel node in the child graph that implements the graph_task<> + cudaKernelNodeParams kernel_params; + + kernel_params.func = (void*) reserved::loop>; + + kernel_params.gridDim = dim3(static_cast(blocks)); + kernel_params.blockDim = dim3(static_cast(block_size)); + + auto arg_instances = deps.instance(t); + // It is ok to use reference to local variables because the arguments + // will be used directly when calling cudaGraphAddKernelNode + void* kernelArgs[] = {&n, const_cast(static_cast(&sub_shape)), &f, &arg_instances}; + kernel_params.kernelParams = kernelArgs; + kernel_params.extra = nullptr; + + kernel_params.sharedMemBytes = 0; + + // This task corresponds to a single graph node, so we set that + // node instead of creating an child graph. Input and output + // dependencies will be filled later. + cuda_safe_call(cudaGraphAddKernelNode(&t.get_node(), t.get_ctx_graph(), nullptr, 0, &kernel_params)); + // fprintf(stderr, "KERNEL NODE => graph %p, gridDim %d blockDim %d (n %ld)\n", t.get_graph(), + // kernel_params.gridDim.x, kernel_params.blockDim.x, n); + } + else + { + fprintf(stderr, "Internal error.\n"); + abort(); + } + } + + // Executes loop on the host. + template + void do_parallel_for_host(Fun&& f, const sub_shape_t& shape, typename context::task_type& t) + { + const size_t n = shape.size(); + + // Wrap this for_each_n call in a host callback launched in CUDA stream associated with that task + // To do so, we pack all argument in a dynamically allocated tuple + // that will be deleted by the callback + auto args = + new ::std::tuple(deps.instance(t), n, mv(f), shape); + + // The function which the host callback will execute + auto host_func = [](void* untyped_args) { + auto p = static_cast(untyped_args); + SCOPE(exit) + { + delete p; + }; + + auto& data = ::std::get<0>(*p); + const size_t n = ::std::get<1>(*p); + Fun&& f = mv(::std::get<2>(*p)); + const sub_shape_t& shape = ::std::get<3>(*p); + + auto explode_coords = [&](size_t i, deps_t... data) { + auto h = [&](auto... coords) { + f(coords..., data...); + }; + ::std::apply(h, shape.index_to_coords(i)); + }; + + // Finally we get to do the workload on every 1D item of the shape + for (size_t i = 0; i < n; ++i) + { + ::std::apply(explode_coords, ::std::tuple_cat(::std::make_tuple(i), data)); + } + }; + + if constexpr (::std::is_same_v) + { + cuda_safe_call(cudaLaunchHostFunc(t.get_stream(), host_func, args)); + } + else if constexpr (::std::is_same_v) + { + cudaHostNodeParams params; + params.userData = args; + params.fn = host_func; + + // Put this host node into the child graph that implements the graph_task<> + cuda_safe_call(cudaGraphAddHostNode(&t.get_node(), t.get_ctx_graph(), nullptr, 0, ¶ms)); + } + else + { + fprintf(stderr, "Internal error.\n"); + abort(); + } + } + +private: + task_dep_vector deps; + context& ctx; + exec_place e_place; + ::std::string symbol; + shape_t shape; + + ::std::vector<::std::function> dump_hooks; +}; +} // end namespace reserved + +#endif // __CUDACC__ + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/reduction_base.cuh b/cudax/include/cuda/experimental/__stf/internal/reduction_base.cuh new file mode 100644 index 00000000000..0b6a0cd7c78 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/reduction_base.cuh @@ -0,0 +1,72 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +namespace cuda::experimental::stf +{ +class logical_data_untyped; + +/** + * @brief Interface of the reduction operators (how to initialize data, how to + * apply the reduction) + */ +class reduction_operator_base +{ +public: + // For now, we ignore commutativity ! + reduction_operator_base(bool /* is_commutative */) {} + + virtual ~reduction_operator_base() {} + + reduction_operator_base& operator=(const reduction_operator_base&) = delete; + reduction_operator_base(const reduction_operator_base&) = delete; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen fails here + + // Reduction operator (inout, in) + virtual void op_untyped( + logical_data_untyped& d, + const data_place& inout_memory_node, + instance_id_t inout_instance_id, + const data_place& in_memory_node, + instance_id_t in_instance_id, + const exec_place& e, + event_list& prereq_in) = 0; + + // Initialization operator + virtual void init_op_untyped( + logical_data_untyped& d, + const data_place& out_memory_node, + instance_id_t out_instance_id, + const exec_place& e, + event_list& prereq_in) = 0; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +private: + // not used for now ... + // bool is_commutative; +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/reorderer.cuh b/cudax/include/cuda/experimental/__stf/internal/reorderer.cuh new file mode 100644 index 00000000000..690f5061515 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/reorderer.cuh @@ -0,0 +1,381 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** @file + * + * @brief Implements automatic task reordering + * + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include // reorderer_payload uses task_dep_vector_untyped +#include // heft_scheduler uses statistics_t + +#include // ::std::shuffle +#include // ::std::function +#include // ::std::unique_ptr +#include +#include +#include +#include +#include + +namespace cuda::experimental::stf::reserved +{ + +/** + * @brief We cannot pass deferred_stream_task<> to the reorderer due to circular + * dependencies, so we store all the necessary info in this struct instead. + */ +struct reorderer_payload +{ + reorderer_payload( + ::std::string s, int id, ::std::unordered_set succ, ::std::unordered_set pred, task_dep_vector_untyped d) + : symbol(mv(s)) + , mapping_id(id) + , successors(mv(succ)) + , predecessors(mv(pred)) + , deps(mv(d)) + {} + + reorderer_payload() = delete; + + bool done = false; + bool done_execution = false; + double upward_rank = -1.0; + int device = -1; + + size_t num_successors() const + { + return successors.size(); + } + size_t num_predecessors() const + { + return predecessors.size(); + } + + /// Needed for task_statistics + const ::std::string& get_symbol() const + { + return symbol; + } + + /// Needed for task_statistics + const task_dep_vector_untyped& get_task_deps() const + { + return deps; + } + + ::std::string symbol; + int mapping_id; + ::std::unordered_set successors; + ::std::unordered_set predecessors; + task_dep_vector_untyped deps; +}; + +/** + * @brief The reorderer class defines the interface for all reorderers + */ +class reorderer +{ +public: + /** + * @brief Reorder a vector of tasks + * + * @param tasks The vector of tasks to be reordered + */ + virtual void reorder_tasks(::std::vector& tasks, ::std::unordered_map& task_map) = 0; + + /// @brief Destructor for the reorderer + virtual ~reorderer() = default; + + static ::std::unique_ptr make(const char* reorderer_type); + +protected: + reorderer() = default; + + const int num_devices = cuda_try(); +}; + +class random_reorderer : public reorderer +{ +public: + random_reorderer() = default; + + void reorder_tasks(::std::vector& tasks, ::std::unordered_map&) override + { + ::std::shuffle(::std::begin(tasks), ::std::end(tasks), gen); + } + +private: + ::std::mt19937 gen = ::std::mt19937(::std::random_device()()); +}; + +class heft_reorderer : public reorderer +{ +public: + heft_reorderer() + : reorderer() + { + const char* filename = getenv("CUDASTF_TASK_STATISTICS"); + + if (filename) + { + statistics.read_statistics_file(filename); + } + else + { + statistics.enable_calibration(); + } + } + + void reorder_tasks(::std::vector& tasks, ::std::unordered_map& task_map) override + { + calculate_upward_ranks(tasks, task_map); + rearrange_tasks(tasks, task_map); + } + +private: + void calculate_upward_ranks(const ::std::vector& tasks, + ::std::unordered_map& task_map) const + { + ::std::queue work_list; // queue of mapping ids + ::std::unordered_set tasks_done; + + double comm_cost = 0.2; + + // Initialize the work_list with the leaf tasks + for (int id : tasks) + { + auto& t = task_map.at(id); + if (t.num_successors() == 0) + { + work_list.push(t.mapping_id); + } + } + + while (work_list.size() > 0) + { + auto& current_task = task_map.at(work_list.front()); + work_list.pop(); + + tasks_done.insert(current_task.mapping_id); + current_task.done = true; + + // The second term in the upward_rank equation that gets added to the task cost + double second_term = 0.0; + for (int s : current_task.successors) + { + const auto& succ = task_map.at(s); + assert(succ.upward_rank != -1); + second_term = ::std::max(second_term, comm_cost + succ.upward_rank); + } + + ::std::pair stats; + double task_cost; + if (current_task.get_symbol().rfind("task ", 0) == 0) + { + task_cost = 0; + } + else + { + stats = statistics.get_task_stats(current_task); + task_cost = ::std::get<0>(stats); + } + current_task.upward_rank = task_cost + second_term; + + for (int p : current_task.predecessors) + { + const auto& pred = task_map.at(p); + if (tasks_done.count(p)) + { + continue; + } + + bool add_it = true; + for (int s : pred.successors) + { + const auto& succ = task_map.at(s); + if (succ.upward_rank == -1) + { + add_it = false; + break; + } + } + + if (add_it) + { + work_list.push(pred.mapping_id); + } + } + } + } + + /** + * @brief Now that we've calculated the upward ranks, we need to rearrange + * the tasks. This isn't as simple as sorting the vector of tasks according + * to the upward rank, as we need to take into account when tasks are ready. + */ + void rearrange_tasks(::std::vector& tasks, ::std::unordered_map& task_map) const + { + using task_priority = ::std::pair; // the double is the upward rank, needed for sorting. Should I just use + // the reorderer_payload + auto cmp = [](const task_priority& p1, const task_priority& p2) { + return p1.second < p2.second; + }; + ::std::priority_queue, decltype(cmp)> ready_tasks(cmp); + + for (int id : tasks) + { + const auto& t = task_map.at(id); + if (t.num_predecessors() == 0) + { + ready_tasks.emplace(t.mapping_id, t.upward_rank); + } + } + + ::std::unordered_set tasks_done; // shouldn't be necessary but we need it now + ::std::vector actual_order; + + while (ready_tasks.size() > 0) + { + auto [id, upward_rank] = ready_tasks.top(); + ready_tasks.pop(); + + auto& current_task = task_map.at(id); + current_task.done_execution = true; + tasks_done.insert(id); + actual_order.push_back(id); + + for (int succ_id : current_task.successors) + { + if (tasks_done.count(succ_id)) + { + continue; + } + + const auto& succ = task_map.at(succ_id); + bool is_ready_now = true; + + for (int pred_id : succ.predecessors) + { + const auto& pred = task_map.at(pred_id); + + if (!pred.done_execution) + { + is_ready_now = false; + } + } + + if (is_ready_now) + { + ready_tasks.emplace(succ_id, succ.upward_rank); + } + } + } + + tasks = mv(actual_order); + } + + task_statistics& statistics = task_statistics::instance(); +}; + +class post_mortem_reorderer : public reorderer +{ +public: + post_mortem_reorderer(const char* order_file) + : reorderer() + { + read_order_file(order_file); + } + +private: + void reorder_tasks(::std::vector& tasks, ::std::unordered_map&) override + { + tasks = file_order; + } + + /* Read the csv schedule file mapping tasks to devices */ + void read_order_file(const char* filename) + { + ::std::ifstream file(filename); + EXPECT(file, "Failed to open order file: '", filename, "'."); + + int current_line = 0; + for (::std::string line; ::std::getline(file, line); ++current_line) + { + ::std::stringstream ss(line); + + int mapping_id = -1; + + int column = 0; + for (::std::string cell; ::std::getline(ss, cell, ','); ++column) + { + if (column == 1) + { + mapping_id = ::std::stoi(cell); + } + } + + EXPECT(mapping_id >= 0, "Invalid mapping id value '", mapping_id, "' provided on line '", current_line, "'."); + + file_order.push_back(mapping_id); + } + } + + ::std::vector file_order; +}; + +inline ::std::unique_ptr reorderer::make(const char* reorderer_type) +{ + if (!reorderer_type) + { + return nullptr; + } + + const auto reorderer_type_s = ::std::string(reorderer_type); + + if (reorderer_type_s == "random") + { + return ::std::make_unique(); + } + + if (reorderer_type_s == "heft") + { + return ::std::make_unique(); + } + + if (reorderer_type_s == "post_mortem") + { + const char* order_file = getenv("CUDASTF_ORDER_FILE"); + + EXPECT(order_file, "CUDASTF_TASK_ORDER set to 'post_mortem' but CUDASTF_SCHEDULE_FILE is unset."); + EXPECT(::std::filesystem::exists(order_file), "CUDASTF_ORDER_FILE '", order_file, "' does not exist"); + + return ::std::make_unique(order_file); + } + + fprintf(stderr, "Invalid CUDASTF_TASK_ORDER value '%s'\n", reorderer_type); + abort(); +} + +} // namespace cuda::experimental::stf::reserved diff --git a/cudax/include/cuda/experimental/__stf/internal/repeat.cuh b/cudax/include/cuda/experimental/__stf/internal/repeat.cuh new file mode 100644 index 00000000000..20a75762aed --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/repeat.cuh @@ -0,0 +1,91 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include +#include + +namespace cuda::experimental::stf::reserved +{ + +/** + * @brief Repeats a blocks of code for a fixed number of steps + * + * This construct aims at allowing CUDASTF to automate some optimizations such + * as introducing new epochs, or doing some scheduling work. + */ +template +class repeat_scope +{ +public: + static constexpr size_t tasks_per_epoch = 200; + + repeat_scope(context_t& ctx, size_t count) + : condition(count) + , ctx(ctx) + {} + repeat_scope(context_t& ctx, ::std::function condition) + : condition(mv(condition)) + , ctx(ctx) + {} + + template + void operator->*(Fun&& f) + { + size_t task_cnt = 0; + for (size_t iter = 0; next(); ++iter) + { + size_t before_cnt = ctx.task_count(); + + static_assert(::std::is_invocable_v, "Incorrect lambda function signature."); + + f(ctx, iter); + + size_t after_cnt = ctx.task_count(); + assert(after_cnt >= before_cnt); + + // If there is more than a specific number of tasks, fire a new epoch ! + task_cnt += after_cnt - before_cnt; + if (task_cnt > tasks_per_epoch) + { + ctx.change_epoch(); + task_cnt = 0; + } + } + } + +private: + bool next() + { + return condition.index() == 0 + ? ::std::get(condition)-- > 0 + : ::std::get<::std::function>(condition)(); + } + + // Number of iterations, or a function which evaluates if we continue + ::std::variant> condition; + // The supporting context for this construct + context_t& ctx; +}; + +} // end namespace cuda::experimental::stf::reserved diff --git a/cudax/include/cuda/experimental/__stf/internal/scheduler.cuh b/cudax/include/cuda/experimental/__stf/internal/scheduler.cuh new file mode 100644 index 00000000000..da9d3290cff --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/scheduler.cuh @@ -0,0 +1,467 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Implements automatic task scheduling + * + * CUDASTF_SCHEDULE + * CUDASTF_SCHEDULE_FILE + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include // scheduler uses task +#include // heft_scheduler uses statistics_t + +#include // rand() +#include // ::std::filesystem::exists +#include // ::std::ifstream +#include +#include // ::std::numeric_limits::max() +#include // random_scheduler uses rng +#include // ::std::stringstream +#include +#include +#include + +namespace cuda::experimental::stf::reserved +{ + +/** + * @brief The scheduler class defines the interface that all schedulers must follow to assign tasks to devices + */ +class scheduler +{ +public: + scheduler() + { + cuda_safe_call(cudaGetDeviceCount(&num_devices)); + assert(num_devices > 0); + } + + /** + * @brief Assign a task to a device + * + * @param mapping_id The mapping ID of the task + * @return The device ID of the assigned device and a boolean whether this task still needs calibration + */ + virtual ::std::pair schedule_task(const task& t) = 0; + + /// @brief Destructor for the scheduler + virtual ~scheduler() = default; + + static ::std::unique_ptr make(const char* schedule_type); + +protected: + int num_devices = 0; + + // Map from task id to device + using schedule_t = ::std::unordered_map>; +}; + +class random_scheduler : public scheduler +{ +public: + ::std::pair schedule_task(const task&) override + { + return {exec_place::device(dist(gen)), false}; + } + +private: + ::std::mt19937 gen = ::std::mt19937(::std::random_device()()); + ::std::uniform_int_distribution<> dist = ::std::uniform_int_distribution<>(0, num_devices - 1); +}; + +class round_robin_scheduler : public scheduler +{ +public: + round_robin_scheduler() = default; + + ::std::pair schedule_task(const task&) override + { + return {exec_place::device(current_device++ % num_devices), false}; + } + +private: + int current_device = 0; +}; + +class post_mortem_scheduler : public scheduler +{ +public: + post_mortem_scheduler(const char* schedule_file) + { + read_schedule_file(schedule_file); + } + + ::std::pair schedule_task(const task& t) override + { + return {exec_place::device(schedule[t.get_mapping_id()]), false}; + } + +private: + schedule_t schedule; + + /* Read the csv schedule file mapping tasks to devices */ + void read_schedule_file(const char* filename) + { + ::std::ifstream file(filename); + EXPECT(file, "Failed to open schedule file: '", filename, "'."); + + int current_line = 0; + for (::std::string line; ::std::getline(file, line); ++current_line) + { + ::std::stringstream ss(line); + + int mapping_id = -1; + int dev_id = -1; + + int column = 0; + for (::std::string cell; ::std::getline(ss, cell, ','); ++column) + { + if (column == 0) + { + dev_id = ::std::stoi(cell); + } + else if (column == 1) + { + mapping_id = ::std::stoi(cell); + } + } + + EXPECT(dev_id >= 0, "Invalid device id value '", dev_id, "' provided on line '", current_line, "'."); + EXPECT(mapping_id >= 0, "Invalid mapping id value '", mapping_id, "' provided on line '", current_line, "'."); + + schedule[mapping_id] = dev_id; + } + } +}; + +class heft_scheduler : public scheduler +{ +public: + heft_scheduler() + : gpu_loads(num_devices, 0.0) + , msi(num_devices) + { + const char* filename = getenv("CUDASTF_TASK_STATISTICS"); + + if (filename) + { + statistics.read_statistics_file(filename); + } + else + { + statistics.enable_calibration(); + } + } + + ::std::pair schedule_task(const task& t) override + { + auto [task_cost, num_calls] = statistics.get_task_stats(t); + + if (num_calls == 0) + { + task_cost = default_cost; + } + + double best_end = ::std::numeric_limits::max(); + int best_device = -1; + + for (int i = 0; i < num_devices; i++) + { + int current_device = i; + + double total_cost = cost_on_device(t, current_device, task_cost, gpu_loads[current_device]); + if (total_cost < best_end) + { + best_device = current_device; + best_end = total_cost; + } + } + + gpu_loads[best_device] = best_end; + schedule[t.get_mapping_id()] = best_device; + + auto& deps = t.get_task_deps(); + for (const auto& dep : deps) + { + msi.update_msi_for_dep(best_device, dep, best_end); + } + + bool needs_calibration = num_calls < num_samples; + + return {exec_place::device(best_device), needs_calibration}; + } + + ~heft_scheduler() + { + const char* schedule_file = getenv("CUDASTF_HEFT_SCHEDULE"); + if (schedule_file) + { + write_schedule_file(schedule_file); + } + } + +private: + class msi_protocol + { + public: + msi_protocol(int num_devices) + : num_devices(num_devices) + {} + + double when_available(int device_id, const task_dep_untyped& dep) + { + auto& info = get_symbol_info(dep.get_symbol()); + const ::std::pair& device_info = info[device_id]; + + msi_state device_state = device_info.first; + switch (device_state) + { + case msi_state::modified: + case msi_state::shared: + return device_info.second; + default: + break; + } + + double earliest = get_earliest(dep); + return earliest; + } + + void update_msi_for_dep(int device_id, const task_dep_untyped& dep, double task_end) + { + const ::std::string& symbol = dep.get_symbol(); + const access_mode mode = dep.get_access_mode(); + + auto& info = get_symbol_info(symbol); + ::std::pair& device_info = info[device_id]; + + // Update local state first + switch (device_info.first) + { + case msi_state::modified: + device_info.second = task_end; + break; + case msi_state::shared: + if (mode != access_mode::read) + { + device_info.first = msi_state::modified; + device_info.second = task_end; + } + break; + case msi_state::invalid: + if (mode == access_mode::read) + { + device_info.first = msi_state::shared; + device_info.second = get_earliest(dep); + } + else + { + device_info.first = msi_state::modified; + device_info.second = task_end; + } + break; + } + + for (int i = 0; i < num_devices; i++) + { + if (i == device_id) // already updated + { + continue; + } + + ::std::pair& i_info = info[i]; + + msi_state state = i_info.first; + switch (state) + { + case msi_state::modified: + if (mode == access_mode::read) + { + i_info.first = msi_state::shared; + } + else + { + i_info.first = msi_state::invalid; + i_info.second = -1.0; + } + break; + case msi_state::shared: + if (mode != access_mode::read) + { + i_info.first = msi_state::invalid; + i_info.second = -1.0; + } + break; + default: + assert(state == msi_state::invalid); + break; + } + } + } + + private: + enum class msi_state : unsigned int + { + modified = 0, + shared = 1, + invalid = 2 + }; + + int num_devices; + const double bandwidth = 250 * 1e5; // Bytes/ms, Obtained by running + // cuda-samples/Samples/5_Domain_Specific/p2pBandwidthLatencyTest + using cache_state = ::std::vector<::std::pair>; + ::std::unordered_map<::std::string, cache_state> cache; + + cache_state& get_symbol_info(const ::std::string& symbol) + { + auto it = cache.find(symbol); + if (it == cache.end()) + { + cache_state state(num_devices, ::std::make_pair(msi_state::invalid, -1.0)); + cache.emplace(symbol, mv(state)); + it = cache.find(symbol); + } + + return it->second; + } + + double get_earliest(const task_dep_untyped& dep) const + { + const ::std::string& symbol = dep.get_symbol(); + const auto& info = cache.at(symbol); // need to use at() to keep method const + double earliest = ::std::numeric_limits::max(); + + bool found_one = false; + + double comm_cost = dep.get_data_footprint() / bandwidth; + + for (int i = 0; i < num_devices; i++) + { + const ::std::pair& device_info = info[i]; + if (device_info.first != msi_state::invalid) + { + found_one = true; + earliest = ::std::min(earliest, device_info.second + comm_cost); + } + } + + if (!found_one) // If the data is not on any device + { + earliest = comm_cost; + } + + return earliest; + } + }; + + template + double cost_on_device(const task_type& t, int device_id, double task_cost, double when_can_start) + { + double data_available = 0.0; + + const auto& deps = t.get_task_deps(); + for (const auto& dep : deps) + { + access_mode mode = dep.get_access_mode(); + switch (mode) + { + case access_mode::read: + case access_mode::rw: + data_available = ::std::max(data_available, msi.when_available(device_id, dep)); + break; + default: + break; + } + } + + double possible_start = ::std::max(when_can_start, data_available); + double end = possible_start + task_cost; + + return end; + } + + void write_schedule_file(const char* schedule_file) const + { + ::std::ofstream file(schedule_file); + if (!file) + { + ::std::cerr << "Failed to write to heft schedule file '" << schedule_file << "'.\n"; + return; + } + + for (const auto& [device_id, mapping_id] : schedule) + { + file << mapping_id << "," << device_id << '\n'; + } + } + + ::std::vector gpu_loads; // TODO: is it better to use a ::std::array? + task_statistics& statistics = task_statistics::instance(); + msi_protocol msi; + schedule_t schedule; + const int num_samples = 5; + const double default_cost = 0.5; +}; + +inline ::std::unique_ptr scheduler::make(const char* schedule_type) +{ + if (!schedule_type) + { + return nullptr; + } + + const auto schedule_type_s = ::std::string(schedule_type); + + if (schedule_type_s == "post_mortem") + { + const char* schedule_file = getenv("CUDASTF_SCHEDULE_FILE"); + + EXPECT(schedule_file, "CUDASTF_SCHEDULE set to 'post_mortem' but CUDASTF_SCHEDULE_FILE is unset."); + EXPECT(::std::filesystem::exists(schedule_file), "CUDASTF_SCHEDULE_FILE '", schedule_file, "' does not exist"); + + return ::std::make_unique(schedule_file); + } + + if (schedule_type_s == "random") + { + return ::std::make_unique(); + } + + if (schedule_type_s == "round_robin") + { + return ::std::make_unique(); + } + + if (schedule_type_s == "heft") + { + return ::std::make_unique(); + } + + ::std::cerr << "Invalid CUDASTF_SCHEDULE value '" << schedule_type << "'\n"; + exit(EXIT_FAILURE); +} + +} // namespace cuda::experimental::stf::reserved diff --git a/cudax/include/cuda/experimental/__stf/internal/slice.cuh b/cudax/include/cuda/experimental/__stf/internal/slice.cuh new file mode 100644 index 00000000000..2e2a627d33d --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/slice.cuh @@ -0,0 +1,1193 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Definition of `slice` and related artifacts + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include +#include +#include +#include + +#include + +namespace cuda::experimental::stf +{ +using ::cuda::std::mdspan; +} + +namespace cuda::experimental::stf +{ + +/** + * @brief Abstraction for the shape of a structure such as a multidimensional view, i.e. everything but the data + * itself. A type to be used with cudastf must specialize this template. + * + * @tparam T The data corresponding to this shape. + * + * Any specialization must be default constructible, copyable, and assignable. The required methods are constructor from + * `const T&`. All other definitions are optional and should provide full information about the structure ("shape") of + * an object of type `T`, without actually allocating any data for it. + * + * @class shape_of + */ +template +class shape_of; + +#if defined(CUDASTF_BOUNDSCHECK) && defined(NDEBUG) +# error "CUDASTF_BOUNDSCHECK requires that NDEBUG is not defined." +#endif + +/** + * @brief A layout stride that can be used with `mdspan`. + * + * In debug mode (i.e., `NDEBUG` is not defined) all uses of `operator()()` are bounds-checked by means of `assert`. + */ +struct layout_stride : ::cuda::std::layout_stride +{ + template + struct mapping : ::cuda::std::layout_stride::mapping + { + constexpr mapping() = default; + + template + constexpr _CCCL_HOST_DEVICE mapping(A&&... a) + : ::cuda::std::layout_stride::mapping(::std::forward(a)...) + {} + + template + constexpr _CCCL_HOST_DEVICE auto operator()(is_t&&... is) const + { +#ifdef CUDASTF_BOUNDSCHECK + each_in_pack( + [&](auto r, const auto& i) { + assert(i < this->extents().extent(r) && "Index out of bounds."); + }, + is...); +#endif + return ::cuda::std::layout_stride::mapping::operator()(::std::forward(is)...); + } + }; +}; + +/** + * @brief Slice based on `mdspan`. + * + * @tparam T + * @tparam dimensions + */ +template +using slice = mdspan, layout_stride>; + +/** + * @brief Compute how many dimensions of a slice are actually contiguous. + * + * @tparam T Base type for the slice + * @tparam dimensions + * @param span + * @return size_t + */ +template +size_t contiguous_dims(const S& span) +{ + if (span.rank() < 2) + { + return span.rank(); + } + + // Repeat until size != stride + size_t contiguous_dims = 1; + size_t prod_strides = 1; + while (contiguous_dims < span.rank() + && span.extent(contiguous_dims - 1) * prod_strides == span.stride(contiguous_dims)) + { + prod_strides *= span.stride(contiguous_dims); + contiguous_dims++; + } + + assert(contiguous_dims <= span.rank()); + return contiguous_dims; +} + +/** + * @brief Returns a `slice` object starting at `data` with the given extents and strides. + * + * @tparam ElementType Element type of the slice + * @tparam Extents Extents of the slice in each dimension packed in a tuple + * @tparam Strides Strides of the slice in each dimension + * @param data pointer to the beginning of data + * @param extents Values for the extents, e.g. `std::tuple{1024, 512}` + * @param strides Values for the strides; fastest-moving one is implicitly 1. + * @return auto `slice` + */ +template +_CCCL_HOST_DEVICE auto make_slice(ElementType* data, const ::std::tuple& extents, const Strides&... strides) +{ + static_assert(sizeof...(Extents) == sizeof...(Strides) + 1); + using Result = slice; + auto sizes = ::std::apply( + [&](auto&&... e) { + return ::cuda::std::array{size_t(e)...}; + }, + extents); + ::cuda::std::array mdspan_strides{1, size_t(strides)...}; + return Result(data, typename Result::mapping_type(sizes, mdspan_strides)); +} + +/** + * @brief Returns a contiguous mdspan for a dense multidimensional array. + * + * @tparam T Element type + * @param data Pointer to first element of the data + * @param extents Sizes + * @return auto + */ +template +_CCCL_HOST_DEVICE auto make_slice(T* data, const Extents&... extents) +{ + using Result = slice; + static_assert(sizeof...(Extents) == Result::rank()); + + if constexpr (sizeof...(Extents) == 0) + { + return Result(data, typename Result::mapping_type()); + } + else + { + ::cuda::std::array sizes{size_t(extents)...}, strides; + for (size_t i = 1; i < strides.size(); ++i) + { + strides[i] = sizes[i - 1]; + } + + strides[0] = sizes.size() != 0 && sizes.begin()[0] != 0; + for (size_t i = 2; i < strides.size(); ++i) + { + strides.begin()[i] *= strides.begin()[i - 1]; + } + return Result(data, typename Result::mapping_type(sizes, strides)); + } +} + +namespace reserved +{ + +template +auto make_mdview(Whatevs&&... whatevs) +{ + return make_slice(::std::forward(whatevs)...); +} + +} // namespace reserved + +#ifdef UNITTESTED_FILE +# ifdef STF_HAS_UNITTEST_WITH_ARGS +UNITTEST("slice usual suspects: default ctor, copy, move, assign", (slice())) +{ + using View = ::std::remove_reference_t; + UNITTEST("should work for several sizes", 1u, 13u) + { + View s; + EXPECT(s.extent(0) == 0); + EXPECT(s.extent(1) == 0); + EXPECT(s.size() == 0); + + double data[200 * 100]; + s = reserved::make_mdview(data, ::std::make_tuple(200, unittest_param), 200); + EXPECT(s.extent(0) == 200); + EXPECT(s.extent(1) == unittest_param); + EXPECT(s.size() == 200 * unittest_param); + }; +}; + +UNITTEST("2D slice basics", (slice())) +{ + using View = ::std::remove_reference_t; + // Bidimensional array of 200 rows of 100 elements each + double data[200 * 100]; + // Access the first 13 elements of each row in that array + auto s = reserved::make_mdview(data, ::std::make_tuple(200, 13), 200); + EXPECT(s.extent(0) == 200); + EXPECT(s.extent(1) == 13); + EXPECT(s.size() == 200 * 13); + EXPECT(&s(0, 0) == data); + EXPECT(&s(1, 0) == data + 1); + EXPECT(&s(2, 0) == data + 2); + EXPECT(&s(2, 3) == data + 2 + 3 * 200); +}; + +UNITTEST("3D slice basics", (slice())) +{ + using View = ::std::remove_reference_t; + // 3-dimensional array of 200 by 100 by 27 + double data[14 * 27 * 300]; + // Access the first 13*14 elements of each row in that array + auto s = reserved::make_mdview(data, ::std::tuple{200, 13, 14}, 300, 27 * 300); + EXPECT(s.extent(0) == 200); + EXPECT(s.extent(1) == 13); + EXPECT(s.extent(2) == 14); + EXPECT(s.size() == 200 * 13 * 14); + EXPECT(&s(0, 0, 0) == data); + EXPECT(&s(1, 0, 0) == data + 1); + EXPECT(&s(2, 3, 0) == data + 2 + 3 * 300); + EXPECT(&s(3, 5, 7) == data + 3 + 5 * 300 + 7 * 27 * 300); +}; + +UNITTEST("2D tiles", (slice())) +{ + using View = ::std::remove_reference_t; + // 6x4 matrix, 4 tiles of size 3x2, make sure we touch all entries of the + // original matrix once when iterating over the tiles + int data[6 * 4] = {0}; + auto s00 = reserved::make_mdview(data + 0, ::std::tuple{3, 2}, 6); + auto s10 = reserved::make_mdview(data + 3, ::std::tuple{3, 2}, 6); + auto s01 = reserved::make_mdview(data + 12, ::std::tuple{3, 2}, 6); + auto s11 = reserved::make_mdview(data + 15, ::std::tuple{3, 2}, 6); + + for (size_t j = 0; j < s00.extent(1); j++) + { + for (size_t i = 0; i < s00.extent(0); i++) + { + s00(i, j) = 42; + } + } + + for (size_t j = 0; j < s10.extent(1); j++) + { + for (size_t i = 0; i < s10.extent(0); i++) + { + s10(i, j) = 42; + } + } + + for (size_t j = 0; j < s01.extent(1); j++) + { + for (size_t i = 0; i < s01.extent(0); i++) + { + s01(i, j) = 42; + } + } + + for (size_t j = 0; j < s11.extent(1); j++) + { + for (size_t i = 0; i < s11.extent(0); i++) + { + s11(i, j) = 42; + } + } + + for (size_t i = 0; i < 4 * 6; i++) + { + EXPECT(data[i] == 42); + } +}; + +UNITTEST("contiguous_dims 1D", (slice())) +{ + using View = ::std::remove_reference_t; + int a[10]; + auto s = reserved::make_mdview(a, 10); + EXPECT(contiguous_dims(s) == 1); +}; + +UNITTEST("contiguous_dims 2D", (slice())) +{ + using View = ::std::remove_reference_t; + // This is non-contiguous + int a[3 * 2]; + auto s = reserved::make_mdview(a, ::std::tuple{3, 2}, 6); + EXPECT(contiguous_dims(s) == 1); + + // This is contiguous + int b[6 * 2]; + auto s2 = reserved::make_mdview(b, ::std::tuple{6, 2}, 6); + EXPECT(contiguous_dims(s2) == 2); +}; + +UNITTEST("contiguous_dims 3D", (slice())) +{ + using View = ::std::remove_reference_t; + int a[3 * 2 * 10]; + auto s = reserved::make_mdview(a, ::std::tuple{3, 2, 10}, 6, 12); + EXPECT(contiguous_dims(s) == 1); + + auto s2 = reserved::make_mdview(a, ::std::tuple{3, 2, 10}, 3, 3 * 2); + EXPECT(contiguous_dims(s2) == 3); + + auto s3 = reserved::make_mdview(a, ::std::tuple{3, 2, 10}, 3, 10); + EXPECT(contiguous_dims(s3) == 2); +}; + +UNITTEST("implicit contiguous strides", (slice())) +{ + using View = ::std::remove_reference_t; + int a[3 * 2 * 10]; + auto s = reserved::make_mdview(a, 3, 2, 10); + EXPECT(s.stride(0) == 1); + EXPECT(s.stride(1) == 3); + EXPECT(s.stride(2) == 3 * 2); + EXPECT(contiguous_dims(s) == 3); +}; + +# endif // STF_HAS_UNITTEST_WITH_ARGS +#endif // UNITTESTED_FILE + +/** @brief Specialization of the `shape` template for `mdspan` + * + * @extends shape_of + */ +template +class shape_of> +{ +public: + using described_type = mdspan; + using coords_t = array_tuple; + + /** + * @brief Dimensionality of the slice. + * + */ + _CCCL_HOST_DEVICE static constexpr size_t rank() + { + return described_type::rank(); + } + + // Functions to create the begin and end iterators + _CCCL_HOST_DEVICE auto begin() + { + ::std::array sizes; + unroll([&](auto i) { + sizes[i] = extents.extent(i); + }); + return box(sizes).begin(); + } + + _CCCL_HOST_DEVICE auto end() + { + ::std::array sizes; + unroll([&](auto i) { + sizes[i] = extents.extent(i); + }); + return box(sizes).end(); + } + + /** + * @brief The default constructor builds a shape with size 0 in all dimensions. + * + * All `shape_of` specializations must define this constructor. + */ + shape_of() = default; + + /** + * @name Copies a shape. + * + * All `shape_of` specializations must define this constructor. + */ + shape_of(const shape_of&) = default; + + /** + * @brief Extracts the shape from a given slice. + * + * @param x object to get the shape from + * + * All `shape_of` specializations must define this constructor. + */ + explicit _CCCL_HOST_DEVICE shape_of(const described_type& x) + : extents(x.extents()) + , strides(x.mapping().strides()) + {} + + /** + * @brief Create a new `shape_of` object from a `coords_t` object. + * + * @tparam Sizes Types (all must convert to `size_t` implicitly) + * @param size0 Size for the first dimension ( + * @param sizes Sizes of data for the other dimensions, one per dimension + * + * Initializes dimensions to `size0`, `sizes...`. This constructor is optional. + */ + explicit shape_of(const coords_t& sizes) + : extents(reserved::to_cuda_array(sizes)) + { + size_t product_sizes = 1; + unroll([&](auto i) { + if (i == 0) + { + strides[i] = ::std::get<0>(sizes) != 0; + } + else + { + product_sizes *= extent(i - 1); + strides[i] = product_sizes; + } + }); + } + + /** + * @brief Create a new `shape_of` object taking exactly `dimension` sizes. + * + * @tparam Sizes Types (all must convert to `size_t` implicitly) + * @param size0 Size for the first dimension ( + * @param sizes Sizes of data for the other dimensions, one per dimension + * + * Initializes dimensions to `size0`, `sizes...`. This constructor is optional. + */ + template + explicit shape_of(size_t size0, Sizes&&... sizes) + : extents(size0, ::std::forward(sizes)...) + { + static_assert(sizeof...(sizes) + 1 == rank(), "Wrong number of arguments passed to shape_of."); + + strides[0] = size0 != 0; + size_t product_sizes = 1; + for (size_t i = 1; i < rank(); ++i) + { + product_sizes *= extent(i - 1); + strides[i] = product_sizes; + } + } + + ///@{ @name Constructors + explicit shape_of(const ::std::array& sizes) + : extents(reserved::convert_to_cuda_array(sizes)) + {} + + explicit shape_of(const ::std::array& sizes, const ::std::array& _strides) + : shape_of(sizes) + { + if constexpr (rank() > 1) + { + size_t n = 0; + for (auto i = _strides.begin(); i != _strides.end(); ++i) + { + strides[n++] = *i; + } + for (auto i = strides.rbegin() + 1; i != strides.rend(); ++i) + { + *i *= i[1]; + } + } + } + ///@} + + bool operator==(const shape_of& other) const + { + return extents == other.extents && strides == other.strides; + } + + /** + * @brief Returns the size of a slice in a given dimension (run-time version) + * + * @tparam dim The dimension to get the size for. Must be `< dimensions` + * @return size_t The size + * + * This member function is optional. + */ + constexpr _CCCL_HOST_DEVICE size_t extent(size_t dim) const + { + assert(dim < rank()); + return extents.extent(dim); + } + + /** + * @brief Get the stride for a specified dimension. + * + * @param dim The dimension for which to get the stride. + * @return The stride for the specified dimension. + */ + constexpr _CCCL_HOST_DEVICE size_t stride(size_t dim) const + { + assert(dim < rank()); + return strides[dim]; + } + + /** + * @brief Total size of the slice in all dimensions (product of the sizes) + * + * @return size_t The total size + * + * This member function is optional. + */ + _CCCL_HOST_DEVICE size_t size() const + { + size_t result = 1; + for (size_t i = 0; i != rank(); ++i) + { + result *= extents.extent(i); + } + return result; + } + + /** + * @brief Returns an array with sizes in all dimensions + * + * @return const std::array& + */ + _CCCL_HOST_DEVICE ::std::array get_sizes() const + { + ::std::array result; + for (size_t i = 0; i < rank(); ++i) + { + result[i] = extents.extent(i); + } + return result; + } + + /** + * @brief Returns the extents as a dim4 type + * + * @return const dim4 + */ + dim4 get_data_dims() const + { + static_assert(rank() < 5); + + dim4 dims(0); + if constexpr (rank() >= 1) + { + dims.x = static_cast(extents.extent(0)); + } + if constexpr (rank() >= 2) + { + dims.y = static_cast(extents.extent(1)); + } + if constexpr (rank() >= 3) + { + dims.z = static_cast(extents.extent(2)); + } + if constexpr (rank() >= 4) + { + dims.t = static_cast(extents.extent(3)); + } + return dims; + } + + /** + * @brief Set contiguous strides based on the provided sizes. + * + * @note This function should not be called (calling it will engender a link-time error). + * + * @param sizes The sizes to set the contiguous strides. + */ + void set_contiguous_strides(const ::std::array& sizes); + + /** + * @brief Create a new `described_type` object with the given base pointer. + * + * @param base Base pointer to create the described_type object. + * @return described_type Newly created described_type object. + */ + described_type create(T* base) const + { + return described_type(base, typename described_type::mapping_type(extents, strides)); + } + + // This transforms a tuple of (shape, 1D index) into a coordinate + _CCCL_HOST_DEVICE coords_t index_to_coords(size_t index) const + { + ::std::array coordinates{}; + // for (::std::ptrdiff_t i = _dimensions - 1; i >= 0; i--) + for (auto i : each(0, shape_of::rank())) + { + coordinates[i] = index % extent(i); + index /= extent(i); + } + + return ::std::apply( + [](const auto&... e) { + return ::std::make_tuple(e...); + }, + coordinates); + } + +private: + typename described_type::extents_type extents{}; + ::cuda::std::array strides{}; +}; + +#ifdef UNITTESTED_FILE +# ifdef STF_HAS_UNITTEST_WITH_ARGS +UNITTEST("shape_of for slice and mdspan", (slice())) +{ + using View = ::std::remove_reference_t; + using s = shape_of; + + static_assert(s::rank() == 3); + EXPECT(s::rank() == 3); + + s shape_obj1; + assert(shape_obj1.get_sizes() == (::std::array{0, 0, 0})); + EXPECT(shape_obj1.size() == 0); + EXPECT(shape_obj1.extent(0) == 0); + EXPECT(shape_obj1.extent(1) == 0); + EXPECT(shape_obj1.extent(2) == 0); + EXPECT(shape_obj1.stride(0) == 0); + EXPECT(shape_obj1.stride(1) == 0); + EXPECT(shape_obj1.stride(2) == 0); + + s shape_obj2{2, 3, 4}; + assert(shape_obj2.get_sizes() == (::std::array{2, 3, 4})); + EXPECT(shape_obj2.size() == 2 * 3 * 4); + EXPECT(shape_obj2.extent(0) == 2); + EXPECT(shape_obj2.extent(1) == 3); + EXPECT(shape_obj2.extent(2) == 4); + EXPECT(shape_obj2.stride(0) == 1); + EXPECT(shape_obj2.stride(1) == 2); + EXPECT(shape_obj2.stride(2) == 2 * 3); + + s shape_obj3{{200, 13, 14}, {300, 27 * 300}}; + assert(shape_obj3.get_sizes() == (::std::array{200, 13, 14})); + EXPECT(shape_obj3.size() == 200 * 13 * 14); + EXPECT(shape_obj3.extent(0) == 200); + EXPECT(shape_obj3.extent(1) == 13); + EXPECT(shape_obj3.extent(2) == 14); + + for (auto i : shape_obj3) + { + EXPECT(i[0] < shape_obj3.extent(0)); + EXPECT(i[1] < shape_obj3.extent(1)); + EXPECT(i[2] < shape_obj3.extent(2)); + } +}; + +UNITTEST("3D slice should be similar to 3D mdspan", (slice())) +{ + // 3-dimensional array of 3 by 4 by 5 + using View = ::std::remove_reference_t; + + double data[3 * 4 * 5]; + for (size_t i = 0; i < sizeof(data) / sizeof(data[0]); ++i) + { + data[i] = 1.0 * i; + } + // Access each element of the array + auto m = reserved::make_mdview(data, ::std::tuple{3, 4, 5}, 3, 3 * 4); + + EXPECT(m.extent(0) == 3); + EXPECT(m.extent(1) == 4); + EXPECT(m.extent(2) == 5); + + for (size_t i = 0; i < m.extent(0); ++i) + { + for (size_t j = 0; j < m.extent(1); ++j) + { + for (size_t k = 0; k < m.extent(2); ++k) + { + EXPECT(m(i, j, k) == i + j * 3 + k * 3 * 4); + } + } + } + + // An array without explicit strides should behave just the same as the one above + m = reserved::make_mdview(data, 3, 4, 5); + + EXPECT(m.extent(0) == 3); + EXPECT(m.extent(1) == 4); + EXPECT(m.extent(2) == 5); + + for (size_t i = 0; i < m.extent(0); ++i) + { + for (size_t j = 0; j < m.extent(1); ++j) + { + for (size_t k = 0; k < m.extent(2); ++k) + { + EXPECT(m(i, j, k) == i + j * 3 + k * 3 * 4); + } + } + } + + // To access a subset of the array, use the same strides but different extents + + auto m2 = reserved::make_mdview(data, ::std::tuple{3, 2, 1}, 3, 3 * 4); + + EXPECT(m2.extent(0) == 3); + EXPECT(m2.extent(1) == 2); + EXPECT(m2.extent(2) == 1); + + for (size_t i = 0; i < m2.extent(0); ++i) + { + for (size_t j = 0; j < m2.extent(1); ++j) + { + for (size_t k = 0; k < m2.extent(2); ++k) + { + // printf("m2(%zu, %zu, %zu) = %g\n", i, j, k, m2(i, j, k)); + m2(i, j, k) = -1; + } + } + } + + // first dim is the fastest moving + double witness[3 * 4 * 5] = { + /*0, 0, 0*/ -1, + /*1, 0, 0*/ -1, + /*2, 0, 0*/ -1, + /*0, 1, 0*/ -1, + /*1, 1, 0*/ -1, + /*2, 1, 0*/ -1, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59}; + EXPECT(::std::equal(data, data + 3 * 4 * 5, witness)); +}; + +# endif // STF_HAS_UNITTEST_WITH_ARGS +#endif // UNITTESTED_FILE + +/** + * @brief Pins a slice in host memory for efficient use with CUDA primitives + * + * @tparam T memory type + * @tparam dimensions slice dimension + * @param s slice to pin + */ +template +bool pin(mdspan& s) +{ + // We need the rank as a constexpr value + constexpr auto rank = mdspan::extents_type::rank(); + + if (address_is_pinned(s.data_handle())) + { + return false; + } + + if constexpr (rank == 0) + { + cuda_safe_call(pin_memory(s.data_handle(), 1)); + } + else if constexpr (rank == 1) + { + cuda_safe_call(pin_memory(s.data_handle(), s.extent(0))); + } + else if constexpr (rank == 2) + { + switch (contiguous_dims(s)) + { + case 1: + for (size_t index_1 = 0; index_1 < s.extent(1); index_1++) + { + cuda_safe_call(pin_memory(&s(0, index_1) + index_1 * s.stride(1), s.extent(0))); + } + break; + case 2: + // fprintf(stderr, "PIN 2D - contiguous\n"); + cuda_safe_call(pin_memory(s.data_handle(), s.extent(0) * s.extent(1))); + break; + default: + assert(false); + abort(); + } + } + else + { + static_assert(rank == 3, "Dimensionality not supported."); + switch (contiguous_dims(s)) + { + case 1: + for (size_t index_2 = 0; index_2 < s.extent(2); index_2++) + { + for (size_t index_1 = 0; index_1 < s.extent(1); index_1++) + { + // fprintf(stderr, "ADDR %d,%d,0 = %p \n", index_2, index_1, &s(index_2, index_1, 0)); + cuda_safe_call(pin_memory(&s(0, index_1, index_2), s.extent(0))); + } + } + break; + case 2: + for (size_t index_2 = 0; index_2 < s.extent(2); index_2++) + { + cuda_safe_call(pin_memory(&s(0, 0, index_2), s.extent(0) * s.extent(1))); + } + break; + case 3: + // fprintf(stderr, "PIN 3D - contiguous\n"); + cuda_safe_call(pin_memory(s.data_handle(), s.extent(0) * s.extent(1) * s.extent(2))); + break; + default: + assert(false); + abort(); + } + } + + return true; +} + +/** + * @brief Unpin the memory associated with an mdspan object. + * + * @tparam T The type of elements in the mdspan. + * @tparam P The properties of the mdspan. + * @param s The mdspan object to unpin memory for. + */ +template +void unpin(mdspan& s) +{ + // We need the rank as a constexpr value + constexpr auto rank = mdspan::extents_type::rank(); + + if constexpr (rank == 0) + { + unpin_memory(s.data_handle()); + } + else if constexpr (rank == 1) + { + unpin_memory(s.data_handle()); + } + else if constexpr (rank == 2) + { + switch (contiguous_dims(s)) + { + case 1: + for (size_t index_1 = 0; index_1 < s.extent(1); index_1++) + { + unpin_memory(&s(0, index_1) + index_1 * s.extent(0)); + } + break; + case 2: + // fprintf(stderr, "PIN 2D - contiguous\n"); + unpin_memory(s.data_handle()); + break; + default: + assert(false); + abort(); + } + } + else + { + static_assert(rank == 3, "Dimensionality not supported."); + switch (contiguous_dims(s)) + { + case 1: + for (size_t index_2 = 0; index_2 < s.extent(2); index_2++) + { + for (size_t index_1 = 0; index_1 < s.extent(1); index_1++) + { + // fprintf(stderr, "ADDR %d,%d,0 = %p \n", index_2, index_1, &s(index_2, index_1, 0)); + unpin_memory(&s(0, index_1, index_2)); + } + } + break; + case 2: + for (size_t index_2 = 0; index_2 < s.extent(2); index_2++) + { + unpin_memory(&s(0, 0, index_2)); + } + break; + case 3: + // fprintf(stderr, "PIN 3D - contiguous\n"); + unpin_memory(s.data_handle()); + break; + default: + assert(false); + abort(); + } + } +} + +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_MSVC(4702) // unreachable code + +template +size_t data_hash([[maybe_unused]] mdspan s, ::std::index_sequence = ::std::index_sequence<>()) +{ + using Slice = mdspan; + if constexpr (!reserved::has_std_hash_v && !reserved::has_cudastf_hash_v) + { + fprintf(stderr, "Error: cannot compute data_hash on a mdspan if ::std::hash is not defined.\n"); + abort(); + return 0; + } + else + { + if constexpr (sizeof...(i) != Slice::rank()) + { + return data_hash(s, ::std::make_index_sequence()); + } + else + { + if (s.size() == 0) + { + return 0; + } + + size_t h = 0; + auto content_hash = [&](auto... indices) -> void { + for (;;) + { + cuda::experimental::stf::hash_combine(h, s(indices...)); + + bool bump = true; + each_in_pack( + [&](auto current_dim, size_t& index) { + if (!bump) + { + return; + } + ++index; + if (index >= s.extent(current_dim)) + { + index = 0; + } + else + { + bump = false; + } + }, + indices...); + if (bump) + { + // Done with all dimensions + break; + } + } + }; + content_hash((i * 0)...); + return h; + } + } +} +_CCCL_DIAG_POP + +/** + * Write the content of the mdspan into a file + */ +template +void data_dump([[maybe_unused]] mdspan s, + ::std::ostream& file = ::std::cerr, + ::std::index_sequence = ::std::index_sequence<>()) +{ + using Slice = mdspan; + if constexpr (reserved::has_ostream_operator::value) + { + if constexpr (sizeof...(i) != Slice::rank()) + { + return data_dump(s, file, ::std::make_index_sequence()); + } + else + { + if (s.size() == 0) + { + return; + } + + auto print_element = [&](auto... indices) -> void { + for (;;) + { + file << s(indices...) << ' '; + bool bump = true; + each_in_pack( + [&](auto current_dim, size_t& index) { + if (!bump) + { + return; + } + ++index; + if (index >= s.extent(current_dim)) + { + // Done printing last element on the current dimension, move to the next one + if (current_dim < Slice::rank() - 1) + { + file << "; "; + } + index = 0; + } + else + { + bump = false; + } + }, + indices...); + if (bump) + { + // Done with all dimensions + break; + } + } + }; + + print_element((i * 0)...); + file << '\n'; + } + } + else + { + ::std::cerr << "Unsupported typed." << ::std::endl; + } +} + +#ifdef UNITTESTED_FILE +UNITTEST("data_dump") +{ + double data[4] = {1, 2, 3, 4}; + auto s = make_slice(data, 4); + ::std::cerr << type_name << '\n'; + data_dump(s, ::std::cerr); + auto s1 = make_slice(data, 2, 2); + ::std::cerr << type_name << '\n'; + data_dump(s1, ::std::cerr); +}; + +UNITTEST("data_hash") +{ + // Select a random filename + double data[4] = {1, 2, 3, 4}; + auto s = make_slice(data, 4); + + double same_data[4] = {1, 2, 3, 4}; + auto same_s = make_slice(same_data, 4); + + EXPECT(data_hash(s) == data_hash(same_s)); + + double other_data[5] = {1, 2, 3, 4, 5}; + auto other_s = make_slice(other_data, 5); + + EXPECT(data_hash(s) != data_hash(other_s)); +}; +#endif + +// Note this is implementing it in the cudastf namespace, not std::hash +template +struct hash> +{ + ::std::size_t operator()(mdspan const& s) const noexcept + { + static constexpr auto _dimensions = mdspan::rank(); + // Combine hashes from the ptr, sizes and strides + size_t h = 0; + hash_combine(h, s.data_handle()); + + if constexpr (_dimensions > 0) + { + for (size_t i = 0; i < _dimensions; i++) + { + hash_combine(h, s.extent(i)); + } + } + + if constexpr (_dimensions > 1) + { + for (size_t i = 1; i < _dimensions; i++) + { + hash_combine(h, s.stride(i)); + } + } + + return h; + } +}; + +#ifdef UNITTESTED_FILE + +UNITTEST("slice hash") +{ + double A[5 * 2]; + auto s = make_slice(A, ::std::tuple{5, 2}, 5); + auto s2 = make_slice(A, ::std::tuple{4, 2}, 5); + + size_t h = hash>{}(s); + size_t h2 = hash>{}(s2); + + EXPECT(h != h2); +}; + +UNITTEST("slice hash 3D") +{ + double A[5 * 2 * 40]; + + // contiguous + auto s = make_slice(A, ::std::tuple{5, 2, 40}, 5, 5 * 2); + + // non-contiguous + auto s2 = make_slice(A, ::std::tuple{4, 2, 40}, 5, 5 * 2); + + size_t h = hash>{}(s); + size_t h2 = hash>{}(s2); + + EXPECT(h != h2); +}; + +UNITTEST("shape_of basics") +{ + using namespace cuda::experimental::stf; + auto s1 = shape_of>(1, 2, 3); + auto s2 = shape_of>(::std::tuple(1, 2, 3)); + EXPECT(s1 == s2); +}; + +#endif // UNITTESTED_FILE + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/task.cuh b/cudax/include/cuda/experimental/__stf/internal/task.cuh new file mode 100644 index 00000000000..6b81bdb6090 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/task.cuh @@ -0,0 +1,693 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Implement the task class and methods to implement the STF programming model + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +/* + * This is a generic class of "tasks" that are synchronized according to + * accesses on "data" depending on in/out dependencies + */ + +#include +#include // task has-a task_dep_vector_untyped + +#include + +#include + +namespace cuda::experimental::stf +{ + +namespace reserved +{ + +class mapping_id_tag +{}; + +using mapping_id_t = reserved::unique_id; + +} // end namespace reserved + +class backend_ctx_untyped; +class logical_data_untyped; +class exec_place; + +void reclaim_memory( + backend_ctx_untyped& ctx, const data_place& place, size_t requested_s, size_t& reclaimed_s, event_list& prereqs); + +/** + * @brief Generic implementation of a task based on asynchronous events and accessing logical data + */ +class task +{ +public: + /** + * @brief current task status + * + * We keep track of the status of task so that we do not make API calls at an + * inappropriate time, such as setting the symbol once the task has already + * started, or releasing a task that was not started yet. + */ + enum class phase + { + setup, // the task has not started yet + running, // between acquire and release + finished, // we have released + }; + +private: + // pimpl + class impl + { + public: + impl(const impl&) = delete; + impl& operator=(const impl&) = delete; + + impl(exec_place where = exec_place::current_device()) + : e_place(mv(where)) + , affine_data_place(e_place.affine_data_place()) + {} + + // Vector of user provided deps + task_dep_vector_untyped deps; + + // This list is only useful when calling the get() method of a task, to + // reduce overheads, we initialize this vector lazily + void initialize_reordered_indexes() + { + // This list gives converts the original index to the sorted index + // For example the first entry of the list before being ordered has order t->reordered_index[0] + reordered_indexes.resize(deps.size()); + + int sorted_index = 0; + for (auto& it : deps) + { + reordered_indexes[it.dependency_index] = sorted_index; + sorted_index++; + } + } + + // Get the index of the dependency after reordering, for example + // deps[reordered_index[0]] is the first piece of data + ::std::vector reordered_indexes; + + // Indices of logical data which were locked (non skipped). Indexes are + // those obtained after sorting. + ::std::vector<::std::pair> unskipped_indexes; + + // Extra events that need to be done before the task starts. These are + // "extra" as these are in addition to the events that will be required to + // acquire the logical_data_untypeds accessed by the task + event_list input_events; + + // A string useful for debugging purpose + mutable ::std::string symbol; + + // This points to the prerequisites for this task's termination + event_list done_prereqs; + + // Used to uniquely identify the task + reserved::unique_id_t unique_id; + + // Used to uniquely identify the task for mapping purposes + reserved::mapping_id_t mapping_id; + + // This is a pointer to a generic data structure used by "unset_place" to + // restore previous context + exec_place saved_place_ctx; + + // Indicate the status of the task + task::phase phase = task::phase::setup; + + // This is where the task is executed + exec_place e_place; + + // This is the default data place for the task. In general this is the + // affine data place of the execution place, but this can be a + // composite data place when using a grid of places for example. + data_place affine_data_place; + + ::std::vector<::std::function> post_submission_hooks; + }; + +protected: + // This is the only state + ::std::shared_ptr pimpl; + +public: + task() + : pimpl(::std::make_shared()) + {} + + task(exec_place ep) + : pimpl(::std::make_shared(mv(ep))) + {} + + task(const task& rhs) + : pimpl(rhs.pimpl) + {} + task(task&&) = default; + + task& operator=(const task& rhs) = default; + task& operator=(task&& rhs) = default; + + explicit operator bool() const + { + return pimpl != nullptr; + } + + bool operator==(const task& rhs) const + { + return pimpl == rhs.pimpl; + } + + /// Get the string attached to the task for debugging purposes + const ::std::string& get_symbol() const + { + if (pimpl->symbol.empty()) + { + pimpl->symbol = "task " + ::std::to_string(pimpl->unique_id); + } + return pimpl->symbol; + } + + /// Attach a string to this task, which can be useful for debugging purposes, or in tracing tools. + void set_symbol(::std::string new_symbol) + { + EXPECT(get_task_phase() == phase::setup); + pimpl->symbol = mv(new_symbol); + } + + /// Add one dependency + void add_dep(task_dep_untyped d) + { + EXPECT(get_task_phase() == phase::setup); + pimpl->deps.push_back(mv(d)); + } + + /// Add a set of dependencies + void add_deps(task_dep_vector_untyped input_deps) + { + EXPECT(get_task_phase() == phase::setup); + if (pimpl->deps.empty()) + { + // Frequent case + pimpl->deps = mv(input_deps); + } + else + { + pimpl->deps.insert( + pimpl->deps.end(), ::std::make_move_iterator(input_deps.begin()), ::std::make_move_iterator(input_deps.end())); + } + } + + /// Add a set of dependencies + template + void add_deps(task_dep_untyped first, Pack&&... pack) + { + EXPECT(get_task_phase() == phase::setup); + pimpl->deps.push_back(mv(first)); + if constexpr (sizeof...(Pack) > 0) + { + add_deps(::std::forward(pack)...); + } + } + + /// Get the dependencies of the task + const task_dep_vector_untyped& get_task_deps() const + { + return pimpl->deps; + } + + /// Specify where the task should run + task& on(exec_place p) + { + EXPECT(get_task_phase() == phase::setup); + // This defines an affine data place too + set_affine_data_place(p.affine_data_place()); + pimpl->e_place = mv(p); + return *this; + } + + /// Get and set the execution place of the task + const exec_place& get_exec_place() const + { + return pimpl->e_place; + } + exec_place& get_exec_place() + { + return pimpl->e_place; + } + void set_exec_place(const exec_place& place) + { + pimpl->e_place = place; + } + + /// Get and Set the affine data place of the task + const data_place& get_affine_data_place() const + { + return pimpl->affine_data_place; + } + + void set_affine_data_place(data_place affine_data_place) + { + pimpl->affine_data_place = mv(affine_data_place); + } + + dim4 grid_dims() const + { + return get_exec_place().grid_dims(); + } + + /// Get the list of events which mean that the task was executed + const event_list& get_done_prereqs() const + { + return pimpl->done_prereqs; + } + + /// Add an event list to the list of events which mean that the task was executed + template + void merge_event_list(T&& tail) + { + pimpl->done_prereqs.merge(::std::forward(tail)); + } + + /** + * @brief Get the identifier of a data instance used by a task + * + * We here find the instance id used by a given piece of data in a task. + * Note that this incurs a certain overhead because it searches through the + * list of logical data in the task. + */ + instance_id_t find_data_instance_id(const logical_data_untyped& d) const; + + /** + * @brief Generic method to retrieve the data instance associated to an + * index in a task. + * + * If `T` is the exact type stored, this returns a reference to a valid data instance in the task. If `T` is + * `constify`, where `U` is the type stored, this returns an rvalue of type `T`. + * + * Calling this outside the start()/end() section will result in undefined behaviour. + * + * @remark One should not forget the "template" keyword when using this API with a task `t` + * `T &res = t.template get(index);` + */ + template + decltype(auto) get(size_t submitted_index) const; + + // If there are extra input dependencies in addition to STF-induced events + void set_input_events(event_list _input_events) + { + EXPECT(get_task_phase() == phase::setup); + pimpl->input_events = mv(_input_events); + } + + const event_list& get_input_events() const + { + return pimpl->input_events; + } + + // Get the unique task identifier + int get_unique_id() const + { + return pimpl->unique_id; + } + + // Get the unique task mapping identifier + int get_mapping_id() const + { + return pimpl->mapping_id; + } + + size_t hash() const + { + return ::std::hash()(pimpl.get()); + } + + void add_post_submission_hook(::std::vector<::std::function>& hooks) + { + for (auto& h : hooks) + { + pimpl->post_submission_hooks.push_back(h); + } + } + + /** + * @brief Start a task + * + * SUBMIT = acquire + release at the same time ... + */ + // Resolve all dependencies at the specified execution place + // Returns execution prereqs + event_list acquire(backend_ctx_untyped& ctx); + + void release(backend_ctx_untyped& ctx, event_list& done_prereqs); + + // Returns the current state of the task + phase get_task_phase() const + { + EXPECT(pimpl); + return pimpl->phase; + } + + /* When the task has ended, we cannot do anything with it. It is possible + * that the user-facing task object is not destroyed when the context is + * synchronized, so we clear it. + * + * This for example happens when doing : + * auto t = ctx.task(A.rw()); + * t->*[](auto A){...}; + * ctx.finalize(); + */ + void clear() + { + pimpl.reset((cuda::experimental::stf::task::impl*) nullptr); + } +}; + +namespace reserved +{ + +/* This method lazily allocates data (possibly reclaiming memory) and copies data if needed */ +template +void dep_allocate( + backend_ctx_untyped& ctx, + Data& d, + access_mode mode, + const data_place& dplace, + const ::std::optional eplace, + instance_id_t instance_id, + event_list& prereqs) +{ + auto& inst = d.get_data_instance(instance_id); + + /* + * DATA LAZY ALLOCATION + */ + bool already_allocated = inst.is_allocated(); + if (!already_allocated) + { + // nvtx_range r("acquire::allocate"); + /* Try to allocate memory : if we fail to do so, we must try to + * free other instances first, and retry */ + int alloc_attempts = 0; + while (true) + { + ::std::ptrdiff_t s = 0; + + prereqs.merge(inst.get_read_prereq(), inst.get_write_prereq()); + + // The allocation routine may decide to store some extra information + void* extra_args = nullptr; + + d.allocate(dplace, instance_id, s, &extra_args, prereqs); + + // Save extra_args + inst.set_extra_args(extra_args); + + if (s >= 0) + { + // This allocation was succesful + inst.allocated_size = s; + inst.set_allocated(true); + inst.reclaimable = true; + break; + } + + assert(s < 0); + + // Limit the number of attempts if it's simply not possible + EXPECT(alloc_attempts++ < 5); + + // We failed to allocate so we try to reclaim + size_t reclaimed_s = 0; + size_t needed = -s; + reclaim_memory(ctx, dplace, needed, reclaimed_s, prereqs); + } + + // After allocating a reduction instance, we need to initialize it + if (mode == access_mode::redux) + { + assert(eplace.has_value()); + // We have just allocated a new piece of data to perform + // reductions, so we need to initialize this with an + // appropriate user-provided operator + // First get the data instance and then its reduction operator + ::std::shared_ptr ops = inst.get_redux_op(); + ops->init_op_untyped(d, dplace, instance_id, eplace.value(), prereqs); + } + } +} + +} // end namespace reserved + +// inline size_t task_state::hash() const { +// size_t h = 0; +// for (auto& e: logical_data_ids) { +// int id = e.first; +// auto handle = e.second.lock(); +// // ignore expired handles +// if (handle) { +// hash_combine(h, ::std::hash {}(id)); +// hash_combine(h, handle->hash()); +// } +// } + +// return h; +// } + +/** + * @brief Data instance implementation + * + * This describes an "instance" of a logical data. This is one copy on a data + * place. There can be multiple data instances of the same logical data on + * different places, or at the same places when we have access modes such as + * reductions. + * + * The data instance contains everything required to keep track of the events + * which need to be fulfilled prior to using that copy of the data. The "state" + * field makes it possible to implement the MSI protocol. + * + * Note that a data instance may correspond to a piece of data that is out of + * sync, but that is allocated (or not). In this case, future accesses to the + * logical data associated to this data instance on that data place will + * transfer a copy from a data instance that is valid. */ +class data_instance +{ +public: + data_instance() {} + data_instance(bool used, data_place dplace) + : used(used) + , dplace(mv(dplace)) + { +#if 0 + // Since this will default construct a task, we need to decrement the id + reserved::mapping_id_t::decrement_id(); +#endif + } + + void set_used(bool flag) + { + assert(flag != used); + used = flag; + } + + bool get_used() const + { + return used; + } + + void set_dplace(data_place _dplace) + { + dplace = mv(_dplace); + } + const data_place& get_dplace() const + { + return dplace; + } + + // Returns what is the reduction operator associated to this data instance + ::std::shared_ptr get_redux_op() const + { + return redux_op; + } + // Sets the reduction operator associated to that data instance + void set_redux_op(::std::shared_ptr op) + { + redux_op = op; + } + + // Indicates if the data instance is allocated (ie. if it needs to be + // allocated prior to use). Note that we may have allocated instances that + // are out of sync too. + bool is_allocated() const + { + return state.is_allocated(); + } + + void set_allocated(bool b) + { + state.set_allocated(b); + } + + reserved::msir_state_id get_msir() const + { + return state.get_msir(); + } + + void set_msir(reserved::msir_state_id st) + { + state.set_msir(st); + } + + const event_list& get_read_prereq() const + { + return state.get_read_prereq(); + } + const event_list& get_write_prereq() const + { + return state.get_write_prereq(); + } + + void set_read_prereq(event_list prereq) + { + state.set_read_prereq(mv(prereq)); + } + void set_write_prereq(event_list prereq) + { + state.set_write_prereq(mv(prereq)); + } + + void add_read_prereq(const event_list& _prereq) + { + state.add_read_prereq(_prereq); + } + void add_write_prereq(const event_list& _prereq) + { + state.add_write_prereq(_prereq); + } + + void clear_read_prereq() + { + state.clear_read_prereq(); + } + void clear_write_prereq() + { + state.clear_write_prereq(); + } + + bool has_last_task_relaxed() const + { + return last_task_relaxed.has_value(); + } + void set_last_task_relaxed(task t) + { + last_task_relaxed = mv(t); + } + const task& get_last_task_relaxed() const + { + assert(last_task_relaxed.has_value()); + return last_task_relaxed.value(); + } + + int max_prereq_id() const + { + return state.max_prereq_id(); + } + + // Compute a hash of the MSI/Alloc state + size_t state_hash() const + { + return hash{}(state); + } + + void set_extra_args(void* args) + { + extra_args = args; + } + + void* get_extra_args() const + { + return extra_args; + } + + void clear() + { + clear_read_prereq(); + clear_write_prereq(); + last_task_relaxed.reset(); + } + +private: + // Is this instance available or not ? If not we can reuse this data + // instance when looking for an available slot in the vector of data + // instances attached to the logical data + bool used = false; + + // If the used flag is set, this tells where this instance is located + data_place dplace; + + // Reduction operator attached to the data instance + ::std::shared_ptr redux_op; + + // @@@@TODO@@@@ There are a lot of unchecked forwarding with this variable, + // which is public in practice ... + // + // This structure contains everything to implement the MSI protocol, + // including asynchronous prereqs so that we only use a data instance once + // it's ready to do so + reserved::per_data_instance_msi_state state; + + // This stores the last task which used this instance with a relaxed coherence mode (redux) + ::std::optional last_task_relaxed; + + // This generic pointer can be used to store some information in the + // allocator which is passed to the deallocation routine. + void* extra_args = nullptr; + +public: + // Size of the memory allocation (bytes). Only valid for allocated instances. + size_t allocated_size = 0; + + // A false value indicates that this instance cannot be a candidate for + // memory reclaiming (e.g. because this corresponds to memory allocated by + // the user) + bool reclaimable = false; + + bool automatically_pinned = false; +}; + +template <> +struct hash +{ + ::std::size_t operator()(const task& t) const + { + return t.hash(); + } +}; + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/task_dep.cuh b/cudax/include/cuda/experimental/__stf/internal/task_dep.cuh new file mode 100644 index 00000000000..1b089af5fae --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/task_dep.cuh @@ -0,0 +1,294 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +namespace cuda::experimental::stf +{ + +::std::shared_ptr pack_state(const logical_data_untyped&); + +class task; + +/** + * @brief Type storing dependency information for one data item + * + * A task dependency (`task_dep`) object stores a pointer to the logical data, the + * requested access mode, and other requirements such as the data placement policy. + * + * Note that the instance id is set automatically when the systems assigns an + * id for this specific instance of data. + */ +class task_dep_untyped +{ +public: + // dependency with an explicit data_place + task_dep_untyped(logical_data_untyped& d, + access_mode m, + data_place dplace, + ::std::shared_ptr redux_op = nullptr) + : data(pack_state(d)) + , m(m) + , dplace(mv(dplace)) + , redux_op(mv(redux_op)) + {} + + task_dep_untyped( + const logical_data_untyped& d, data_place dplace, ::std::shared_ptr redux_op = nullptr) + : task_dep_untyped(const_cast(d), access_mode::read, mv(dplace), mv(redux_op)) + {} + + // dependency without an explicit data_place : using data_place::affine + task_dep_untyped(logical_data_untyped& d, access_mode m, ::std::shared_ptr redux_op = nullptr) + : task_dep_untyped(d, m, data_place::affine, mv(redux_op)) + {} + + task_dep_untyped(const logical_data_untyped& d, ::std::shared_ptr redux_op = nullptr) + : task_dep_untyped(const_cast(d), access_mode::read, mv(redux_op)) + {} + + // @@@@ TODO @@@@ We may make this class non-copyable, but for now we are + // actually copying it when adding dependencies to a task. + + logical_data_untyped get_data() const; + + instance_id_t get_instance_id() const + { + return instance_id; + } + + /* Returns the same untyped task dependency with a read-only access mode */ + task_dep_untyped as_read_mode_untyped() const + { + task_dep_untyped res(*this); + res.m = access_mode::read; + return res; + } + + // We should only assign it once + void set_instance_id(instance_id_t id) + { + EXPECT(instance_id == instance_id_t::invalid); + instance_id = id; + } + + access_mode get_access_mode() const + { + return m; + } + + const data_place& get_dplace() const + { + return dplace; + } + + ::std::shared_ptr get_redux_op() const + { + return redux_op; + } + + // Index of the dependency within a task (only valid after acquiring !) + int dependency_index = -1; + + // When acquiring multiple pieces of data, it is possible that we have + // dependencies that can be merged. For example a task with Ar, Br and Aw + // is equivalent to a task acquiring Arw and Br. In this case we may skip + // some of the dependencies because they have already been fulfilles + bool skipped = false; + + bool operator<(const task_dep_untyped& rhs) const + { + assert(data && rhs.data); + return data < rhs.data; + } + + bool operator==(const task_dep_untyped& rhs) const + { + assert(data && rhs.data); + return data == rhs.data; + } + + bool operator!=(const task_dep_untyped& rhs) const + { + return !(*this == rhs); + } + + void set_symbol(::std::string s) const + { + symbol = mv(s); + } + + const ::std::string& get_symbol() const + { + return symbol; + } + + void set_data_footprint(size_t f) const + { + data_footprint = f; + } + + size_t get_data_footprint() const + { + return data_footprint; + } + + void reset_logical_data() + { + data = nullptr; + } + +private: + ::std::shared_ptr data; + access_mode m = access_mode::none; + instance_id_t instance_id = instance_id_t::invalid; + + // Copied from logical_data.h since they are needed for scheduling. Will not + // be valid until set through setters above. Marked as mutable to allow + // setting them only during scheduling (since task_dep can only be accessed + // as const ref) + mutable ::std::string symbol; + mutable size_t data_footprint = 0; + + mutable data_place dplace; + ::std::shared_ptr redux_op; +}; + +template +class logical_data; + +/** + * @brief Type storing dependency information for one data item, including the data type + * + * @tparam `T` The user-level type involved (e.g. `slice` or `slice` for a contiguous array). Note + * that `T` may be different in `const` qualifiers than the actual type stored in the dependency information. + */ +template +class task_dep : public task_dep_untyped +{ +public: + using data_t = T; + + template + task_dep(Pack&&... pack) + : task_dep_untyped(::std::forward(pack)...) + { + static_assert(sizeof(task_dep) == sizeof(task_dep_untyped), + "Cannot add state here because it would be lost through slicing"); + } + + /* Returns the same task dependency with a read-only access mode */ + task_dep as_read_mode() const + { + return task_dep(as_read_mode_untyped()); + } + + /** + * @brief Returns a reference to `T` if `T` is the same as the stored type, or an rvalue of type `T` if `T` has + * additional `const` qualifiers. + * + * @return decltype(auto) `T&` or `T` + */ + decltype(auto) instance(task&) const; +}; + +// A vector of dependencies +using task_dep_vector_untyped = ::std::vector; + +/** + * @brief Typed dependencies - carries dependency objects along with their types. + * + * @tparam Data Types of the objects the task depends on + * + * If types are not needed, this type can be converted implicitly to `task_dep_vector_untyped`. + */ +template +class task_dep_vector : public task_dep_vector_untyped +{ +public: + /** + * @brief Constructor (applies only for `task_dep_vector`) + * + * @tparam T Data type + * @param d typed task dependency object + */ + template + task_dep_vector(task_dep d) + : task_dep_vector_untyped(1, mv(d)) + { + static_assert(sizeof(task_dep_vector) == sizeof(task_dep_vector_untyped)); + static_assert(sizeof...(Data) == 1); + static_assert(::std::is_same_v); + } + + /** + * @brief Create given a number of `task_dep` instantiations. + * + * @param deps typed dependency objects + */ + task_dep_vector(task_dep... deps) + : task_dep_vector_untyped{mv(deps)...} + {} + + /** + * @brief Get the type depended upon at position `i`. + * + * @tparam i + * + * For example, `typename task_dep_vector::type_at<1>` is `double`. + */ + template + using type_at = ::std::tuple_element_t>; + + /** + * @brief Get the `task_dep` instantiation at position `i`. + * + * @tparam i + * + * For example, `typename task_dep_vector::type_at<1>` is an object type `task_dep`. + */ + template + task_dep>& at() + { + return static_cast>&>(task_dep_vector_untyped::at(i)); + } + + /** + * @brief Extracts physical data from this object to an `::std::tuple` object. + * + * @return ::std::tuple + * + * The physical data extracted is usable only after the dependencies have been satisfied. + */ + ::std::tuple instance(task& t) + { + return make_tuple_indexwise([&](auto i) { + return at().instance(t); + }); + } + + // All instantiations of task_dep_vector are friends with one another + template + friend class task_dep_vector; +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/task_state.cuh b/cudax/include/cuda/experimental/__stf/internal/task_state.cuh new file mode 100644 index 00000000000..dac1a185916 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/task_state.cuh @@ -0,0 +1,254 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +namespace cuda::experimental::stf::reserved +{ + +class logical_data_untyped_impl; // TODO: this should never be used outside logical_data_untyped + +/* To support task nesting, we create a stack of task contexts. Every time a + * new task is executed (between acquire and release), a new task_state is pushed + * on the stack. There is always a "root ctx" which corresponds to the state of + * the library when there is no task under execution. + */ +class ctx_stack +{ +public: + /* Add one task to the leaf tasks */ + void add_leaf_task(const task& t) + { + // this will create a new key in the map + leaf_tasks_mutex.lock(); + event_list& done_prereqs = leaf_tasks[t.get_unique_id()]; + leaf_tasks_mutex.unlock(); + + // XXX we need a copy method for event_list + done_prereqs.merge(t.get_done_prereqs()); + } + + /* Remove one task (if it is still a leaf task, otherwise do nothing) */ + void remove_leaf_task(int task_id) + { + // Erase that leaf task if it is found, or do nothing + auto guard = ::std::lock_guard(leaf_tasks_mutex); + leaf_tasks.erase(task_id); + } + + const auto& get_leaf_tasks() const + { + return leaf_tasks; + } + + auto& get_leaf_tasks() + { + return leaf_tasks; + } + + void add_pending_freeze(const task& fake_t, const event_list& events) + { + auto guard = ::std::lock_guard(pending_freeze_mutex); + + // This creates an entry if necessary (there can be multiple gets) + event_list& prereqs = pending_freeze[fake_t.get_unique_id()]; + + // Add these events to the stored list + prereqs.merge(events); + } + + // When we unfreeze a logical data, there is no need to automatically sync + // with the get events because unfreezing implies the get events where + // sync'ed with + void remove_pending_freeze(const task& fake_t) + { + auto guard = ::std::lock_guard(pending_freeze_mutex); + pending_freeze.erase(fake_t.get_unique_id()); + } + + bool has_start_events() const + { + return (start_events.size() > 0); + } + + void add_start_events(const event_list& lst) + { + start_events.merge(lst); + + // We only add events at the beginning of the context, but use them + // often, so it's good to optimize anyhow + start_events.optimize(); + } + + const event_list& get_start_events() const + { + return start_events; + } + + void add_dangling_events(const event_list& lst) + { + auto guard = ::std::lock_guard(dangling_events_mutex); + dangling_events.merge(lst); + /* If the number of dangling events gets too high, we try to optimize + * the list to avoid keeping events alive for no reason. */ + if (dangling_events.size() > 16) + { + dangling_events.optimize(); + } + } + + ctx_stack() + { + // This forces us to call the dtor of the singleton AFTER the destructor of the CUDA runtime. + // If all ressources are cleaned up by the time we destroy this ctx_stack singleton, we are "safe" + cudaError_t ret = cudaFree(0); + + // If we are running the task in the context of a CUDA callback, we are + // not allowed to issue any CUDA API call. + EXPECT((ret == cudaSuccess || ret == cudaErrorNotPermitted)); + } + + ~ctx_stack() + { + // Make sure everything is clean before leaving that context + assert(dangling_events.size() == 0); + + // Otherwise there are tasks which were not completed + assert(leaf_tasks.size() == 0); + } + + ctx_stack(const ctx_stack&) = delete; + ctx_stack& operator=(const ctx_stack&) = delete; + + // Insert a fence with all pending asynchronous operations on the current context + [[nodiscard]] inline event_list insert_task_fence(reserved::per_ctx_dot& dot) + { + auto prereqs = event_list(); + // Create a node in the DOT output (if any) + int fence_unique_id = -1; + bool dot_is_tracing = dot.is_tracing(); + if (dot_is_tracing) + { + fence_unique_id = reserved::unique_id_t(); + dot.add_fence_vertex(fence_unique_id); + } + + { + auto guard = ::std::lock_guard(leaf_tasks_mutex); + + // Sync with the events of all leaf tasks + for (auto& [t_id, t_done_prereqs] : get_leaf_tasks()) + { + // Add the events associated with the termination of that leaf tasks to the list of events + prereqs.merge(mv(t_done_prereqs)); + + // Add an edge between that leaf task and the fence node in the DOT output + if (dot_is_tracing) + { + dot.add_edge(t_id, fence_unique_id, 1); + } + } + + /* Remove all leaf tasks */ + leaf_tasks.clear(); + + /* Erase start events if any */ + start_events.clear(); + + assert(get_leaf_tasks().size() == 0); + } + + { + // Wait for all pending get() operations associated to frozen logical data + auto guard = ::std::lock_guard(pending_freeze_mutex); + + for (auto& [fake_t_id, get_prereqs] : pending_freeze) + { + // Depend on the get() operation + prereqs.merge(mv(get_prereqs)); + + // Add an edge between that freeze and the fence node in the DOT output + if (dot_is_tracing) + { + dot.add_edge(fake_t_id, fence_unique_id, 1); + } + } + + pending_freeze.clear(); + } + + // Sync with events which have not been synchronized with, and which are + // not "reachable". For example if some async operations occurred in a data + // handle destructor there could be some remaining events to sync with to + // make sure data were properly deallocated. + auto guard = ::std::lock_guard(dangling_events_mutex); + if (dangling_events.size() > 0) + { + prereqs.merge(mv(dangling_events)); + + // We consider that dangling events have been sync'ed with, so there is + // no need to keep track of them. + dangling_events.clear(); + } + + assert(dangling_events.size() == 0); + + return prereqs; + } + +public: + ::std::unordered_map logical_data_ids; + ::std::mutex logical_data_ids_mutex; + +private: + // To synchronize with all work submitted in this context, we need to + // synchronize will all "leaf tasks". Leaf tasks are task that have no + // outgoing dependencies. Leaf tasks will eventually depend on tasks which + // are not leaf, so it is sufficient to wait for leaf tasks. + // + // Instead of storing tasks, we store a map of id to event lists + ::std::unordered_map leaf_tasks; + ::std::mutex leaf_tasks_mutex; + + // Some asynchronous operations cannot be waited on when they occur. + // For example, when destroying a logical data, it is possible that + // asynchronous operations are not completed immediately (write back + // copies, deallocations, ...). A fence can be used to wait on these + // "dangling" events. + event_list dangling_events; + mutable ::std::mutex dangling_events_mutex; + + // To automatically synchronize with pending get() operartion for + // frozen_logical_data, we keep track of the events. The freeze operation + // is identified by the id of the "fake" task, and this map should be + // cleaned when unfreezing which means it has been synchronized with. + ::std::unordered_map pending_freeze; + ::std::mutex pending_freeze_mutex; + + // Events which denote the beginning of the context : any task with no + // dependency, or logical data with a reference copy should depend on it. + event_list start_events; +}; + +} // namespace cuda::experimental::stf::reserved diff --git a/cudax/include/cuda/experimental/__stf/internal/task_statistics.cuh b/cudax/include/cuda/experimental/__stf/internal/task_statistics.cuh new file mode 100644 index 00000000000..a15e264f516 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/task_statistics.cuh @@ -0,0 +1,303 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Implements the tracking and recording of task statistics. Each task is uniquely + * identified by the user set task symbol and the size of its dependencies. Can be enabled + * by setting the CUDASTF_CALIBRATION_FILE environment variable to point to the file which + * will store the results. + * + * + * CUDASTF_CALIBRATION_FILE + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace cuda::experimental::stf::reserved +{ + +/** + * @brief This class stores statistics about task execution time + */ +class task_statistics : public reserved::meyers_singleton +{ +protected: + task_statistics() + { + const char* filename = ::std::getenv("CUDASTF_CALIBRATION_FILE"); + + if (!filename) + { + calibrating = false; + return; + } + + calibrating = true; + calibration_file = ::std::string(filename); + } + + ~task_statistics() + { + if (!calibration_file.empty()) + { + write_stats(); + } + } + +public: + bool is_calibrating() const + { + return calibrating; + } + + bool is_calibrating_to_file() const + { + return !calibration_file.empty(); + } + + /** + * @brief Enable online calibration + */ + void enable_calibration() + { + calibrating = true; + } + + template + void log_task_time(const task_type& t, double time) + { + auto key = ::std::pair{t.get_symbol(), get_data_footprint(t)}; + + auto it = statistics.find(key); + if (it == statistics.end()) + { + statistics.emplace(key, statistic(time)); + } + else + { + it->second.update(time); + } + } + + class statistic + { + public: + statistic() = delete; + explicit statistic(double initial_time) + : num_calls(1) + , mean(initial_time) + , squares(0.0) + {} + + // Constructor when reading from file + statistic(int num_calls, double mean, double stddev) + : num_calls(num_calls) + , mean(mean) + , stddev(stddev) + {} + + void update(double new_time) + { + // Use Welford's method to compute mean https://stackoverflow.com/a/15638726 + num_calls++; + + double delta = new_time - mean; + + mean += delta / num_calls; + squares += delta * (new_time - mean); + + double variance = (num_calls == 1 ? 0 : squares / (num_calls - 1)); + stddev = ::std::sqrt(variance); + } + + int get_num_calls() const + { + return num_calls; + } + double get_mean() const + { + return mean; + } + double get_stddev() const + { + return stddev; + } + + private: + int num_calls = 0; + double mean = 0.0; + double squares = 0.0; + + // Only used when reading file + double stddev = 0.0; + }; + + using statistics_map_key_t = ::std::pair<::std::string, size_t>; + using statistics_map_t = + ::std::unordered_map>; + + void read_statistics_file(const char* filename) + { + assert(!is_calibrating() && "Cannot calibrate if we read task statistics from a file."); + + ::std::ifstream file(filename); + EXPECT(file, "Failed to read statistics file '", filename, "'."); + + int current_line = 0; + for (::std::string line; ::std::getline(file, line); ++current_line) + { + if (current_line == 0) // This is the csv header + { + continue; + } + + ::std::stringstream ss(line); + + ::std::string task_name; + size_t size = 0; + double time = -1.0; + int num_calls = -1; + double stddev = -1.0; + + int column = 0; + for (::std::string cell; ::std::getline(ss, cell, ','); ++column) + { + EXPECT(column < 5, "Invalid number of columns in statistics file ", column + 1, " (expected 5)."); + + if (column == 0) + { + task_name = cell; + } + else if (column == 1) + { + size = ::std::stoul(cell); + } + else if (column == 2) + { + num_calls = ::std::stoi(cell); + } + else if (column == 3) + { + time = ::std::stod(cell); + } + else if (column == 4) + { + stddev = ::std::stod(cell); + } + } + + EXPECT(time >= 0, "Invalid time value '", time, "' provided on line '", current_line, "'."); + EXPECT(num_calls >= 0, "Invalid num_calls value '", num_calls, "' provided on line '", current_line, "'."); + EXPECT(stddev >= 0, "Invalid stddev value '", stddev, "' provided on line '", current_line, "'."); + + ::std::pair<::std::string, size_t> key(task_name, size); + statistics.emplace(key, statistic(num_calls, time, stddev)); + } + } + + /** + * @brief Get statistics associated with a specific task + * + * @tparam Type of task + * @param The specified task + * @return A pair of the task time and the number of calls so far + */ + template + ::std::pair get_task_stats(const task_type& t) + { + const ::std::string& task_name = t.get_symbol(); + size_t data_footprint = get_data_footprint(t); + + ::std::pair<::std::string, size_t> key(task_name, data_footprint); + auto it = statistics.find(key); + if (it != statistics.end()) + { + const statistic& s = it->second; + return {s.get_mean(), s.get_num_calls()}; + } + + // If we do not have the task in the map, this means we have to be calibrating online. + // A missing task implies an incomplete stats file was provided + EXPECT(is_calibrating(), "Task '", task_name, "' with size ", data_footprint, " not provided in stats file."); + + return {0.0, 0}; + } + +private: + ::std::string calibration_file; + bool calibrating = false; + + template + size_t get_data_footprint(const task_type& t) const + { + size_t data_footprint = 0; + const auto deps = t.get_task_deps(); + + for (auto it = deps.begin(); it < deps.end(); it++) + { + size_t new_size = it->get_data_footprint(); + data_footprint += new_size; + assert(data_footprint >= new_size); // Check for overflow + } + + return data_footprint; + } + + void write_stats() const + { + ::std::ofstream file(calibration_file); + if (!file) + { + ::std::cerr << "Failed to write to calibration file '" << calibration_file << "'.\n"; + return; + } + + file << "task,size,num_calls,mean,stddev\n"; + for (const auto& [key, value] : statistics) + { + double stddev = value.get_stddev(); + file << key.first << "," << key.second << "," << value.get_num_calls() << "," << value.get_mean() << "," << stddev + << '\n'; + } + + file.close(); + if (file.rdstate() & ::std::ifstream::failbit) + { + ::std::cerr << "ERROR: Closing calibration file failed.\n"; + } + } + + statistics_map_t statistics; +}; + +} // namespace cuda::experimental::stf::reserved diff --git a/cudax/include/cuda/experimental/__stf/internal/thread_hierarchy.cuh b/cudax/include/cuda/experimental/__stf/internal/thread_hierarchy.cuh new file mode 100644 index 00000000000..cbe2e779791 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/thread_hierarchy.cuh @@ -0,0 +1,531 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief The thread hierarchy class describes how threads are organized in a kernel + * + * It can be used to describe a hierarchy of threads, the size of the different + * levels, and whether they can synchronize or not, for example. + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief This describes a hierarchy of threads used to implement a launch construct. + * + * It corresponds to a thread_hierarchy_spec which was mapped on the execution + * place, and provides simple mechanisms at different levels in the hierarchy : + * - getting the rank and the size of the calling thread in the hierarchy + * - synchronizing all threads in a specific level + * - getting a local storage attached to a specific level. + * + * This class is intended to be passed by value from the host to CUDA kernels + * so it does not contain pointers or indirection (except in the implementation + * of system-wide barriers) + */ +template +class thread_hierarchy +{ + // Depth of this hierarchy (each level has two spec values, `sync` and `width`) + static constexpr size_t depth = [](auto x, auto y, auto...) { + // Also run some checks + static_assert(::std::is_same_v, "You must use bool for the odd arguments of thread_hierarchy."); + static_assert(::std::is_same_v, + "You must use size_t for the even arguments of thread_hierarchy."); + // Two spec parameters per depth level + return sizeof...(spec) / 2; + }(spec..., false, size_t(0)); + + // Fetch the inner thread_hierarchy (peels off two template arguments) + template + struct inner_t; + template + struct inner_t + { + using type = thread_hierarchy; + }; + +public: + // For tag-types + thread_hierarchy() = default; + + template + friend class thread_hierarchy; + + // Construct from an outer thread hierarchy (peel off one level) + template + _CCCL_HOST_DEVICE thread_hierarchy(const thread_hierarchy& outer) + : devid(outer.devid) + , launch_config(outer.launch_config) + , cg_system(outer.cg_system) + , device_tmp(outer.device_tmp) + , system_tmp(outer.system_tmp) + { + for (size_t i : each(0, depth)) + { + level_sizes[i] = outer.level_sizes[i + 1]; + mem_sizes[i] = outer.mem_sizes[i + 1]; + } + } + + /** + * This takes an interpreted_execution_policy which is the mapping of a spec + * on the hardware, and generates a thread_hierarchy object that can be passed + * to kernels as an argument + */ + thread_hierarchy(int devid, interpreted_execution_policy& p) + : devid(devid) + { + launch_config = p.get_config(); + + // If we may synchronize accross multiple devices. + cg_system = p.cg_system; + + size_t i = 0; + for (auto& l : p.get_levels()) + { + level_sizes[i] = l.width(); + assert(l.width() > 0); + mem_sizes[i] = l.get_mem(); + i++; + } + } + + /** + * @brief Get the statically-specified width at a specific level + * @param level The level + * @return The width (0 if width is dynamic) + */ + static inline constexpr size_t static_width(size_t level) + { + size_t data[] = {spec...}; + return data[1 + 2 * level]; + } + + _CCCL_HOST_DEVICE const ::std::array& get_config() const + { + return launch_config; + } + + // Rank from the root + _CCCL_HOST_DEVICE size_t rank(int level, int root_level) const + { +#ifndef __CUDA_ARCH__ + (void) level; + (void) root_level; + return 0; +#else + int tid = threadIdx.x; + int bid = blockIdx.x; + // config : ndevs = launch_config[0] + // nblocks = launch_config[1] + // nthreads = launch_config[2] + const int nblocks = launch_config[1]; + const int nthreads = launch_config[2]; + + int global_id = tid + bid * nthreads + devid * nblocks * nthreads; + + // An entry at level l represents a total of width[level] * width[level+1] ... * width[depth - 1] threads. + size_t level_effective_size = 1; + size_t root_level_effective_size = 1; + + if constexpr (depth > 0) + { + for (size_t l = level + 1; l < depth; l++) + { + level_effective_size *= level_sizes[l]; + } + + for (size_t l = root_level + 1; l < depth; l++) + { + root_level_effective_size *= level_sizes[l]; + } + } + + return (global_id % root_level_effective_size) / level_effective_size; +#endif + } + + _CCCL_HOST_DEVICE size_t size(int level, int root_level) const + { + if constexpr (depth == 0) + { + return 1; + } + + assert(root_level < level); +#ifndef __CUDA_ARCH__ + ::std::ignore = level; + ::std::ignore = root_level; + return 1; +#else + // Could have been : return level_prod_size[level]/level_prod_size[root_level]; + // Divisions are probably more expensive. + size_t s = 1; + for (int l = root_level; l < level; l++) + { + s *= level_sizes[l + 1]; + } + + return s; +#endif + } + + _CCCL_HOST_DEVICE size_t size(int level = int(depth) - 1) const + { + return size(level, -1); + } + + _CCCL_HOST_DEVICE size_t rank(int level = int(depth) - 1) const + { + return rank(level, -1); + } + + _CCCL_HOST_DEVICE void sync(int level = 0) + { + assert(level >= 0); + assert(level < depth); +#ifndef __CUDA_ARCH__ + ::std::ignore = level; + return; +#else + + // Check that it is legal to synchronize (use a static assertion in the future) + assert(may_sync(level)); + + // We compute the products of level_sizes and config in reversed order. + + // This is the number of threads to synchronize + size_t target_size = 1; + for (int l = level; l < depth; l++) + { + target_size *= level_sizes[l]; + } + + // We then compute the number of threads for each of the different + // scopes (system, device, blocks) ... and compare with this number of + // threads + + // Test the different config levels + size_t block_scope_size = launch_config[2]; + if (target_size == block_scope_size) + { + cooperative_groups::this_thread_block().sync(); + return; + } + + size_t device_scope_size = block_scope_size * launch_config[1]; + if (target_size == device_scope_size) + { + cooperative_groups::this_grid().sync(); + return; + } + + size_t ndevs = launch_config[0]; + size_t system_scope_size = device_scope_size * ndevs; + if (target_size == system_scope_size) + { + cg_system.sync(devid, ndevs); + return; + } + + // Unsupported configuration + assert(0); +#endif + } + + template + auto remove_first_tuple_element(const ::std::tuple& t) + { + return ::std::make_tuple(::std::get(t)...); + } + + template + _CCCL_HOST_DEVICE auto apply_partition(const shape_t& s, const ::std::tuple& t) const + { + auto s0 = P::apply(s, pos4(rank(0)), dim4(size(0))); + auto sans_first = make_tuple_indexwise([&](auto index) { + return ::std::get(t); + }); + if constexpr (sizeof...(sub_partitions)) + { + return inner().apply_partition(s0, sans_first); + } + else + { + return s0; + } + } + + // Default partitioner tuple + template + _CCCL_HOST_DEVICE auto apply_partition(const shape_t& s) const + { + if constexpr (depth == 1) + { + return cyclic_partition::apply(s, pos4(rank(0)), dim4(size(0))); + } + else + { + auto s0 = blocked_partition::apply(s, pos4(rank(0)), dim4(size(0))); + return inner().apply_partition(s0); + } + } + + /** + * @brief Get the inner thread hierarchy (starting one level down) + * + * @return `thread_hierarchy` instantiated with `spec` sans the first two arguments + */ + _CCCL_HOST_DEVICE auto inner() const + { + return typename inner_t::type(*this); + } + + template + _CCCL_HOST_DEVICE slice storage(int level) + { + assert(level >= 0); + assert(level < depth); + + // We compute the products of level_sizes and config in reversed order. + + // This is the number of threads to synchronize + size_t target_size = 1; + for (int l = level; l < depth; l++) + { + target_size *= level_sizes[l]; + } + + // We then compute the number of threads for each of the different + // scopes (system, device, blocks) ... and compare with this number of + // threads + +#ifdef __CUDA_ARCH__ + size_t nelems = mem_sizes[level] / sizeof(T); + + // Test the different config levels + size_t block_scope_size = launch_config[2]; + if (target_size == block_scope_size) + { + // Use dynamic shared memory + extern __shared__ T dyn_buffer[]; + return make_slice(&dyn_buffer[0], nelems); + } + + size_t device_scope_size = block_scope_size * launch_config[1]; + if (target_size == device_scope_size) + { + // Use device memory + return make_slice(static_cast(device_tmp), nelems); + } + + size_t ndevs = launch_config[0]; + size_t system_scope_size = device_scope_size * ndevs; + if (target_size == system_scope_size) + { + // Use system memory (managed memory) + return make_slice(static_cast(system_tmp), nelems); + } +#endif + + // Unsupported configuration : memory must be a scope boundaries + assert(!"Unsupported configuration : memory must be a scope boundaries"); + return make_slice(static_cast(nullptr), 0); + } + + void set_device_tmp(void* addr) + { + device_tmp = addr; + } + + void set_system_tmp(void* addr) + { + system_tmp = addr; + } + + void set_devid(int d) + { + devid = d; + } + +private: + // On which device is this running ? (or -1 if host or ignored) + int devid = -1; + + // ndevs * nblocks * nthreads + ::std::array launch_config = {}; + + reserved::cooperative_group_system cg_system; + + // witdh of each level + ::std::array level_sizes = {}; + + // What is the memory associated to each level ? + ::std::array mem_sizes = {}; + + // Address of the per-device buffer in device memory, if any + void* device_tmp = nullptr; + + // Base address for system-wide buffers (managed memory), if any + void* system_tmp = nullptr; + + /* Special case to work-around spec = <> */ + template + _CCCL_HOST_DEVICE bool may_sync_impl(int) const + { + return false; + } + + template + _CCCL_HOST_DEVICE bool may_sync_impl(int level) const + { + static_assert(Idx < sizeof...(spec) / 2); + return (level == Idx) ? sync : may_sync_impl(level); + } + + /* Evaluate at runtime if we can call sync on the specified level */ + _CCCL_HOST_DEVICE bool may_sync(int level) const + { + return may_sync_impl<0, spec...>(level); + } +}; + +#ifdef UNITTESTED_FILE +# ifdef __CUDACC__ +namespace reserved +{ +template +__global__ void unit_test_thread_hierarchy(thread_hierarchy h) +{ + assert(h.size(0) == 2); + assert(h.size(1) == 2 * 32); + + // Rank within the whole system + assert(h.rank(1, -1) == 32 + threadIdx.x + blockIdx.x * blockDim.x); + assert(h.rank(1) == 32 + threadIdx.x + blockIdx.x * blockDim.x); + + assert(h.rank(0, -1) == 1); // device + assert(h.rank(1, 0) == threadIdx.x + blockIdx.x * blockDim.x); +} +} // end namespace reserved + +UNITTEST("thread hierarchy indexing") +{ + interpreted_execution_policy p; + + p.add_level({::std::make_pair(hw_scope::device, 2UL)}); + p.add_level({::std::make_pair(hw_scope::block, 8UL), ::std::make_pair(hw_scope::thread, 4UL)}); + + int dev_id = 1; + auto h = thread_hierarchy(dev_id, p); + + static_assert(h.static_width(0) == 0); + static_assert(h.static_width(1) == 0); + + auto config = p.get_config(); + reserved::unit_test_thread_hierarchy<<>>(h); + + cuda_safe_call(cudaDeviceSynchronize()); +}; + +namespace reserved +{ +template +__global__ void unit_test_thread_hierarchy_sync(thread_hierarchy h) +{ + assert(h.rank(1, 0) == threadIdx.x); + assert(h.rank(0, -1) == blockIdx.x); + h.sync(1); + h.sync(0); +} +} // end namespace reserved + +UNITTEST("thread hierarchy sync") +{ + interpreted_execution_policy p; + + // 2 levels + p.add_level({::std::make_pair(hw_scope::block, 8UL)}); + p.set_level_sync(0, true); + p.add_level({::std::make_pair(hw_scope::thread, 4UL)}); + p.set_level_sync(1, true); + + size_t dev_id = 1; + auto h = thread_hierarchy(dev_id, p); + + auto config = p.get_config(); + + void* args[] = {&h}; + cuda_safe_call(cudaLaunchCooperativeKernel( + (void*) reserved::unit_test_thread_hierarchy_sync, + config[1], + config[2], + args, + 0, + 0)); + + cuda_safe_call(cudaDeviceSynchronize()); +}; + +namespace reserved +{ +template +__global__ void unit_test_thread_hierarchy_inner_sync(thread_hierarchy h) +{ + h.inner().sync(); + auto h_inner = h.inner(); + assert(h_inner.size() == 4); +} +} // end namespace reserved + +UNITTEST("thread hierarchy inner sync") +{ + interpreted_execution_policy p; + + // 2 levels + p.add_level({::std::make_pair(hw_scope::block, 8UL)}); + p.set_level_sync(0, false); + p.add_level({::std::make_pair(hw_scope::thread, 4UL)}); + p.set_level_sync(1, true); + + int dev_id = 1; + auto h = thread_hierarchy(dev_id, p); + + auto config = p.get_config(); + reserved::unit_test_thread_hierarchy_inner_sync<<>>(h); + + cuda_safe_call(cudaDeviceSynchronize()); +}; + +# endif // __CUDACC__ +#endif // UNITTESTED_FILE + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/void_interface.cuh b/cudax/include/cuda/experimental/__stf/internal/void_interface.cuh new file mode 100644 index 00000000000..f03d0ba1aaa --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/internal/void_interface.cuh @@ -0,0 +1,206 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief This implements a void data interface useful to implement STF + * dependencies without actual data (e.g. to enforce task dependencies) + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +namespace cuda::experimental::stf +{ + +template +class shape_of; + +template +struct streamed_interface_of; + +template +struct graphed_interface_of; + +class void_interface +{}; + +/** + * @brief defines the shape of a void interface + * + * Note that we specialize cuda::experimental::stf::shape_of to avoid ambiguous specialization + * + * @extends shape_of + */ +template <> +class shape_of +{ +public: + shape_of() = default; + shape_of(const shape_of&) = default; + shape_of(const void_interface&) + : shape_of() + {} + + /// Mandatory method : defined the total number of elements in the shape + size_t size() const + { + return 0; + } +}; + +/** + * @brief Data interface to manipulate the void interface in the CUDA stream backend + */ +class void_stream_interface : public stream_data_interface_simple +{ +public: + using base = stream_data_interface_simple; + using base::shape_t; + + void_stream_interface(void_interface m) + : base(::std::move(m)) + {} + void_stream_interface(typename base::shape_t s) + : base(s) + {} + + /// Copy the content of an instance to another instance : this is a no-op + void stream_data_copy(const data_place&, instance_id_t, const data_place&, instance_id_t, cudaStream_t) override {} + + /// Pretend we allocate an instance on a specific data place : we do not do any allocation here + void stream_data_allocate( + backend_ctx_untyped&, const data_place&, instance_id_t, ::std::ptrdiff_t& s, void**, cudaStream_t) override + { + // By filling a non negative number, we notify that the allocation was succesful + s = 0; + } + + /// Pretend we deallocate an instance (no-op) + void stream_data_deallocate(backend_ctx_untyped&, const data_place&, instance_id_t, void*, cudaStream_t) override {} + + bool pin_host_memory(instance_id_t) override + { + // no-op + return false; + } + + void unpin_host_memory(instance_id_t) override {} +}; + +/** + * @brief Define how the CUDA stream backend must manipulate this void interface + * + * Note that we specialize cuda::experimental::stf::shape_of to avoid ambiguous specialization + * + * @extends streamed_interface_of + */ +template <> +struct streamed_interface_of +{ + using type = void_stream_interface; +}; + +/** + * @brief Data interface to manipulate the void interface in the CUDA graph backend + */ +class void_graph_interface : public graph_data_interface +{ +public: + /// @brief Alias for the base class + using base = graph_data_interface; + /// @brief Alias for the shape type + using base::shape_t; + + void_graph_interface(void_interface s) + : base(mv(s)) + {} + void_graph_interface(shape_of s) + : base(mv(s)) + {} + + void data_allocate( + backend_ctx_untyped&, + block_allocator_untyped&, + const data_place&, + instance_id_t, + ::std::ptrdiff_t& s, + void**, + event_list&) override + { + s = 0; + } + + void data_deallocate( + backend_ctx_untyped&, block_allocator_untyped&, const data_place&, instance_id_t, void*, event_list&) final + {} + + cudaGraphNode_t graph_data_copy( + cudaMemcpyKind, + instance_id_t, + instance_id_t, + cudaGraph_t graph, + const cudaGraphNode_t* input_nodes, + size_t input_cnt) override + { + cudaGraphNode_t dummy; + cuda_safe_call(cudaGraphAddEmptyNode(&dummy, graph, input_nodes, input_cnt)); + return dummy; + } + + bool pin_host_memory(instance_id_t) override + { + // no-op + return false; + } + + void unpin_host_memory(instance_id_t) override {} +}; + +/** + * @brief Define how the CUDA stream backend must manipulate this void interface + * + * Note that we specialize cuda::experimental::stf::shape_of to avoid ambiguous specialization + * + * @extends graphed_interface_of + */ +template <> +struct graphed_interface_of +{ + using type = void_graph_interface; +}; + +/** + * @brief A hash of the matrix + */ +template <> +struct hash +{ + ::std::size_t operator()(void_interface const&) const noexcept + { + return 42; + } +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh new file mode 100644 index 00000000000..a912433dea6 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh @@ -0,0 +1,376 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Implementation of the localized_array class which is used to allocate a piece + * of data that is dispatched over multiple data places + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include + +#include +#include + +namespace cuda::experimental::stf::reserved +{ + +/** + * @brief An allocator that takes a mapping function to dispatch an allocation over multiple data places. + * + * This is the mechanism used to implement the data_place of a grid of execution places. + */ +class localized_array +{ + struct metadata + { + metadata(int dev_, size_t size_, size_t offset_) + : alloc_handle{} + , dev(dev_) + , size(size_) + , offset(offset_) + {} + + CUmemGenericAllocationHandle alloc_handle; + int dev; + size_t size; + size_t offset; + }; + +public: + // ::std::function delinearize : translate the index in a buffer into a position in the data + // TODO pass mv(place) + template + localized_array(exec_place_grid grid, + get_executor_func_t mapper, + F&& delinearize, + size_t total_size, + size_t elemsize, + dim4 data_dims) + : grid(mv(grid)) + , mapper(mv(mapper)) + , total_size_bytes(total_size * elemsize) + , data_dims(data_dims) + , elemsize(elemsize) + { + // Regardless of the grid, we allow all devices to access that localized array + const int ndevs = cuda_try(); + CUdevice dev = cuda_try(); + + /* Check whether the current device supports UVA */ + int supportsVMM = cuda_try(CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, dev); + // fprintf(stderr, "VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED ? %d\n", supportsVMM); + EXPECT(supportsVMM == 1, "Cannot create a localized_array object on this machine because it does not support VMM."); + + /* Get allocation granularity */ + + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location = {.type = CU_MEM_LOCATION_TYPE_DEVICE, .id = dev}; + + size_t alloc_granularity_bytes = cuda_try(&prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); + // fprintf(stderr, "GRANULARITY = %ld KB\n", alloc_granularity_bytes / 1024); + + // To make our life simpler for now: we assume that we only allocate full blocks + block_size_bytes = alloc_granularity_bytes; + + vm_total_size_bytes = + ((total_size_bytes + alloc_granularity_bytes - 1) / alloc_granularity_bytes) * alloc_granularity_bytes; + + // Number of pages to assign (note that we will try to make less allocations in practice by grouping pages) + size_t nblocks = vm_total_size_bytes / alloc_granularity_bytes; + + // Reserve a range of virtual addresses, round up size to accomodate granularity requirements + cuda_try(cuMemAddressReserve(&base_ptr, vm_total_size_bytes, 0ULL, 0ULL, 0ULL)); + + // fprintf(stderr, "cuMemAddressReserve => %p + %ld (%ld KB)\n", (void *)base_ptr, vm_total_size_bytes, + // vm_total_size_bytes / 1024); + + ::std::vector accessDesc(ndevs); + for (int d = 0; d < ndevs; d++) + { + accessDesc[d].location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc[d].location.id = d; + accessDesc[d].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + } + + // Compute mapping at allocation granularity + ::std::vector owner; + owner.reserve(nblocks); + for (size_t i = 0; i < nblocks; i++) + { + owner.push_back( + block_to_grid_pos(i * block_size_bytes / elemsize, alloc_granularity_bytes / elemsize, delinearize)); + } + + // We create one allocation handle per block + meta.reserve(nblocks); + + // Try to merge blocks with the same position + for (size_t i = 0; i < nblocks;) + { + pos4 p = owner[i]; + size_t j = 0; + // Count consecutive blocks with the same position in the grid + while ((i + j < nblocks) && (owner[i + j] == p)) + { + j++; + } + + meta.emplace_back(grid_pos_to_dev(p), j * alloc_granularity_bytes, i * block_size_bytes); + + i += j; + } + + // fprintf(stderr, "GOT %ld effective blocks (%ld blocks)\n", nblocks_effective, nblocks); + + // Create a physical allocation per block, this is not mapped in + // virtual memory yet. + for (auto& item : meta) + { + CUmemAllocationProp create_prop = {}; + create_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + create_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + // Physically allocate this block on the appropriate device + // fprintf(stderr, "MAP EFFECTIVE BLOCK %d on dev %d\n", i, hdl_dev[i]); + create_prop.location.id = item.dev; + + cuda_safe_call(cuMemCreate(&item.alloc_handle, item.size, &create_prop, 0)); + + // Device where this was allocated + + assert(item.offset + item.size <= vm_total_size_bytes); + // fprintf(stderr, "cuMemMap(base_ptr %p = %p + %p, %p, 0ULL, hdl[%d], 0ULL)\n", base_ptr + offset, + // base_ptr, + // offset, sz, i); + cuda_safe_call(cuMemMap(base_ptr + item.offset, item.size, 0ULL, item.alloc_handle, 0ULL)); + + for (int d = 0; d < ndevs; d++) + { + int set_access = 1; + if (item.dev != d) + { + cuda_safe_call(cudaDeviceCanAccessPeer(&set_access, d, item.dev)); + + if (!set_access) + { + fprintf(stderr, "Warning : Cannot enable peer access between devices %d and %d\n", d, item.dev); + } + } + + if (set_access == 1) + { + cuda_safe_call(cuMemSetAccess(base_ptr + item.offset, item.size, &accessDesc[d], 1ULL)); + } + } + } + // fprintf(stderr, "localized_array (this = %p) : nblocks_effective %ld\n", this, nblocks_effective); + } + + localized_array() = delete; + localized_array(const localized_array&) = delete; + localized_array(localized_array&&) = delete; + localized_array& operator=(const localized_array&) = delete; + localized_array& operator=(localized_array&&) = delete; + + ~localized_array() + { + // fprintf(stderr, "~localized_array (this = %p) ... base ptr %p vm_total_size_bytes %ld - nblocks_effective + // %ld\n", this, (void *)base_ptr, vm_total_size_bytes, nblocks_effective); + for (auto& item : meta) + { + size_t offset = item.offset; + size_t sz = item.size; + cuda_safe_call(cuMemUnmap(base_ptr + offset, sz)); + cuda_safe_call(cuMemRelease(item.alloc_handle)); + } + + cuda_safe_call(cuMemAddressFree(base_ptr, vm_total_size_bytes)); + } + + // Convert the device pointer in the device API back to a raw void * pointer + void* get_base_ptr() const + { + return reinterpret_cast(base_ptr); + } + + /* + * This equality operator is for example used to find entries in an allocation cache which match a specific request + */ + template + bool operator==(::std::tuple t) const + { + // tuple arguments : + // 0 : grid, 1 : mapper, 2 : delinearize function, 3 : total size, 4 elem_size, 5 : data_dims + bool result = grid == ::std::get<0>(t) && mapper == ::std::get<1>(t) + && this->total_size_bytes == ::std::get<3>(t) * ::std::get<4>(t) && elemsize == ::std::get<4>(t); + if (result) + { + assert(this->total_size_bytes == ::std::get<3>(t) * ::std::get<4>(t)); + assert(data_dims == ::std::get<5>(t)); + } + return result; + } + + void merge(const event_list& source) + { + prereqs.merge(source); + } + + void merge_into(event_list& target) + { + target.merge(mv(prereqs)); + prereqs.clear(); + } + +private: + int grid_pos_to_dev(pos4 grid_pos) + { + return device_ordinal(grid.get_place(grid_pos).affine_data_place()); + } + + // linearized_index : expressed in number of entries from the base, not bytes + // allocation_granularity expressed in number of entries + template + pos4 block_to_grid_pos(size_t linearized_index, size_t allocation_granularity, F&& delinearize) + { +#if 0 + // Our first strategy consists in mapping the block at the location of the first entry of the block + return index_to_grid_pos(linearized_index, delinearize); +#else + ::std::random_device rd; + ::std::mt19937 gen(rd()); + ::std::uniform_int_distribution<> dis(0, static_cast(allocation_granularity - 1)); + + const size_t nsamples = 10; + ::std::array sampled_pos; + for (size_t sample = 0; sample < nsamples; sample++) + { + size_t index = linearized_index + dis(gen); + sampled_pos[sample] = index_to_grid_pos(index, delinearize); + } + + // Count the number of occurences of each pos + ::std::unordered_map> sample_cnt; + for (auto& s : sampled_pos) + { + ++sample_cnt[s]; + } + + size_t max_cnt = 0; + pos4 max_pos; + for (auto& s : sample_cnt) + { + if (s.second > max_cnt) + { + max_pos = s.first; + max_cnt = s.second; + } + } + + // ::std::cout << "GOT BEST POS for offset " << linearized_index << " -> " << max_pos.string() << ::std::endl; + + return max_pos; +#endif + } + + template + pos4 index_to_grid_pos(size_t linearized_index, F&& delinearize) + { + // Logical coordinates of this index + pos4 coords = delinearize(linearized_index); + + pos4 eplace_coords = mapper(coords, data_dims, grid.get_dims()); + + return eplace_coords; + } + + event_list prereqs; // To allow reuse in a cache + exec_place_grid grid; + get_executor_func_t mapper = nullptr; + ::std::vector meta; + + // sizes in number of elements, not bytes !! TODO rename + size_t block_size_bytes = 0; + size_t total_size_bytes = 0; + + // size of the VA reservation in bytes + size_t vm_total_size_bytes = 0; + + // Start of the VA reservation + CUdeviceptr base_ptr = 0; + + // Parameter saved to allow reusing data + dim4 data_dims; + size_t elemsize = 0; +}; + +/** + * @brief A very simple allocation cache for slices in composite data places + */ +class composite_slice_cache +{ +public: + composite_slice_cache() = default; + composite_slice_cache(const composite_slice_cache&) = delete; + composite_slice_cache(composite_slice_cache&) = delete; + composite_slice_cache(composite_slice_cache&&) = default; + + [[nodiscard]] event_list deinit() + { + event_list result; + cache.each([&](auto& obj) { + obj.merge_into(result); + }); + return result; + } + + // Save one localized array in the cache + void put(::std::unique_ptr a, const event_list& prereqs) + { + EXPECT(a.get()); + a->merge(prereqs); + cache.put(mv(a)); + } + + // Look if there is a matching entry. Return it if found, create otherwise + template + ::std::unique_ptr + get(const data_place& place, + get_executor_func_t mapper, + F&& delinearize, + size_t total_size, + size_t elem_size, + dim4 data_dims) + { + EXPECT(place.is_composite()); + return cache.get(place.get_grid(), mapper, ::std::forward(delinearize), total_size, elem_size, data_dims); + } + +private: + reserved::linear_pool cache; +}; + +} // end namespace cuda::experimental::stf::reserved diff --git a/cudax/include/cuda/experimental/__stf/places/blocked_partition.cuh b/cudax/include/cuda/experimental/__stf/places/blocked_partition.cuh new file mode 100644 index 00000000000..9999e3e7f02 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/places/blocked_partition.cuh @@ -0,0 +1,130 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Definition of the `blocked_partition` strategy + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +namespace cuda::experimental::stf +{ + +template <::std::ptrdiff_t which_dim = -1> +class blocked_partition_custom +{ +public: + blocked_partition_custom() = default; + + template + _CCCL_HOST_DEVICE static auto apply(const box& in, pos4 place_position, dim4 grid_dims) + { + ::std::array<::std::pair<::std::ptrdiff_t, ::std::ptrdiff_t>, dimensions> bounds; + size_t target_dim = (which_dim == -1) ? dimensions - 1 : size_t(which_dim); + if (target_dim > dimensions - 1) + { + target_dim = dimensions - 1; + } + + // if constexpr (dimensions > 1) { + for (size_t d = 0; d < dimensions; d++) + { + // First position in this dimension (included) + bounds[d].first = in.get_begin(d); + // Last position in this dimension (excluded) + bounds[d].second = in.get_end(d); + } + // } + + size_t nplaces = grid_dims.x; + ::std::ptrdiff_t dim_beg = bounds[target_dim].first; + ::std::ptrdiff_t dim_end = bounds[target_dim].second; + size_t cnt = dim_end - dim_beg; + ::std::ptrdiff_t part_size = (cnt + nplaces - 1) / nplaces; + + // If first = second, this means it's an empty shape. This may happen + // when there are more entries in grid_dims than in the shape for + // example + bounds[target_dim].first = ::std::min(dim_beg + part_size * place_position.x, dim_end); + bounds[target_dim].second = ::std::min(dim_beg + part_size * (place_position.x + 1), dim_end); + + return box(bounds); + } + + template + _CCCL_HOST_DEVICE static auto apply(const mdspan_shape_t& in, pos4 place_position, dim4 grid_dims) + { + constexpr size_t dimensions = mdspan_shape_t::rank(); + + ::std::array<::std::pair<::std::ptrdiff_t, ::std::ptrdiff_t>, dimensions> bounds; + for (size_t d = 0; d < dimensions; d++) + { + bounds[d].first = 0; + bounds[d].second = in.extent(d); + } + + // Last position in this dimension (excluded) + size_t target_dim = (which_dim == -1) ? dimensions - 1 : size_t(which_dim); + if (target_dim > dimensions - 1) + { + target_dim = dimensions - 1; + } + + ::std::ptrdiff_t dim_end = in.extent(target_dim); + + // The last dimension is split accross the different places + size_t nplaces = grid_dims.x; + size_t part_size = (in.extent(target_dim) + nplaces - 1) / nplaces; + bounds[target_dim].first = ::std::min((::std::ptrdiff_t) part_size * place_position.x, dim_end); + bounds[target_dim].second = ::std::min((::std::ptrdiff_t) part_size * (place_position.x + 1), dim_end); + + return box(bounds); + } + + _CCCL_HOST_DEVICE static pos4 get_executor(pos4 data_coords, dim4 data_dims, dim4 grid_dims) + { + // Find the largest dimension + size_t rank = data_dims.get_rank(); + size_t target_dim = (which_dim == -1) ? rank : size_t(which_dim); + if (target_dim > rank) + { + target_dim = rank; + } + + size_t extent = data_dims.get(target_dim); + + size_t nplaces = grid_dims.x; + size_t part_size = (extent + nplaces - 1) / nplaces; + + // Get the coordinate in the selected dimension + size_t c = data_coords.get(target_dim); + + return pos4(c / part_size); + } +}; + +using blocked_partition = blocked_partition_custom<>; + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/places/cyclic_shape.cuh b/cudax/include/cuda/experimental/__stf/places/cyclic_shape.cuh new file mode 100644 index 00000000000..6cbea121f7d --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/places/cyclic_shape.cuh @@ -0,0 +1,332 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Define explicit shapes + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief An cyclic shape is a shape or rank 'dimensions' where the bounds are + * explicit in each dimension, and where we jump between elements with a + * specific stride. + * + * @tparam dimensions the rank of the shape + */ +template +class cyclic_shape +{ +public: + ///@{ @name Constructors + + /// Construct and explicit shape from a list of lower and upper bounds + _CCCL_HOST_DEVICE explicit cyclic_shape(const ::std::array<::std::tuple, dimensions>& list) + { + size_t i = 0; + for (auto& e : list) + { + begins[i] = ::std::get<0>(e); + ends[i] = ::std::get<1>(e); + strides[i] = ::std::get<2>(e); + // printf("CYCLIC SHAPE : dim %ld : %ld %ld %ld\n", i, begin, end, stride); + i++; + } + } + + /// Construct and explicit shape from a list of lower and upper bounds + _CCCL_HOST_DEVICE cyclic_shape(const ::std::array& begins_, + const ::std::array& ends_, + const ::std::array& strides_) + : begins(begins_) + , ends(ends_) + , strides(strides_) + {} + + _CCCL_HOST_DEVICE void print() const + { + printf("CYCLIC SHAPE\n"); + for (size_t i = 0; i < dimensions; i++) + { + printf("\t%ld:%ld:%ld\n", begins[i], ends[i], strides[i]); + } + } + + ///@} + + /// Get the total number of elements in this explicit shape + _CCCL_HOST_DEVICE ::std::ptrdiff_t size() const + { + size_t res = 1; + for (size_t d = 0; d < dimensions; d++) + { + // TODO: should this be (ends[d] - begins[d] + strides[d] - 1) / strides[d]; + res *= (ends[d] - begins[d]) / strides[d]; + } + + return res; + } + + // Overload the equality operator to check if two shapes are equal + _CCCL_HOST_DEVICE bool operator==(const cyclic_shape& other) const + { + for (size_t i = 0; i < dimensions; ++i) + { + if (begins[i] != other.begins[i] || ends[i] != other.ends[i] || strides[i] != other.strides[i]) + { + // printf("BEGIN[%ld] %ld != OTHER BEGIN[%ld] %ld\n", i, begins[i] , i , other.begins[i]); + return false; + } + } + return true; + } + + /// get the dimensionnality of the explicit shape + _CCCL_HOST_DEVICE size_t get_rank() const + { + return dimensions; + } + + // Iterator class for cyclic_shape + class iterator + { + private: + cyclic_shape* shape; + size_t index; + ::std::array current; // Array to store the current position in each dimension + + public: + _CCCL_HOST_DEVICE iterator(cyclic_shape& s, size_t idx = 0) + : shape(&s) + , index(idx) + , current(s.begins) + {} + + // Overload the dereference operator to get the current position + _CCCL_HOST_DEVICE auto& operator*() + { + if constexpr (dimensions == 1) + { + return current[0]; + } + else + { + return current; + } + } + + // Overload the pre-increment operator to move to the next position + _CCCL_HOST_DEVICE iterator& operator++() + { + index++; + + for (auto dim : each(0, dimensions)) + { + current[dim] += shape->strides[dim]; // Increment the current position by the stride + // fprintf(stderr, "current[%ld] += %ld\n", dim, shape.get_stride(dim)); + + // printf("TEST current[%ld] (%ld) > shape.ends[%ld] (%ld)\n", dim, current[dim], dim, shape.ends[dim]); + if (current[dim] < shape->ends[dim]) + { + break; + } + // Wrap around to the begin value if the current position exceeds the end value + current[dim] = shape->begins[dim]; + } + + return *this; + } + + // Overload the equality operator to check if two iterators are equal + _CCCL_HOST_DEVICE bool operator==(const iterator& other) const + { /*printf("EQUALITY TEST index %d %d shape equal ? %s\n", index, + other.index, (&shape == &other.shape)?"yes":"no"); */ + // printf("check == : index %d %d => %s shape equal ? %s\n", index, other.index, (index == + // other.index)?"yes":"no", (shape == other.shape)?"yes":"no"); + return shape == other.shape && index == other.index; + } + + // Overload the inequality operator to check if two iterators are not equal + _CCCL_HOST_DEVICE bool operator!=(const iterator& other) const + { + return !(*this == other); + } + }; + + // Functions to create the begin and end iterators + _CCCL_HOST_DEVICE iterator begin() + { + return iterator(*this); + } + + _CCCL_HOST_DEVICE iterator end() + { + size_t total_positions = 1; + for (auto i : each(0, dimensions)) + { + auto begin = begins[i]; // included + auto end = ends[i]; // excluded + auto stride = strides[i]; + total_positions *= (end - begin + stride - 1) / stride; + } + // The first invalid position is index with "total_positions", even if there is no entry in the range + return iterator(*this, total_positions); + } + +private: + ::std::array begins; + ::std::array ends; + ::std::array strides; +}; + +/** + * @brief Apply a round-robin distribution of elements + */ +class cyclic_partition +{ +public: + cyclic_partition() = default; + + template + _CCCL_HOST_DEVICE static auto apply(const box& in, pos4 place_position, dim4 grid_dims) + { + ::std::array begins; + ::std::array ends; + ::std::array strides; + for (size_t d = 0; d < dimensions; d++) + { + begins[d] = in.get_begin(d) + place_position.get(d); + ends[d] = in.get_end(d); + strides[d] = grid_dims.get(d); + } + + return cyclic_shape(begins, ends, strides); + } + + template + _CCCL_HOST_DEVICE static auto apply(const mdspan_shape_t& in, pos4 place_position, dim4 grid_dims) + { + constexpr size_t dimensions = mdspan_shape_t::rank(); + + ::std::array<::std::tuple, dimensions> bounds; + for (size_t d = 0; d < dimensions; d++) + { + // Can't assign the whole tuple because the assignment needs to run on device. + ::std::get<0>(bounds[d]) = place_position.get(d); + ::std::get<1>(bounds[d]) = in.extent(d); + ::std::get<2>(bounds[d]) = grid_dims.get(d); + } + + return cyclic_shape(bounds); + } + + _CCCL_HOST_DEVICE static pos4 get_executor(pos4 /*unused*/, dim4 /*unused*/, dim4 /*unused*/) + { + abort(); + return pos4(0); + } +}; + +#ifdef UNITTESTED_FILE +UNITTEST("cyclic_shape<3>") +{ + // Expect to iterate over Card({0, 2, 4, 6}x{1, 3}x{10, 15}) = 4*2*2 = 16 items + const size_t expected_cnt = 16; + size_t cnt = 0; + cyclic_shape<3> shape{{::std::make_tuple(0, 7, 2), ::std::make_tuple(1, 5, 2), ::std::make_tuple(10, 20, 5)}}; + for ([[maybe_unused]] const auto& pos : shape) + { + //// Use the position in each dimension + // ::std::cout << "("; + // for (const auto& p: pos) { + // ::std::cout << p << ", "; + //} + // ::std::cout << ")" << ::std::endl; + EXPECT(cnt < expected_cnt); + cnt++; + } + + EXPECT(cnt == expected_cnt); +}; + +UNITTEST("empty cyclic_shape<1>") +{ + cyclic_shape<1> shape{{::std::make_tuple(7, 7, 1)}}; + + auto it_end = shape.end(); + auto it_begin = shape.begin(); + if (it_end != it_begin) + { + fprintf(stderr, "Error: begin() != end()\n"); + abort(); + } + + // There should be no entry in this range + for ([[maybe_unused]] const auto& pos : shape) + { + abort(); + } +}; + +UNITTEST("apply cyclic ") +{ + box<3> e({{0, 7}, {1, 5}, {10, 20}}); + + size_t dim0 = 2; + size_t dim1 = 2; + + size_t cnt = 0; + + size_t expected_cnt = 7 * 4 * 10; + + for (size_t i0 = 0; i0 < dim0; i0++) + { + for (size_t i1 = 0; i1 < dim1; i1++) + { + auto c = cyclic_partition::apply(e, pos4(i0, i1), dim4(dim0, dim1)); + for ([[maybe_unused]] const auto& pos : c) + { + //// Use the position in each dimension + // ::std::cout << " ("; + // for (const auto& p: pos) { + // ::std::cout << p << ", "; + //} + // ::std::cout << ")" << ::std::endl; + + // avoid infinite loops + EXPECT(cnt < expected_cnt); + + cnt++; + } + } + } + + // We must have gone over all elements exactly once + EXPECT(cnt == expected_cnt); +}; +#endif // UNITTESTED_FILE + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh b/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh new file mode 100644 index 00000000000..31841e728f9 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh @@ -0,0 +1,99 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief CUDA stream execution place implementation + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief Designates execution that is to run on a specific CUDA stream + * + */ +class exec_place_cuda_stream : public exec_place +{ +public: + class impl : public exec_place::impl + { + public: + impl(const decorated_stream& _dstream) + : exec_place::impl(data_place::device(_dstream.dev_id)) + , dstream(_dstream) + { + // Create a dummy pool + dummy_pool.payload.push_back(dstream); + } + + /* We set the current device to be the device on which the CUDA stream was created */ + exec_place activate(backend_ctx_untyped& bctx) const override + { + return exec_place::device(dstream.dev_id).activate(bctx); + } + + void deactivate(backend_ctx_untyped& bctx, const exec_place& prev) const override + { + return exec_place::device(dstream.dev_id).deactivate(bctx, prev); + } + + stream_pool& get_stream_pool(async_resources_handle&, bool) const override + { + return dummy_pool; + } + + ::std::string to_string() const override + { + return "exec(stream id=" + ::std::to_string(dstream.id) + " dev=" + ::std::to_string(dstream.dev_id) + ")"; + } + + private: + decorated_stream dstream; + // We create a dummy pool of streams which only consists in a single stream in practice. + mutable stream_pool dummy_pool; + }; + +public: + exec_place_cuda_stream(const decorated_stream& dstream) + : exec_place(::std::make_shared(dstream)) + { + static_assert(sizeof(exec_place_cuda_stream) == sizeof(exec_place), + "exec_place_cuda_stream cannot add state; it would be sliced away."); + } +}; + +inline exec_place_cuda_stream exec_place::cuda_stream(cudaStream_t stream) +{ + int devid = get_device_from_stream(stream); + return exec_place_cuda_stream(decorated_stream(stream, -1, devid)); +} + +inline exec_place_cuda_stream exec_place::cuda_stream(const decorated_stream& dstream) +{ + return exec_place_cuda_stream(dstream); +} + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh new file mode 100644 index 00000000000..49d37486d2e --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh @@ -0,0 +1,276 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Implementation of green context places + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#if CUDA_VERSION >= 12040 +namespace cuda::experimental::stf +{ + +/* Get the unique ID associated with a context (overloaded) */ +inline unsigned long long get_cuda_context_id(CUcontext ctx) +{ + unsigned long long ctx_id; + cuda_safe_call(cuCtxGetId(ctx, &ctx_id)); + + return ctx_id; +} + +/* Get the unique ID associated with a green context (overloaded) */ +inline unsigned long long get_cuda_context_id(CUgreenCtx gctx) +{ + CUcontext primary_ctx; + cuda_safe_call(cuCtxFromGreenCtx(&primary_ctx, gctx)); + return get_cuda_context_id(primary_ctx); +} + +/** + * @brief Helper class to create views of green contexts that can be used as execution places + */ +class green_context_helper +{ +public: + /* Create green contexts with sm_count SMs per context on a specific device (current device by default) */ + green_context_helper(int sm_count, int devid = cuda_try()) + : devid(devid) + , numsm(sm_count) + { + assert(devid >= 0); + const int old_device = cuda_try(); + // Change device only if necessary. + if (devid != old_device) + { + cuda_safe_call(cudaSetDevice(devid)); + } + + /* Make sure we aren't requesting more SMs than the GPU has available */ + int max_SMs; + cuda_safe_call(cudaDeviceGetAttribute(&max_SMs, cudaDevAttrMultiProcessorCount, devid)); + assert(max_SMs >= int(numsm)); + + /* Determine the device's resources */ + CUdevice device; + cuda_safe_call(cuDeviceGet(&device, devid)); + + /* Retain the primary ctx in order to get a set of SM resources for that device */ + CUcontext primaryCtx; + CUdevResource input; + cuda_safe_call(cuDevicePrimaryCtxRetain(&primaryCtx, device)); + cuCtxGetDevResource(primaryCtx, &input, CU_DEV_RESOURCE_TYPE_SM); + cuDevicePrimaryCtxRelease(device); + + // First we query how many groups should be created + unsigned int nbGroups; + cuda_safe_call(cuDevSmResourceSplitByCount(NULL, &nbGroups, &input, NULL, 0, sm_count)); + + // Split the resources as requested + assert(nbGroups >= 1); + resources.resize(nbGroups); + cuda_safe_call(cuDevSmResourceSplitByCount(resources.data(), &nbGroups, &input, &remainder, 0, sm_count)); + + /* Create a green context for each group */ + ctxs.resize(nbGroups); + + // Create pools of CUDA streams + pools.reserve(nbGroups); + + for (int i = 0; i < static_cast(nbGroups); i++) + { + if (resources[i].type != CU_DEV_RESOURCE_TYPE_INVALID) + { + // Create a descriptor and a green context with that descriptor: + CUdevResourceDesc localdesc; + /* The generated resource descriptor is necessary for the creation of green contexts via the + * cuGreenCtxCreate API. The API expects nbResources == 1, as there is only one type of resource and + * merging the same types of resource is currently not supported. */ + cuda_safe_call(cuDevResourceGenerateDesc(&localdesc, &resources[i], 1)); + // Create a green context + cuda_safe_call(cuGreenCtxCreate(&ctxs[i], localdesc, device, CU_GREEN_CTX_DEFAULT_STREAM)); + + CUcontext green_primary; + cuda_safe_call(cuCtxFromGreenCtx(&green_primary, ctxs[i])); + + pools.push_back(::std::make_shared(async_resources_handle::pool_size, devid, green_primary)); + } + } + } + + green_context_helper() = default; + ~green_context_helper() = default; + +public: + size_t get_device_id() const + { + return devid; + } + CUgreenCtx partition(size_t partition = 0) + { + return ctxs[partition]; + } + + green_ctx_view get_view(size_t id) + { + return green_ctx_view(ctxs[id], pools[id], devid); + } + + size_t get_count() const + { + return ctxs.size(); + } + +private: + friend class exec_place; + + // resources to define how we split the device(s) into green contexts + ::std::vector resources; + + // Pools of CUDA streams associated to each green context (lazily created streams) + ::std::vector<::std::shared_ptr> pools; + + CUdevResource remainder = {}; + int devid = -1; + + // Number of SMs requested per green context + size_t numsm = 0; + + ::std::vector ctxs; +}; + +/** + * @brief Designates execution that is to run on a green context. Initialize with the device ordinal and green_context + */ +class exec_place_green_ctx : public exec_place +{ +public: + class impl : public exec_place::impl + { + public: + /* Note that we are using the data place of the device here as the affine data place */ + impl(green_ctx_view gc_view) + : exec_place::impl(data_place::green_ctx(gc_view)) + , devid(gc_view.devid) + , g_ctx(gc_view.g_ctx) + , pool(mv(gc_view.pool)) + {} + + // This is used to implement deactivate and wrap an existing context + impl(CUcontext saved_context) + : driver_context(saved_context) + {} + + exec_place activate(backend_ctx_untyped&) const override + { + // Save the current context and transform it into a fake green context place + CUcontext current_ctx; + cuda_safe_call(cuCtxGetCurrent(¤t_ctx)); + exec_place result = exec_place(::std::make_shared(current_ctx)); + + // Convert the green context to a primary context (TODO cache this ?) + cuda_safe_call(cuCtxFromGreenCtx(&driver_context, g_ctx)); + +# if 0 + // for debug purposes, display the affinity + { + CUdevResource check_resource; + cuda_safe_call(cuGreenCtxGetDevResource(g_ctx, &check_resource, CU_DEV_RESOURCE_TYPE_SM)); + unsigned long long check_ctxId; + cuda_safe_call(cuCtxGetId(driver_context, &check_ctxId)); + fprintf(stderr, "ACTIVATE : set affinity with %d SMs (ctx ID = %llu)\n", check_resource.sm.smCount, + check_ctxId); + } +# endif + + cuda_safe_call(cuCtxSetCurrent(driver_context)); + + return result; + } + + void deactivate(backend_ctx_untyped&, const exec_place& prev) const override + { + auto prev_impl = ::std::static_pointer_cast(prev.get_impl()); + CUcontext saved_ctx = prev_impl->driver_context; + +# ifdef DEBUG + // Ensure that the current context is the green context that we have activated before + CUcontext current_ctx; + cuda_safe_call(cuCtxGetCurrent(¤t_ctx)); + assert(get_cuda_context_id(current_ctx) == get_cuda_context_id(driver_context)); +# endif + + cuda_safe_call(cuCtxSetCurrent(saved_ctx)); + } + + ::std::string to_string() const override + { + return "green ctx ( id=" + ::std::to_string(get_cuda_context_id(g_ctx)) + " dev_id =" + ::std::to_string(devid) + + ")"; + } + + stream_pool& get_stream_pool(async_resources_handle&, bool) const override + { + return *pool; + } + + private: + int devid = -1; + CUgreenCtx g_ctx = {}; + // a context created from the green context (or used to store an existing context to implement + // activate/deactivate) + mutable CUcontext driver_context = {}; + ::std::shared_ptr pool; + }; + +public: + exec_place_green_ctx(green_ctx_view gc_view) + : exec_place(::std::make_shared(mv(gc_view))) + { + static_assert(sizeof(exec_place_green_ctx) <= sizeof(exec_place), + "exec_place_green_ctx cannot add state; it would be sliced away."); + } + + exec_place_green_ctx(::std::shared_ptr gc_view_ptr) + : exec_place(::std::make_shared(*gc_view_ptr)) + { + static_assert(sizeof(exec_place_green_ctx) <= sizeof(exec_place), + "exec_place_green_ctx cannot add state; it would be sliced away."); + } +}; + +inline exec_place exec_place::green_ctx(const green_ctx_view& gc_view) +{ + return exec_place_green_ctx(gc_view); +} + +inline exec_place exec_place::green_ctx(const ::std::shared_ptr& gc_view_ptr) +{ + return exec_place_green_ctx(gc_view_ptr); +} + +} // end namespace cuda::experimental::stf +#endif diff --git a/cudax/include/cuda/experimental/__stf/places/exec/green_ctx_view.cuh b/cudax/include/cuda/experimental/__stf/places/exec/green_ctx_view.cuh new file mode 100644 index 00000000000..ed3efd03898 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/places/exec/green_ctx_view.cuh @@ -0,0 +1,69 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +/** + * @file + * @brief Implementation of green context views + */ + +#include +#include + +#if CUDA_VERSION >= 12040 +namespace cuda::experimental::stf +{ + +// Green contexts are only supported since CUDA 12.4 +/** + * @brief View of a green context and a pool of CUDA streams + */ +class green_ctx_view +{ +public: + green_ctx_view(CUgreenCtx g_ctx, ::std::shared_ptr pool, int devid) + : g_ctx(g_ctx) + , pool(mv(pool)) + , devid(devid) + {} + + CUgreenCtx g_ctx; + ::std::shared_ptr pool; + int devid; + + bool operator==(const green_ctx_view& other) const + { + return (g_ctx == other.g_ctx) && (pool.get() == other.pool.get()) && (devid == other.devid); + } +}; + +template <> +struct hash +{ + ::std::size_t operator()(const green_ctx_view& k) const + { + return hash_all(k.g_ctx, k.pool, k.devid); + } +}; + +} // end namespace cuda::experimental::stf + +#endif diff --git a/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh b/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh new file mode 100644 index 00000000000..6b56b3b1c3f --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh @@ -0,0 +1,606 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief An experimental mechanism to launch callback on specific CPU threads + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include +#include + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // do not document + +# if !defined(_CCCL_COMPILER_MSVC) +# define STATEFUL_CALLBACKS + +namespace cuda::experimental::stf +{ + +class cb; + +# ifdef STATEFUL_CALLBACKS +class cudaCallbackStateCtxKeys : public reserved::meyers_singleton +{ +protected: + cudaCallbackStateCtxKeys() = default; + ~cudaCallbackStateCtxKeys() = default; + +public: + // per thread current callback + pthread_key_t cb_key; + + // Initialize cb_key once only + pthread_once_t cb_key_once; +}; + +class cudaCallbackStateCtx : public reserved::meyers_singleton +{ +protected: + cudaCallbackStateCtx() + { + // Create a pthread_key to store the current callback + // This is done only once for all, so that all pthread share the key + pthread_once(&cudaCallbackStateCtxKeys::instance().cb_key_once, &cb_key_init); + } + + ~cudaCallbackStateCtx() + { + pthread_key_delete(cudaCallbackStateCtxKeys::instance().cb_key_once); + } + +public: + void set_current_cb(cb* cb) + { + pthread_setspecific(cudaCallbackStateCtxKeys::instance().cb_key, cb); + } + + cb* get_current_cb() + { + return ((cb*) pthread_getspecific(cudaCallbackStateCtxKeys::instance().cb_key)); + } + +private: + static void cb_key_init() + { + // We cannot pass an argument in this function with the + // pthread_once interface, so we retrieve the static instance + // instead. + cudaCallbackStateCtxKeys* ctx = &cudaCallbackStateCtxKeys::instance(); + pthread_key_create(&ctx->cb_key, NULL); + pthread_setspecific(ctx->cb_key, NULL); + } +}; +# endif // STATEFUL_CALLBACKS + +class callback_queue; + +bool cudaCallbackQueueProgress(callback_queue* q, bool flag); +cudaError_t cudaStreamAddCallbackWithQueue( + cudaStream_t stream, cudaStreamCallback_t callback, void* userData, unsigned int flags, class callback_queue* q); +int* cf_pop(callback_queue* q); + +# define USE_COMPLETION_FLAG_POOL 1 + +# ifdef USE_COMPLETION_FLAG_POOL +class completion_flag_pool +{ +public: + void init() + { + cnt = 1024; + cudaMallocManaged((void**) &pool, cnt * sizeof(int)); + memset(pool, 0, cnt * sizeof(int)); + + int ii; + for (ii = 0; ii < cnt; ii++) + { + stack.push(&pool[ii]); + } + } + + // TODO destructor + int* pop() + { + if (stack.empty()) + { + return NULL; + } + + int* ptr = stack.top(); + stack.pop(); + + return ptr; + } + + void push(int* flag_ptr) + { + stack.push(flag_ptr); + } + +private: + int cnt; + int* pool; + ::std::stack stack; // a stack to store available entries in the pool + + // A mutex to protect the pool of completion flags + pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; +}; +# endif + +class cb +{ +public: + // Graph Host nodes slightly differ as they don't have a status + bool is_graph_host_node; + cudaHostFn_t graph_callback; + cudaGraph_t graph; + + // For streams + cudaError_t status; + cudaStream_t stream; + cudaStreamCallback_t callback; + + void* userData; + + class callback_queue* queue; + +# ifdef STATEFUL_CALLBACKS + // To deal with restartable callbacks + int step = 0; + void* private_ptr = NULL; +# endif + + // This serves as a spinlock to unlock the stream after the execution + // of the callback. + // TODO: we could have a pool of suck spinlocks or an array (per stream ??)? + int* completion_flag; + + /* Callback from cudaGraph */ + cb(cudaHostFn_t _graph_callback, void* _userData, class callback_queue* _queue) + { + is_graph_host_node = true; + graph_callback = _graph_callback; + callback = NULL; + userData = _userData; + queue = _queue; + +# ifdef STATEFUL_CALLBACKS + step = 0; + private_ptr = NULL; +# endif + +# ifdef USE_COMPLETION_FLAG_POOL + completion_flag = cf_pop(queue); +# else + cudaMallocManaged((void**) &completion_flag, sizeof(int)); +# endif + } + + /* Callback from CUDA streams */ + cb(cudaStream_t _stream, cudaStreamCallback_t _callback, void* _userData, class callback_queue* _queue) + { + is_graph_host_node = false; + stream = _stream; + userData = _userData; + callback = _callback; + graph_callback = NULL; + queue = _queue; + +# ifdef STATEFUL_CALLBACKS + step = 0; + private_ptr = NULL; +# endif + +# ifdef USE_COMPLETION_FLAG_POOL + completion_flag = cf_pop(queue); +# else + cudaMallocManaged((void**) &completion_flag, sizeof(int)); +# endif + } +}; + +class callback_queue : public reserved::meyers_singleton +{ +protected: + callback_queue() + { + init(); + launch_worker(); + } + + ~callback_queue() = default; + +public: + ::std::deque dq; + + // 0 = running + // 1 = destroyed + int status = 0; + + pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; + pthread_cond_t cond = PTHREAD_COND_INITIALIZER; + + void init() + { + // Create a pool of completion flags + cfp.init(); + } + + void async_destroy(cudaStream_t stream) + { + cudaStreamAddCallbackWithQueue(stream, NULL, NULL, 0, this); + } + + void launch_worker() + { + // Avoid C++ whining ... + typedef void* (*func_ptr)(void*); + pthread_create(&progress_thread, NULL, (func_ptr) &callback_queue::callback_queue_worker, this); + } + + void wait_worker() + { + pthread_join(progress_thread, NULL); + } + + /* Helper routines to create a thread dedicated to this queue ! */ + pthread_t progress_thread; + +# ifdef USE_COMPLETION_FLAG_POOL + class completion_flag_pool cfp; +# endif + + static void* callback_queue_worker(void* args) + { + fprintf(stderr, "CALLBACK RUNNING...\n"); + class callback_queue* cbq = (callback_queue*) args; + cudaCallbackQueueProgress(cbq, 1); + fprintf(stderr, "CALLBACK HALTING...\n"); + + return NULL; + } +}; + +inline callback_queue* default_callback_queue() +{ + class callback_queue* default_cb = &callback_queue::instance(); + return default_cb; +} + +inline int* cf_pop(callback_queue* q) +{ + return q->cfp.pop(); +} + +inline void callback_dispatcher(cudaStream_t, cudaError_t, void* userData) +{ + class cb* cb_ = (cb*) userData; + class callback_queue* queue = cb_->queue; + + // Protect the queue + pthread_mutex_lock(&queue->mutex); + queue->dq.push_back(cb_); + pthread_cond_broadcast(&queue->cond); + pthread_mutex_unlock(&queue->mutex); +} + +inline void cudagraph_callback_dispatcher(void* userData) +{ + cb* cb_ = (cb*) userData; + + class callback_queue* queue = cb_->queue; + + // Protect the queue + pthread_mutex_lock(&queue->mutex); + queue->dq.push_back(cb_); + pthread_cond_broadcast(&queue->cond); + pthread_mutex_unlock(&queue->mutex); +} + +// There is likely a more efficient way in the current implementation of callbacks ! +__global__ void callback_completion_kernel(int* completion_flag) +{ + // Loop until *completion_flag == 1 + while (1 != (atomicCAS(completion_flag, 1, 1))) + { + } + + // printf("notified %p\n", completion_flag); +} + +# ifdef STATEFUL_CALLBACKS +inline void set_current_cb(cb* cb) +{ + cudaCallbackStateCtx::instance().set_current_cb(cb); +} + +inline class cb* get_current_cb() +{ + return cudaCallbackStateCtx::instance().get_current_cb(); +} + +cudaError_t cudaCallbackSetStatus(int step, void* private_ptr) +{ + class cb* current_cb = get_current_cb(); + assert(current_cb); + current_cb->step = step; + current_cb->private_ptr = private_ptr; + return cudaSuccess; +} + +cudaError_t cudaCallbackGetStatus(int* step, void** private_ptr) +{ + class cb* current_cb = get_current_cb(); + assert(current_cb); + + if (step) + { + *step = current_cb->step; + } + + if (private_ptr) + { + *private_ptr = current_cb->private_ptr; + } + + return cudaSuccess; +} + +// Helpers +inline int cudaCallbackGetStep() +{ + int step; + cudaCallbackGetStatus(&step, NULL); + return step; +} + +inline void* cudaCallbackGetPrivatePtr() +{ + void* private_ptr; + cudaCallbackGetStatus(NULL, &private_ptr); + return private_ptr; +} +# endif // STATEFUL_CALLBACKS + +inline void execute_callback(cb* cb) +{ + // Indicates if a stateful callback needs to be resubmitted Note that we do + // not check for STATEFUL_CALLBACKS to avoid really miserable code ... + int cb_restart = 0; + + if (cb->callback || cb->graph_callback) + { +# ifdef STATEFUL_CALLBACKS + // This makes it possible to retrieve the state of the callback from + // the user-provided callback function itself + set_current_cb(cb); +# endif + if (cb->is_graph_host_node) + { + // fprintf(stderr, "EXECUTING cb->graph_callback\n"); + cb->graph_callback(cb->userData); + } + else + { + // fprintf(stderr, "EXECUTING cb->callback\n"); + cb->callback(cb->stream, cb->status, cb->userData); + } + +# ifdef STATEFUL_CALLBACKS + // Values equal to 0 or strictly negative indicate the callback is over + cb_restart = (cb->step > 0); + set_current_cb(NULL); +# endif + } + + // If the callback is over + if (!cb_restart) + { + // Notify the GPU-side kernel that we can go-on + // fprintf(stderr, "NOTIFY flag %p\n", cb->completion_flag); + *(cb->completion_flag) = 1; + } +} + +/* + * Public functions + */ + +/** + * @brief Submit a callback in a specific queue + * + * @param stream the CUDA stream which should be used to synchronized with the callback + * @param callback the description of the callback + * @param userData a generic pointer passed to the callback + * @param flags (ignored for now) + * @param q the queue where to submit the callback + * @return cudaError_t indicating if the submission was successful + */ +inline _CCCL_HOST cudaError_t cudaStreamAddCallbackWithQueue( + cudaStream_t stream, cudaStreamCallback_t callback, void* userData, unsigned int flags, callback_queue* q) +{ + if (q) + { + // We store the arguments in a structure that will be destroyed later on + class cb* data = new cb(stream, callback, userData, q); + cudaError_t err = cudaStreamAddCallback(stream, callback_dispatcher, data, flags); + + // Submit completion kernel in the stream ... + callback_completion_kernel<<<1, 1, 0, stream>>>(data->completion_flag); + + return err; + } + else + { + // If there is no queue, we use the usual callback mechanism + return cudaStreamAddCallback(stream, callback, userData, flags); + } +} + +/** + * @brief Submit a callback in a specific queue + * + * @param stream the CUDA stream which should be used to synchronized with the callback + * @param fn the function implementing the callback + * @param userData a generic pointer passed to the callback + * @param q the queue where to submit the callback + * @return cudaError_t indicating if the submission was successful + */ +inline _CCCL_HOST cudaError_t +cudaLaunchHostFuncWithQueue(cudaStream_t stream, cudaHostFn_t fn, void* userData, class callback_queue* q) +{ + assert(q); + // We store the arguments in a structure that will be destroyed later on + class cb* data = new cb(fn, userData, q); + cudaError_t err = cudaLaunchHostFunc(stream, (cudaHostFn_t) cudagraph_callback_dispatcher, data); + + // Submit completion kernel in the stream ... + callback_completion_kernel<<<1, 1, 0, stream>>>(data->completion_flag); + + return err; +} + +/** + * @brief Submit a host node in a CUDA graph into a specific queue + */ +inline _CCCL_HOST cudaError_t cudaGraphAddHostNodeWithQueue( + cudaGraphNode_t* node, + cudaGraph_t graph, + cudaGraphNode_t* deps, + size_t ndeps, + const cudaHostNodeParams* params, + class callback_queue* q) +{ + assert(q); + + // We store the arguments in a structure that will be destroyed later on + class cb* data = new cb(params->fn, params->userData, q); + + // XXX we should expose a child graph ... + cudaGraphNode_t node0; + + cudaHostNodeParams dispatcher_params; + dispatcher_params.fn = (cudaHostFn_t) cudagraph_callback_dispatcher; + dispatcher_params.userData = data; + cudaError_t err = cudaGraphAddHostNode(&node0, graph, deps, ndeps, &dispatcher_params); + if (err != cudaSuccess) + { + return err; + } + + // Submit completion kernel in the stream ... + // callback_completion_kernel<<<1,1,0,stream>>>(data->completion_flag); + cudaKernelNodeParams kernel_node_params; + kernel_node_params.func = (void*) callback_completion_kernel; + kernel_node_params.gridDim = 1; + kernel_node_params.blockDim = 1; + kernel_node_params.kernelParams = new void*[1]; + kernel_node_params.kernelParams[0] = (void*) &data->completion_flag; + err = cudaGraphAddKernelNode(node, graph, &node0, 1, &kernel_node_params); + + return err; +} + +/** + * @brief Progress method of a callback queue + * + * This function processes the callback queue for a CUDA device. It can operate in blocking or non-blocking mode, + * depending on the `blocking` parameter. In blocking mode, the function processes the queue until it is empty. In + * non-blocking mode, the function processes the queue for one iteration and then returns. + * + * @param q A pointer to the callback_queue to be processed. + * @param blocking mode of operation (`false` for non-blocking, `true` for blocking). + * @return `true` if the queue's status is set to 1, indicating that the queue has been processed and is empty, `false` + * otherwise. + */ +inline bool cudaCallbackQueueProgress(callback_queue* q, bool flag) +{ + int blocking = (flag == 1); + + int stop = 0; + + if (q->status == 1) + { + return true; + } + + while (!stop) + { + class cb* cb = NULL; + + // Protect the queue + pthread_mutex_lock(&q->mutex); + + // If there is something in this queue, handle it + if (!q->dq.empty()) + { + // Pop the front element + assert(q->dq.size() > 0); + cb = q->dq.front(); + q->dq.pop_front(); + } + else + { + pthread_cond_wait(&q->cond, &q->mutex); + } + + pthread_mutex_unlock(&q->mutex); + + if (cb) + { + execute_callback(cb); + +# ifdef STATEFUL_CALLBACKS + int cb_restart = (cb->step > 0); + if (cb_restart) + { + pthread_mutex_lock(&q->mutex); + q->dq.push_back(cb); + pthread_cond_broadcast(&q->cond); + pthread_mutex_unlock(&q->mutex); + } +# endif + + if (cb->callback == NULL && cb->graph_callback == NULL) + { + // destroy + q->status = 1; + return true; + } + } + + if (blocking) + { + stop = 0; + } + } + + return false; +} + +} // end namespace cuda::experimental::stf + +# endif // !_CCCL_COMPILER_MSVC +#endif // DOXYGEN_SHOULD_SKIP_THIS do not document diff --git a/cudax/include/cuda/experimental/__stf/places/inner_shape.cuh b/cudax/include/cuda/experimental/__stf/places/inner_shape.cuh new file mode 100644 index 00000000000..04b2badf7f2 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/places/inner_shape.cuh @@ -0,0 +1,136 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Implement a mechanism to compute the inner part of a shape + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +namespace cuda::experimental::stf +{ + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen fails to parse this + +/** + * @brief Applying "inner" on a mdspan shape returns an explicit shape which extents + * have been diminished by a "thickness" constant. + * + * For example, a applying inner<2> on mdspan of dimension {M, N} will produce + * an explicit shape ({2, M-2}, {2, N-2}) + * + * @tparam thickness number of elements to remove in every direction + * @tparam rank dimension of the box + * @param s input box shape + * + */ +template +_CCCL_HOST_DEVICE box::rank()> inner(const shape_of>& s) +{ + using m = mdspan; + constexpr size_t rank = m::rank(); + + const ::std::array sizes = s.get_sizes(); + + ::std::array<::std::pair<::std::ptrdiff_t, ::std::ptrdiff_t>, rank> inner_extents; + for (size_t i = 0; i < rank; i++) + { + inner_extents[i].first = thickness; + inner_extents[i].second = sizes[i] - thickness; + } + + return box(inner_extents); +} + +/** + * @brief Applying "inner" on an explicit shape returns another explicit shape which + * extents have been diminished by a "thickness" constant. + * + * @overload + * + * For example, a applying inner<2> on an explicit shape {{10, 100}, {-10, 10}} + * will produce the explicit shape ({12, 98}, {-8, 8}) + * + * @tparam thickness number of elements to remove in every direction + * @tparam rank dimension of the box + * @param s input box shape + */ +template +_CCCL_HOST_DEVICE box inner(const box& s) +{ + ::std::array<::std::pair<::std::ptrdiff_t, ::std::ptrdiff_t>, rank> inner_extents; + for (size_t i = 0; i < rank; i++) + { + inner_extents[i].first = s.get_begin(i) + thickness; + inner_extents[i].second = s.get_end(i) - thickness; + } + + return box(inner_extents); +} + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +#ifdef UNITTESTED_FILE +UNITTEST("inner explicit shape (explicit bounds)") +{ + box s({10, 100}, {-10, 10}); + static_assert(::std::is_same_v>); + + auto i = inner<2>(s); + EXPECT(i.get_begin(0) == 12); + EXPECT(i.get_end(0) == 98); + EXPECT(i.get_begin(1) == -8); + EXPECT(i.get_end(1) == 8); +}; + +UNITTEST("inner explicit shape (sizes)") +{ + box s(10, 100, 12); + static_assert(::std::is_same_v>); + + auto i = inner<2>(s); + EXPECT(i.get_begin(0) == 2); + EXPECT(i.get_end(0) == 8); + EXPECT(i.get_begin(1) == 2); + EXPECT(i.get_end(1) == 98); + EXPECT(i.get_begin(2) == 2); + EXPECT(i.get_end(2) == 10); +}; + +UNITTEST("inner mdspan shape") +{ + auto s = shape_of>(10, 100, 12); + + auto i = inner<2>(s); + EXPECT(i.get_begin(0) == 2); + EXPECT(i.get_end(0) == 8); + EXPECT(i.get_begin(1) == 2); + EXPECT(i.get_end(1) == 98); + EXPECT(i.get_begin(2) == 2); + EXPECT(i.get_end(2) == 10); +}; + +#endif // UNITTESTED_FILE + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/places/loop_dispatch.cuh b/cudax/include/cuda/experimental/__stf/places/loop_dispatch.cuh new file mode 100644 index 00000000000..3d53ab4a02d --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/places/loop_dispatch.cuh @@ -0,0 +1,171 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Mechanism to dispatch computation over a set of execution places. + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +namespace cuda::experimental::stf +{ + +namespace reserved +{ + +/* A very simple hash function to give the impression of a random distribution of indices */ +inline size_t customHash(size_t value) +{ + size_t hash = value; + + // Mix the bits using XOR and bit shifts + hash ^= (hash << 13); + hash ^= (hash >> 17); + hash ^= (hash << 5); + + return hash; +} + +} // end namespace reserved + +/* TODO : introduce a policy to decide whether or not to use threads, and the thread-index mapping (currently random) */ +template +inline void loop_dispatch( + context_t ctx, + exec_place_t root_exec_place, + place_partition_scope scope, + size_t start, + size_t end, + ::std::function func) +{ + auto partition = place_partition(ctx.async_resources(), root_exec_place, scope); + + size_t cnt = end - start; + + size_t place_cnt = partition.size(); + + size_t nthreads = ::std::min(place_cnt, cnt); + + ::std::vector<::std::thread> threads; + + // loop in reversed order so that tid=0 comes last, and we can execute it + // in the calling thread while other threads are working + for (size_t tid = nthreads; tid-- > 0;) + { + // Work that should be performed by thread "tid" + auto tid_work = [=, &func]() { + // Distribute subplaces in a round robin fashion + ::std::vector<::std::shared_ptr> thread_affinity; + for (size_t i = tid; i < place_cnt; i += nthreads) + { + thread_affinity.push_back(::std::make_shared(partition.get(tid))); + } + ctx.push_affinity(mv(thread_affinity)); + + for (size_t i = start; i < end; i++) + { + if (reserved::customHash(i) % nthreads == tid) + { + func(i); + } + } + + ctx.pop_affinity(); + }; + + if (!use_threads || tid == 0) + { + // This is done by the current thread + tid_work(); + } + else + { + threads.emplace_back(tid_work); + } + } + + // If we don't use threads, check none was created + assert(use_threads || threads.size() == 0); + + // Wait for all threads to complete + if (use_threads) + { + for (auto& thread : threads) + { + thread.join(); + } + } +} + +/* + * Overload of loop_dispatch which automatically selects the partitioning scope + */ +template +inline void +loop_dispatch(context_t ctx, exec_place_t root_exec_place, size_t start, size_t end, ::std::function func) +{ + // Partition among devices by default + place_partition_scope scope = place_partition_scope::cuda_device; + + if (getenv("CUDASTF_GREEN_CONTEXT_SIZE")) + { + scope = place_partition_scope::green_context; + } + + loop_dispatch(mv(ctx), mv(root_exec_place), scope, start, end, mv(func)); +} + +/* + * Overload of loop_dispatch which automatically selects the current affinity + * based on the ctx (or takes all devices) and selects the scope + */ +template +inline void loop_dispatch(context_t ctx, size_t start, size_t end, ::std::function func) +{ + // Partition among devices by default + place_partition_scope scope = place_partition_scope::cuda_device; + + if (getenv("CUDASTF_GREEN_CONTEXT_SIZE")) + { + scope = place_partition_scope::green_context; + } + + // The type of the "exec_place_t" differs if we already use a vector of + // pointers to places, or an actual execution place, so we do not factorize + // it yet. + if (ctx.has_affinity()) + { + loop_dispatch>, use_threads>( + mv(ctx), ctx.current_affinity(), scope, start, end, mv(func)); + } + else + { + loop_dispatch( + mv(ctx), exec_place::all_devices(), scope, start, end, mv(func)); + } +} + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh new file mode 100644 index 00000000000..77867522b2b --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh @@ -0,0 +1,218 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Facilities to manipulate subset of places + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief Defines a partitioning granularity + * + * This should be used in combination with `place_partition` + */ +enum class place_partition_scope +{ + cuda_device, + green_context, + cuda_stream, +}; + +/** + * @brief Convert a place_partition_scope value to a string (for debugging purpose) + */ +inline ::std::string place_partition_scope_to_string(place_partition_scope scope) +{ + switch (scope) + { + case place_partition_scope::cuda_device: + return "cuda_device"; + case place_partition_scope::green_context: + return "green_context"; + case place_partition_scope::cuda_stream: + return "cuda_stream"; + } + + abort(); + return "unknown"; +} + +// TODO method to get the scope of an exec place + +/** + * @brief Get subsets of an execution place. + * + * This computes a vector of execution places which are the partition of the + * input execution place divided with a specific partitioning granularity. One + * may partition a place into devices, or green contexts, for example. + * + * An iterator is provided to dispatch computation over the subplaces. + */ +class place_partition +{ +public: + /** @brief Partition an execution place into a vector of subplaces */ + place_partition(async_resources_handle& handle, exec_place place, place_partition_scope scope) + { +#if CUDA_VERSION < 12040 + assert(scope != place_partition_scope::green_context && "Green contexts unsupported."); +#endif + compute_subplaces(handle, place, scope); + } + + /** @brief Partition a vector of execution places into a vector of subplaces */ + place_partition( + async_resources_handle& handle, ::std::vector<::std::shared_ptr> places, place_partition_scope scope) + { + for (const auto& place : places) + { + compute_subplaces(handle, *place, scope); + } + } + + ~place_partition() = default; + + // To iterate over all subplaces + using iterator = ::std::vector::iterator; + iterator begin() + { + return sub_places.begin(); + } + iterator end() + { + return sub_places.end(); + ; + } + + /** @brief Number of subplaces in the partition */ + size_t size() const + { + return sub_places.size(); + } + + /** @brief Get the i-th place */ + exec_place& get(size_t i) + { + return sub_places[i]; + } + + /** @brief Get the i-th place (const reference) */ + const exec_place& get(size_t i) const + { + return sub_places[i]; + } + +private: + /** @brief Compute the subplaces of a place at the specified granularity (scope) into the sub_places vector */ + void compute_subplaces(async_resources_handle& handle, const exec_place& place, place_partition_scope scope) + { + if (place.is_grid() && scope == place_partition_scope::cuda_device) + { + exec_place_grid g = place.as_grid(); + // Copy the vector of places + sub_places = g.get_places(); + return; + } + + if (place.is_device() && scope == place_partition_scope::cuda_device) + { + sub_places.push_back(place); + return; + } + + if (place.is_grid() && scope == place_partition_scope::cuda_stream) + { + // Recursively partition grid into devices, then into streams + for (auto& device_p : place_partition(handle, place, place_partition_scope::cuda_device)) + { + auto device_p_places = place_partition(handle, device_p, place_partition_scope::cuda_stream).sub_places; + sub_places.insert(sub_places.end(), device_p_places.begin(), device_p_places.end()); + } + return; + } + + if (place.is_device() && scope == place_partition_scope::cuda_stream) + { + auto& pool = place.get_stream_pool(handle, true); + for (size_t i = 0; i < pool.size(); i++) + { + // As a side effect, this will populate the pool + decorated_stream dstream = pool.next(); + sub_places.push_back(exec_place::cuda_stream(dstream)); + } + return; + } + +// Green contexts are only supported since CUDA 12.4 +#if CUDA_VERSION >= 12040 + if (place.is_grid() && scope == place_partition_scope::green_context) + { + // Recursively partition grid into devices, then into green contexts + for (auto& device_p : place_partition(handle, place, place_partition_scope::cuda_device)) + { + auto device_p_places = place_partition(handle, device_p, place_partition_scope::green_context).sub_places; + sub_places.insert(sub_places.end(), device_p_places.begin(), device_p_places.end()); + } + return; + } + + if (place.is_device() && scope == place_partition_scope::green_context) + { + // Find the device associated to the place, and get the cached green context helper in the + // async_resources_handle (if any) + int dev_id = device_ordinal(place.affine_data_place()); + ::std::shared_ptr& h = handle.gc_helper(dev_id); + // Lazy initialization : if this is a nullptr we haven't created the helper yet + if (h.get() == nullptr) + { + // 8 SMs per green context is a granularity that should work on any arch. + const char* env = getenv("CUDASTF_GREEN_CONTEXT_SIZE"); + int sm_cnt = env ? atoi(env) : 8; + h = ::std::make_shared(sm_cnt, dev_id); + } + + // Get views of green context out of the helper to create execution places + size_t cnt = h->get_count(); + for (size_t i = 0; i < cnt; i++) + { + sub_places.push_back(exec_place::green_ctx(h->get_view(i))); + } + + return; + } +#endif + assert(!"Internal error: unreachable code."); + } + + /** A vector with all subplaces (computed once in compute_subplaces) */ + ::std::vector sub_places; +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh new file mode 100644 index 00000000000..32a37658bda --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/places/places.cuh @@ -0,0 +1,1701 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** @file + * + * @brief Defines abstractions for places where data is stored and places where execution is carried. + * + * TODO Add more documentation about this file here. + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include + +// Sync only will not move data.... +// Data place none? + +namespace cuda::experimental::stf +{ + +class backend_ctx_untyped; +class exec_place; +class exec_place_host; +class exec_place_grid; +class exec_place_cuda_stream; + +// Green contexts are only supported since CUDA 12.4 +#if CUDA_VERSION >= 12040 +class exec_place_green_ctx; +#endif + +using get_executor_func_t = pos4 (*)(pos4, dim4, dim4); + +/** + * @brief Designates where data will be stored (CPU memory vs. on device 0 (first GPU), device 1 (second GPU), ...) + * + * This typed `enum` is aligned with CUDA device ordinals but does not implicitly convert to `int`. See `device_ordinal` + * below. + */ +class data_place +{ + // Constructors and factory functions below forward to this. + explicit data_place(int devid) + : devid(devid) + {} + +public: + /** + * @brief Default constructor. The object is intialized as invalid. + */ + data_place() = default; + + /** + * @brief Constant representing an invalid `data_place` object. + */ + static const data_place invalid; + + /** + * @brief Constant representing the host CPU as the `data_place`. + */ + static const data_place host; + + /** + * @brief Constant representing a managed memory location as the `data_place`. + */ + static const data_place managed; + + /// This actually does not define a data_place, but means that we should use + /// the data place affine to the execution place + static const data_place affine; + + /** + * @brief Constant representing a placeholder that lets the library automatically select a GPU device as the + * `data_place`. + */ + static const data_place device_auto; + + /** @brief Data is placed on device with index dev_id. Two relaxations are allowed: -1 can be passed to create a + * placeholder for the host, and -2 can be used to create a placeholder for a managed device. + */ + static data_place device(int dev_id = 0) + { + static int const ndevs = [] { + int result; + cuda_safe_call(cudaGetDeviceCount(&result)); + return result; + }(); + + EXPECT((dev_id >= managed_devid && dev_id < ndevs), "Invalid device ID ", dev_id); + return data_place(dev_id); + } + + /** + * @brief Select the embedded memory of the current device as `data_place`. + */ + static data_place current_device() + { + return device(cuda_try()); + } + + // User-visible API when using a different partitioner than the one of the grid + template + static data_place composite(partitioner_t p, const exec_place_grid& g); + + static data_place composite(get_executor_func_t f, const exec_place_grid& grid); + +#if CUDA_VERSION >= 12040 + static data_place green_ctx(const green_ctx_view& gc_view); +#endif + + bool operator==(const data_place& rhs) const; + + bool operator!=(const data_place& rhs) const + { + return !(*this == rhs); + } + + /// checks if this data place is a composite data place + bool is_composite() const + { + return (composite_desc != nullptr); + } + + /// checks if this data place is a green context data place + bool is_green_ctx() const + { +#if CUDA_VERSION >= 12040 + return gc_view != nullptr; +#else + return false; +#endif + } + + /// checks if this data place corresponds to a specific device + bool is_device() const + { + return !is_composite() && !is_green_ctx() && (devid >= 0); + } + + ::std::string to_string() const + { + if (*this == host) + { + return "host"; + } + if (*this == managed) + { + return "managed"; + } + if (*this == device_auto) + { + return "auto"; + } + if (*this == invalid) + { + return "invalid"; + } + + if (is_green_ctx()) + { + return "green ctx"; + } + + if (is_composite()) + { + return "composite" + ::std::to_string(devid); + } + + return "dev" + ::std::to_string(devid); + } + + /** + * @brief Returns an index guaranteed to be >= 0 (0 for managed CPU, 1 for pinned CPU, 2 for device 0, 3 for device + * 1, ...). Requires that `p` is initialized and different from `data_place::invalid`. + */ + friend inline size_t to_index(const data_place& p) + { + EXPECT(p.devid >= -2, "Data place with device id ", p.devid, " does not refer to a device."); + // This is not stricly a problem in this function, but it's not legit either. So let's assert. + assert(p.devid < cuda_try()); + return p.devid + 2; + } + + /** + * @brief Returns the device ordinal (0 = first GPU, 1 = second GPU, ... and by convention the CPU is -1) + * Requires that `p` is initialized. + */ + friend inline int device_ordinal(const data_place& p) + { + if (p.is_green_ctx()) + { +#if CUDA_VERSION >= 12040 + return p.gc_view->devid; +#else + assert(0); +#endif + } + + // TODO: restrict this function, i.e. sometimes it's called with invalid places. + // EXPECT(p != invalid, "Invalid device id ", p.devid, " for data place."); + // EXPECT(p.devid >= -2, "Data place with device id ", p.devid, " does not refer to a device."); + // assert(p.devid < cuda_try()); + return p.devid; + } + + const exec_place_grid& get_grid() const; + const get_executor_func_t& get_partitioner() const; + + exec_place get_affine_exec_place() const; + + decorated_stream getDataStream(async_resources_handle& async_resources) const; + +private: + /** + * @brief Store the fields specific to a composite data place + * Definition comes later to avoid cyclic dependencies. + */ + class composite_state; + + //{ state + int devid = invalid_devid; // invalid by default + // Stores the fields specific to composite data places + ::std::shared_ptr composite_desc; + +public: +#if CUDA_VERSION >= 12040 + ::std::shared_ptr gc_view; +#endif + //} state + +private: + /* Constants to implement data_place::invalid, data_place::host, etc. */ + enum devid : int + { + invalid_devid = ::std::numeric_limits::min(), + device_auto_devid = -4, + affine_devid = -3, + managed_devid = -2, + host_devid = -1, + }; +}; + +inline const data_place data_place::invalid(invalid_devid); +inline const data_place data_place::host(host_devid); +inline const data_place data_place::managed(managed_devid); +inline const data_place data_place::device_auto(device_auto_devid); +inline const data_place data_place::affine(affine_devid); + +/** + * @brief Indicates where a computation takes place (CPU, dev0, dev1, ...) + * + * Currently data and computation are together `(devid == int(data_place))`. + */ +class exec_place +{ +public: + /* + * @brief Using the pimpl idiom. Public because a number of classes inehrit from this. + */ + class impl + { + public: + // Note that the default ctor assumes an invalid affine data place + impl() = default; + impl(const impl&) = delete; + impl& operator=(const impl&) = delete; + virtual ~impl() = default; + + explicit impl(data_place place) + : affine(mv(place)) + {} + + virtual exec_place activate(backend_ctx_untyped&) const + { + if (affine.is_device()) + { + auto old_dev_id = cuda_try(); + auto new_dev_id = device_ordinal(affine); + if (old_dev_id != new_dev_id) + { + cuda_safe_call(cudaSetDevice(new_dev_id)); + } + + auto old_dev = data_place::device(old_dev_id); + return exec_place(mv(old_dev)); + } + return exec_place(); + } + + virtual void deactivate(backend_ctx_untyped&, const exec_place& prev) const + { + if (affine.is_device()) + { + auto current_dev_id = cuda_try(); + auto restored_dev_id = device_ordinal(prev.pimpl->affine); + if (current_dev_id != restored_dev_id) + { + cuda_safe_call(cudaSetDevice(restored_dev_id)); + } + } + } + + virtual const data_place& affine_data_place() const + { + return affine; + } + + virtual ::std::string to_string() const + { + return "exec(" + affine.to_string() + ")"; + } + + virtual bool is_device() const + { + return affine.is_device(); + } + + virtual bool is_grid() const + { + return false; + } + + virtual size_t size() const + { + return 1; + } + + virtual void set_affine_data_place(data_place place) + { + affine = mv(place); + }; + + virtual bool operator==(const impl& rhs) const + { + return affine == rhs.affine; + } + + /* Return the pool associated to this place + * + * If the stream is expected to perform computation, the + * for_computation should be true. If we plan to use this stream for data + * transfers, or other means (graph capture) we set the value to false. (This + * flag is intended for performance matters, not correctness */ + virtual stream_pool& get_stream_pool(async_resources_handle& async_resources, bool for_computation) const + { + if (!affine.is_device()) + { + fprintf(stderr, "Error: get_stream_pool virtual method is not implemented for this exec place.\n"); + abort(); + } + + int dev_id = device_ordinal(affine); + return async_resources.get_device_stream_pool(dev_id, for_computation); + } + + decorated_stream getStream(async_resources_handle& async_resources, bool for_computation) const + { + return get_stream_pool(async_resources, for_computation).next(); + } + + protected: + friend class exec_place; + explicit impl(int devid) + : affine(data_place::device(devid)) + {} + data_place affine = data_place::invalid; + }; + + exec_place() = default; + exec_place(const data_place& affine) + : pimpl(affine.is_device() ? device(device_ordinal(affine)).pimpl : ::std::make_shared(affine)) + { + EXPECT(pimpl->affine != data_place::host, "To create an execution place for the host, use exec_place::host."); + } + + bool operator==(const exec_place& rhs) const + { + return *pimpl == *rhs.pimpl; + } + bool operator!=(const exec_place& rhs) const + { + return !(*this == rhs); + } + + /** + * @brief an iterator class which goes over all subplaces in an exec place. + * + * This is a trivial singleton unless we have a grid of places. + */ + class iterator + { + public: + iterator(::std::shared_ptr impl, size_t index) + : it_impl(mv(impl)) + , index(index) + {} + + exec_place operator*(); + + iterator& operator++() + { + index++; + return *this; + } + + bool operator==(const iterator& other) const + { + return index == other.index; + } + + bool operator!=(const iterator& other) const + { + return !(*this == other); + } + + private: + ::std::shared_ptr it_impl; + size_t index; + }; + + iterator begin() + { + return iterator(pimpl, 0); + } + iterator end() + { + return iterator(pimpl, pimpl->size()); + } + + /** + * @brief Returns a string representation of the execution place object. + * + * @return std::string + */ + ::std::string to_string() const + { + return pimpl->to_string(); + } + + /** + * @brief Returns the `data_place` naturally associated with this execution place. + */ + const data_place& affine_data_place() const + { + return pimpl->affine_data_place(); + } + + void set_affine_data_place(data_place place) + { + pimpl->set_affine_data_place(mv(place)); + }; + + stream_pool& get_stream_pool(async_resources_handle& async_resources, bool for_computation) const + { + return pimpl->get_stream_pool(async_resources, for_computation); + } + + decorated_stream getStream(async_resources_handle& async_resources, bool for_computation) const + { + return pimpl->getStream(async_resources, for_computation); + } + + // TODO make protected ! + const ::std::shared_ptr& get_impl() const + { + return pimpl; + } + + /** + * @brief Set computation to run on this place. + * + * @return `exec_place` The previous execution place. See `deactivate` below. + */ + exec_place activate(backend_ctx_untyped& state) const + { + return pimpl->activate(state); + } + + /** + * @brief Undoes the effect of `activate`. Call with the previous `exec_place` object retured by `activate`. + * + * @warning Undefined behavior if you don't pass the result of `activate`. + */ + void deactivate(backend_ctx_untyped& state, const exec_place& p) const + { + pimpl->deactivate(state, p); + } + + bool is_device() const + { + return pimpl->is_device(); + } + + bool is_grid() const + { + return pimpl->is_grid(); + } + + size_t size() const + { + return pimpl->size(); + } + + // Get the implementation assuming this is a grid + // We need to defer the implementation after exec_place_grid has been + // defined because this requires a ::std::static_pointer_cast from the base + // class to exec_place_grid + exec_place_grid as_grid() const; + + size_t grid_dim(int axid_is) const; + dim4 grid_dims() const; + + /* These helper methods provide convenient way to express execution places, + * for example exec_place::host or exec_place::device(4). + */ + static const exec_place_host host; + static const exec_place device_auto; + static exec_place device(int devid); + +// Green contexts are only supported since CUDA 12.4 +#if CUDA_VERSION >= 12040 + static exec_place green_ctx(const green_ctx_view& gc_view); + static exec_place green_ctx(const ::std::shared_ptr& gc_view_ptr); +#endif + + static exec_place_cuda_stream cuda_stream(cudaStream_t stream); + static exec_place_cuda_stream cuda_stream(const decorated_stream& dstream); + + /** + * @brief Returns the currently active device. + * + * @return exec_place + */ + static exec_place current_device() + { + return exec_place::device(cuda_try()); + } + + static exec_place_grid all_devices(); + + static exec_place_grid n_devices(size_t n, dim4 dims); + + static exec_place_grid n_devices(size_t n); + + // For debug purpose on a machine with a single GPU, for example + static exec_place_grid repeat(const exec_place& e, size_t cnt); + + /** + * @brief Execute lambda on this place. + * + * This method accepts a functor, saves the current CUDA device, changes it to the current execution place, + * invokes the lambda, and finally sets the current device back to the previous one. The last step is + * taken even if the lambda throws an exception. + * + * @tparam Fun A callable entity type + * @param fun Input functor that will be forwarded and executed + * + * @return auto the result of the executed functor. + * + */ + template + auto operator->*(Fun&& fun) const + { + const int new_device = device_ordinal(pimpl->affine); + if (new_device >= 0) + { + // We're on a device + // Change device only if necessary. + const int old_device = cuda_try(); + if (new_device != old_device) + { + cuda_safe_call(cudaSetDevice(new_device)); + } + + SCOPE(exit) + { + // It is the responsibility of the client to ensure that any change of the current device in this + // section was reverted. + if (new_device != old_device) + { + cuda_safe_call(cudaSetDevice(old_device)); + } + }; + return ::std::forward(fun)(); + } + else + { + // We're on the host, just call the function with no further ado. + return ::std::forward(fun)(); + } + } + +public: + exec_place(::std::shared_ptr pimpl) + : pimpl(mv(pimpl)) + {} + +private: + // No other state + ::std::shared_ptr pimpl; +}; + +/** + * @brief Designates execution that is to run on the host. + * + */ +class exec_place_host : public exec_place +{ +public: + // Implementation of the exec_place_device class + class impl : public exec_place::impl + { + public: + impl() + : exec_place::impl(data_place::host) + {} + exec_place activate(backend_ctx_untyped&) const override + { + return exec_place(); + } // no-op + void deactivate(backend_ctx_untyped&, const exec_place& p) const override + { + EXPECT(!p.get_impl()); + } // no-op + virtual const data_place& affine_data_place() const override + { + return data_place::host; + } + virtual stream_pool& get_stream_pool(async_resources_handle& async_resources, bool for_computation) const override + { + // There is no pool attached to the host itself, so we use the pool attached to the execution place of the + // current device + return exec_place::current_device().get_stream_pool(async_resources, for_computation); + } + }; + + static ::std::shared_ptr make() + { + static impl result; + return ::std::shared_ptr(&result, [](impl*) {}); // no-op deleter + } + +private: + friend class exec_place; + /** + * @brief Constructor + */ + exec_place_host() + : exec_place(make()) + { + static_assert(sizeof(exec_place) == sizeof(exec_place_host), + "exec_place_host cannot add state; it would be sliced away."); + } +}; + +inline const exec_place_host exec_place::host{}; +inline const exec_place exec_place::device_auto{data_place::device_auto}; + +UNITTEST("exec_place_host::operator->*") +{ + bool witness = false; + exec_place::host->*[&] { + witness = true; + }; + EXPECT(witness); +}; + +inline exec_place exec_place::device(int devid) +{ + // Create a static vector of impls - there's exactly one per device. + static int ndevices; + static impl* impls = [] { + cuda_safe_call(cudaGetDeviceCount(&ndevices)); + auto result = static_cast(::operator new[](ndevices * sizeof(impl))); + for (int i : each(ndevices)) + { + new (result + i) impl(i); + } + return result; + }(); + assert(devid >= 0); + assert(devid < ndevices); + return ::std::shared_ptr(&impls[devid], [](impl*) {}); // no-op deleter +} + +#ifdef UNITTESTED_FILE +UNITTEST("exec_place ->* operator") +{ + exec_place e = exec_place::device(0); + e->*[]() { + int current_dev = cuda_try(); + EXPECT(current_dev == 0); + }; + + // Ensure the ->* operator works with a const exec place + const exec_place ce = exec_place::device(0); + ce->*[]() { + int current_dev = cuda_try(); + EXPECT(current_dev == 0); + }; +}; + +UNITTEST("exec_place assignments") +{ + // Make sure we can use exec_place by values, replace it, etc... + exec_place e; + int ndevices = cuda_try(); + if (ndevices >= 1) + { + e = exec_place::device(0); + } + if (ndevices >= 2) + { + e = exec_place::device(1); + } + e = exec_place::host; +}; + +UNITTEST("exec_place movable") +{ + exec_place e = exec_place::device(0); + exec_place e2 = mv(e); +}; + +UNITTEST("exec_place copyable") +{ + exec_place e = exec_place::device(0); + exec_place e2 = e; +}; +#endif // UNITTESTED_FILE + +class exec_place_grid : public exec_place +{ +public: + /* + * Implementation of the exec_place_grid + */ + class impl : public exec_place::impl + { + public: + // Define a grid directly from a vector of places + // This creates an execution grid automatically + impl(::std::vector _places) + : dims(static_cast(_places.size()), 1, 1, 1) + , places(mv(_places)) + { + assert(!places.empty()); + assert(dims.x > 0); + assert(affine == data_place::invalid); + } + + // With a "dim4 shape" + impl(::std::vector _places, const dim4& _dims) + : dims(_dims) + , places(mv(_places)) + { + assert(dims.x > 0); + assert(affine == data_place::invalid); + } + + // TODO improve with a better description + ::std::string to_string() const final + { + return ::std::string("GRID place"); + } + + exec_place activate(backend_ctx_untyped&) const override + { + // No-op + return exec_place(); + } + + // TODO : shall we deactivate the current place, if any ? + void deactivate(backend_ctx_untyped&, const exec_place& _prev) const override + { + // No-op + EXPECT(!_prev.get_impl(), "Invalid execution place."); + } + + /* Dynamically checks whether an execution place is a device */ + bool is_device() const override + { + return false; + } + + /* Dynamically checks whether an execution place is a grid */ + bool is_grid() const override + { + return true; + } + + bool operator==(const exec_place::impl& rhs) const override + { + // First, check if rhs is of type exec_place_grid::impl + auto other = dynamic_cast(&rhs); + if (!other) + { + return false; // rhs is not a grid, so they are not equal + } + + // Compare two grids + return *this == *other; + } + + // Compare two grids + bool operator==(const impl& rhs) const + { + // First, compare base class properties + if (!exec_place::impl::operator==(rhs)) + { + return false; + } + + // Compare grid-specific properties + return dims == rhs.dims && places == rhs.places; + } + + const ::std::vector& get_places() const + { + return places; + } + + exec_place grid_activate(backend_ctx_untyped& ctx, size_t i) const + { + const auto& v = get_places(); + return v[i].activate(ctx); + } + + void grid_deactivate(backend_ctx_untyped& ctx, size_t i, exec_place p) const + { + const auto& v = get_places(); + v[i].deactivate(ctx, p); + } + + const exec_place& get_current_place() + { + return get_places()[current_p_1d]; + } + + // Set the current place from the 1D index within the grid (flattened grid) + void set_current_place(backend_ctx_untyped& ctx, size_t p_index) + { + // Unset the previous place, if any + if (current_p_1d >= 0) + { + // First deactivate the previous place + grid_deactivate(ctx, current_p_1d, old_place); + } + + // get the 1D index for that position + current_p_1d = (::std::ptrdiff_t) p_index; + + // The returned value contains the state to restore when we deactivate the place + old_place = grid_activate(ctx, current_p_1d); + } + + // Set the current place, given the position in the grid + void set_current_place(backend_ctx_untyped& ctx, pos4 p) + { + size_t p_index = dims.get_index(p); + set_current_place(ctx, p_index); + } + + void unset_current_place(backend_ctx_untyped& ctx) + { + EXPECT(current_p_1d >= 0, "unset_current_place() called without corresponding call to set_current_place()"); + + // First deactivate the previous place + grid_deactivate(ctx, current_p_1d, old_place); + current_p_1d = -1; + } + + ::std::ptrdiff_t current_place_id() const + { + return current_p_1d; + } + + dim4 get_dims() const + { + return dims; + } + + int get_dim(int axis_id) const + { + return dims.get(axis_id); + } + + size_t size() const override + { + return dims.size(); + } + + /* Get the place associated to this position in the grid */ + const exec_place& get_place(pos4 p) const + { + return coords_to_place(p); + } + + const exec_place& get_place(size_t p_index) const + { + // TODO (miscco): should this method take an int? + return coords_to_place(static_cast(p_index)); + } + + virtual stream_pool& get_stream_pool(async_resources_handle& async_resources, bool for_computation) const override + { + // We "arbitrarily" select a pool from one of the place in the + // grid, which can be suffiicent for a data transfer, but we do not + // want to allow this for computation where we expect a more + // accurate placement. + assert(!for_computation); + assert(places.size() > 0); + return places[0].get_stream_pool(async_resources, for_computation); + } + + private: + // What is the execution place at theses coordinates in the exec place grid ? + const exec_place& coords_to_place(int c0, int c1 = 0, int c2 = 0, int c3 = 0) const + { + // Flatten the (c0, c1, c2, c3) vector into a global index + int index = c0 + dims.get(0) * (c1 + dims.get(1) * (c2 + c3 * dims.get(2))); + return places[index]; + } + + const exec_place& coords_to_place(pos4 coords) const + { + return coords_to_place(coords.x, coords.y, coords.z, coords.t); + } + + // current position in the grid (flattened to 1D) if we have a grid of + // execution place. -1 indicates there is no current position. + ::std::ptrdiff_t current_p_1d = -1; + + // saved state before setting the current place + exec_place old_place; + + // dimensions of the "grid" + dim4 dims; + ::std::vector places; + }; + + ///@{ @name Constructors + dim4 get_dims() const + { + return get_impl()->get_dims(); + } + + int get_dim(int axis_id) const + { + return get_dims().get(axis_id); + } + + size_t size() const + { + return get_dims().size(); + } + + explicit operator bool() const + { + return get_impl() != nullptr; + } + + /* Note that we compare against the exact same implementation : we could + * have equivalent grids with the same execution places, but to avoid a + * costly comparision we here only look for actually identical grids. + */ + bool operator==(const exec_place_grid& rhs) const + { + return *get_impl() == *(rhs.get_impl()); + } + + ::std::ptrdiff_t current_place_id() const + { + return get_impl()->current_place_id(); + } + + const exec_place& get_place(pos4 p) const + { + return get_impl()->get_place(p); + } + + const ::std::vector& get_places() const + { + return get_impl()->get_places(); + } + + // Set the current place from the 1D index within the grid (flattened grid) + void set_current_place(backend_ctx_untyped& ctx, size_t p_index) + { + return get_impl()->set_current_place(ctx, p_index); + } + + // Get the current execution place + const exec_place& get_current_place() + { + return get_impl()->get_current_place(); + } + + // Set the current place, given the position in the grid + void set_current_place(backend_ctx_untyped& ctx, pos4 p) + { + return get_impl()->set_current_place(ctx, p); + } + + void unset_current_place(backend_ctx_untyped& ctx) + { + return get_impl()->unset_current_place(ctx); + } + + ::std::shared_ptr get_impl() const + { + assert(::std::dynamic_pointer_cast(exec_place::get_impl())); + return ::std::static_pointer_cast(exec_place::get_impl()); + } + + // Default constructor + exec_place_grid() + : exec_place(nullptr) + {} + + // private: + exec_place_grid(::std::shared_ptr p) + : exec_place(mv(p)) + {} + + exec_place_grid(::std::vector p, const dim4& d) + : exec_place(::std::make_shared(mv(p), d)) + {} +}; + +inline exec_place_grid make_grid(::std::vector places, const dim4& dims) +{ + return exec_place_grid(mv(places), dims); +} + +inline exec_place_grid make_grid(::std::vector places) +{ + assert(!places.empty()); + const auto x = static_cast(places.size()); + return make_grid(mv(places), dim4(x, 1, 1, 1)); +} + +/// Implementation deferred because we need the definition of exec_place_grid +inline exec_place exec_place::iterator::operator*() +{ + EXPECT(index < it_impl->size()); + if (it_impl->is_grid()) + { + return ::std::static_pointer_cast(it_impl)->get_place(index); + } + return exec_place(it_impl); +} + +inline exec_place_grid exec_place::repeat(const exec_place& e, size_t cnt) +{ + return make_grid(::std::vector(cnt, e)); +} + +/* Deferred implementation : ::std::static_pointer_cast requires that exec_place_grid is a complete type */ +inline exec_place_grid exec_place::as_grid() const +{ + // Make sure it is really a grid + EXPECT(is_grid()); + return exec_place_grid(::std::static_pointer_cast(pimpl)); +} + +inline dim4 exec_place::grid_dims() const +{ + EXPECT(is_grid()); + return ::std::static_pointer_cast(pimpl)->get_dims(); +} + +inline size_t exec_place::grid_dim(int axis_id) const +{ + EXPECT(is_grid()); + return ::std::static_pointer_cast(pimpl)->get_dim(axis_id); +} + +/* Get the first N available devices */ +inline exec_place_grid exec_place::n_devices(size_t n, dim4 dims) +{ + const int ndevs = cuda_try(); + + EXPECT(ndevs >= int(n)); + ::std::vector devices; + devices.reserve(n); + for (auto d : each(n)) + { + // TODO (miscco): Use proper type + devices.push_back(exec_place::device(static_cast(d))); + } + + return make_grid(mv(devices), dims); +} + +/* Get the first N available devices */ +inline exec_place_grid exec_place::n_devices(size_t n) +{ + return n_devices(n, dim4(static_cast(n), 1, 1, 1)); +} + +inline exec_place_grid exec_place::all_devices() +{ + return n_devices(cuda_try()); +} + +inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 strides, pos4 tile_id) +{ + const auto& g = e_place.as_grid(); + dim4 g_dims = e_place.get_dims(); + + /* + * Example : strides = (3, 2). tile 1 id = (1, 0) + * 0 1 2 0 1 2 0 1 2 0 1 + * 3 4 5 3 4 5 3 4 5 3 4 + * 0 1 2 0 1 2 0 1 2 0 1 + */ + + // Dimension K_x of the new grid on axis x : + // pos_x + K_x stride_x = dim_x + // K_x = (dim_x - pos_x)/stride_x + dim4 size = dim4((g.get_dim(0) - tile_id.x + strides.x - 1) / strides.x, + (g.get_dim(1) - tile_id.y + strides.y - 1) / strides.y, + (g.get_dim(2) - tile_id.z + strides.z - 1) / strides.z, + (g.get_dim(3) - tile_id.t + strides.t - 1) / strides.t); + + // fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.x, strides.x, tile_id.x); + // fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.y, strides.y, tile_id.y); + // fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.z, strides.z, tile_id.z); + // fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.t, strides.t, tile_id.t); + + ::std::vector places; + places.reserve(size.x * size.y * size.z * size.t); + + for (int t = tile_id.t; t < g_dims.t; t += strides.t) + { + for (int z = tile_id.z; z < g_dims.z; z += strides.z) + { + for (int y = tile_id.y; y < g_dims.y; y += strides.y) + { + for (int x = tile_id.x; x < g_dims.x; x += strides.x) + { + places.push_back(g.get_place(pos4(x, y, z, t))); + } + } + } + } + + // fprintf(stderr, "ind %d (%d,%d,%d,%d)=%d\n", ind, size.x, size.y, size.z, size.t, + // size.x*size.y*size.z*size.t); + assert(int(places.size()) == size.x * size.y * size.z * size.t); + + return make_grid(mv(places), size); +} + +// example : +// auto sub_g = partition_tile(g, dim4(2,2), dim4(0,1)) +inline exec_place_grid partition_tile(const exec_place_grid& e_place, dim4 tile_sizes, pos4 tile_id) +{ + const auto& g = e_place.as_grid(); + + // TODO define dim4=dim4 * dim4 + dim4 begin_coords( + tile_id.x * tile_sizes.x, tile_id.y * tile_sizes.y, tile_id.z * tile_sizes.z, tile_id.t * tile_sizes.t); + + // TODO define dim4=MIN(dim4,dim4) + // upper bound coordinate (excluded) + dim4 end_coords(::std::min((tile_id.x + 1) * tile_sizes.x, g.get_dim(0)), + ::std::min((tile_id.y + 1) * tile_sizes.y, g.get_dim(1)), + ::std::min((tile_id.z + 1) * tile_sizes.z, g.get_dim(2)), + ::std::min((tile_id.t + 1) * tile_sizes.t, g.get_dim(3))); + + // fprintf(stderr, "G DIM %d TILE SIZE %d ID %d\n", g_dims.x, tile_sizes.x, tile_id.x); + // fprintf(stderr, "G DIM %d TILE SIZE %d ID %d\n", g_dims.y, tile_sizes.y, tile_id.y); + // fprintf(stderr, "G DIM %d TILE SIZE %d ID %d\n", g_dims.z, tile_sizes.z, tile_id.z); + // fprintf(stderr, "G DIM %d TILE SIZE %d ID %d\n", g_dims.t, tile_sizes.t, tile_id.t); + // + // + // fprintf(stderr, "BEGIN %d END %d\n", begin_coords.x, end_coords.x); + // fprintf(stderr, "BEGIN %d END %d\n", begin_coords.y, end_coords.y); + // fprintf(stderr, "BEGIN %d END %d\n", begin_coords.z, end_coords.z); + // fprintf(stderr, "BEGIN %d END %d\n", begin_coords.t, end_coords.t); + + dim4 size = dim4(end_coords.x - begin_coords.x, + end_coords.y - begin_coords.y, + end_coords.z - begin_coords.z, + end_coords.t - begin_coords.t); + + ::std::vector places; + places.reserve(size.x * size.y * size.z * size.t); + + for (int t = begin_coords.t; t < end_coords.t; t++) + { + for (int z = begin_coords.z; z < end_coords.z; z++) + { + for (int y = begin_coords.y; y < end_coords.y; y++) + { + for (int x = begin_coords.x; x < end_coords.x; x++) + { + places.push_back(g.get_place(pos4(x, y, z, t))); + } + } + } + } + + // fprintf(stderr, "ind %d (%d,%d,%d,%d)=%d\n", ind, size.x, size.y, size.z, size.t, + // size.x*size.y*size.z*size.t); + assert(int(places.size()) == size.x * size.y * size.z * size.t); + + return make_grid(mv(places), size); +} + +/* + * This is defined here so that we avoid cyclic dependencies. + */ +class data_place::composite_state +{ +public: + composite_state() = default; + + composite_state(exec_place_grid grid, get_executor_func_t partitioner_func) + : grid(mv(grid)) + , partitioner_func(mv(partitioner_func)) + {} + + const exec_place_grid& get_grid() const + { + return grid; + } + const get_executor_func_t& get_partitioner() const + { + return partitioner_func; + } + +private: + exec_place_grid grid; + get_executor_func_t partitioner_func; +}; + +inline data_place data_place::composite(get_executor_func_t f, const exec_place_grid& grid) +{ + data_place result; + + // Save the state that is specific to a composite data place into the + // data_place object. + result.composite_desc = ::std::make_shared(grid, f); + + return result; +} + +#if CUDA_VERSION >= 12040 +inline data_place data_place::green_ctx(const green_ctx_view& gc_view) +{ + data_place result; + result.gc_view = ::std::make_shared(gc_view); + return result; +} +#endif + +// User-visible API when the same partitioner as the one of the grid +template +data_place data_place::composite(partitioner_t, const exec_place_grid& g) +{ + return data_place::composite(&partitioner_t::get_executor, g); +} + +inline exec_place data_place::get_affine_exec_place() const +{ + // EXPECT(*this != affine); + // EXPECT(*this != data_place::invalid); + + if (*this == host) + { + return exec_place::host; + } + + // This is debatable ! + if (*this == managed) + { + return exec_place::host; + } + + if (is_composite()) + { + // Return the grid of places associated to that composite data place + return get_grid(); + } + +#if CUDA_VERSION >= 12040 + if (is_green_ctx()) + { + EXPECT(gc_view != nullptr); + return exec_place::green_ctx(gc_view); + } +#endif + + // This must be a device + return exec_place::device(devid); +} + +inline decorated_stream data_place::getDataStream(async_resources_handle& async_resources) const +{ + return get_affine_exec_place().getStream(async_resources, false); +} + +inline const exec_place_grid& data_place::get_grid() const +{ + return composite_desc->get_grid(); +}; +inline const get_executor_func_t& data_place::get_partitioner() const +{ + return composite_desc->get_partitioner(); +} + +inline bool data_place::operator==(const data_place& rhs) const +{ + if (is_composite() != rhs.is_composite()) + { + return false; + } + + if (is_green_ctx() != rhs.is_green_ctx()) + { + return false; + } + + if (!is_composite()) + { + return devid == rhs.devid; + } + + if (is_green_ctx()) + { +#if CUDA_VERSION >= 12040 + return *gc_view == *rhs.gc_view; +#else + assert(0); +#endif + } + + return (get_grid() == rhs.get_grid() && (get_partitioner() == rhs.get_partitioner())); +} + +#ifdef UNITTESTED_FILE +UNITTEST("Data place equality") +{ + EXPECT(data_place::managed == data_place::managed); + EXPECT(data_place::managed != data_place::host); +}; +#endif // UNITTESTED_FILE + +/** + * @brief ID of a data instance. A logical data can have multiple instances in various parts of memory + * (CPU and several GPUs). This type identifies the index of such an instance in the internal data structures. + * + */ +enum class instance_id_t : size_t +{ + invalid = static_cast(-1) +}; + +#ifdef UNITTESTED_FILE +UNITTEST("places to_symbol") +{ + EXPECT(data_place::host.to_string() == ::std::string("host")); + EXPECT(exec_place::current_device().to_string() == ::std::string("exec(dev0)")); + EXPECT(exec_place::host.to_string() == ::std::string("exec(host)")); +}; + +UNITTEST("exec place equality") +{ + EXPECT(exec_place::current_device() == exec_place::current_device()); + + auto c1 = exec_place::current_device(); + auto c2 = exec_place::current_device(); + EXPECT(c1 == c2); + + EXPECT(exec_place::host != exec_place::current_device()); + + cuda_safe_call(cudaSetDevice(0)); // just in case the environment was somehow messed up + EXPECT(exec_place::device(0) == exec_place::current_device()); +}; + +UNITTEST("grid exec place equality") +{ + auto all = exec_place::all_devices(); + auto repeated_dev0 = exec_place::repeat(exec_place::device(0), 3); + + EXPECT(exec_place::all_devices() == exec_place::all_devices()); + + EXPECT(all != repeated_dev0); +}; +#endif // UNITTESTED_FILE + +namespace reserved +{ + +template +::std::pair +compute_occupancy(Kernel&& f, size_t dynamicSMemSize = 0, int blockSizeLimit = 0) +{ + using key_t = ::std::pair; + static ::std:: + unordered_map, ::cuda::experimental::stf::hash> + occupancy_cache; + const auto key = ::std::make_pair(dynamicSMemSize, blockSizeLimit); + + if (auto i = occupancy_cache.find(key); i != occupancy_cache.end()) + { + // Cache hit + return i->second; + } + // Miss + auto& result = occupancy_cache[key]; + cuda_safe_call(cudaOccupancyMaxPotentialBlockSize(&result.first, &result.second, f, dynamicSMemSize, blockSizeLimit)); + return result; +} + +/** + * This method computes the block and grid sizes to optimize thread occupancy. + * + * If cooperative kernels are needed, the grid size is capped to the number of + * blocks + * + * - min_grid_size and max_block_size are the grid and block sizes to + * _optimize_ occupancy + * - block_size_limit is the absolute maximum of threads in a block due to + * resource constraints + */ +template +void compute_kernel_limits( + const Fun&& f, + int& min_grid_size, + int& max_block_size, + size_t shared_mem_bytes, + bool cooperative, + int& block_size_limit) +{ + static_assert(::std::is_function::type>::value, + "Template parameter Fun must be a pointer to a function type."); + + ::std::tie(min_grid_size, max_block_size) = compute_occupancy(f, shared_mem_bytes); + + if (cooperative) + { + // For cooperative kernels, the number of blocks is limited. We compute the number of SM on device 0 and assume + // we have a homogeneous machine. + static const int sm_count = cuda_try(cudaDevAttrMultiProcessorCount, 0); + + // TODO there could be more than 1 block per SM, but we do not know the actual block sizes for now ... + min_grid_size = ::std::min(min_grid_size, sm_count); + } + + /* Compute the maximum block size (not the optimal size) */ + static const auto maxThreadsPerBlock = [&] { + cudaFuncAttributes result; + cuda_safe_call(cudaFuncGetAttributes(&result, f)); + return result.maxThreadsPerBlock; + }(); + block_size_limit = maxThreadsPerBlock; +} + +} // end namespace reserved + +template +template +interpreted_execution_policy::interpreted_execution_policy( + const thread_hierarchy_spec& p, const exec_place& where, const Fun& f) +{ + constexpr size_t pdepth = sizeof...(spec) / 2; + + if (where == exec_place::host) + { + // XXX this may not match the type of the spec if we are not using the default spec ... + for (size_t d = 0; d < pdepth; d++) + { + this->add_level({::std::make_pair(hw_scope::thread, 1)}); + } + return; + } + + size_t ndevs = where.size(); + + if constexpr (pdepth == 1) + { + size_t l0_size = p.get_width(0); + bool l0_sync = thread_hierarchy_spec::template is_synchronizable<0>; + + int max_block_size = 0, min_grid_size = 0; + size_t shared_mem_bytes = 0; + int block_size_limit = 0; + + reserved::compute_kernel_limits(f, min_grid_size, max_block_size, shared_mem_bytes, l0_sync, block_size_limit); + + int grid_size = 0; + int block_size; + + if (l0_size == 0) + { + grid_size = min_grid_size; + // Maximum occupancy without exceeding limits + block_size = ::std::min(max_block_size, block_size_limit); + l0_size = ndevs * grid_size * block_size; + } + else + { + // Find grid_size and block_size such that grid_size*block_size = l0_size and block_size <= max_block_size + for (block_size = max_block_size; block_size >= 1; block_size--) + { + if (l0_size % block_size == 0) + { + grid_size = l0_size / block_size; + break; + } + } + } + + // Make sure we have computed the width if that was implicit + assert(l0_size > 0); + + assert(grid_size > 0); + assert(block_size <= max_block_size); + + assert(l0_size % ndevs == 0); + assert(l0_size % (ndevs * block_size) == 0); + + assert(ndevs * grid_size * block_size == l0_size); + + this->add_level({::std::make_pair(hw_scope::device, ndevs), + ::std::make_pair(hw_scope::block, grid_size), + ::std::make_pair(hw_scope::thread, block_size)}); + this->set_level_mem(0, size_t(p.get_mem(0))); + this->set_level_sync(0, l0_sync); + } + else if constexpr (pdepth == 2) + { + size_t l0_size = p.get_width(0); + size_t l1_size = p.get_width(1); + bool l0_sync = thread_hierarchy_spec::template is_synchronizable<0>; + bool l1_sync = thread_hierarchy_spec::template is_synchronizable<1>; + + int max_block_size = 0, min_grid_size = 0; + int block_size_limit = 0; + /* level 1 will be mapped on threads, level 0 on blocks and above */ + size_t shared_mem_bytes = size_t(p.get_mem(1)); + reserved::compute_kernel_limits(f, min_grid_size, max_block_size, shared_mem_bytes, l0_sync, block_size_limit); + + // For implicit widths, use sizes suggested by CUDA occupancy calculator + if (l1_size == 0) + { + // Maximum occupancy without exceeding limits + l1_size = ::std::min(max_block_size, block_size_limit); + } + else + { + if (int(l1_size) > block_size_limit) + { + fprintf(stderr, + "Unsatisfiable spec: Maximum block size %d threads, requested %zu (level 1)\n", + block_size_limit, + l1_size); + abort(); + } + } + + if (l0_size == 0) + { + l0_size = min_grid_size * ndevs; + } + + // Enforce the resource limits in the number of threads per block + assert(int(l1_size) <= block_size_limit); + + assert(l0_size % ndevs == 0); + + /* Merge blocks and devices */ + this->add_level({::std::make_pair(hw_scope::device, ndevs), ::std::make_pair(hw_scope::block, l0_size / ndevs)}); + this->set_level_mem(0, size_t(p.get_mem(0))); + this->set_level_sync(0, l0_sync); + + this->add_level({::std::make_pair(hw_scope::thread, l1_size)}); + this->set_level_mem(1, size_t(p.get_mem(1))); + this->set_level_sync(1, l1_sync); + } + else if constexpr (pdepth == 3) + { + size_t l0_size = p.get_width(0); + size_t l1_size = p.get_width(1); + size_t l2_size = p.get_width(1); + bool l0_sync = thread_hierarchy_spec::template is_synchronizable<0>; + bool l1_sync = thread_hierarchy_spec::template is_synchronizable<1>; + bool l2_sync = thread_hierarchy_spec::template is_synchronizable<2>; + + int max_block_size = 0, min_grid_size = 0; + int block_size_limit = 0; + /* level 2 will be mapped on threads, level 1 on blocks, level 0 on devices */ + size_t shared_mem_bytes = size_t(p.get_mem(2)); + reserved::compute_kernel_limits( + f, min_grid_size, max_block_size, shared_mem_bytes, l0_sync || l1_sync, block_size_limit); + + // For implicit widths, use sizes suggested by CUDA occupancy calculator + if (l2_size == 0) + { + // Maximum occupancy without exceeding limits + l2_size = ::std::min(max_block_size, block_size_limit); + } + else + { + if (int(l2_size) > block_size_limit) + { + fprintf(stderr, + "Unsatisfiable spec: Maximum block size %d threads, requested %zu (level 2)\n", + block_size_limit, + l2_size); + abort(); + } + } + + if (l1_size == 0) + { + l1_size = min_grid_size; + } + + if (l0_size == 0) + { + l0_size = ndevs; + } + + // Enforce the resource limits in the number of threads per block + assert(int(l2_size) <= block_size_limit); + assert(int(l0_size) <= ndevs); + + /* Merge blocks and devices */ + this->add_level({::std::make_pair(hw_scope::device, l0_size)}); + this->set_level_mem(0, size_t(p.get_mem(0))); + this->set_level_sync(0, l0_sync); + + this->add_level({::std::make_pair(hw_scope::block, l1_size)}); + this->set_level_mem(1, size_t(p.get_mem(1))); + this->set_level_sync(1, l1_sync); + + this->add_level({::std::make_pair(hw_scope::thread, l2_size)}); + this->set_level_mem(2, size_t(p.get_mem(2))); + this->set_level_sync(2, l2_sync); + } + else + { + static_assert(pdepth == 3); + } +} + +/** + * @brief Specialization of `std::hash` for `cuda::experimental::stf::data_place` to allow it to be used as a key in + * `std::unordered_map`. + */ +template <> +struct hash +{ + ::std::size_t operator()(const data_place& k) const + { + // Not implemented for composite places + EXPECT(!k.is_composite()); + + // TODO fix gc_view visibility or provide a getter + if (k.is_green_ctx()) + { +#if CUDA_VERSION >= 12040 + return hash()(*(k.gc_view)); +#else + assert(0); +#endif + } + + return ::std::hash()(device_ordinal(k)); + } +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/places/tiled_partition.cuh b/cudax/include/cuda/experimental/__stf/places/tiled_partition.cuh new file mode 100644 index 00000000000..4492d87efd6 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/places/tiled_partition.cuh @@ -0,0 +1,173 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Definition of the `tiled_partition` strategy + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +namespace cuda::experimental::stf +{ + +namespace reserved +{ + +/* + * Define a tiled transformation to a shape of mdspan + */ +template +class tiled_mdspan_shape +{ +public: + // parts id, nparts ? + tiled_mdspan_shape(const mdspan_shape_t& s, size_t part_id, size_t nparts) + : original_shape(s) + , part_id(part_id) + , nparts(nparts) + {} + + /* The number of elements in this part */ + size_t size() const + { + assert(mdspan_shape_t::rank() == 1 && "Tiled mdspan shape only implemented in 1D yet"); + + const size_t n = original_shape.size(); + + // 0000 1111 2222 0000 11xx xxxx xxxx + // S=4, n=18 + // + // nparts=3 + // ntiles = (18+3)/4 = 5 + // tile_per_part = (5+2)/3 = 2 (1 or 2 tiles per part) + // cnt = 2*4 = 8 + // last_elem = {0 => (0+((2-1)*3)+1)*8=(1+3)*8=16} (n >= last_elem) => cnt = 8 + // {1 => (1+((2-1)*3)+1)*8=(2+3)*8=20} extra = min(4, 20-18)=2 => cnt = 6 + // {2 => (2+((2-1)*3)+1)*8=(3+3)*8=24} extra = min(4, 24-18)=4 => cnt = 4 + + // How many tiles if we round up to a multiple of tile_size ? + const size_t ntiles = (n + tile_size - 1) / tile_size; + + const size_t tile_per_part = (ntiles + nparts - 1) / nparts; + + // If all parts are the same + size_t cnt = tile_per_part * tile_size; + + // Assuming all parts have tile_per_part tiles, the last tile of the + // part starts the last tile has index (part_id + ((tile_per_part-1)* + // nparts)), and ends at the beginning of the next tile (hence +1). + const size_t last_elem = (part_id + (tile_per_part - 1) * nparts + 1) * tile_size; + + // Remove extra elements (if any) by computing what would be the last + // element for this part + if (last_elem > n) + { + size_t extra_elems = min(tile_size, last_elem - n); + cnt -= extra_elems; + } + + return cnt; + } + + using coords_t = typename mdspan_shape_t::coords_t; + + _CCCL_HOST_DEVICE coords_t index_to_coords(size_t index) const + { + // First transform from nparts and part_id to one coordinate + const size_t remain = index % tile_size; + const size_t tile_id = index / tile_size; + // Stage 2: apply original shape's transformation to stage1 + return original_shape.index_to_coords((part_id + tile_id * nparts) * tile_size + remain); + } + +private: + mdspan_shape_t original_shape; + size_t part_id; + size_t nparts; +}; + +} // end namespace reserved + +/** + * @brief Tiled partition strategy applied on a shape of mdspan + * + * @tparam `tile_size` size of the tiles + * @tparam `mdspan_shape_t` shape of a mdspan + * + * Since there is no partial template deduction on classes, we provide a + * function to implement tiled and have the other type deduced. + */ +template +auto tiled(const mdspan_shape_t s, size_t part_id, size_t nparts) +{ + return reserved::tiled_mdspan_shape(s, part_id, nparts); +} + +template +class tiled_partition +{ + static_assert(tile_size > 0); + +public: + tiled_partition() = default; + + template + static const reserved::tiled_mdspan_shape + apply(const mdspan_shape_t& in, pos4 place_position, dim4 grid_dims) + { + // TODO assert 1D ! + assert(grid_dims.x > 0); + return reserved::tiled_mdspan_shape(in, place_position.x, grid_dims.x); + } + + _CCCL_HOST_DEVICE static pos4 get_executor(pos4 data_coords, dim4 /*unused*/, dim4 grid_dims) + { + assert(grid_dims.x > 0); + return pos4((data_coords.x / tile_size) % grid_dims.x); + } +}; + +#ifdef UNITTESTED_FILE +UNITTEST("Composite data place equality") +{ + auto all = exec_place::all_devices(); + auto repeated_dev0 = exec_place::repeat(exec_place::device(0), 3); + + using P = tiled_partition<128>; + using P2 = tiled_partition<64>; + + /* Same partitionning operator, same execution place */ + EXPECT(data_place::composite(P(), all) == data_place::composite(P(), all)); + + /* Make sure we do not have a false positive in the test below */ + EXPECT(all != repeated_dev0); + + /* Same partitionning operator, different execution place */ + EXPECT(data_place::composite(P(), repeated_dev0) != data_place::composite(P(), all)); + + /* Different partitionning operator, same execution place */ + EXPECT(data_place::composite(P(), all) != data_place::composite(P2(), all)); +}; +#endif // UNITTESTED_FILE + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh new file mode 100644 index 00000000000..53f8190d432 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh @@ -0,0 +1,150 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +/** + * @file + * + * @brief Defines a data interface over the hashtable class, which is based on + * https://nosferalatu.com/SimpleGPUHashTable.html + */ + +#include +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief A simple example of a hashtable data interface that can be used as a logical data in the ``stream_ctx``. + */ +class hashtable_stream_interface : public stream_data_interface_simple +{ +public: + using base = stream_data_interface_simple; + using base::shape_t; + + /** + * @brief Initialize from an existing hashtable + */ + hashtable_stream_interface(hashtable h) + : base(mv(h)) + {} + + /** + * @brief Initialize from a shape of hashtable (start empty) + * @overload + */ + hashtable_stream_interface(shape_t /*unused*/) + : base(hashtable(nullptr)) + {} + + void stream_data_copy( + const data_place& dst_memory_node, + instance_id_t dst_instance_id, + const data_place& src_memory_node, + instance_id_t src_instance_id, + cudaStream_t s) override + { + ::std::ignore = src_memory_node; + ::std::ignore = dst_memory_node; + assert(src_memory_node != dst_memory_node); + + cudaMemcpyKind kind = cudaMemcpyDefault; + + const hashtable& src_instance = this->instance(src_instance_id); + const hashtable& dst_instance = this->instance(dst_instance_id); + + reserved::KeyValue* src = src_instance.addr; + reserved::KeyValue* dst = dst_instance.addr; + size_t sz = this->shape.get_capacity() * sizeof(reserved::KeyValue); + + // NAIVE method ! + cuda_safe_call(cudaMemcpyAsync((void*) dst, (void*) src, sz, kind, s)); + } + + void stream_data_allocate( + backend_ctx_untyped& /*unused*/, + const data_place& memory_node, + instance_id_t instance_id, + ::std::ptrdiff_t& s, + void** /*unused*/, + cudaStream_t stream) override + { + s = this->shape.get_capacity() * sizeof(reserved::KeyValue); + hashtable& local_desc = this->instance(instance_id); + + reserved::KeyValue* base_ptr; + + if (memory_node == data_place::host) + { + // Fallback to a synchronous method + cuda_safe_call(cudaStreamSynchronize(stream)); + cuda_safe_call(cudaHostAlloc(&base_ptr, s, cudaHostAllocMapped)); + memset(base_ptr, 0xff, s); + } + else + { + cuda_safe_call(cudaMallocAsync(&base_ptr, s, stream)); + + // We also need to initialize the hashtable + static_assert(reserved::kEmpty == 0xffffffff, "memset expected kEmpty=0xffffffff"); + cuda_safe_call(cudaMemsetAsync(base_ptr, 0xff, s, stream)); + } + + local_desc.addr = base_ptr; + } + + void stream_data_deallocate( + backend_ctx_untyped& /*unused*/, + const data_place& memory_node, + instance_id_t instance_id, + void* /*unused*/, + cudaStream_t stream) override + { + hashtable& local_desc = this->instance(instance_id); + if (memory_node == data_place::host) + { + // Fallback to a synchronous method + cuda_safe_call(cudaStreamSynchronize(stream)); + cuda_safe_call(cudaFreeHost(local_desc.addr)); + } + else + { + cuda_safe_call(cudaFreeAsync(local_desc.addr, stream)); + } + local_desc.addr = nullptr; // not strictly necessary, but helps debugging + } +}; + +/// @cond +// Forward declaration +template +struct streamed_interface_of; +/// @endcond + +template <> +struct streamed_interface_of +{ + using type = hashtable_stream_interface; +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh new file mode 100644 index 00000000000..759e4583433 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh @@ -0,0 +1,303 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** @file + * + * @brief Defines a single- and multi-dimensional span/range/view/slice for layout purposes. To be replaced or + * supplanted by `std::mdspan` in the future. + * + * Not much access functionality is defined or needed; only data layout is of importance. + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +namespace cuda::experimental::stf +{ + +/** @brief Contiguous memory interface. Supports multiple dimensions (compile-time chosen) and strides (run-time + * chosen). + */ +template +class slice_stream_interface : public stream_data_interface> +{ +public: + using base = stream_data_interface>; + using typename base::element_type; + using typename base::shape_t; + + using mutable_value_type = typename ::std::remove_const::type; + + slice_stream_interface(T* p) + : base(slice(p)) + { + static_assert(dimensions == 0, "This constructor is reserved for 0-dimensional data."); + } + + slice_stream_interface(slice s) + : base(mv(s)) + {} + + slice_stream_interface(shape_t s) + : base(mv(s)) + {} + + void stream_data_copy(const data_place&, instance_id_t, const data_place&, instance_id_t, cudaStream_t) override + { + // We are using data_copy instead of stream_data_copy in this interface + assert(!"This should not be called directly."); + } + + void data_allocate( + backend_ctx_untyped& bctx, + block_allocator_untyped& custom_allocator, + const data_place& memory_node, + instance_id_t instance_id, + ::std::ptrdiff_t& s, + void** extra_args, + event_list& prereqs) override + { + // fprintf(stderr, "SLICE data_allocate : data_place %ld - size<0>(b)T) = %ld * %d = + // %ld KB\n", + // int(memory_node), this->size<0>(common), sizeof(T), + // this->size<0>(common) * sizeof(T) / 1024); + + s = this->shape.size() * sizeof(T); + auto& local_desc = this->instance(instance_id); + + assert(memory_node != data_place::invalid); + + T* base_ptr; + + if (!memory_node.is_composite()) + { + base_ptr = static_cast(custom_allocator.allocate(bctx, memory_node, s, prereqs)); + local_desc = this->shape.create(base_ptr); + return; + } + + exec_place_grid grid = memory_node.get_grid(); + size_t total_size = this->shape.size(); + + // position (x,y,z,t) on (nx,ny,nz,nt) + // * index = x + nx*y + nx*ny*z + nx*ny*nz*t + // * index = x + nx(y + ny(z + nz*t)) + // So we can compute x, y, z, t from the index + // * x := index % nx; + // * index - x = nx(y + ny(z + nz*t)) + // * (index - x)/nx = y + ny(z + nz*t)) + // So, y := ( (index - x)/nx ) %ny + // index = x + nx(y + ny(z + nz*t)) + // (index - x)/nx = y + ny(z+nz*t) + // (index - x)/nx - y = ny(z+nz*t) + // ( (index - x)/nx - y)/ny = z+nz*t + // So, z := (( (index - x)/nx - y)/ny) % nz + // ( (index - x)/nx - y)/ny - z = nz*t + // ( ( (index - x)/nx - y)/ny - z ) / nz = t + auto delinearize = [&](size_t ind) { + static_assert(dimensions <= 4); + + size_t nx, ny, nz; + size_t x = 0, y = 0, z = 0, t = 0; + if constexpr (dimensions >= 1) + { + nx = this->shape.extent(0); + x = ind % nx; + } + if constexpr (dimensions >= 2) + { + ny = this->shape.extent(1); + y = ((ind - x) / nx) % ny; + } + if constexpr (dimensions >= 3) + { + nz = this->shape.extent(2); + z = (((ind - x) / nx - y) / ny) % nz; + } + + if constexpr (dimensions >= 4) + { + t = (((ind - x) / nx - y) / ny - z) / nz; + } + + return pos4(x, y, z, t); + }; + + // Get the extents stored as a dim4 + const dim4 data_dims = this->shape.get_data_dims(); + + auto array = bctx.get_composite_cache().get( + memory_node, memory_node.get_partitioner(), delinearize, total_size, sizeof(T), data_dims); + // We need to wait for its pending dependencies if any... + array->merge_into(prereqs); + base_ptr = static_cast(array->get_base_ptr()); + + // Store this localized array in the extra_args associated to the + // data instance so that we can use it later in the deallocation + // method. + *extra_args = array.release(); + local_desc = this->shape.create(base_ptr); + } + + void data_deallocate( + backend_ctx_untyped& bctx, + block_allocator_untyped& custom_allocator, + const data_place& memory_node, + instance_id_t instance_id, + void* extra_args, + event_list& prereqs) override + { + const size_t sz = this->shape.size() * sizeof(T); + auto& local_desc = this->instance(instance_id); + // We can deallocate a copy of a logical data even if it was only accessible in read only mode + auto ptr = const_cast(local_desc.data_handle()); + + // TODO erase local_desc to avoid future reuse by mistake + // local_desc = element_type(); + + if (!memory_node.is_composite()) + { + custom_allocator.deallocate(bctx, memory_node, prereqs, ptr, sz); + return; + } + + assert(extra_args); + + // To properly erase a composite data, we would need to synchronize the + // calling thread with all events because the current CUDA VMM facility + // does not have an asynchronous counterpart. Instead, we put the + // localized_array object into a cache, which will speedup the + // allocation of identical arrays, if any. + // This cached array is only usable once the prereqs of this deallocation are fulfilled. + auto* array = static_cast(extra_args); + bctx.get_composite_cache().put(::std::unique_ptr(array), prereqs); + } + + void data_copy(backend_ctx_untyped& bctx, + const data_place& dst_memory_node, + instance_id_t dst_instance_id, + const data_place& src_memory_node, + instance_id_t src_instance_id, + event_list& prereqs) override + { + assert(src_memory_node != dst_memory_node); + // We support dimensions up to 2, or higher if the slices are contiguous + // static_assert(dimensions <= 2, "unsupported yet."); + // assert(dimensions <= 2 && "unsupported yet."); + + auto decorated_s = dst_memory_node.getDataStream(bctx.async_resources()); + auto op = stream_async_op(bctx, decorated_s, prereqs); + + if (bctx.generate_event_symbols()) + { + // TODO + d->get_symbol(); + op.set_symbol("slice copy " + src_memory_node.to_string() + "->" + dst_memory_node.to_string()); + } + + cudaStream_t s = decorated_s.stream; + + // Let CUDA figure out from pointers + cudaMemcpyKind kind = cudaMemcpyDefault; + + /* Get size */ + auto& b = this->shape; + const auto& src_instance = this->instance(src_instance_id); + const auto& dst_instance = this->instance(dst_instance_id); + + /* We are copying so the destination will be changed, but from this + * might be a constant variable (when using a read-only access). Having a T + * type with a const qualifier is therefore possible, even if these API do not + * want const pointers. */ + auto dst_ptr = const_cast(dst_instance.data_handle()); + auto src_ptr = src_instance.data_handle(); + assert(src_ptr); + assert(dst_ptr); + + if constexpr (dimensions == 0) + { + cuda_safe_call(cudaMemcpyAsync(dst_ptr, src_ptr, sizeof(T), kind, s)); + } + else if constexpr (dimensions == 1) + { + cuda_safe_call(cudaMemcpyAsync(dst_ptr, src_ptr, b.extent(0) * sizeof(T), kind, s)); + } + else if constexpr (dimensions == 2) + { + cuda_safe_call(cudaMemcpy2DAsync( + dst_ptr, + dst_instance.stride(1) * sizeof(T), + src_ptr, + src_instance.stride(1) * sizeof(T), + b.extent(0) * sizeof(T), + b.extent(1), + kind, + s)); + } + else + { + // We only support higher dimensions if they are contiguous ! + if ((contiguous_dims(src_instance) == dimensions) && (contiguous_dims(dst_instance) == dimensions)) + { + cuda_safe_call(cudaMemcpyAsync(dst_ptr, src_ptr, b.size() * sizeof(T), kind, s)); + } + else + { + assert(dimensions == 2 && "Higher dimensions not supported."); + } + } + + prereqs = op.end(bctx); + } + + bool pin_host_memory(instance_id_t instance_id) override + { + auto s = this->instance(instance_id); + return s.data_handle() && pin(s); + } + + void unpin_host_memory(instance_id_t instance_id) override + { + unpin(this->instance(instance_id)); + } + + ::std::optional get_memory_type(instance_id_t instance_id) override + { + auto s = this->instance(instance_id); + + cudaPointerAttributes attributes{}; + cuda_safe_call(cudaPointerGetAttributes(&attributes, s.data_handle())); + + // Implicitely converted to an optional + return attributes.type; + }; +}; + +template +struct streamed_interface_of; + +template +struct streamed_interface_of> +{ + using type = slice_stream_interface::rank()>; +}; + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh new file mode 100644 index 00000000000..c4927c99187 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh @@ -0,0 +1,227 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Reduction operators over slices in the CUDA stream backend + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +namespace cuda::experimental::stf +{ + +template +__global__ void +slice_reduction_op_kernel(const slice in, const slice inout) +{ + size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + size_t nthreads = blockDim.x * gridDim.x; + + if constexpr (dimensions == 1) + { + for (size_t i = tid; i < inout.extent(0); i += nthreads) + { + ReduxOp::op_gpu(in(i), inout(i)); + } + } + else if constexpr (dimensions == 2) + { + for (size_t j = 0; j < inout.extent(1); j++) + { + for (size_t i = tid; i < inout.extent(0); i += nthreads) + { + ReduxOp::op_gpu(in(i, j), inout(i, j)); + } + } + } + else + { + static_assert(dimensions == 1 || dimensions == 2, "Dimensionality not supported."); + } +} + +template +__global__ void slice_reduction_op_init_kernel(slice out) +{ + size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + size_t nthreads = blockDim.x * gridDim.x; + + if constexpr (dimensions == 1) + { + for (size_t i = tid; i < out.extent(0); i += nthreads) + { + ReduxOp::init_gpu(out(i)); + } + } + else if constexpr (dimensions == 2) + { + for (size_t j = 0; j < out.extent(1); j++) + { + for (size_t i = tid; i < out.extent(0); i += nthreads) + { + ReduxOp::init_gpu(out(i, j)); + } + } + } + else + { + static_assert(dimensions == 1 || dimensions == 2, "Dimensionality not supported."); + } +} + +/** + * @brief Helper class to define element-wise reduction operators applied to slices + * + * ReduxOp::init_host(element_type &out); + * ReduxOp::op_host(const element_type &in, element_type &inout); + * __device__ ReduxOp::init_gpu(element_type &out); + * __device__ ReduxOp::op_gpu(const element_type &in, element_type &inout); + * + * @extends stream_reduction_operator + */ +template +class slice_reduction_op : public stream_reduction_operator> +{ +public: + using instance_t = slice; + + slice_reduction_op() = default; + + /// Reconstruct an instance by applying the reduction operator over it and another instance + void op(const instance_t& in, instance_t& inout, const exec_place& e, cudaStream_t s) override + { + if (e.affine_data_place() == data_place::host) + { + // TODO make a callback when the situation gets better + cuda_safe_call(cudaStreamSynchronize(s)); + // slice_print(in, "in before op"); + // slice_print(inout, "inout before op"); + + if constexpr (dimensions == 1) + { + assert(in.extent(0) == inout.extent(0)); + for (size_t i = 0; i < in.extent(0); i++) + { + ReduxOp::op_host(in(i), inout(i)); + } + } + else if constexpr (dimensions == 2) + { + for (size_t j = 0; j < inout.extent(1); j++) + { + for (size_t i = 0; i < inout.extent(0); i++) + { + ReduxOp::op_host(in(i, j), inout(i, j)); + } + } + } + else + { + static_assert(dimensions == 1 || dimensions == 2, "Dimensionality not supported."); + } + + // slice_print(in, "in after op"); + // slice_print(inout, "inout after op"); + } + else + { + // this is not the host, so this has to be a device ... (XXX) + auto [gridsize, threadblocksize] = + reserved::compute_occupancy(slice_reduction_op_kernel); + slice_reduction_op_kernel<<>>(in, inout); + } + } + + /// Initialize an instance with an appropriate default value for the reduction operator + void init_op(instance_t& out, const exec_place& e, cudaStream_t s) override + { + if (e.affine_data_place() == data_place::host) + { + // TODO make a callback when the situation gets better + cuda_safe_call(cudaStreamSynchronize(s)); + if constexpr (dimensions == 1) + { + for (size_t i = 0; i < out.extent(0); i++) + { + ReduxOp::init_host(out(i)); + } + } + else if constexpr (dimensions == 2) + { + for (size_t j = 0; j < out.extent(1); j++) + { + for (size_t i = 0; i < out.extent(0); i++) + { + ReduxOp::init_host(out(i, j)); + } + } + } + else + { + static_assert(dimensions == 1 || dimensions == 2, "Dimensionality not supported."); + } + } + else + { + // this is not the host, so this has to be a device ... (XXX) + auto [gridsize, threadblocksize] = + reserved::compute_occupancy(slice_reduction_op_init_kernel); + + EXPECT(out.data_handle() != nullptr); + slice_reduction_op_init_kernel<<>>(out); + } + } +}; + +template +class slice_reduction_op_sum_impl +{ +public: + static void init_host(element_type& out) + { + out = element_type(0); + }; + static __device__ void init_gpu(element_type& out) + { + out = element_type(0); + }; + + static void op_host(const element_type& in, element_type& inout) + { + inout += in; + }; + static __device__ void op_gpu(const element_type& in, element_type& inout) + { + inout += in; + }; +}; + +/** + * @brief A sum reduction operator over slices + */ + +template +class slice_reduction_op_sum + : public slice_reduction_op> +{}; +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh b/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh new file mode 100644 index 00000000000..e68dca7d0bf --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh @@ -0,0 +1,501 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuda::experimental::stf +{ + +class stream_and_event; +namespace reserved +{ + +inline event join_with_stream( + backend_ctx_untyped& bctx, decorated_stream dstream, event_list& prereq_in, ::std::string string, bool record_event); + +using stream_and_event_vector = small_vector, 7>; + +/* Tag types for event counters */ +class cuda_event_tag +{ +public: + class created + {}; + class alive + {}; + class destroyed + {}; +}; + +class cuda_stream_wait_event_tag +{}; + +} // namespace reserved + +/* This event type allows to synchronize a CUDA stream with a CUDA event in + * another stream, possibly on another device. */ +class stream_and_event : public event_impl +{ +protected: + stream_and_event() = default; + stream_and_event(const stream_and_event&) = delete; + stream_and_event& operator=(const stream_and_event&) = delete; + + stream_and_event(stream_and_event&&) = delete; + + ~stream_and_event() override + { + // fprintf(stderr, "DESTROY EVENT %s (%d) - cudaEvent %p\n", this->get_symbol().c_str(), + // int(this->unique_prereq_id), cudaEvent); + if (cudaEvent) + { + cuda_safe_call(cudaEventDestroy(cudaEvent)); +#ifdef CUDASTF_DEBUG + reserved::counter::increment(); + reserved::counter::decrement(); +#endif + + // fprintf(stderr, "DESTROY EVENT %p #%d (created %d)\n", event, ++destroyed_event_cnt, + // event_cnt); + } + } + + stream_and_event(const decorated_stream& dstream, bool do_insert_event) + : dstream(dstream) + { + // fprintf(stderr, "stream_and_event %s (ID %d)\n", this->get_symbol().c_str(), int(this->unique_prereq_id)); + if (do_insert_event) + { + insert_event(); + } + } + +public: + // @@@@TODO@@@@ remove this function so that we have a mere + // insert_dep call, and add the initial test to check whether both + // streams are the same here too. + // Insert a dependency from stream s1 over s2 (s1 waits for s2) + static void insert_dependency(cudaStream_t s1, cudaStream_t s2) + { + // Create a dependency between the last stream and the current stream + + // event and stream must be on the same device + // if the stream does not belong to the current device, we + // therefore have to find in which device the stream was created, + // record the event, and restore the current device to its original + // value. + + // Find the stream structure in the driver API + CUstream s2_driver = CUstream(s2); + CUcontext ctx; + cuda_safe_call(cuStreamGetCtx(s2_driver, &ctx)); + + // Query the context associated with a stream by using the underlying driver API + CUdevice s2_dev; + cuda_safe_call(cuCtxPushCurrent(ctx)); + cuda_safe_call(cuCtxGetDevice(&s2_dev)); + cuda_safe_call(cuCtxPopCurrent(&ctx)); + + // ::std::cout << "STREAM DEVICE = " << s2_dev << ::std::endl; + + exec_place::device(s2_dev)->*[&] { + // Disable timing to avoid implicit barriers + cudaEvent_t sync_event; + cuda_safe_call(cudaEventCreateWithFlags(&sync_event, cudaEventDisableTiming)); +#ifdef CUDASTF_DEBUG + reserved::counter::increment(); + reserved::counter::increment(); + reserved::high_water_mark::record( + reserved::counter::load()); +#endif + + cuda_safe_call(cudaEventRecord(sync_event, s2)); + + // According to documentation "event may be from a different device than stream." + cuda_safe_call(cudaStreamWaitEvent(s1, sync_event, 0)); +#ifdef CUDASTF_DEBUG + reserved::counter.increment(); +#endif + + // Asynchronously destroy event to avoid a memleak + cuda_safe_call(cudaEventDestroy(sync_event)); +#ifdef CUDASTF_DEBUG + reserved::counter::increment(); + reserved::counter::decrement(); +#endif + }; + } + + void insert_event() + { + // If needed, compute the underlying device + if (dstream.dev_id == -1) + { + dstream.dev_id = get_device_from_stream(dstream.stream); + } + + // Save the current device + exec_place::device(dstream.dev_id)->*[&] { + // Disable timing to avoid implicit barriers + cuda_safe_call(cudaEventCreateWithFlags(&cudaEvent, cudaEventDisableTiming)); + // fprintf(stderr, "CREATE EVENT %p %s\n", cudaEvent, get_symbol().c_str()); + assert(cudaEvent); +#ifdef CUDASTF_DEBUG + reserved::counter::increment(); + reserved::counter::increment(); + reserved::high_water_mark::record( + reserved::counter::load()); +#endif + cuda_safe_call(cudaEventRecord(cudaEvent, dstream.stream)); + }; + } + + void insert_dep(async_resources_handle& async_resources, const stream_and_event& from) + { + // Otherwise streams will enforce dependencies + if (dstream.stream != from.dstream.stream) + { + bool skip = async_resources.validate_sync_and_update(dstream.id, from.dstream.id, int(from.unique_prereq_id)); + if (!skip) + { + cuda_safe_call(cudaStreamWaitEvent(dstream.stream, from.cudaEvent, 0)); +#ifdef CUDASTF_DEBUG + reserved::counter.increment(); +#endif + } + } + } + + /** + * @brief Remove implicit dependencies already induced by more recent events using the same stream. + */ + bool factorize(reserved::event_vector& events) override + { + assert(events.size() >= 2); + assert([&] { + for (const auto& e : events) + { + assert(dynamic_cast(e.operator->())); + } + return true; + }()); + + static_assert(sizeof(reserved::event_vector) == sizeof(reserved::stream_and_event_vector)); + auto& proxy = reinterpret_cast(events); + + if (events.size() == 2) + { + // Specialize for two entries + auto& a = proxy[0]; + auto& b = proxy[1]; + + if (a->dstream.stream == b->dstream.stream) + { + // Must remove one element, keep the one with the largest id + if (a->unique_prereq_id < b->unique_prereq_id) + { + a = mv(b); + } + proxy.pop_back(); + } + + return true; + } + + // Sort events on stream (ascending) then id (descending) + ::std::sort(proxy.begin(), proxy.end(), [](const auto& a, const auto& b) { + return a->dstream.stream != b->dstream.stream + ? a->dstream.stream < b->dstream.stream + : a->unique_prereq_id > b->unique_prereq_id; + }); + + // Remove duplicates. Two events are duplicates if they have the same stream. + // Will keep the first element of each duplicate run, which is the one with the largest id. + proxy.erase(unstable_unique(proxy.begin(), + proxy.end(), + [](const auto& a, const auto& b) { + return a->dstream.stream == b->dstream.stream; + }), + proxy.end()); + + return true; + + // // For each CUDA stream, we only keep the most recent event. The `int` is the unique event ID. + // ::std::unordered_map> m; + + // // For each element, compare it with existing entries of the map, keep only the most recent ones + // for (auto& e: events) { + // auto se = reserved::handle(e, reserved::use_dynamic_cast); + // cudaStream_t stream = se->stream; + // int id = se->unique_prereq_id; + + // auto iterator = m.find(stream); + // if (iterator != m.end()) { + // int previous_id = iterator->second.first; + // // Equality is possible if we have redundant entries, but then + // // we drop redundant values + // if (id > previous_id) { + // iterator->second = ::std::make_pair(id, &e); + // } + // } else { + // m[stream] = ::std::make_pair(id, &e); + // } + // } + + // // Create a new container with the remaining elements and swap it with the old one + // ::std::vector new_events; + // new_events.reserve(m.size()); + // for (const auto& e: m) { + // new_events.push_back(mv(*e.second.second)); + // } + + // events.swap(new_events); + + // return true; + } + + void sync_with_stream(backend_ctx_untyped& bctx, event_list& prereqs, cudaStream_t stream) const override + { + reserved::join_with_stream(bctx, decorated_stream(stream), prereqs, "sync", false); + } + + cudaStream_t get_stream() const + { + return dstream.stream; + } + + decorated_stream get_decorated_stream() const + { + return dstream; + } + + ::std::ptrdiff_t get_stream_id() const + { + return dstream.id; + } + + cudaEvent_t get_cuda_event() const + { + return cudaEvent; + } + +private: + decorated_stream dstream; + cudaEvent_t cudaEvent = nullptr; +}; + +/** + * @brief This implements an asynchronous operation synchronized by the means of stream-based events + */ +class stream_async_op +{ +public: + stream_async_op() = default; + + stream_async_op(backend_ctx_untyped& bctx, decorated_stream dstream, event_list& prereq_in) + : dstream(mv(dstream)) + { + setup(bctx, prereq_in); + } + + // Async operation associated to a data place, we automatically get a stream attached to that place + stream_async_op(backend_ctx_untyped& bctx, const data_place& place, cudaStream_t* target_stream, event_list& prereq_in) + { + int stream_dev_id = device_ordinal(place); + + if (stream_dev_id >= 0) + { + // We try to look for a stream that is already on the same device as this will require less synchronization + dstream = device_lookup_in_event_list(bctx, prereq_in, stream_dev_id); + } + + if (dstream.stream == nullptr) + { + // We did not select a stream yet, so we take one in the pools in + // the async_resource_handle object associated to the context + dstream = place.getDataStream(bctx.async_resources()); + } + + // Note that if we had stream_dev_id = -1 (eg. host memory), the device + // id of this decorated stream will disagree, as we have taken one + // stream from any device (current device, in particular) + assert(dstream.stream); + + if (target_stream) + { + *target_stream = dstream.stream; + } + + setup(bctx, prereq_in); + } + + void set_symbol(::std::string s) + { + symbol = mv(s); + } + + // Make sure all operations are done in the stream, and insert an event. A CUDASTF event is returned + event end_as_event(backend_ctx_untyped& bctx) + { + /* Create an event that synchronize with all pending work in the CUDA stream */ + event e = event(reserved::handle(dstream, true)); + + if (!symbol.empty()) + { + e->set_symbol(bctx, mv(symbol)); + } + + auto& dot = *bctx.get_dot(); + if (dot.is_tracing_prereqs()) + { + for (int id : joined_ids) + { + dot.add_edge(id, e->unique_prereq_id, 1); + } + } + + return e; + } + + // Make sure all operations are done in the stream, and insert an event. A + // list of CUDASTF events (with a single entry) is returned. + event_list end(backend_ctx_untyped& bctx) + { + return event_list(end_as_event(bctx)); + } + +private: + // Initialize the async operation so that the stream waits for input events + void setup(backend_ctx_untyped& bctx, event_list& prereq_in) + { + // Make sure we reduce the number of resulting stream/event synchronization + // API calls to a minimum. If the list was already optimized, this will be a no-op + prereq_in.optimize(); + + // Do we have to keep track of dependencies in DOT ? + bool tracing = bctx.get_dot()->is_tracing_prereqs(); + + // Make sure any operation in the stream depends on prereq_in + for (const auto& e : prereq_in) + { + assert(dynamic_cast(e.operator->())); + auto se = reserved::handle(e, reserved::use_static_cast); + + if (dstream.stream != se->get_stream()) + { + bool skip = + bctx.async_resources().validate_sync_and_update(dstream.id, se->get_stream_id(), se->unique_prereq_id); + if (!skip) + { + cuda_safe_call(cudaStreamWaitEvent(dstream.stream, se->get_cuda_event(), 0)); +#ifdef CUDASTF_DEBUG + reserved::counter.increment(); +#endif + } + } + se->outbound_deps++; + + if (tracing) + { + joined_ids.push_back(se->unique_prereq_id); + } + } + } + + /* Find is there is already a stream associated to that device in the + * prereq list */ + static decorated_stream device_lookup_in_event_list(backend_ctx_untyped& /* bctx */, event_list& prereq_in, int devid) + { + if (reserved::cached_getenv("CUDASTF_NO_LOOKUP")) + { + return decorated_stream(nullptr); + } + + for (const auto& e : prereq_in) + { + cudaStream_t stream; + ::std::ptrdiff_t stream_id = -1; + auto se = reserved::handle(e, reserved::use_static_cast); + stream = se->get_stream(); + stream_id = se->get_stream_id(); + + // Find the stream structure in the driver API + auto stream_driver = CUstream(stream); + CUcontext ctx; + cuda_safe_call(cuStreamGetCtx(stream_driver, &ctx)); + + CUdevice stream_dev; + cuda_safe_call(cuCtxPushCurrent(ctx)); + cuda_safe_call(cuCtxGetDevice(&stream_dev)); + cuda_safe_call(cuCtxPopCurrent(&ctx)); + + if (stream_dev == devid) + { + // fprintf(stderr, "Found matching device %d with stream %p\n", devid, stream); + return decorated_stream(stream, stream_id, devid); + } + } + + return decorated_stream(); + } + + decorated_stream dstream; + ::std::string symbol; + + // Used to display dependencies in DOT + ::std::vector joined_ids; +}; + +namespace reserved +{ + +/* This creates a synchronization point between all entries of the prereq_in list, and a CUDA stream */ +inline event join_with_stream( + backend_ctx_untyped& bctx, decorated_stream dstream, event_list& prereq_in, ::std::string string, bool record_event) +{ + // Make sure we reduce the number of resulting stream/event synchronization + // API calls to a minimum. If the list was already optimized, this will be a no-op + prereq_in.optimize(); + + auto se = reserved::handle(mv(dstream), record_event); + se->set_symbol(bctx, mv(string)); + join(bctx, *se, prereq_in); + return se; +} + +/* Create a simple event in a CUDA stream */ +inline event record_event_in_stream(const decorated_stream& dstream) +{ + return reserved::handle(dstream, true); +} + +} // end namespace reserved + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/stream/reduction.cuh b/cudax/include/cuda/experimental/__stf/stream/reduction.cuh new file mode 100644 index 00000000000..4493672c70b --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/stream/reduction.cuh @@ -0,0 +1,168 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Interface to define reduction operators in the CUDA stream backend + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief Helper class to define a reduction operator attached to a data + * interface. + * + * Defining a new operator requires to define the virtual methods + * stream_init_op and stream_redux_op which respectively initialize a data + * instance, and apply the reduction operator over two instances. + * + * See stream_reduction_operator for a class which directly manipulates typed + * data instances, with a simpler programming interface. + */ +class stream_reduction_operator_untyped : public reduction_operator_base +{ +public: + stream_reduction_operator_untyped(bool is_commutative = true) + : reduction_operator_base(is_commutative) + {} + + virtual void stream_redux_op( + logical_data_untyped& d, + const data_place& inout_memory_node, + instance_id_t inout_instance_id, + const data_place& in_memory_node, + instance_id_t in_instance_id, + const exec_place& e, + cudaStream_t s) = 0; + + virtual void stream_init_op( + logical_data_untyped& d, + const data_place& out_memory_node, + instance_id_t out_instance_id, + const exec_place& e, + cudaStream_t s) = 0; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen has issues with this code + void op_untyped( + logical_data_untyped& d, + const data_place& inout_memory_node, + instance_id_t inout_instance_id, + const data_place& in_memory_node, + instance_id_t in_instance_id, + const exec_place& ep, + event_list& prereqs) override + { + auto dstream = inout_memory_node.getDataStream(d.get_ctx().async_resources()); + auto async_op = stream_async_op(d.get_ctx(), dstream, prereqs); + if (d.get_ctx().generate_event_symbols()) + { + async_op.set_symbol("redux op " + d.get_symbol()); + } + +# ifdef REDUCTION_DEBUG + fprintf(stderr, "stream_redux_op inout %d in %d\n", inout_instance_id, in_instance_id); +# endif + stream_redux_op(d, inout_memory_node, inout_instance_id, in_memory_node, in_instance_id, ep, dstream.stream); + + prereqs = async_op.end(d.get_ctx()); + } + + void init_op_untyped(logical_data_untyped& d, + const data_place& out_memory_node, + instance_id_t out_instance_id, + const exec_place& ep, + event_list& prereqs) override + { + auto dstream = out_memory_node.getDataStream(d.get_ctx().async_resources()); + auto async_op = stream_async_op(d.get_ctx(), dstream, prereqs); + if (d.get_ctx().generate_event_symbols()) + { + async_op.set_symbol("redux init op " + d.get_symbol()); + } + +# ifdef REDUCTION_DEBUG + fprintf(stderr, "stream_init_op out %d\n", out_instance_id); +# endif + stream_init_op(d, out_memory_node, out_instance_id, ep, dstream.stream); + + prereqs = async_op.end(d.get_ctx()); + } +#endif // DOXYGEN_SHOULD_SKIP_THIS // doxygen has issues with this code +}; + +/** + * @class stream_reduction_operator + * + * @brief Helper class to define a reduction operator attached to a type of + * data instance. + * + * Defining a new operator requires to define the virtual + * methods op and init_op which respectively initialize a data + * instance, and apply the reduction operator over two instances. + */ +template +class stream_reduction_operator : public stream_reduction_operator_untyped +{ + /** + * @brief Apply the reduction operator over 'in' and 'inout' data instances + * asynchronously in cudaStream_t 's'. The operator is applied from + * execution place 'e' + */ + virtual void op(const T& in, T& inout, const exec_place& e, cudaStream_t s) = 0; + + /** + * @brief Initialize data instance 'out' with a neutral element with respect to the reduction operator. + * + * This is done asynchronously in cudaStream_t 's'. The operator is applied + * from execution place 'e'. + */ + virtual void init_op(T& out, const exec_place& e, cudaStream_t s) = 0; + + void stream_redux_op( + logical_data_untyped& d, + const data_place& /*unused*/, + instance_id_t inout_instance_id, + const data_place&, + instance_id_t in_instance_id, + const exec_place& e, + cudaStream_t s) + { + const auto& in_instance = d.instance(in_instance_id); + auto& inout_instance = d.instance(inout_instance_id); + + op(in_instance, inout_instance, e, s); + } + + void stream_init_op( + logical_data_untyped& d, const data_place&, instance_id_t out_instance_id, const exec_place& e, cudaStream_t s) + { + auto& out_instance = d.instance(out_instance_id); + + init_op(out_instance, e, s); + } +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh new file mode 100644 index 00000000000..c3c8ac2a82b --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh @@ -0,0 +1,1320 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Implements the stream_ctx backend that uses CUDA streams and CUDA events to synchronize tasks + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include // for unit test! +#include // For implicit logical_data_untyped constructors +#include + +namespace cuda::experimental::stf +{ + +template +struct streamed_interface_of; + +/** + * @brief Uncached allocator (used as a basis for other allocators) + * + * Any allocation/deallocation results in an actual underlying CUDA API call + * (e.g. cudaMallocAsync). This allocator should generally not be used + * directly, but used within an allocator with a more advanced strategy + * (caching, heap allocation, ...) + */ +class uncached_stream_allocator : public block_allocator_interface +{ +public: + uncached_stream_allocator() = default; + + void* + allocate(backend_ctx_untyped& ctx, const data_place& memory_node, ::std::ptrdiff_t& s, event_list& prereqs) override + { + void* result = nullptr; + + // That is a miss, we need to do an allocation + if (memory_node == data_place::host) + { + cuda_safe_call(cudaMallocHost(&result, s)); + } + else if (memory_node == data_place::managed) + { + cuda_safe_call(cudaMallocManaged(&result, s)); + } + else + { + if (memory_node.is_green_ctx()) + { + fprintf(stderr, + "Pretend we use cudaMallocAsync on green context (using device %d in reality)\n", + device_ordinal(memory_node)); + } + + const int prev_dev_id = cuda_try(); + // Optimization: track the current device ID + int current_dev_id = prev_dev_id; + SCOPE(exit) + { + if (current_dev_id != prev_dev_id) + { + cuda_safe_call(cudaSetDevice(prev_dev_id)); + } + }; + + EXPECT(!memory_node.is_composite()); + + // (Note device_ordinal works with green contexts as well) + cuda_safe_call(cudaSetDevice(device_ordinal(memory_node))); + current_dev_id = device_ordinal(memory_node); + + // Last possibility : this is a device + auto dstream = memory_node.getDataStream(ctx.async_resources()); + auto op = stream_async_op(ctx, dstream, prereqs); + if (ctx.generate_event_symbols()) + { + op.set_symbol("cudaMallocAsync"); + } + cuda_safe_call(cudaMallocAsync(&result, s, dstream.stream)); + prereqs = op.end(ctx); + } + return result; + } + + void deallocate( + backend_ctx_untyped& ctx, const data_place& memory_node, event_list& prereqs, void* ptr, size_t /* sz */) override + { + auto dstream = memory_node.getDataStream(ctx.async_resources()); + auto op = stream_async_op(ctx, dstream, prereqs); + if (ctx.generate_event_symbols()) + { + op.set_symbol("cudaFreeAsync"); + } + + if (memory_node == data_place::host) + { + // XXX TODO defer to deinit (or implement a blocking policy)? + cuda_safe_call(cudaStreamSynchronize(dstream.stream)); + cuda_safe_call(cudaFreeHost(ptr)); + } + else if (memory_node == data_place::managed) + { + cuda_safe_call(cudaStreamSynchronize(dstream.stream)); + cuda_safe_call(cudaFree(ptr)); + } + else + { + const int prev_dev_id = cuda_try(); + // Optimization: track the current device ID + int current_dev_id = prev_dev_id; + SCOPE(exit) + { + if (current_dev_id != prev_dev_id) + { + cuda_safe_call(cudaSetDevice(prev_dev_id)); + } + }; + + // (Note device_ordinal works with green contexts as well) + cuda_safe_call(cudaSetDevice(device_ordinal(memory_node))); + current_dev_id = device_ordinal(memory_node); + + // Assuming device memory + cuda_safe_call(cudaFreeAsync(ptr, dstream.stream)); + } + + prereqs = op.end(ctx); + } + + // Nothing is done as all deallocation are done immediately + event_list deinit(backend_ctx_untyped& /* ctx */) override + { + return event_list(); + } + + ::std::string to_string() const override + { + return "uncached stream allocator"; + } +}; + +/** This class describes a CUDASTF execution context where CUDA streams and + * CUDA events are used as synchronization primitives. + * + * This class is copyable, movable, and can be passed by value + */ +class stream_ctx : public backend_ctx +{ + using base = backend_ctx; + +public: + using task_type = stream_task<>; + + /** + * @brief Definition for the underlying implementation of `data_interface` + * + * @tparam T + */ + template + using data_interface = typename streamed_interface_of::type; + + /// @brief This type is copyable, assignable, and movable. However, copies have reference semantics. + ///@{ + stream_ctx(async_resources_handle handle = async_resources_handle(nullptr)) + : backend_ctx(::std::make_shared(mv(handle))) + {} + stream_ctx(cudaStream_t user_stream, async_resources_handle handle = async_resources_handle(nullptr)) + : backend_ctx(::std::make_shared(mv(handle))) + { + // We set the user stream as the entry point after the creation of the + // context so that we can manipulate the object, not its shared_ptr + // implementation + set_user_stream(user_stream); + } + + ///@} + + void set_user_stream(cudaStream_t user_stream) + { + // TODO first introduce the user stream in our pool + auto dstream = decorated_stream(user_stream); + + this->state().user_dstream = dstream; + + // Create an event in the stream + auto start_e = reserved::record_event_in_stream(dstream); + get_stack().add_start_events(event_list(mv(start_e))); + + // When a stream is attached to the context creation, the finalize() + // semantic is non-blocking + this->state().blocking_finalize = false; + } + + // Indicate if the finalize() call should be blocking or not + bool blocking_finalize = true; + + ::std::string to_string() const + { + return "stream backend context"; + } + + using backend_ctx::task; + + /** + * @brief Creates a task on the specified execution place + */ + template + stream_task task(exec_place e_place, task_dep... deps) + { + EXPECT(state().deferred_tasks.empty(), "Mixing deferred and immediate tasks is not supported yet."); + + auto dump_hooks = reserved::get_dump_hooks(this, deps...); + + auto res = stream_task(*this, mv(e_place), mv(deps)...); + res.add_post_submission_hook(dump_hooks); + return res; + } + + template + deferred_stream_task deferred_task(exec_place e_place, task_dep... deps) + { + auto result = deferred_stream_task(*this, mv(e_place), mv(deps)...); + + int id = result.get_mapping_id(); + state().deferred_tasks.push_back(id); + state().task_map.emplace(id, result); + + return result; + } + + template + deferred_stream_task deferred_task(task_dep... deps) + { + return deferred_task(exec_place::current_device(), mv(deps)...); + } + + template + auto deferred_host_launch(task_dep... deps) + { + auto result = deferred_host_launch_scope(this, mv(deps)...); + int id = result.get_mapping_id(); + + state().deferred_tasks.push_back(id); + state().task_map.emplace(id, result); + + return result; + } + + cudaStream_t task_fence() + { + const auto& user_dstream = state().user_dstream; + // We either use the user-provided stream, or we get one stream from the pool + decorated_stream dstream = + (user_dstream.has_value()) + ? user_dstream.value() + : exec_place::current_device().getStream(async_resources(), true /* stream for computation */); + + auto prereqs = get_stack().insert_task_fence(*get_dot()); + + prereqs.optimize(); + + // The output event is used for the tools in practice so we can ignore it + /* auto before_e = */ reserved::join_with_stream(*this, dstream, prereqs, "task_fence", false); + + return dstream.stream; + } + + /* + * host_launch : launch a "kernel" in a callback + */ + + template + class deferred_parallel_for_scope : public deferred_stream_task<> + { + struct payload_t : public deferred_stream_task<>::payload_t + { + payload_t(stream_ctx& ctx, exec_place e_place, shape_t shape, task_dep... deps) + : task(ctx, mv(e_place), mv(shape), mv(deps)...) + {} + + reserved::parallel_for_scope task; + ::std::function&)> todo; + + void set_symbol(::std::string s) override + { + task.set_symbol(mv(s)); + } + + const ::std::string& get_symbol() const override + { + return task.get_symbol(); + } + + int get_mapping_id() const override + { + return task.get_mapping_id(); + } + + void run() override + { + todo(task); + } + + void populate_deps_scheduling_info() override + { + // Error checking copied from acquire() in acquire_release() + + int index = 0; + const auto& deps = get_task_deps(); + for (const auto& dep : deps) + { + if (!dep.get_data().is_initialized()) + { + fprintf(stderr, "Error: dependency number %d is an uninitialized logical data.\n", index); + abort(); + } + dep.set_symbol(dep.get_data().get_symbol()); + dep.set_data_footprint(dep.get_data().get_data_interface().data_footprint()); + index++; + } + } + + const task_dep_vector_untyped& get_task_deps() const override + { + return task.get_task_deps(); + } + + void set_exec_place(exec_place e_place) override + { + task.set_exec_place(mv(e_place)); + } + }; + + payload_t& my_payload() const + { + // Safe to do the cast because we've set the pointer earlier ourselves + return *static_cast(payload.get()); + } + + public: + deferred_parallel_for_scope(stream_ctx& ctx, exec_place e_place, shape_t shape, task_dep... deps) + : deferred_stream_task<>(::std::make_shared(ctx, mv(e_place), mv(shape), mv(deps)...)) + {} + + ///@{ + /** + * @name Set the symbol of the task. This is used for profiling and debugging. + * + * @param s + * @return deferred_parallel_for_scope& + */ + deferred_parallel_for_scope& set_symbol(::std::string s) & + { + payload->set_symbol(mv(s)); + return *this; + } + + deferred_parallel_for_scope&& set_symbol(::std::string s) && + { + set_symbol(mv(s)); + return mv(*this); + } + ///@} + + void populate_deps_scheduling_info() + { + payload->populate_deps_scheduling_info(); + } + + template + void operator->*(Fun fun) + { + my_payload().todo = [f = mv(fun)](reserved::parallel_for_scope& task) { + task->*f; + }; + } + }; + + template + class deferred_host_launch_scope : public deferred_stream_task<> + { + struct payload_t : public deferred_stream_task<>::payload_t + { + payload_t(stream_ctx& ctx, task_dep... deps) + : task(ctx, mv(deps)...) + {} + + reserved::host_launch_scope task; + ::std::function&)> todo; + + void set_symbol(::std::string s) override + { + task.set_symbol(s); + } + + const ::std::string& get_symbol() const override + { + return task.get_symbol(); + } + + int get_mapping_id() const override + { + return task.get_mapping_id(); + } + + void run() override + { + todo(task); + } + + void populate_deps_scheduling_info() override + { + int index = 0; + const auto& deps = get_task_deps(); + for (const auto& dep : deps) + { + if (!dep.get_data().is_initialized()) + { + fprintf(stderr, "Error: dependency number %d is an uninitialized logical data.\n", index); + abort(); + } + dep.set_symbol(dep.get_data().get_symbol()); + dep.set_data_footprint(dep.get_data().get_data_interface().data_footprint()); + index++; + } + } + + const task_dep_vector_untyped& get_task_deps() const override + { + return task.get_task_deps(); + } + + void set_exec_place(exec_place) override {} + }; + + payload_t& my_payload() const + { + // Safe to do the cast because we've set the pointer earlier ourselves + return *static_cast(payload.get()); + } + + public: + deferred_host_launch_scope(stream_ctx& ctx, task_dep... deps) + : deferred_stream_task<>(::std::make_shared(ctx, mv(deps)...)) + {} + + ///@{ + /** + * @name Set the symbol of the task. This is used for profiling and debugging. + * + * @param s + * @return deferred_host_launch_scope& + */ + deferred_host_launch_scope& set_symbol(::std::string s) & + { + payload->set_symbol(mv(s)); + return *this; + } + + deferred_host_launch_scope&& set_symbol(::std::string s) && + { + set_symbol(mv(s)); + return mv(*this); + } + ///@} + + void populate_deps_scheduling_info() + { + payload->populate_deps_scheduling_info(); + } + + template + void operator->*(Fun fun) + { + my_payload().todo = [f = mv(fun)](reserved::host_launch_scope& task) { + task->*f; + }; + } + }; + + void finalize() + { + assert(get_phase() < backend_ctx_untyped::phase::finalized); + auto& state = this->state(); + if (!state.submitted_stream) + { + // Wasn't submitted yet + submit(); + assert(state.submitted_stream); + } + if (state.blocking_finalize) + { + cuda_safe_call(cudaStreamSynchronize(state.submitted_stream)); + } + state.cleanup(); + set_phase(backend_ctx_untyped::phase::finalized); + +#ifdef CUDASTF_DEBUG + // Ensure that the total number of CUDA events created corresponds to + // the number of events destroyed + const auto alive = reserved::counter.load(); + if (alive != 0) + { + fprintf(stderr, + "WARNING!!! %lu CUDA events leaked (approx %lu created vs. %lu destroyed).\n", + alive, + reserved::counter.load(), + reserved::counter.load()); + } + + assert(alive == 0); + + const char* display_stats_env = getenv("CUDASTF_DISPLAY_STATS"); + if (!display_stats_env || atoi(display_stats_env) == 0) + { + return; + } + + fprintf(stderr, + "[STATS CUDA EVENTS] created=%lu destroyed=%lu alive=%lu reserved::high_water_mark=%lu\n", + reserved::counter.load(), + reserved::counter.load(), + alive, + reserved::high_water_mark.load()); + fprintf(stderr, + "[STATS CUDA EVENTS] cuda_stream_wait_event=%lu\n", + reserved::counter.load()); +#endif + } + + float get_submission_time_ms() const + { + assert(state().submitted_stream); + return state().submission_time; + } + + void submit() + { + auto& state = this->state(); + assert(!state.submitted_stream); + + assert(get_phase() < backend_ctx_untyped::phase::submitted); + + cudaEvent_t startEvent = nullptr; + cudaEvent_t stopEvent = nullptr; + + ::std::unordered_map payloads; + if (reordering_tasks()) + { + build_task_graph(); + for (int id : state.deferred_tasks) + { + const auto& t = state.task_map.at(id); + payloads.emplace(id, t.get_reorderer_payload()); + } + reorder_tasks(state.deferred_tasks, payloads); + + for (auto& [id, payload] : payloads) + { + if (payload.device != -1) + { + state.task_map.at(id).set_exec_place(exec_place::device(payload.device)); + } + } + + cuda_safe_call(cudaSetDevice(0)); + cuda_safe_call(cudaStreamSynchronize(task_fence())); + cuda_safe_call(cudaEventCreate(&startEvent)); + cuda_safe_call(cudaEventCreate(&stopEvent)); + cuda_safe_call(cudaEventRecord(startEvent, task_fence())); + } + + for (int id : state.deferred_tasks) + { + auto& task = state.task_map.at(id); + task.run(); + } + + if (reordering_tasks()) + { + cuda_safe_call(cudaSetDevice(0)); + cuda_safe_call(cudaEventRecord(stopEvent, task_fence())); + cuda_safe_call(cudaEventSynchronize(stopEvent)); + cuda_safe_call(cudaEventElapsedTime(&state.submission_time, startEvent, stopEvent)); + } + + // Write-back data and erase automatically created data instances + state.erase_all_logical_data(); + state.detach_allocators(*this); + + state.submitted_stream = task_fence(); + assert(state.submitted_stream != nullptr); + + set_phase(backend_ctx_untyped::phase::submitted); + } + + // no-op : so that we can use the same code with stream_ctx and graph_ctx + void change_epoch() + { + auto& dot = *get_dot(); + if (dot.is_tracing()) + { + dot.change_epoch(); + } + } + + template + auto deferred_parallel_for(exec_place e_place, S shape, task_dep... deps) + { + auto result = deferred_parallel_for_scope(*this, mv(e_place), mv(shape), mv(deps)...); + int id = result.get_mapping_id(); + state().deferred_tasks.push_back(id); + state().task_map.emplace(id, result); + + return result; + } + + template + auto deferred_parallel_for(S shape, task_dep... deps) + { + return deferred_parallel_for(exec_place::current_device(), mv(shape), mv(deps)...); + } + +private: + /* This class contains all the state associated to a stream_ctx, and all states associated to every contexts (in + * `impl`) */ + class impl : public base::impl + { + public: + impl(async_resources_handle _async_resources = async_resources_handle(nullptr)) + : base::impl(mv(_async_resources)) + { + reserved::backend_ctx_setup_allocators(*this); + } + + void cleanup() + { + // Reset this object + deferred_tasks.clear(); + task_map.clear(); + submitted_stream = nullptr; + base::impl::cleanup(); + } + + // Due to circular dependencies, we need to define it here, and not in backend_ctx_untyped + void update_uncached_allocator(block_allocator_untyped custom) override + { + reserved::backend_ctx_update_uncached_allocator(*this, mv(custom)); + } + + event_list stream_to_event_list(cudaStream_t stream, ::std::string) const override + { + auto e = reserved::record_event_in_stream(decorated_stream(stream)); + /// e->set_symbol(mv(event_symbol)); + return event_list(mv(e)); + } + + ::std::vector deferred_tasks; // vector of mapping_ids + ::std::unordered_map> task_map; // maps from a mapping_id to the deferred_task + cudaStream_t submitted_stream = nullptr; // stream used in submit + float submission_time = 0.0; + + // If the context is attached to a user stream, we should use it for + // finalize() or task_fence() + ::std::optional user_dstream; + + /* By default, the finalize operation is blocking, unless user provided + * a stream when creating the context */ + bool blocking_finalize = true; + }; + + impl& state() + { + return dynamic_cast(get_state()); + } + const impl& state() const + { + return dynamic_cast(get_state()); + } + + /// @brief Build the task graph by populating the predecessor and successor lists. + /// The logic here is copied from acquire_release.h notify_access(), although this doesn't handle redux at the + /// moment + void build_task_graph() + { + auto& state = this->state(); + + // Maps from a logical data to its last writer. The int is the mapping_id + ::std::unordered_map<::std::string, ::std::deque> current_readers; + ::std::unordered_map<::std::string, int> current_writer, previous_writer; + + for (int id : state.deferred_tasks) + { + auto& t = state.task_map.at(id); + assert(id == t.get_mapping_id()); + + for (const auto& dep : t.get_task_deps()) + { + const access_mode mode = dep.get_access_mode(); + + const logical_data_untyped data = dep.get_data(); + const auto& symbol = data.get_symbol(); + const auto it = current_writer.find(symbol); + const bool write = mode == access_mode::rw || mode == access_mode::write; + + if (write) + { + if (it == current_writer.end()) + { // WAR + if (auto readers_it = current_readers.find(symbol); readers_it != current_readers.end()) + { + for (auto& readers_queue = readers_it->second; !readers_queue.empty();) + { + const int reader_id = readers_queue.back(); + readers_queue.pop_back(); + + auto& reader_task = state.task_map.at(reader_id); + t.add_predecessor(reader_id); + reader_task.add_successor(id); + } + } + } + else + { // WAW + const int writer_id = it->second; + auto& writer_task = state.task_map.at(writer_id); + + t.add_predecessor(writer_id); + writer_task.add_successor(id); + + previous_writer[symbol] = writer_id; + } + current_writer[symbol] = id; + } + else + { + current_readers[symbol].emplace_back(id); + if (it == current_writer.end()) + { // RAR + + auto previous_writer_it = previous_writer.find(symbol); + if (previous_writer_it != previous_writer.end()) + { + const int previous_writer_id = previous_writer_it->second; + auto& previous_writer_task = state.task_map.at(previous_writer_id); + + t.add_predecessor(previous_writer_id); + previous_writer_task.add_successor(id); + } + } + else + { // RAW + const int writer_id = it->second; + auto& writer_task = state.task_map.at(writer_id); + previous_writer[symbol] = writer_id; + current_writer.erase(symbol); + + t.add_predecessor(writer_id); + writer_task.add_successor(id); + } + } + } + } + } +}; + +#ifdef UNITTESTED_FILE +UNITTEST("copyable stream_task") +{ + stream_ctx ctx; + stream_task<> t = ctx.task(); + stream_task<> t_cpy = t; + ctx.finalize(); +}; + +UNITTEST("movable stream_task") +{ + stream_ctx ctx; + stream_task<> t = ctx.task(); + stream_task<> t_cpy = mv(t); + ctx.finalize(); +}; + +// FIXME : This test is causing some compiler errors with MSVC, so we disable +// it on MSVC for now +# if !defined(_CCCL_COMPILER_MSVC) +UNITTEST("logical_data_untyped moveable") +{ + using namespace cuda::experimental::stf; + + class scalar + { + public: + scalar(stream_ctx& ctx) + { + size_t s = sizeof(double); + double* h_addr = (double*) malloc(s); + cuda_safe_call(cudaHostRegister(h_addr, s, cudaHostRegisterPortable)); + handle = ctx.logical_data(h_addr, 1); + } + + scalar& operator=(scalar&& rhs) + { + handle = mv(rhs.handle); + return *this; + } + + logical_data_untyped handle; + }; + + stream_ctx ctx; + scalar global_res(ctx); + + for (int bid = 0; bid < 2; bid++) + { + // This variable is likely to have the same address accross loop + // iterations, which can stress the logical data management + scalar res(ctx); + auto t = ctx.task(); + t.add_deps(res.handle.write()); + t.start(); + t.end(); + + global_res = mv(res); + } + + ctx.finalize(); +}; +# endif // !_CCCL_COMPILER_MSVC + +# ifdef __CUDACC__ +namespace reserved +{ + +static __global__ void dummy() {} + +} // namespace reserved + +UNITTEST("copyable stream_ctx") +{ + stream_ctx ctx; + stream_ctx ctx2 = ctx; + auto t = ctx.task(); + auto t2 = ctx2.task(); + + ctx2.submit(); + ctx2.finalize(); +}; + +UNITTEST("movable stream_ctx") +{ + stream_ctx ctx; + auto t = ctx.task(); + stream_ctx ctx2 = mv(ctx); + auto t2 = ctx2.task(); + + ctx2.submit(); + ctx2.finalize(); +}; + +UNITTEST("stream context basics") +{ + stream_ctx ctx; + + double X[1024], Y[1024]; + auto handle_X = ctx.logical_data(X); + auto handle_Y = ctx.logical_data(Y); + + for (int k = 0; k < 10; k++) + { + ctx.task(handle_X.rw())->*[&](cudaStream_t s, auto /*unuased*/) { + reserved::dummy<<<1, 1, 0, s>>>(); + }; + } + + ctx.task(handle_X.read(), handle_Y.rw())->*[&](cudaStream_t s, auto /*unuased*/, auto /*unuased*/) { + reserved::dummy<<<1, 1, 0, s>>>(); + }; + + ctx.finalize(); +}; + +UNITTEST("stream context after logical data") +{ + stream_ctx ctx; + logical_data> handle_X, handle_Y; + + double X[1024], Y[1024]; + handle_X = ctx.logical_data(X); + handle_Y = ctx.logical_data(Y); + + for (int k = 0; k < 10; k++) + { + ctx.task(handle_X.rw())->*[&](cudaStream_t s, auto /*unused*/) { + reserved::dummy<<<1, 1, 0, s>>>(); + }; + } + + ctx.task(handle_X.read(), handle_Y.rw())->*[&](cudaStream_t s, auto /*unused*/, auto /*unused*/) { + reserved::dummy<<<1, 1, 0, s>>>(); + }; + + ctx.finalize(); +}; + +UNITTEST("set_symbol on stream_task and stream_task<>") +{ + stream_ctx ctx; + + double X[1024], Y[1024]; + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + stream_task<> t = ctx.task(); + t.add_deps(lX.rw(), lY.rw()); + t.set_symbol("stream_task<>"); + t.start(); + t.end(); + + stream_task, slice> t2 = ctx.task(lX.rw(), lY.rw()); + t2.set_symbol("stream_task"); + t2.start(); + t2.end(); + + ctx.finalize(); +}; + +UNITTEST("non contiguous slice") +{ + stream_ctx ctx; + + int X[32 * 32]; + + // Pinning non contiguous memory is extremelly expensive, so we do it now + cuda_safe_call(cudaHostRegister(&X[0], 32 * 32 * sizeof(int), cudaHostRegisterPortable)); + + for (size_t i = 0; i < 32 * 32; i++) + { + X[i] = 1; + } + + // Create a non-contiguous slice + auto lX = ctx.logical_data(make_slice(&X[0], ::std::tuple{24, 32}, 32)); + + ctx.host_launch(lX.rw())->*[](auto sX) { + for (size_t i = 0; i < sX.extent(0); i++) + { + for (size_t j = 0; j < sX.extent(1); j++) + { + sX(i, j) = 2; + } + } + }; + + ctx.finalize(); + + for (size_t j = 0; j < 32; j++) + { + for (size_t i = 0; i < 32; i++) + { + size_t ind = i + 32 * j; + int expected = ((i < 24) ? 2 : 1); + EXPECT(X[ind] == expected); + } + } + + cuda_safe_call(cudaHostUnregister(&X[0])); +}; + +UNITTEST("logical data from a shape of 2D slice") +{ + stream_ctx ctx; + + // Create a non-contiguous slice + auto lX = ctx.logical_data(shape_of>(24, 32)); + + // We cannot use the parallel_for construct in a UNITTEST so we here rely + // on the fact that X is likely going to be allocated in a contiguous + // fashion on devices, and we use cudaMemsetAsync. + ctx.task(lX.write())->*[](cudaStream_t stream, auto sX) { + assert(contiguous_dims(sX) == 2); + cudaMemsetAsync(sX.data_handle(), 42, sX.size() * sizeof(char), stream); + }; + + ctx.host_launch(lX.read())->*[](auto sX) { + for (size_t i = 0; i < sX.extent(0); i++) + { + for (size_t j = 0; j < sX.extent(1); j++) + { + EXPECT(sX(i, j) == 42); + } + } + }; + + ctx.finalize(); +}; + +UNITTEST("logical data from a shape of 3D slice") +{ + stream_ctx ctx; + + // Create a non-contiguous slice + auto lX = ctx.logical_data(shape_of>(24, 32, 16)); + + // We cannot use the parallel_for construct in a UNITTEST so we here rely + // on the fact that X is likely going to be allocated in a contiguous + // fashion on devices, and we use cudaMemsetAsync. + ctx.task(lX.write())->*[](cudaStream_t stream, auto sX) { + EXPECT(contiguous_dims(sX) == 3); + cudaMemsetAsync(sX.data_handle(), 42, sX.size() * sizeof(char), stream); + }; + + ctx.host_launch(lX.read())->*[](auto sX) { + for (size_t i = 0; i < sX.extent(0); i++) + { + for (size_t j = 0; j < sX.extent(1); j++) + { + for (size_t k = 0; k < sX.extent(2); k++) + { + EXPECT(sX(i, j, k) == 42); + } + } + } + }; + + ctx.finalize(); +}; + +UNITTEST("const class containing a stream_ctx") +{ + struct foo + { + foo() = default; + stream_ctx& get_ctx() const + { + return ctx; + } + + mutable stream_ctx ctx; + }; + + const foo f; + + // Create a non-contiguous slice + auto lX = f.get_ctx().logical_data(shape_of>(24)); + + f.get_ctx().finalize(); +}; + +# endif + +UNITTEST("stream ctx loop") +{ + for (size_t iter = 0; iter < 10; iter++) + { + stream_ctx ctx; + auto lA = ctx.logical_data(shape_of>(64)); + ctx.task(lA.write())->*[](cudaStream_t, auto) {}; + ctx.finalize(); + } +}; + +UNITTEST("stream ctx loop custom allocator") +{ + for (size_t iter = 0; iter < 10; iter++) + { + stream_ctx ctx; + ctx.set_allocator(block_allocator(ctx)); + auto lA = ctx.logical_data(shape_of>(64)); + ctx.task(lA.write())->*[](cudaStream_t, auto) {}; + ctx.finalize(); + } +}; + +// This test ensures we can convert a logical_data_untyped to a logical_data +// It is located here because we need all other classes to actually test it +UNITTEST("get logical_data from a task_dep") +{ + stream_ctx ctx; + + using T = slice; + auto lA = ctx.logical_data(shape_of(64)); + + // Create a task dependency using that logical data + auto d = lA.read(); + + logical_data_untyped ul = d.get_data(); + EXPECT(ul == lA); + + logical_data lB = d.get_data(); + EXPECT(lB == lA); + + auto lC = logical_data(d.get_data()); + EXPECT(lC == lA); + + auto lD = static_cast>(d.get_data()); + EXPECT(lD == lA); + + ctx.finalize(); +}; + +# ifdef __CUDACC__ +namespace reserved +{ +inline void unit_test_pfor() +{ + stream_ctx ctx; + SCOPE(exit) + { + ctx.finalize(); + }; + auto lA = ctx.logical_data(shape_of>(64)); + ctx.parallel_for(lA.shape(), lA.write())->*[] _CCCL_DEVICE(size_t i, slice A) { + A(i) = 2 * i; + }; + ctx.host_launch(lA.read())->*[](auto A) { + for (size_t i = 0; i < 64; i++) + { + EXPECT(A(i) == 2 * i); + } + }; +} + +UNITTEST("basic parallel_for test") +{ + unit_test_pfor(); +}; + +inline void unit_test_host_pfor() +{ + stream_ctx ctx; + auto lA = ctx.logical_data(shape_of>(64)); + ctx.parallel_for(exec_place::host, lA.shape(), lA.write())->*[](size_t i, slice A) { + A(i) = 2 * i; + }; + ctx.host_launch(lA.read())->*[](auto A) { + for (size_t i = 0; i < 64; i++) + { + EXPECT(A(i) == 2 * i); + } + }; + ctx.finalize(); +} + +UNITTEST("basic parallel_for test on host") +{ + unit_test_host_pfor(); +}; + +inline void unit_test_pfor_mix_host_dev() +{ + stream_ctx ctx; + + const int N = 1024; + double X[N]; + + auto lx = ctx.logical_data(X); + + ctx.parallel_for(lx.shape(), lx.write())->*[=] _CCCL_DEVICE(size_t pos, auto sx) { + sx(pos) = 17 * pos + 4; + }; + + ctx.parallel_for(exec_place::host, lx.shape(), lx.rw())->*[=](size_t pos, auto sx) { + sx(pos) = sx(pos) * sx(pos); + }; + + ctx.parallel_for(lx.shape(), lx.rw())->*[=] _CCCL_DEVICE(size_t pos, auto sx) { + sx(pos) = sx(pos) - 7; + }; + + ctx.finalize(); + + for (size_t i = 0; i < N; i++) + { + EXPECT(X[i] == (17 * i + 4) * (17 * i + 4) - 7); + } +} + +/* This test ensures that parallel_for on exec_place::host are properly + * interleaved with those on devices. */ +UNITTEST("parallel_for host and device") +{ + unit_test_pfor_mix_host_dev(); +}; + +inline void unit_test_untyped_place_pfor() +{ + stream_ctx ctx; + + exec_place where = exec_place::host; + + auto lA = ctx.logical_data(shape_of>(64)); + // We have to put both __host__ __device__ qualifiers as this is resolved + // dynamically and both host and device codes will be generated + ctx.parallel_for(where, lA.shape(), lA.write())->*[] _CCCL_HOST_DEVICE(size_t i, slice A) { +# ifdef __CUDA_ARCH__ + // Even if we do have a __device__ qualifier, we are not supposed to call it + assert(0); +# endif + A(i) = 2 * i; + }; + ctx.host_launch(lA.read())->*[](auto A) { + for (size_t i = 0; i < 64; i++) + { + EXPECT(A(i) == 2 * i); + } + }; + ctx.finalize(); +} + +UNITTEST("basic parallel_for test on host (untyped execution place)") +{ + unit_test_untyped_place_pfor(); +}; + +inline void unit_test_pfor_grid() +{ + stream_ctx ctx; + auto where = exec_place::all_devices(); + auto lA = ctx.logical_data(shape_of>(64)); + ctx.parallel_for(blocked_partition(), where, lA.shape(), lA.write())->*[] _CCCL_DEVICE(size_t i, slice A) { + A(i) = 2 * i; + }; + ctx.host_launch(lA.read())->*[](auto A) { + for (size_t i = 0; i < 64; i++) + { + EXPECT(A(i) == 2 * i); + } + }; + ctx.finalize(); +} + +UNITTEST("basic parallel_for test on grid") +{ + unit_test_pfor_grid(); +}; + +inline void unit_test_pfor_untyped_grid() +{ + stream_ctx ctx; + + exec_place where = exec_place::repeat(exec_place::current_device(), 4); + + auto lA = ctx.logical_data(shape_of>(64)); + ctx.parallel_for(blocked_partition(), where, lA.shape(), lA.write())->*[] _CCCL_HOST_DEVICE(size_t i, slice A) { + A(i) = 2 * i; + }; + ctx.host_launch(lA.read())->*[](auto A) { + for (size_t i = 0; i < 64; i++) + { + EXPECT(A(i) == 2 * i); + } + }; + ctx.finalize(); +} + +UNITTEST("basic parallel_for test on grid") +{ + unit_test_pfor_untyped_grid(); +}; + +inline void unit_test_launch() +{ + stream_ctx ctx; + SCOPE(exit) + { + ctx.finalize(); + }; + auto lA = ctx.logical_data(shape_of>(64)); + ctx.launch(lA.write())->*[] _CCCL_DEVICE(auto t, slice A) { + for (auto i : t.apply_partition(shape(A))) + { + A(i) = 2 * i; + } + }; + ctx.host_launch(lA.read())->*[](auto A) { + for (size_t i = 0; i < 64; i++) + { + EXPECT(A(i) == 2 * i); + } + }; +} + +UNITTEST("basic launch test") +{ + unit_test_launch(); +}; + +} // end namespace reserved +# endif // UNITTESTED_FILE + +#endif // __CUDACC__ + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_data_interface.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_data_interface.cuh new file mode 100644 index 00000000000..9e442f5ac15 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/stream/stream_data_interface.cuh @@ -0,0 +1,194 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Implementation of the stream_data_interface class which makes it + * possible to define data interfaces in the CUDA stream backend + * + * The stream_data_interface_simple provides a simpler interface compared to stream_data_interface + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief Data interface for the CUDA streams backend. + * + * This is similar to stream_data_interface_simple but relies on a more + * sophisticated allocator scheme that caches memory allocations. + */ +template +class stream_data_interface : public data_impl_base +{ +public: + using base = data_impl_base; + using typename base::shape_t; + + stream_data_interface(T object) + : base(mv(object)) + {} + + stream_data_interface(shape_of shape) + : base(mv(shape)) + {} + + virtual void stream_data_copy( + const data_place& dst_memory_node, + instance_id_t dst_instance_id, + const data_place& src_memory_node, + instance_id_t src_instance_id, + cudaStream_t stream) = 0; + + // Returns prereq + void data_copy(backend_ctx_untyped& bctx, + const data_place& dst_memory_node, + instance_id_t dst_instance_id, + const data_place& src_memory_node, + instance_id_t src_instance_id, + event_list& prereqs) override + { + cudaStream_t stream; + auto op = stream_async_op(bctx, dst_memory_node, &stream, prereqs); + if (bctx.generate_event_symbols()) + { + op.set_symbol("copy"); + } + + stream_data_copy(dst_memory_node, dst_instance_id, src_memory_node, src_instance_id, stream); + + prereqs = op.end(bctx); + } +}; + +/** + * @brief Simplified data interface for the CUDA streams backend. + * + * Implements all primitives of data_interface in terms of to-be-defined + * primitives that take a stream as a parameter. + */ +template +class stream_data_interface_simple : public data_impl_base +{ +public: + using base = data_impl_base; + using typename base::shape_t; + + stream_data_interface_simple(T p) + : base(mv(p)) + {} + + stream_data_interface_simple(typename base::shape_t sh) + : base(sh) + {} + + // The routine indicates how many bytes have been allocated, or were + // required if the allocated failed + virtual void stream_data_allocate( + backend_ctx_untyped& ctx, + const data_place& memory_node, + instance_id_t instance_id, + ::std::ptrdiff_t& s, + void** extra_args, + cudaStream_t stream) = 0; + + void data_allocate( + backend_ctx_untyped& ctx, + block_allocator_untyped& /*unused*/, + const data_place& memory_node, + instance_id_t instance_id, + ::std::ptrdiff_t& s, + void** extra_args, + event_list& prereqs) final override + { + cudaStream_t stream; + auto op = stream_async_op(ctx, memory_node, &stream, prereqs); + if (ctx.generate_event_symbols()) + { + op.set_symbol("alloc"); + } + + stream_data_allocate(ctx, memory_node, instance_id, s, extra_args, stream); + + prereqs = op.end(ctx); + } + + virtual void stream_data_deallocate( + backend_ctx_untyped& ctx, + const data_place& memory_node, + instance_id_t instance_id, + void* extra_args, + cudaStream_t stream) = 0; + + void data_deallocate( + backend_ctx_untyped& ctx, + block_allocator_untyped& /*custom_allocator*/, + const data_place& memory_node, + instance_id_t instance_id, + void* extra_args, + event_list& prereqs) final override + { + cudaStream_t stream; + auto op = stream_async_op(ctx, memory_node, &stream, prereqs); + if (ctx.generate_event_symbols()) + { + op.set_symbol("dealloc "); + } + + stream_data_deallocate(ctx, memory_node, instance_id, extra_args, stream); + + prereqs = op.end(ctx); + } + + virtual void stream_data_copy( + const data_place& dst_memory_node, + instance_id_t dst_instance_id, + const data_place& src_memory_node, + instance_id_t src_instance_id, + cudaStream_t stream) = 0; + + // Returns prereq + void data_copy(backend_ctx_untyped& bctx, + const data_place& dst_memory_node, + instance_id_t dst_instance_id, + const data_place& src_memory_node, + instance_id_t src_instance_id, + event_list& prereqs) override + { + cudaStream_t stream; + auto op = stream_async_op(bctx, dst_memory_node, &stream, prereqs); + if (bctx.generate_event_symbols()) + { + op.set_symbol("copy"); + } + + stream_data_copy(dst_memory_node, dst_instance_id, src_memory_node, src_instance_id, stream); + + prereqs = op.end(bctx); + } +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh new file mode 100644 index 00000000000..cf04c370e95 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh @@ -0,0 +1,852 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Implement tasks in the CUDA stream backend (stream_ctx) + * + * @see stream_ctx + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include + +namespace cuda::experimental::stf +{ + +class stream_ctx; + +template +class stream_task; + +/** + * @brief Task with dynamic dependencies that uses CUDA streams (and events) to synchronize between the different tasks. + * + * `stream_task<>` automatically selects a stream from an internal pool if needed, or take a user-provided stream + * (by calling `set_stream`). All operations in a task are expected to be executed asynchronously with respect to + * that task's stream. + * + * This task type accepts dynamic dependencies, i.e. dependencies can be added at runtime by calling `add_deps()` or + * `add_deps()` prior to starting the task with `start()`. In turn, the added depdencies have dynamic types. It is the + * caller's responsibility to access the correct types for each dependency by calling `get(index)`. + */ +template <> +class stream_task<> : public task +{ +public: + stream_task(backend_ctx_untyped ctx_, exec_place e_place = exec_place::current_device()) + : task(mv(e_place)) + , ctx(mv(ctx_)) + { + ctx.increment_task_count(); + } + + stream_task(const stream_task<>&) = default; + stream_task<>& operator=(const stream_task<>&) = default; + ~stream_task() = default; + + // movable ?? + + // Returns the stream associated to that task : any asynchronous operation + // in the task body should be performed asynchronously with respect to that + // CUDA stream + cudaStream_t get_stream() + { + const auto& e_place = get_exec_place(); + if (e_place.is_grid()) + { + // Even with a grid, when we have a ctx.task construct we have not + // yet selected/activated a specific place. So we take the main + // stream associated to the whole task in that case. + ::std::ptrdiff_t current_place_id = e_place.as_grid().current_place_id(); + return (current_place_id < 0 ? dstream.stream : stream_grid[current_place_id].stream); + } + + return dstream.stream; + } + + // TODO use a pos4 and check that we have a grid, of the proper dimension + cudaStream_t get_stream(size_t pos) + { + const auto& e_place = get_exec_place(); + + if (e_place.is_grid()) + { + return stream_grid[pos].stream; + } + + return dstream.stream; + } + + stream_task<>& set_stream(cudaStream_t s) + { + // We don't need to find a stream from the pool in this case + automatic_stream = false; + // -1 identifies streams which are not from our internal pool + dstream = decorated_stream(s); + return *this; + } + + stream_task<>& start() + { + const auto& e_place = get_exec_place(); + + event_list prereqs = acquire(ctx); + + /* Select the stream(s) */ + if (e_place.is_grid()) + { + // We have currently no way to pass an array of per-place streams + assert(automatic_stream); + + // Note: we store grid in a variable to avoid dangling references + // because the compiler does not know we are making a refernce to + // a vector that remains valid + const auto& grid = e_place.as_grid(); + const auto& places = grid.get_places(); + for (const exec_place& p : places) + { + stream_grid.push_back(get_stream_from_pool(p)); + } + + EXPECT(stream_grid.size() > 0UL); + } + else + { + if (automatic_stream) + { + bool found = false; + auto& pool = e_place.get_stream_pool(ctx.async_resources(), true); + + // To avoid creating inter stream dependencies when this is not + // necessary, we try to reuse streams which belong to the pool, + // and which are used by events with no outbound dependency. An + // event with an existing outbound dependencies means multiple + // operations might depend on it so it might be worth using + // multiple streams. + if (!getenv("CUDASTF_DO_NOT_REUSE_STREAMS")) + { + for (auto& e : prereqs) + { + // fprintf(stderr, "outbounds %d (%s)\n", e->outbound_deps.load(), e->get_symbol().c_str()); + if (e->outbound_deps == 0) + { + auto se = reserved::handle(e, reserved::use_static_cast); + decorated_stream candidate = se->get_decorated_stream(); + + if (candidate.id != -1) + { + for (const decorated_stream& pool_s : pool) + { + if (candidate.id == pool_s.id) + { + found = true; + dstream = candidate; + // fprintf(stderr, "REUSING stream ID %ld\n", dstream.id); + break; + } + } + } + + if (found) + { + break; + } + } + } + } + + if (!found) + { + dstream = get_stream_from_pool(e_place); + // fprintf(stderr, "COULD NOT REUSE ... selected stream ID %ld\n", dstream.id); + } + } + } + + // Select one stream to sync with all prereqs + auto& s0 = e_place.is_grid() ? stream_grid[0] : dstream; + + /* Ensure that stream depend(s) on prereqs */ + submitted_events = stream_async_op(ctx, s0, prereqs); + if (ctx.generate_event_symbols()) + { + submitted_events.set_symbol("Submitted" + get_symbol()); + } + + /* If this is a grid, all other streams must wait on s0 too */ + if (e_place.is_grid()) + { + insert_dependencies(stream_grid); + } + + return *this; + } + + void set_current_place(pos4 p) + { + get_exec_place().as_grid().set_current_place(ctx, p); + } + + void unset_current_place() + { + return get_exec_place().as_grid().unset_current_place(ctx); + } + + const exec_place& get_current_place() + { + return get_exec_place().as_grid().get_current_place(); + } + + /* End the task, but do not clear its data structures yet */ + stream_task<>& end_uncleared() + { + assert(get_task_phase() == task::phase::running); + + event_list end_list; + + const auto& e_place = get_exec_place(); + // Create an event with this stream + + if (e_place.is_grid()) + { + // s0 depends on all other streams + for (size_t i = 1; i < stream_grid.size(); i++) + { + stream_and_event::insert_dependency(stream_grid[0].stream, stream_grid[i].stream); + } + } + + auto se = submitted_events.end_as_event(ctx); + end_list.add(se); + + release(ctx, end_list); + + return *this; + } + + stream_task<>& end() + { + end_uncleared(); + clear(); + return *this; + } + + /** + * @brief Run lambda function on the specified device + * + * @tparam Fun Type of lambda + * @param fun Lambda function taking either a `stream_task<>` or a `cudaStream_t` as the only argument + * + * The lambda must accept exactly one argument. If the type of the lambda's argument is one of + * `stream_task<>`, `stream_task<>&`, `auto`, `auto&`, or `auto&&`, then `*this` is passed to the + * lambda. Otherwise, `this->get_stream()` is passed to the lambda. Depdendencies would need to be accessed + * separately. + */ + template + void operator->*(Fun&& fun) + { + // Apply function to the stream (in the first position) and the data tuple + nvtx_range nr(get_symbol().c_str()); + start(); + SCOPE(exit) + { + end(); + }; + + auto& dot = ctx.get_dot(); + if (dot->is_tracing()) + { + dot->template add_vertex(*this); + } + + // Default for the first argument is a `cudaStream_t`. + if constexpr (::std::is_invocable_v) + { + ::std::forward(fun)(get_stream()); + } + else + { + ::std::forward(fun)(*this); + } + } + + void populate_deps_scheduling_info() const + { + // Error checking copied from acquire() in acquire_release() + + int index = 0; + const auto& deps = get_task_deps(); + for (const auto& dep : deps) + { + if (!dep.get_data().is_initialized()) + { + fprintf(stderr, "Error: dependency number %d is an uninitialized logical data.\n", index); + abort(); + } + dep.set_symbol(dep.get_data().get_symbol()); + dep.set_data_footprint(dep.get_data().get_data_interface().data_footprint()); + index++; + } + } + + /** + * @brief Use the scheduler to assign a device to this task + * + * @return returns true if the task's time needs to be recorded + */ + bool schedule_task() + { + reserved::dot& dot = reserved::dot::instance(); + auto& statistics = reserved::task_statistics::instance(); + + const bool is_auto = get_exec_place().affine_data_place() == data_place::device_auto; + bool calibrate = false; + + // We need to know the data footprint if scheduling or calibrating tasks + if (is_auto || statistics.is_calibrating()) + { + populate_deps_scheduling_info(); + } + + if (is_auto) + { + auto [place, needs_calibration] = ctx.schedule_task(*this); + set_exec_place(place); + calibrate = needs_calibration; + } + + return dot.is_tracing() || (calibrate && statistics.is_calibrating()); + } + +private: + decorated_stream get_stream_from_pool(const exec_place& e_place) + { + return e_place.getStream(ctx.async_resources(), true); + } + + // Make all streams depend on streams[0] + static void insert_dependencies(::std::vector& streams) + { + if (streams.size() < 2) + { + // No synchronization needed + return; + } + + // event and stream must be on the same device + // if the stream does not belong to the current device, we + // therefore have to find in which device the stream was created, + // record the event, and restore the current device to its original + // value. + + // TODO leverage dev_id if known ? + + // Find the stream structure in the driver API + CUcontext ctx; + cuda_safe_call(cuStreamGetCtx(CUstream(streams[0].stream), &ctx)); + + // Query the context associated with a stream by using the underlying driver API + cuda_safe_call(cuCtxPushCurrent(ctx)); + const CUdevice s0_dev = cuda_try(); + cuda_safe_call(cuCtxPopCurrent(&ctx)); + + const int current_dev = cuda_try(); + + if (current_dev != s0_dev) + { + cuda_safe_call(cudaSetDevice(s0_dev)); + } + + // Create a dependency between the last stream and the current stream + cudaEvent_t sync_event; + // Disable timing to avoid implicit barriers + cuda_safe_call(cudaEventCreateWithFlags(&sync_event, cudaEventDisableTiming)); +#ifdef CUDASTF_DEBUG + reserved::counter ++; + reserved::high_water_mark.record(++reserved::counter); +#endif + + cuda_safe_call(cudaEventRecord(sync_event, streams[0].stream)); + + // According to documentation "event may be from a different device than stream." + for (size_t i = 0; i < streams.size(); i++) + { + cuda_safe_call(cudaStreamWaitEvent(streams[i].stream, sync_event, 0)); + } + + // Asynchronously destroy event to avoid a memleak + cuda_safe_call(cudaEventDestroy(sync_event)); +#ifdef CUDASTF_DEBUG + reserved::counter.increment(); + reserved::counter.decrement(); +#endif + + if (current_dev != s0_dev) + { + // Restore current device + cuda_safe_call(cudaSetDevice(current_dev)); + } + } + + bool automatic_stream = true; // `true` if the stream is automatically fetched from the internal pool + + // Stream and their unique id (if applicable) + decorated_stream dstream; + ::std::vector stream_grid; + + // TODO rename to submitted_ops + stream_async_op submitted_events; + +protected: + backend_ctx_untyped ctx; +}; + +/** + * @brief Task running on stream with fixed, typed dependencies. + * + * @tparam Data A list of data that this task depends on + * + * This type models tasks that have known dependencies in arity and type. Dependencies are set statically at + * construction and must be statically-typed (i.e., must have type `task_dep` as opposed to `task_dep_untyped`). + * Therefore, the choice between `stream_task` and `stream_task<>` is made depending on the nature of the task's + * dependencies. + * + * Most of the time a `stream_task` object is created for the sake of using its `->*` method that effects execution. The + * execution place can be set in the constructor and also dynamically. An invocation of `->*` takes place on the last + * set execution place. + * + * It is possible to copy or move this task into a `stream_task<>` by implicit conversion. Subsequently, the + * obtained object can be used with dynamic dependencies. + */ +template +class stream_task : public stream_task<> +{ +public: + /** + * @brief Construct with an execution place and dependencies + * + * @param ctx The backend context + * @param e_place Place where execution will be carried + * @param deps A list of `task_dep` objects that this task depends on + */ + stream_task(backend_ctx_untyped ctx, exec_place e_place, task_dep... deps) + : stream_task<>(mv(ctx), mv(e_place)) + { + static_assert(sizeof(*this) == sizeof(stream_task<>), "Cannot add state - it would be lost by slicing."); + add_deps(mv(deps)...); + } + + /** + * @brief Construct with given dependencies, will execute on the current device + * + * @param ctx The stream context + * @param deps A list of `task_dep` objects that this task depends on + */ + stream_task(stream_ctx* ctx, task_dep... deps) + : stream_task(exec_place::host, ctx, mv(deps)...) + {} + + /** + * @brief Set the symbol object + * + * @param s + * @return stream_task& + */ + stream_task& set_symbol(::std::string s) & + { + stream_task<>::set_symbol(mv(s)); + return *this; + } + + stream_task&& set_symbol(::std::string s) && + { + stream_task<>::set_symbol(mv(s)); + return mv(*this); + } + + /** + * @brief Run lambda function on the specified device, automatically passing it the dependencies + * + * @tparam Fun Type of lambda + * @param fun Lambda function + * + * If `fun`'s first parameter has is declared as one of `stream_task`, `stream_task&`, `auto`, + * `auto&`, or `auto&&`, then `*this` is passed as `fun`'s first argument. Otherwise, `this->get_stream()` is passed + * as `fun`'s first argument. In either case, the first argument is followed by `slice` objects that the task + * depends on. The framework automatically binds dependencies to slices and `fun` may invoke a kernel passing it the + * slices. + */ + template + auto operator->*(Fun&& fun) + { + // Apply function to the stream (in the first position) and the data tuple + auto& dot = ctx.get_dot(); + auto& statistics = reserved::task_statistics::instance(); + + cudaEvent_t start_event, end_event; + + bool record_time = schedule_task(); + + if (statistics.is_calibrating_to_file()) + { + record_time = true; + } + + nvtx_range nr(get_symbol().c_str()); + start(); + + if (record_time) + { + // Events must be created here to avoid issues with multi-gpu + cuda_safe_call(cudaEventCreate(&start_event)); + cuda_safe_call(cudaEventCreate(&end_event)); + cuda_safe_call(cudaEventRecord(start_event, get_stream())); + } + + SCOPE(exit) + { + end_uncleared(); + + if (record_time) + { + cuda_safe_call(cudaEventRecord(end_event, get_stream())); + cuda_safe_call(cudaEventSynchronize(end_event)); + + float milliseconds = 0; + cuda_safe_call(cudaEventElapsedTime(&milliseconds, start_event, end_event)); + + if (dot->is_tracing()) + { + dot->template add_vertex_timing(*this, milliseconds); + } + + if (statistics.is_calibrating()) + { + statistics.log_task_time(*this, milliseconds); + } + } + + clear(); + }; + + if (dot->is_tracing()) + { + dot->template add_vertex(*this); + } + + if constexpr (::std::is_invocable_v) + { + // Invoke passing this task's stream as the first argument, followed by the slices + auto t = tuple_prepend(get_stream(), typed_deps()); + return ::std::apply(::std::forward(fun), t); + } + else + { + // Invoke passing `*this` as the first argument, followed by the slices + static_assert(::std::is_invocable_v, "Incorrect lambda function signature."); + auto t = tuple_prepend(*this, typed_deps()); + return ::std::apply(::std::forward(fun), t); + } + } + +private: + auto typed_deps() + { + return make_tuple_indexwise([&](auto i) { + return this->get<::std::tuple_element_t>>(i); + }); + } +}; + +/* + * @brief Deferred tasks are tasks that are not executed immediately, but rather upon `ctx.submit()`. + */ +template +class deferred_stream_task; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen has issues with this code +/* + * Base of all deferred tasks. Stores the needed information for typed deferred tasks to run (see below). + */ +template <> +class deferred_stream_task<> +{ +protected: + // Type stored + struct payload_t + { + virtual ~payload_t() = default; + virtual void set_symbol(::std::string s) = 0; + virtual const ::std::string& get_symbol() const = 0; + virtual int get_mapping_id() const = 0; + virtual void run() = 0; + virtual void populate_deps_scheduling_info() = 0; + virtual const task_dep_vector_untyped& get_task_deps() const = 0; + virtual void set_exec_place(exec_place e_place) = 0; + + void add_successor(int succ) + { + successors.insert(succ); + } + void add_predecessor(int pred) + { + predecessors.insert(pred); + } + + const ::std::unordered_set& get_successors() const + { + return successors; + } + const ::std::unordered_set& get_predecessors() const + { + return predecessors; + } + + virtual void set_cost(double c) + { + assert(c >= 0.0); + cost = c; + } + virtual double get_cost() const + { + assert(cost >= 0.0); + return cost; + } + + private: + // Sets of mapping ids + ::std::unordered_set predecessors; + ::std::unordered_set successors; + double cost = -1.0; + }; + + ::std::shared_ptr payload; + + deferred_stream_task(::std::shared_ptr payload) + : payload(mv(payload)) + {} + +public: + void run() + { + assert(payload); + payload->run(); + } + + const task_dep_vector_untyped& get_task_deps() const + { + assert(payload); + return payload->get_task_deps(); + } + + void add_successor(int succ) + { + assert(payload); + payload->add_successor(succ); + } + + void add_predecessor(int pred) + { + assert(payload); + payload->add_predecessor(pred); + } + + const ::std::unordered_set& get_successors() const + { + assert(payload); + return payload->get_successors(); + } + + const ::std::unordered_set& get_predecessors() const + { + assert(payload); + return payload->get_predecessors(); + } + + const ::std::string& get_symbol() const + { + assert(payload); + return payload->get_symbol(); + } + + void set_symbol(::std::string s) const + { + return payload->set_symbol(mv(s)); + } + + int get_mapping_id() const + { + assert(payload); + return payload->get_mapping_id(); + } + + double get_cost() const + { + assert(payload); + return payload->get_cost(); + } + + void set_cost(double cost) + { + assert(payload); + payload->set_cost(cost); + } + + auto get_reorderer_payload() const + { + assert(payload); + payload->populate_deps_scheduling_info(); + return reserved::reorderer_payload( + payload->get_symbol(), + payload->get_mapping_id(), + payload->get_successors(), + payload->get_predecessors(), + payload->get_task_deps()); + } + + void set_exec_place(exec_place e_place) + { + assert(payload); + payload->set_exec_place(e_place); + } +}; + +/** + * @brief Deferred tasks are tasks that are not executed immediately, but rather upon `ctx.submit()`. This allows + * the library to perform optimizations on the task graph before it is executed. + * + * @tparam Data The dependencies of the task + */ +template +class deferred_stream_task : public deferred_stream_task<> +{ + struct payload_t : public deferred_stream_task<>::payload_t + { + template + payload_t(backend_ctx_untyped ctx, exec_place e_place, task_dep... deps) + : task(mv(ctx), mv(e_place)) + { + task.add_deps(mv(deps)...); + } + + // Untyped task information. Will be needed later for launching the task. + stream_task<> task; + // Function that launches the task. + ::std::function&)> todo; + // More data could go here + + void set_symbol(::std::string s) override + { + task.set_symbol(mv(s)); + } + + const ::std::string& get_symbol() const override + { + return task.get_symbol(); + } + + int get_mapping_id() const override + { + return task.get_mapping_id(); + } + + void run() override + { + todo(task); + } + + void populate_deps_scheduling_info() override + { + task.populate_deps_scheduling_info(); + } + + const task_dep_vector_untyped& get_task_deps() const override + { + return task.get_task_deps(); + } + + void set_exec_place(exec_place e_place) override + { + task.set_exec_place(e_place); + } + }; + + payload_t& my_payload() const + { + // Safe to do the cast because we've set the pointer earlier ourselves + return *static_cast(payload.get()); + } + +public: + /** + * @brief Construct a new deferred stream task object from a context, execution place, and depdenencies. + * + * @param ctx the parent context + * @param e_place the place where the task will execute + * @param deps task dependencies + */ + deferred_stream_task(backend_ctx_untyped ctx, exec_place e_place, task_dep... deps) + : deferred_stream_task<>(::std::make_shared(mv(ctx), mv(e_place), mv(deps)...)) + {} + + ///@{ + /** + * @name Set the symbol of the task. This is used for profiling and debugging. + * + * @param s + * @return deferred_stream_task& + */ + deferred_stream_task& set_symbol(::std::string s) & + { + payload->set_symbol(mv(s)); + return *this; + } + + deferred_stream_task&& set_symbol(::std::string s) && + { + set_symbol(mv(s)); + return mv(*this); + } + ///@} + + void populate_deps_scheduling_info() + { + payload->populate_deps_scheduling_info(); + } + + template + void operator->*(Fun fun) + { + my_payload().todo = [f = mv(fun)](stream_task<>& untyped_task) { + // Here we have full type info; we can downcast to typed stream_task + auto& task = static_cast&>(untyped_task); + task.operator->*(f); + }; + } +}; +#endif // DOXYGEN_SHOULD_SKIP_THIS + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/utility/cartesian_iterator.cuh b/cudax/include/cuda/experimental/__stf/utility/cartesian_iterator.cuh new file mode 100644 index 00000000000..64f87be6ffd --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/cartesian_iterator.cuh @@ -0,0 +1,523 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +namespace cuda::experimental::stf::reserved +{ + +#if 0 +/** + * @brief Implementation of a cartesian product of multiple iterators + * + * This is an experimental class for now, and has no user in the code yet. + */ +template +class CartesianProduct { +public: + CartesianProduct(Iterators... begins, Iterators... ends) : begins(begins...), ends(ends...), current(begins...) {} + + bool is_end() const { return end_reached; } + + CartesianProduct& operator++() { + increment_helper(::std::index_sequence_for {}); + end_reached = (current == ends); + return *this; + } + + auto operator*() const { return current; } + + bool operator==(const CartesianProduct& other) const { return current == other.current; } + + bool operator!=(const CartesianProduct& other) const { return !(*this == other); } + +private: + template <::std::size_t... Is> + void increment_helper(::std::index_sequence) { + (((::std::get(current) != ::std::get(ends) && + ++diagonal > ::std::distance(::std::get(begins), ::std::get(current))) + ? (++::std::get(current), 0) + : 0), + ...); + } + + ::std::tuple begins; + ::std::tuple ends; + ::std::tuple current; + bool end_reached = false; + + // Keep track of the total "diagonal" (sum of indices) + size_t diagonal = 0; +}; + +# ifdef UNITTESTED_FILE +// This currently fails... +UNITTEST("cartesian product") { + ::std::vector numbers { 1, 2, 3 }; + ::std::vector letters { 'a', 'b', 'c' }; + ::std::vector decimals { 0.1, 0.2 }; + + auto product = + CartesianProduct<::std::vector::iterator, ::std::vector::iterator, ::std::vector::iterator>( + numbers.begin(), letters.begin(), decimals.begin(), numbers.end(), letters.end(), decimals.end()); + + while (!product.is_end()) { + auto tuple = *product; + ::std::cout << "(" << *(::std::get<0>(tuple)) << ", " << *(::std::get<1>(tuple)) << ", " << *(::std::get<2>(tuple)) + << ")" << ::std::endl; + + ++product; + } +}; + +# endif // UNITTESTED_FILE + +#endif + +#if 0 +class RangeIterator { +public: + using value_type = size_t; + _CCCL_HOST_DEVICE explicit RangeIterator(int value) : currentValue(value) {} + + _CCCL_HOST_DEVICE int operator*() const { return currentValue; } + + _CCCL_HOST_DEVICE RangeIterator& operator++() { + ++currentValue; + return *this; + } + + _CCCL_HOST_DEVICE RangeIterator operator++(int) { + RangeIterator temp = *this; + ++(*this); + return temp; + } + + _CCCL_HOST_DEVICE bool operator==(const RangeIterator& other) const { return currentValue == other.currentValue; } + + _CCCL_HOST_DEVICE bool operator!=(const RangeIterator& other) const { return !(*this == other); } + +private: + int currentValue; +}; +#endif + +class RangeIterator +{ +public: + using value_type = size_t; + using difference_type = ptrdiff_t; + using pointer = const value_type*; + using reference = const value_type&; + using iterator_category = ::std::random_access_iterator_tag; + + _CCCL_HOST_DEVICE explicit RangeIterator(int value) + : currentValue(value) + {} + + _CCCL_HOST_DEVICE value_type operator*() const + { + return currentValue; + } + + _CCCL_HOST_DEVICE RangeIterator& operator++() + { + ++currentValue; + return *this; + } + + _CCCL_HOST_DEVICE RangeIterator operator++(int) + { + RangeIterator temp = *this; + ++(*this); + return temp; + } + + _CCCL_HOST_DEVICE RangeIterator& operator--() + { + --currentValue; + return *this; + } + + _CCCL_HOST_DEVICE RangeIterator operator--(int) + { + RangeIterator temp = *this; + --(*this); + return temp; + } + + _CCCL_HOST_DEVICE RangeIterator& operator+=(difference_type n) + { + currentValue += static_cast(n); + return *this; + } + + _CCCL_HOST_DEVICE RangeIterator operator+(difference_type n) const + { + RangeIterator result = *this; + result += n; + return result; + } + + _CCCL_HOST_DEVICE RangeIterator& operator-=(difference_type n) + { + currentValue -= static_cast(n); + return *this; + } + + _CCCL_HOST_DEVICE RangeIterator operator-(difference_type n) const + { + RangeIterator result = *this; + result -= n; + return result; + } + + _CCCL_HOST_DEVICE difference_type operator-(const RangeIterator& other) const + { + return currentValue - other.currentValue; + } + + _CCCL_HOST_DEVICE value_type operator[](difference_type n) const + { + return *(*this + n); + } + + _CCCL_HOST_DEVICE bool operator==(const RangeIterator& other) const + { + return currentValue == other.currentValue; + } + + _CCCL_HOST_DEVICE bool operator!=(const RangeIterator& other) const + { + return !(*this == other); + } + + _CCCL_HOST_DEVICE bool operator<(const RangeIterator& other) const + { + return currentValue < other.currentValue; + } + + _CCCL_HOST_DEVICE bool operator>(const RangeIterator& other) const + { + return currentValue > other.currentValue; + } + + _CCCL_HOST_DEVICE bool operator<=(const RangeIterator& other) const + { + return currentValue <= other.currentValue; + } + + _CCCL_HOST_DEVICE bool operator>=(const RangeIterator& other) const + { + return currentValue >= other.currentValue; + } + +private: + int currentValue; +}; + +class Range +{ +public: + using value_type = size_t; + using iterator = RangeIterator; + using difference_type = iterator::difference_type; + + _CCCL_HOST_DEVICE explicit Range(int n) + : beginIterator(0) + , endIterator(n) + {} + + _CCCL_HOST_DEVICE iterator begin() const + { + return beginIterator; + } + + _CCCL_HOST_DEVICE iterator end() const + { + return endIterator; + } + + _CCCL_HOST_DEVICE value_type size() const + { + return endIterator - beginIterator; + } + + _CCCL_HOST_DEVICE value_type operator[](difference_type index) const + { + return *(beginIterator + index); + } + +private: + iterator beginIterator; + iterator endIterator; +}; + +#ifdef UNITTESTED_FILE +namespace reserved +{ +template +__global__ void unit_test_range_func(T n) +{ + Range range(n); + int sum = 0; + for (auto i : range) + { + // printf("CUDA %ld\n", i); + sum++; + } + assert(sum == n); +} + +} // namespace reserved + +UNITTEST("range") +{ + int n = 10; + Range range(n); + + ::std::vector check(n); + + for (int num : range) + { + // fprintf(stderr, "->%d\n", num); + check[num] = 1; + } + + for (int i = 0; i < n; i++) + { + EXPECT(check[i] == 1); + } + + reserved::unit_test_range_func<<<1, 1>>>(n); + cudaDeviceSynchronize(); +}; +#endif // UNITTESTED_FILE + +class StridedRangeIterator +{ +public: + using value_type = size_t; + using difference_type = ptrdiff_t; + using pointer = const value_type*; + using reference = const value_type&; + using iterator_category = ::std::random_access_iterator_tag; + + _CCCL_HOST_DEVICE explicit StridedRangeIterator(value_type value, difference_type stride) + : currentValue(value) + , stride(stride) + {} + + _CCCL_HOST_DEVICE value_type operator*() const + { + return currentValue; + } + + _CCCL_HOST_DEVICE StridedRangeIterator& operator++() + { + currentValue += stride; + return *this; + } + + _CCCL_HOST_DEVICE StridedRangeIterator operator++(int) + { + StridedRangeIterator temp = *this; + ++(*this); + return temp; + } + + _CCCL_HOST_DEVICE StridedRangeIterator& operator--() + { + currentValue -= stride; + return *this; + } + + _CCCL_HOST_DEVICE StridedRangeIterator operator--(int) + { + StridedRangeIterator temp = *this; + --(*this); + return temp; + } + + _CCCL_HOST_DEVICE StridedRangeIterator& operator+=(difference_type n) + { + currentValue += stride * n; + return *this; + } + + _CCCL_HOST_DEVICE StridedRangeIterator& operator-=(difference_type n) + { + currentValue -= stride * n; + return *this; + } + + _CCCL_HOST_DEVICE StridedRangeIterator operator+(difference_type n) const + { + return StridedRangeIterator(currentValue + stride * n, stride); + } + + _CCCL_HOST_DEVICE StridedRangeIterator operator-(difference_type n) const + { + return StridedRangeIterator(currentValue - stride * n, stride); + } + + _CCCL_HOST_DEVICE difference_type operator-(const StridedRangeIterator& other) const + { + return (currentValue - other.currentValue) / stride; + } + + _CCCL_HOST_DEVICE bool operator==(const StridedRangeIterator& other) const + { + return currentValue == other.currentValue; + } + + _CCCL_HOST_DEVICE bool operator!=(const StridedRangeIterator& other) const + { + return currentValue != other.currentValue; + } + + _CCCL_HOST_DEVICE bool operator<(const StridedRangeIterator& other) const + { + return currentValue < other.currentValue; + } + + _CCCL_HOST_DEVICE bool operator>(const StridedRangeIterator& other) const + { + return currentValue > other.currentValue; + } + + _CCCL_HOST_DEVICE bool operator<=(const StridedRangeIterator& other) const + { + return currentValue <= other.currentValue; + } + + _CCCL_HOST_DEVICE bool operator>=(const StridedRangeIterator& other) const + { + return currentValue >= other.currentValue; + } + +private: + value_type currentValue; + difference_type stride; +}; + +class StridedRange +{ +public: + using value_type = size_t; + using iterator = StridedRangeIterator; + using difference_type = typename iterator::difference_type; + + _CCCL_HOST_DEVICE explicit StridedRange(value_type start, value_type end, difference_type stride) + : start_(start) + , end_(end) + , stride_(stride) + { + // Adjust end to the next multiple of stride if needed + if (start > end) + { + end_ = start_; + } + else if ((end_ - start_) % stride_ != 0) + { + end_ = start_ + stride_ * ((end_ - start_) / stride_ + 1); + } + } + + _CCCL_HOST_DEVICE iterator begin() const + { + return iterator(start_, stride_); + } + + _CCCL_HOST_DEVICE iterator end() const + { + return iterator(end_, stride_); + } + +private: + value_type start_; + value_type end_; + difference_type stride_; +}; + +#ifdef UNITTESTED_FILE +UNITTEST("StridedRange") +{ + StridedRange range(12, 1024, 17); + size_t cnt = 0; + size_t expected_cnt = (1024 - 12 + 17 - 1) / 17; + + for (auto it = range.begin(); it != range.end(); ++it) + { + // ::std::cout << *it << " "; + EXPECT(cnt < expected_cnt); + cnt++; + } + + // ::std::cout << ::std::endl; + + EXPECT(cnt == expected_cnt); +}; + +UNITTEST("StridedRange loop") +{ + size_t nthreads = 16; + size_t n = 48; + size_t cnt = 0; + for (size_t tid = 0; tid < nthreads; tid++) + { + StridedRange range(tid, n, nthreads); + // ::std::cout << "Proc : " << tid << "=>"; + for (auto it = range.begin(); it != range.end(); ++it) + { + // ::std::cout << *it << " "; + EXPECT(cnt < n); + cnt++; + } + // ::std::cout << ::std::endl; + } + + EXPECT(cnt == n); +}; + +template +__global__ void unit_test_strided_range_func(T n) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + StridedRange r(tid, n, nthreads); + for (auto i : r) + { + // printf("CUDA %ld\n", i); + } +} + +UNITTEST("StridedRange CUDA") +{ + size_t n = 100; + unit_test_strided_range_func<<<4, 2>>>(n); + cudaDeviceSynchronize(); +}; +#endif // UNITTESTED_FILE + +} // end namespace cuda::experimental::stf::reserved diff --git a/cudax/include/cuda/experimental/__stf/utility/constant_logical_data.cuh b/cudax/include/cuda/experimental/__stf/utility/constant_logical_data.cuh new file mode 100644 index 00000000000..c7e91378d6a --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/constant_logical_data.cuh @@ -0,0 +1,125 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Implementation of constant_logical_data + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief A constant piece of data that relies on frozen logical data + */ +template +class constant_logical_data +{ + class impl + { + public: + // Initialize from an existing logical data + template + impl(ctx_t& ctx, logical_data orig_ld) + : ld(mv(orig_ld)) + , frozen_ld(ctx.freeze(ld, access_mode::read)) + , stream(ctx.pick_stream()) + { + // Freeze it and ensure it will be unfrozen automatically + frozen_ld.set_automatic_unfreeze(true); + + // Cache a stream : we can use the same since we block on it so there + // is no reason for taking different ones + } + + // Return an instance of the data on the specified data place. This will block if necessary. + T& get(const data_place& where = data_place::current_device()) + { + auto it = cached.find(where); + if (it != cached.end()) + { + // There is a cached value, return it + return it->second; + } + + // We need to populate the cache + + // Insert the new value and get the iterator to the newly inserted item + it = cached.emplace(where, frozen_ld.get(where, stream)).first; + + // Wait for the cache entry to be available + cuda_safe_call(cudaStreamSynchronize(stream)); + + return it->second; + } + + private: + logical_data ld; + frozen_logical_data frozen_ld; + + // cached stream used to get instances + cudaStream_t stream = {}; + + // A cache of the instances, we can use them without further + // synchronization once they are populated. If an instance is missing, we + // will get it in a blocking manner. + ::std::unordered_map> cached; + }; + +public: + template + constant_logical_data(ctx_t& ctx, logical_data orig_ld) + : pimpl(::std::make_shared(ctx, mv(orig_ld))) + {} + + // So that we can have a constant data variable that is populated later + constant_logical_data() = default; + + // Copy constructor + constant_logical_data(const constant_logical_data& other) = default; + + // Move constructor + constant_logical_data(constant_logical_data&& other) noexcept = default; + + // Copy assignment + constant_logical_data& operator=(const constant_logical_data& other) = default; + + // Move assignment + constant_logical_data& operator=(constant_logical_data&& other) noexcept = default; + + T& get(const data_place& where = data_place::current_device()) + { + assert(pimpl); + return pimpl->get(where); + } + +private: + ::std::shared_ptr pimpl = nullptr; +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/utility/core.cuh b/cudax/include/cuda/experimental/__stf/utility/core.cuh new file mode 100644 index 00000000000..575a6ac1c16 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/core.cuh @@ -0,0 +1,696 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** @file + * @brief Widely used artifacts used by most of the library. + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuda::experimental::stf +{ + +// Hack setenv on Windows +#if defined(_CCCL_COMPILER_MSVC) +/** + * @brief Sets an environment variable, mimicking the behavior of `std::setenv` on Windows. + * + * This function attempts to set the value of the environment variable `name` to `value`. + * If `overwrite` is 0 and the variable already exists, the function does nothing. + * + * @param name The name of the environment variable. + * @param value The value to assign to the environment variable. + * @param overwrite If non-zero, the function will overwrite the existing value of the variable. + * @return 0 on success, or -1 on failure (invalid input or memory allocation failure). + * @note This function is designed for MSVC, which lacks a standard `setenv` function. + */ +inline int setenv(const char* name, const char* value, int overwrite) +{ + if (!name || !value || !name[0]) + { + // Invalid input: name or value is null, or name is an empty string + return -1; + } + + // Check if the variable already exists and if overwrite is allowed + if (!overwrite && ::std::getenv(name) != nullptr) + { + return 0; // Variable exists, and we're not allowed to overwrite it + } + + // Construct the string in the form "NAME=VALUE" + auto env_var = ::std::string(name) + "=" + value; + + // Use _putenv to set the environment variable in MSVC + if (_putenv(env_var.c_str()) != 0) + { + return -1; // _putenv failed + } + + return 0; // Success +} +#endif + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // FIXME Doxygen is lost with decltype(auto) +/** + * @brief Custom move function that performs checks on the argument type. + * + * @tparam T Type of the object being moved. The type should satisfy certain conditions for the move to be performed. + * @param obj The object to be moved. + * @return The moved object, ready to be passed to another owner. + * + * @pre The argument `obj` must be an lvalue, i.e., the function will fail to compile for rvalues. + * @pre The argument `obj` must not be `const`, i.e., the function will fail to compile for `const` lvalues. + */ +template +_CCCL_HOST_DEVICE constexpr decltype(auto) mv(T&& obj) +{ + static_assert(::std::is_lvalue_reference_v, "Useless move from rvalue."); + static_assert(!::std::is_const_v<::std::remove_reference_t>, "Misleading move from const lvalue."); + return ::std::move(obj); +} +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** + * @brief Creates a `std::shared_ptr` managing a copy of the given object. + * + * This function takes an object of any type and returns a `std::shared_ptr` + * that manages a copy of that object. If the object is an lvalue reference, + * it will be copied into the `shared_ptr`. If the object is an rvalue reference, + * it will be moved into the `shared_ptr`. + * + * The type managed by the `shared_ptr` has all references and `const`/`volatile` + * qualifiers removed from the original type. + * + * @tparam T The type of the object, deduced automatically. May be an lvalue or rvalue reference. + * @param obj The object to copy into the instance managed by the `shared_ptr`. + * @return A `std::shared_ptr` managing a new copy of the object. + * + * @note This function simplifies the creation of `std::shared_ptr`s by handling + * the type deduction and appropriate forwarding of the object. It's particularly + * useful when you want to create a `shared_ptr` from temporary objects or when + * the object's type includes references or cv-qualifiers. + * + * @code + * int value = 42; + * auto sp1 = to_shared(value); // New shared_ptr + * assert(*sp1 == 42); // sp1 points to an int valued at 42 + * @endcode + */ +template +auto to_shared(T&& obj) +{ + return ::std::make_shared<::std::remove_cv_t<::std::remove_reference_t>>(::std::forward(obj)); +} + +/** + * @brief Create an iterable range from 'from' to 'to' + * + * @tparam T The type of the start range value + * @tparam U The type of the end range value + * @param from The start value of the range + * @param to The end value of the range + * + * @return A range of values from 'from' to 'to' + * + * @note The range includes 'from' and excludes 'to'. The actual type iterated is determined as the type of the + * expression `true ? from : to`. This ensures expected behavior for iteration with different `from` and `to` types. + */ +template +_CCCL_HOST_DEVICE auto each(T from, U to) +{ + using common = ::std::remove_reference_t; + + class iterator + { + common value; + + public: + _CCCL_HOST_DEVICE iterator(common value) + : value(mv(value)) + {} + + _CCCL_HOST_DEVICE common operator*() const + { + return value; + } + + _CCCL_HOST_DEVICE iterator& operator++() + { + if constexpr (::std::is_enum_v) + { + value = static_cast(static_cast<::std::underlying_type_t>(value) + 1); + } + else + { + ++value; + } + return *this; + } + + _CCCL_HOST_DEVICE bool operator!=(const iterator& other) const + { + return value != other.value; + } + }; + + class each_t + { + common begin_, end_; + + public: + _CCCL_HOST_DEVICE each_t(T begin, U end) + : begin_(mv(begin)) + , end_(mv(end)) + {} + _CCCL_HOST_DEVICE iterator begin() const + { + return iterator(begin_); + } + _CCCL_HOST_DEVICE iterator end() const + { + return iterator(end_); + } + }; + + return each_t{mv(from), mv(to)}; +} + +/** + * @brief Create an iterable range from `T(0)` to `to` + * + * @tparam T The type of the end range value + * @param to The end value of the range + * + * @return A range of values from `T(0)` to `to` + * + * @note The range includes 0 and excludes `to` + */ +template +auto each(T to) +{ + static_assert(!::std::is_pointer_v, "Use the two arguments version of each() with pointers."); + if constexpr (::std::is_signed_v) + { + assert(to >= 0 && "Attempt to iterate from 0 to a negative value."); + } + return each(T(0), mv(to)); +} + +/** + * @brief Applies a callable object `f` to each integral constant within a given range `[0, n)`. + * + * This function template takes a callable object `f` and applies it to each integral constant + * in the range `[0, n)`. The callable object is expected to take a single argument of type + * `std::integral_constant` (or `size_t`), where `i` is the current index. + * + * The important element is that the lambda can use its integral argument during compilation, e.g. + * to fetch a tuple element with `std::get(t)`. + * + * @tparam n The number of times the callable object `f` should be applied. + * @tparam F Type of the callable object. + * @tparam i... (Internal) Indices for parameter pack expansion. + * @param f The callable object to apply to each integral constant. + * + * Example usage: + * @code + * auto print_index = [](auto index) { ::std::cout << index << ' '; }; + * unroll<5>(print_index); // Output: 0 1 2 3 4 + * @endcode + * + * Note: Since this function is `constexpr`, it can be used at compile-time if `f` is a + * compile-time invocable object. + */ +template +constexpr void unroll(F&& f, ::std::index_sequence = ::std::index_sequence<>()) +{ + if constexpr (sizeof...(i) != n) + { + return unroll(::std::forward(f), ::std::make_index_sequence()); + } + else + { + using result_t = decltype(f(::std::integral_constant())); + if constexpr (::std::is_same_v) + { + (f(::std::integral_constant()), ...); + } + else + { + (f(::std::integral_constant()) && ...); + } + } +} + +/** + * @brief Prepends an element to a tuple. + * + * This function creates a new tuple by prepending the element `t` to the tuple `p`. + * + * @tparam T The type of the element to prepend. + * @tparam P The types of the elements in the tuple. + * @param prefix The element to prepend. + * @param tuple The tuple to which the element is prepended. + * + * @return std::tuple A new tuple with `t` prepended to `p`. + * + * @par Example: + * @code + * int a = 1; + * std::tuple t = std::make_tuple(2, 3.0, 'c'); + * auto result = tuple_prepend(a, t); + * // result is std::tuple(1, 2, 3.0, 'c') + * @endcode + */ +template +constexpr auto tuple_prepend(T&& prefix, ::std::tuple tuple) +{ + return ::std::apply( + [&](auto&&... p) { + return ::std::tuple(::std::forward(prefix), ::std::forward(p)...); + }, + mv(tuple)); +} + +namespace reserved +{ + +// Like ::std::make_tuple, but skips all values of the same type as `::std::ignore`. +inline constexpr auto make_tuple() +{ + return ::std::tuple<>(); +} + +template +constexpr auto make_tuple([[maybe_unused]] T t, P... p) +{ + if constexpr (::std::is_same_v) + { + // Recurse skipping the first parameter + return make_tuple(mv(p)...); + } + else + { + // Keep first parameter, concatenate with recursive call + return tuple_prepend(mv(t), make_tuple(mv(p)...)); + } +} + +} // namespace reserved + +/** + * @brief Creates a `std::tuple` by applying a callable object `f` to each integral constant within a given range `[0, + * n)`. + * + * This function template takes a callable object `f` and applies it to each integral constant in the range `[0, n)`. + * The results of the calls are collected into a `std::tuple` and returned. The callable object is expected to take a + * single argument of type `std::integral_constant`, where `i` is the current index, and return a value of + * the desired type and value. + * + * If `f` returns `std::ignore` for any argument(s), the corresponding value(s) will be skipped in the resulting + * tuple. + * + * @tparam n The number of times the callable object `f` should be applied. + * @tparam F Type of the callable object. + * @tparam i... (Internal) Indices for parameter pack expansion. + * @param f The callable object to apply to each integral constant. + * @return A `std::tuple` containing the results of applying `f` to each integral constant in the range `[0, n)`. + * + * Example usage: + * @code + * auto make_double = [](auto index) { + * if constexpr (index == 2) + * return std::ignore; + * else + * return static_cast(index); + * }; + * auto result = make_tuple_indexwise<5>(make_double); + * // result is std::tuple{0.0, 1.0, 3.0, 4.0} + * @endcode + * + * Note: Since this function is `constexpr`, it can be used at compile-time if `f` is a compile-time invocable object. + */ +template +constexpr auto make_tuple_indexwise(F&& f, ::std::index_sequence = ::std::index_sequence<>()) +{ + if constexpr (sizeof...(i) != n) + { + return make_tuple_indexwise(::std::forward(f), ::std::make_index_sequence()); + } + else + { + return reserved::make_tuple(f(::std::integral_constant())...); + } +} + +/** + * @brief Iterates over the elements of a tuple, applying a given function object to each element. + * + * The function `each_in_tuple` accepts a tuple and a callable object `f`. If `f` accepts two parameters, it is invoked + * with the index as a `std::integral_constant` and the value at that index in the tuple for each element. If `f` + * accepts only one parameter, it is invoked with the value at each index in the tuple. + * + * @tparam Tuple The type of the tuple over which to iterate. + * @tparam F The type of the callable object to apply to each element in the tuple. + * @tparam i... An optional parameter pack representing indices (used internally in the recursive implementation). + * @param t The tuple over which to iterate. + * @param f The callable object to apply to each element in the tuple. + * @param std::index_sequence An optional index sequence used internally in the recursive implementation + * (default-constructed as empty). + */ +template +constexpr void each_in_tuple(Tuple&& t, F&& f) +{ + constexpr size_t n = ::std::tuple_size_v<::std::remove_reference_t>; + unroll([&](auto j) { + if constexpr (::std::is_invocable_v(::std::forward(t)))>) + { + f(j, ::std::get(::std::forward(t))); + } + else + { + f(::std::get(::std::forward(t))); + } + }); +} + +namespace reserved +{ +// Implementation of each_in_pack below +template +constexpr void each_in_pack(F&& f, ::std::index_sequence, P&&... p) +{ + if constexpr (::std::is_invocable_v, + ::std::tuple_element_t<0, ::std::tuple>>) + { + (f(::std::integral_constant(), ::std::forward

(p)), ...); + } + else + { + (f(::std::forward

(p)), ...); + } +} +} // namespace reserved + +/** + * @brief Applies a given function to each element in a parameter pack. + * + * If the callable object `f` accepts two parameters, `each_in_pack` calls `f(std::integral_constant(), + * std::forward

(p))` for each `p` at position `i` in the parameter pack. Otherwise, `each_in_pack` calls + * `f(std::forward

(p))` for each `p` in the parameter pack. + * + * @tparam F Callable object type + * @tparam P Variadic template parameter pack type + * @param f Callable object to apply to each element in the parameter pack + * @param p Variadic parameter pack + * + * \code{.cpp} + * std::string s; + * cuda::experimental::stf::each_in_pack([&](auto&& p) { s += std::to_string(p) + ", "; }, 1, 2, 3, 4, 5); + * assert(s == "1, 2, 3, 4, 5, "); + * s = ""; + * cuda::experimental::stf::each_in_pack([&](auto i, auto&& p) { s += (i ? ", " : "") + std::to_string(i) + p; }, "a", + * "b", "c"); assert(s == "0a, 1b, 2c"); \endcode + * \endcode + */ +template +constexpr void each_in_pack(F&& f, P&&... p) +{ + if constexpr (sizeof...(P) > 0) + { + reserved::each_in_pack(::std::forward(f), ::std::make_index_sequence(), ::std::forward

(p)...); + } +} + +/** + * @brief Casts an enum value to its underlying type. + * + * This function template takes an enum value and returns its representation + * in the underlying integral type of the enum. + * + * @tparam E The type of the enum. + * @param value The enum value to be cast to its underlying type. + * @return The underlying integral value of the given enum value. + */ +template +auto as_underlying(E value) +{ + return static_cast<::std::underlying_type_t>(value); +} + +template +constexpr V get_reserved_default() +{ + return ::std::is_floating_point_v ? -::std::numeric_limits::max() + : ::std::is_unsigned_v + ? ::std::numeric_limits::max() + : ::std::numeric_limits::min(); +} + +template +using __copy_type_t = decltype(V); + +/** + * @brief A class to represent a value that can be either static or dynamic. + * + * @tparam static_v The static value. If the value is 'reserved', it means it's dynamic. + * @tparam reserved A reserved value indicating the value is dynamic. + * + * All arithmetic and logic operators are supported (if the original type supports them). + */ +template reserved = get_reserved_default<__copy_type_t>()> +class optionally_static +{ +public: + /// The type of the underlying data + using type = decltype(static_v); + + /// Indicates whether the value is static. + static constexpr bool is_static = static_v != reserved; + + /// The reserved value for indicating dynamic state. + static constexpr auto reserved_v = reserved; + + /// Special functions. + constexpr optionally_static() = default; + /// Special functions. + constexpr optionally_static(const optionally_static&) = default; + /// Special functions. + constexpr optionally_static& operator=(const optionally_static&) = default; + + /** + * @brief Constructor that initializes a dynamic value. Works only if `static_v == reserved`. + * + * @param dynamic The dynamic value. + */ + constexpr optionally_static(type dynamic_value) + : payload(dynamic_value) + {} + + /** + * @brief Retrieves the stored value (either static or dynamic). + * + * @return The stored value. + */ + constexpr type get() const + { + if constexpr (is_static) + { + return static_v; + } + else + { + return payload; + } + } + + /** + * @brief Forwards to `get()`. + * + * @return The stored value. + */ + constexpr operator type() const + { + return get(); + } + + /** + * @brief Retrieves a reference to the stored value (assumed dynamic). + * + * @return The stored value. + */ + constexpr type& get_ref() + { + return payload; + } + + optionally_static& operator++() + { + ++get_ref(); + return *this; + } + + optionally_static operator++(int) + { + auto copy = *this; + ++*this; + return copy; + } + + optionally_static& operator--() + { + --get_ref(); + return *this; + } + + optionally_static operator--(int) + { + auto copy = *this; + --*this; + return copy; + } + + optionally_static operator+() const + { + return *this; + } + + auto operator-() const + { + if constexpr (!is_static) + { + return -get(); + } + else if constexpr (-static_v == reserved) + { + /* rare case where we convert a static value into a dynamic one */ + return reserved; + } + else + { + return optionally_static<-static_v, reserved>(); + } + } + +private: + /// The dynamic state, conditional on whether the static value is reserved. + struct nonesuch + {}; + using state_t = ::std::conditional_t; + [[no_unique_address]] state_t payload = state_t(); +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +// Operator implementations +# define _3197bc91feaf98030b2cc0b441d7b0ea(op) \ + template \ + constexpr auto operator op(const optionally_static& lhs, const optionally_static& rhs) \ + { \ + if constexpr (!std::remove_reference_t::is_static \ + || !std::remove_reference_t::is_static) \ + { \ + return lhs.get() op rhs.get(); \ + } \ + else if constexpr ((v1 op v2) == r) \ + { \ + /* rare case where we convert two static values into a dynamic one */ \ + return r; \ + } \ + else \ + { \ + return optionally_static<(v1 op v2), r>(); \ + } \ + } \ + template \ + constexpr auto operator op(const optionally_static& lhs, const T& rhs) \ + { \ + return lhs.get() op rhs; \ + } \ + template \ + constexpr auto operator op(const T& lhs, const optionally_static& rhs) \ + { \ + return lhs op rhs.get(); \ + } \ + template \ + constexpr auto& operator op##=(optionally_static& lhs, const optionally_static& rhs) \ + { \ + return lhs.get_ref() op## = rhs.get(); \ + } \ + template \ + constexpr auto& operator op##=(optionally_static& lhs, const T & rhs) \ + { \ + return lhs.get_ref() op## = rhs; \ + } + +// Implement for the usual operators +_3197bc91feaf98030b2cc0b441d7b0ea(+); +_3197bc91feaf98030b2cc0b441d7b0ea(-); +_3197bc91feaf98030b2cc0b441d7b0ea(*); +_3197bc91feaf98030b2cc0b441d7b0ea(/); +_3197bc91feaf98030b2cc0b441d7b0ea(%); +_3197bc91feaf98030b2cc0b441d7b0ea(&); +_3197bc91feaf98030b2cc0b441d7b0ea(|); +_3197bc91feaf98030b2cc0b441d7b0ea(^); +//_3197bc91feaf98030b2cc0b441d7b0ea(<<); +//_3197bc91feaf98030b2cc0b441d7b0ea(>>); + +# undef _3197bc91feaf98030b2cc0b441d7b0ea + +# define _3197bc91feaf98030b2cc0b441d7b0ea(op) \ + template \ + constexpr bool operator op(const optionally_static& lhs, const optionally_static& rhs) \ + { \ + return lhs.get() op rhs.get(); \ + } \ + template >>> \ + constexpr bool operator op(const optionally_static& lhs, const T& rhs) \ + { \ + return lhs.get() op rhs; \ + } \ + template >>> \ + constexpr bool operator op(const T& lhs, const optionally_static& rhs) \ + { \ + return lhs op rhs.get(); \ + } + +_3197bc91feaf98030b2cc0b441d7b0ea(==); +_3197bc91feaf98030b2cc0b441d7b0ea(!=); +_3197bc91feaf98030b2cc0b441d7b0ea(<); +_3197bc91feaf98030b2cc0b441d7b0ea(>); +_3197bc91feaf98030b2cc0b441d7b0ea(<=); +_3197bc91feaf98030b2cc0b441d7b0ea(>=); + +# undef _3197bc91feaf98030b2cc0b441d7b0ea + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/utility/cuda_attributes.cuh b/cudax/include/cuda/experimental/__stf/utility/cuda_attributes.cuh new file mode 100644 index 00000000000..c2954212a6d --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/cuda_attributes.cuh @@ -0,0 +1,29 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if defined(__CUDACC__) && !defined(_NVHPC_CUDA) +# define CUDASTF_NO_DEVICE_STACK +#elif defined(__CUDACC__) && defined(_NVHPC_CUDA) +# define CUDASTF_NO_DEVICE_STACK _Pragma("diag_suppress no_device_stack") +#else +# define CUDASTF_NO_DEVICE_STACK +#endif diff --git a/cudax/include/cuda/experimental/__stf/utility/cuda_safe_call.cuh b/cudax/include/cuda/experimental/__stf/utility/cuda_safe_call.cuh new file mode 100644 index 00000000000..ded8bd21868 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/cuda_safe_call.cuh @@ -0,0 +1,433 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @dir utility + * + * @brief Utility functions and classes + */ +/** + * @file + * @brief Facilities for error detection and error handling + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include +#include + +#if __has_include() +# include +#endif + +namespace cuda::experimental::stf +{ + +#if __has_include() +// Undocumented +inline const char* cusolverGetErrorString(const cusolverStatus_t status) +{ + switch (status) + { + default: + break; +# define _b738a2a5fe81ee876deadae4a109521c(x) \ + case x: \ + return #x + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_SUCCESS); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_NOT_INITIALIZED); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_ALLOC_FAILED); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_INVALID_VALUE); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_ARCH_MISMATCH); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_MAPPING_ERROR); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_EXECUTION_FAILED); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_INTERNAL_ERROR); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_NOT_SUPPORTED); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_ZERO_PIVOT); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_INVALID_LICENSE); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_IRS_PARAMS_INVALID); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_IRS_INTERNAL_ERROR); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_IRS_NOT_SUPPORTED); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_IRS_OUT_OF_RANGE); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_IRS_MATRIX_SINGULAR); + _b738a2a5fe81ee876deadae4a109521c(CUSOLVER_STATUS_INVALID_WORKSPACE); +# undef _b738a2a5fe81ee876deadae4a109521c + } + return "Unknown cuSOLVER status"; +} +#endif + +/** + * @brief Exception type across CUDA, CUBLAS, and CUSOLVER. + * + * @paragraph example Example + * @snippet this cuda_exception + */ +class cuda_exception : public ::std::exception +{ +public: + cuda_exception() = delete; + // TODO (miscco): Why was this not copyable? + // cuda_exception(const cuda_exception&) = delete; + + /** + * @brief Constructs an exception object from a status value. + * + * If `status` is `0`, the exception is still created with an empty error message. Otherwise, the constructor + * initializes the error message (later accessible with `what()`) appropriately. + * + * @tparam T status type, can be `cudaError_t`, `cublasStatus_t`, or `cusolverStatus_t` + * @param status status value, usually the result of a CUDA API call + * @param loc location of the call, defaulted + */ + template + cuda_exception(const T status, const source_location loc = RESERVED_STF_SOURCE_LOCATION()) + { + // All "success" statuses are zero + static_assert(cudaSuccess == 0 && CUDA_SUCCESS == 0 +#if __has_include() + && CUBLAS_STATUS_SUCCESS == 0 +#endif +#if __has_include() + && CUSOLVER_STATUS_SUCCESS == 0 +#endif + , + "Please revise this function."); + + // Common early exit test for all cases + if (status == 0) + { + return; + } + + int dev = -1; + cudaGetDevice(&dev); + +#if __has_include() + if constexpr (::std::is_same_v) + { + format("%s(%u) [device %d] CUSOLVER error in call %s: %s.", + loc.file_name(), + loc.line(), + dev, + loc.function_name(), + cusolverGetErrorString(status)); + } + else +#endif // __has_include() +#if __has_include() + if constexpr (::std::is_same_v) + { + format("%s(%u) [device %d] CUBLAS error in %s: %s.", + loc.file_name(), + loc.line(), + dev, + loc.function_name(), + cublasGetStatusString(status)); + } + else +#endif // __has_include() + if constexpr (::std::is_same_v) + { + format("%s(%u) [device %d] CUDA OCC error in %s: %s.", + loc.file_name(), + loc.line(), + dev, + loc.function_name(), + cudaGetErrorString(cudaErrorInvalidConfiguration)); + } + else if constexpr (::std::is_same_v) + { + const char* error_string = nullptr; + cuGetErrorString(status, &error_string); + const char* error_name = nullptr; + cuGetErrorName(status, &error_name); + format("%s(%u) [device %d] CUDA DRIVER error in %s: %s (%s).", + loc.file_name(), + loc.line(), + dev, + loc.function_name(), + error_string, + error_name); + } + else + { + static_assert(::std::is_same_v, "Error: not a CUDA status."); + format("%s(%u) [device %d] CUDA error in %s: %s (%s).", + loc.file_name(), + loc.line(), + dev, + loc.function_name(), + cudaGetErrorString(status), + cudaGetErrorName(status)); + } + } + + /** + * @brief Returns a message describing the error. + * + * @return the error message + */ + const char* what() const noexcept override + { + return msg.c_str(); + } + +private: + template + void format(const char* fmt, Ps&&... ps) + { + // Compute the bytes to be written. + auto needed = ::std::snprintf(nullptr, 0, fmt, ps...); + // Pedantically reserve one extra character for the terminating `\0`. + msg.resize(needed + 1); + // This will write `needed` bytes plus a `\0` at the end. + ::std::snprintf(&msg[0], msg.capacity(), fmt, ::std::forward(ps)...); + // The terminating `\0` is not part of the string's length. + msg.resize(needed); + } + + ::std::string msg; +}; + +#ifdef UNITTESTED_FILE +//! [cuda_exception] +UNITTEST("cuda_exception") +{ + auto e = cuda_exception(CUDA_SUCCESS); + EXPECT(e.what()[0] == 0); +# if __has_include() + auto e1 = cuda_exception(CUSOLVER_STATUS_ZERO_PIVOT); + EXPECT(strlen(e1.what()) > 0u); +# endif +}; +//! [cuda_exception] +#endif // UNITTESTED_FILE + +namespace reserved +{ +template +struct first_param_impl; + +template +struct first_param_impl +{ + using type = P; +}; + +/* +`reserved::first_param` is an alias for the type of `fun`'s first parameter. +*/ +template +using first_param = typename first_param_impl::type; +} // namespace reserved + +#ifdef UNITTESTED_FILE +UNITTEST("first_param") +{ + extern int test1(int); + static_assert(::std::is_same_v, int>); + extern int test2(double, int); + static_assert(::std::is_same_v, double>); + extern int test3(int&&); + static_assert(::std::is_same_v, int&&>); +}; +#endif // UNITTESTED_FILE + +/** + * @brief Enforces successful call of CUDA API functions. + * + * If `status` is `0`, the function has no effect. Otherwise, the function prints pertinent error information to + * `stderr` and aborts the program. + * + * @tparam T status type, can be `cudaError_t`, `cublasStatus_t`, or `cusolverStatus_t` + * @param status status value, usually the result of a CUDA API call + * @param loc location of the call, defaulted + * + * @paragraph example Example + * @snippet this cuda_safe_call + */ +template +void cuda_safe_call(const T status, const source_location loc = RESERVED_STF_SOURCE_LOCATION()) +{ + // Common early exit test for all cases + if (status == 0) + { + return; + } + fprintf(stderr, "%s\n", cuda_exception(status, loc).what()); + abort(); +} + +#ifdef UNITTESTED_FILE +UNITTEST("cuda_safe_call") +{ + //! [cuda_safe_call] + cuda_safe_call(CUDA_SUCCESS); // no effect + int dev; + cuda_safe_call(cudaGetDevice(&dev)); // continue execution if the call is successful + if (false) + { + cuda_safe_call(CUDA_ERROR_INVALID_VALUE); // would abort application if called + } + //! [cuda_safe_call] +}; +#endif // UNITTESTED_FILE + +/** + * @brief Throws a `cuda_exception` if the given `status` is an error code + * + * @tparam Status CUDA error code type, such as `cudaError_t`, `cublasStatus_t`, or `cusolverStatus_t` + * @param status CUDA error code value, usually the result of a CUDA API call + * @param loc location of the call, defaulted + * + * The typical usage is to place a CUDA function call inside `cuda_try`, i.e. `cuda_try(cudaFunc(args))` (the + * same way `cuda_safe_call` would be called). For example, `cuda_try(cudaCreateStream(&stream))` is equivalent to + * `cudaCreateStream(&stream)`, with the note that the former call throws an exception in case of error. + * + * @paragraph example Example + * @snippet this cuda_try1 + */ +template +void cuda_try(Status status, const source_location loc = RESERVED_STF_SOURCE_LOCATION()) +{ + if (status) + { + throw cuda_exception(status, loc); + } +} + +#ifdef UNITTESTED_FILE +UNITTEST("cuda_try1") +{ + //! [cuda_try1] + cuda_try(CUDA_SUCCESS); // no effect, returns CUDA_SUCCESS + int dev; + cuda_try(cudaGetDevice(&dev)); // equivalent to the line above + try + { + cuda_try(CUDA_ERROR_INVALID_VALUE); // would abort application if called + } + catch (...) + { + // This point will be reached + return; + } + EXPECT(false, "Should not get here."); + //! [cuda_try1] +}; +#endif // UNITTESTED_FILE + +/** + * @brief Calls a CUDA function and throws a `cuda_exception` in case of failure. + * + * @tparam fun Name of the CUDA function to invoke, cannot be the name of an overloaded function + * @tparam Ps Parameter types to be forwarded + * @param ps Arguments to be forwarded + * @return `auto` (see below) + * + * In this overload of `cuda_try`, the function name is passed explicitly as a template argument and also the first + * argument is omitted, as in `cuda_try()`. `cuda_try` will create a temporary of the appropriate + * type internally, call the specified CUDA function with the address of that temporary, and then return the temporary. + * For example, in the call `cuda_try()`, the created stream object will be returned. That way you can + * write `auto stream = cuda_try();` instead of `cudaStream_t stream; cudaCreateStream(&stream);`. + * This invocation mode relies on the convention used by many CUDA functions with output parameters of specifying them + * in the first parameter position. + * + * Limitations: Does not work with overloaded functions. + * + * @paragraph example Example + * @snippet this cuda_try2 + */ +template +auto cuda_try(Ps&&... ps) +{ + if constexpr (::std::is_invocable_v) + { + cuda_try(fun(::std::forward(ps)...)); + } + else + { + ::std::remove_pointer_t> result{}; + cuda_try(fun(&result, ::std::forward(ps)...)); + return result; + } +} + +#ifdef UNITTESTED_FILE +UNITTEST("cuda_try2") +{ + //! [cuda_try2] + int dev = cuda_try(); // continue execution if the call is successful + cuda_try(cudaGetDevice(&dev)); // equivalent to the line above + //! [cuda_try2] +}; +#endif // UNITTESTED_FILE + +// Unused, keep for later +/// @cond DO_NOT_DOCUMENT +#define OVERLOADS_UNUSED(f) \ + ba7b8453f262e429575e23dcb2192b33( \ + a2bce6d11e8033f5c8d9c9442849656c, \ + f(::std::forward(a2bce6d11e8033f5c8d9c9442849656c)...)) +// Unused, keep for later +#define ba7b8453f262e429575e23dcb2192b33(a, fun_of_a) \ + [&](auto&&... a) noexcept(noexcept(fun_of_a)) -> decltype(fun_of_a) { \ + return fun_of_a; \ + } + +// Unused, keep for later +#define CUDATRY_UNUSED(fun) \ + a838e9c10e0ded64dff84e7b679d2342( \ + (fun), a2bce6d11e8033f5c8d9c9442849656c, cca0b395150985cb1c6ab3f8032edafa, fef8664203d67fe27b0434c87ce346fb) +// Unused, keep for later +#define a838e9c10e0ded64dff84e7b679d2342(f, a, status, result) \ + [&](auto&&... a) { \ + if constexpr (::std::is_invocable_v) \ + { \ + ::cuda::experimental::stf::cuda_try(f(::std::forward(a)...)); \ + } \ + else \ + { \ + ::std::remove_pointer_t> result; \ + if (auto status = f(&result, ::std::forward(a)...)) \ + { \ + throw ::cuda::experimental::stf::cuda_exception(status); \ + } \ + return result; \ + } \ + } CUDATRY_ACCEPTS_ONLY_FUNCTION_NAMES +// Unused, keep for later +#define CUDATRY_ACCEPTS_ONLY_FUNCTION_NAMES_UNUSED(...) (__VA_ARGS__) +/// @endcond + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/utility/dimensions.cuh b/cudax/include/cuda/experimental/__stf/utility/dimensions.cuh new file mode 100644 index 00000000000..63339782b7f --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/dimensions.cuh @@ -0,0 +1,538 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Implementation of the pos4 class + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief pos4 class defining a position within a multidimensional object (default value 0 in each axis) + */ +class pos4 +{ +public: + constexpr pos4() = default; + + /// Create a pos4 from its coordinates + template + _CCCL_HOST_DEVICE constexpr explicit pos4(Integral x, Integral y = 0, Integral z = 0, Integral t = 0) + : x(static_cast(x)) + , y(static_cast(y)) + , z(static_cast(z)) + , t(static_cast(t)) + {} + + /// Get the position along a specific axis + _CCCL_HOST_DEVICE constexpr int get(size_t axis_id) const + { + switch (axis_id) + { + case 0: + return x; + case 1: + return y; + case 2: + return z; + default: + assert(axis_id == 3); + return t; + } + } + + /// Get the position along a specific axis + _CCCL_HOST_DEVICE constexpr int operator()(int axis_id) const + { + return get(axis_id); + } + + /// Comparision of two pos4 in lexicographical order + _CCCL_HOST_DEVICE constexpr bool operator<(const pos4& rhs) const + { + if (x != rhs.x) + { + return x < rhs.x; + } + if (y != rhs.y) + { + return y < rhs.y; + } + if (z != rhs.z) + { + return z < rhs.z; + } + return t < rhs.t; + } + + /// Equality test between two pos4 + _CCCL_HOST_DEVICE constexpr bool operator==(const pos4& rhs) const + { + return x == rhs.x && y == rhs.y && z == rhs.z && t == rhs.t; + } + + /// Convert the pos4 to a string + ::std::string to_string() const + { + return ::std::string("pos4(" + ::std::to_string(x) + "," + ::std::to_string(y) + "," + ::std::to_string(z) + "," + + ::std::to_string(t) + ")"); + } + + int x = 0; + int y = 0; + int z = 0; + int t = 0; +}; + +/** + * @brief dim4 class defining the size of a multidimensional object (default value 1 in each axis) + */ +class dim4 : public pos4 +{ +public: + dim4() = default; + + /// Create a dim4 from its extents + _CCCL_HOST_DEVICE constexpr explicit dim4(int x, int y = 1, int z = 1, int t = 1) + : pos4(x, y, z, t) + {} + + // TODO: could coords ever be negative? (if not, maybe they should be unsigned). + /// Get the total size (multiply all dimensions) + _CCCL_HOST_DEVICE constexpr size_t size() const + { + const ::std::ptrdiff_t result = ::std::ptrdiff_t(x) * y * z * t; + assert(result >= 0); + return result; + } + + /// Compute the dim4 class obtained by taking the minimum of two dim4 along each axis + _CCCL_HOST_DEVICE static constexpr dim4 min(const dim4& a, const dim4& b) + { + return dim4(::std::min(a.x, b.x), ::std::min(a.y, b.y), ::std::min(a.z, b.z), ::std::min(a.t, b.t)); + } + + /// Get the 1D index of a coordinate defined by a pos4 class within a dim4 class + _CCCL_HOST_DEVICE constexpr size_t get_index(const pos4& p) const + { + assert(p.get(0) <= x); + assert(p.get(1) <= y); + assert(p.get(2) <= z); + assert(p.get(3) <= t); + size_t index = p.get(0) + x * (p.get(1) + y * (p.get(2) + p.get(3) * z)); + return index; + } + + /// Get the maximum dimension that is not 1 + _CCCL_HOST_DEVICE constexpr size_t get_rank() const + { + if (t > 1) + { + return 3; + } + if (z > 1) + { + return 2; + } + if (y > 1) + { + return 1; + } + + return 0; + } +}; + +/** + * @brief An explicit shape is a shape or rank 'dimensions' where the bounds are explicit in each dimension. + * + * @tparam dimensions the rank of the shape + */ +template +class box +{ +public: + ///@{ @name Constructors + /// Construct an explicit shape from its lower and upper bounds (inclusive lower bounds, exclusive upper bounds) + template + _CCCL_HOST_DEVICE box(const ::std::array<::std::pair, dimensions>& s) + : s(s) + {} + + /// Construct an explicit shape from its upper bounds (exclusive upper bounds) + template + _CCCL_HOST_DEVICE box(const ::std::array& sizes) + { + for (size_t ind : each(0, dimensions)) + { + s[ind].first = 0; + s[ind].second = sizes[ind]; + if constexpr (::std::is_signed_v) + { + assert(sizes[ind] >= 0 && "Invalid shape."); + } + } + } + + /// Construct an explicit shape from its upper bounds (exclusive upper bounds) + template + _CCCL_HOST_DEVICE box(Int... args) + { + static_assert(sizeof...(Int) == dimensions, "Number of dimensions must match"); + each_in_pack( + [&](auto i, const auto& e) { + if constexpr (::std::is_arithmetic_v<::std::remove_reference_t>) + { + s[i].first = 0; + s[i].second = e; + } + else + { + // Assume a pair + s[i].first = e.first; + s[i].second = e.second; + } + }, + args...); + } + + /// Construct an explicit shape from its lower and upper bounds (inclusive lower bounds, exclusive upper bounds) + template + _CCCL_HOST_DEVICE box(::std::initializer_list... args) + { + static_assert(sizeof...(E) == dimensions, "Number of dimensions must match"); + each_in_pack( + [&](auto i, auto&& e) { + assert((e.size() == 1 || e.size() == 2) && "Invalid arguments for box."); + if (e.size() > 1) + { + s[i].first = *e.begin(); + s[i].second = e.begin()[1]; + } + else + { + s[i].first = 0; + s[i].second = *e.begin(); + } + }, + args...); + } + + // _CCCL_HOST_DEVICE box(const typename ::std::experimental::dextents& extents) { + // for (size_t i: each(0, dimensions)) { + // s[i].first = 0; + // s[i].second = extents[ind]; + // } + // } + + _CCCL_HOST_DEVICE void print() + { + printf("EXPLICIT SHAPE\n"); + for (size_t ind = 0; ind < dimensions; ind++) + { + assert(s[ind].first <= s[ind].second); + printf(" %ld -> %ld\n", s[ind].first, s[ind].second); + } + } + + /// Get the number of elements along a dimension + _CCCL_HOST_DEVICE ::std::ptrdiff_t get_extent(size_t dim) const + { + return s[dim].second - s[dim].first; + } + + /// Get the first coordinate (included) in a specific dimension + _CCCL_HOST_DEVICE ::std::ptrdiff_t get_begin(size_t dim) const + { + return s[dim].first; + } + + /// Get the last coordinate (excluded) in a specific dimension + _CCCL_HOST_DEVICE ::std::ptrdiff_t get_end(size_t dim) const + { + return s[dim].second; + } + + /// Get the total number of elements in this explicit shape + _CCCL_HOST_DEVICE ::std::ptrdiff_t size() const + { + if constexpr (dimensions == 1) + { + return s[0].second - s[0].first; + } + else + { + size_t res = 1; + for (size_t d = 0; d < dimensions; d++) + { + res *= get_extent(d); + } + return res; + } + } + + /// get the dimensionnality of the explicit shape + _CCCL_HOST_DEVICE constexpr size_t get_rank() const + { + return dimensions; + } + + // Iterator class for box + class iterator + { + private: + box iterated; // A copy of the box being iterated + ::std::array<::std::ptrdiff_t, dimensions> current; // Array to store the current position in each dimension + + public: + _CCCL_HOST_DEVICE iterator(const box& b, bool at_end = false) + : iterated(b) + { + if (at_end) + { + for (size_t i = 0; i < dimensions; ++i) + { + current[i] = iterated.get_end(i); + } + } + else + { + for (size_t i = 0; i < dimensions; ++i) + { + current[i] = iterated.get_begin(i); + } + } + } + + // Overload the dereference operator to get the current position + _CCCL_HOST_DEVICE auto& operator*() + { + if constexpr (dimensions == 1UL) + { + return current[0]; + } + else + { + return current; + } + } + + // Overload the pre-increment operator to move to the next position + _CCCL_HOST_DEVICE iterator& operator++() + { + if constexpr (dimensions == 1UL) + { + current[0]++; + } + else + { + // Increment current with carry to next dimension + for (size_t i : each(0, dimensions)) + { + assert(current[i] < iterated.get_end(i) && "Attempt to increment past the end."); + if (++current[i] < iterated.get_end(i)) + { + // Found the new posish, now reset all lower dimensions to "zero" + for (size_t j : each(0, i)) + { + current[j] = iterated.get_begin(j); + } + break; + } + } + } + return *this; + } + + // Overload the equality operator to check if two iterators are equal + _CCCL_HOST_DEVICE bool operator==(const iterator& rhs) const + { /*printf("EQUALITY TEST index %d %d shape equal ? %s\n", index, + other.index, (&shape == &other.shape)?"yes":"no"); */ + assert(iterated == rhs.iterated && "Cannot compare iterators in different boxes."); + for (auto i : each(0, dimensions)) + { + if (current[i] != rhs.current[i]) + { + return false; + } + } + return true; + } + + // Overload the inequality operator to check if two iterators are not equal + _CCCL_HOST_DEVICE bool operator!=(const iterator& other) const + { + return !(*this == other); + } + }; + + // Functions to create the begin and end iterators + _CCCL_HOST_DEVICE iterator begin() + { + return iterator(*this); + } + + _CCCL_HOST_DEVICE iterator end() + { + return iterator(*this, true); + } + + // Overload the equality operator to check if two shapes are equal + _CCCL_HOST_DEVICE bool operator==(const box& rhs) const + { + for (size_t i : each(0, dimensions)) + { + if (get_begin(i) != rhs.get_begin(i) || get_end(i) != rhs.get_end(i)) + { + return false; + } + } + return true; + } + + _CCCL_HOST_DEVICE bool operator!=(const box& rhs) const + { + return !(*this == rhs); + } + + using coords_t = array_tuple; + + // This transforms a tuple of (shape, 1D index) into a coordinate + _CCCL_HOST_DEVICE coords_t index_to_coords(size_t index) const + { + // Help the compiler which may not detect that a device lambda is calling a device lambda + CUDASTF_NO_DEVICE_STACK + return make_tuple_indexwise([&](auto i) { + // included + const ::std::ptrdiff_t begin_i = get_begin(i); + const ::std::ptrdiff_t extent_i = get_extent(i); + auto result = begin_i + (index % extent_i); + index /= extent_i; + return result; + }); + CUDASTF_NO_DEVICE_STACK + } + +private: + ::std::array<::std::pair<::std::ptrdiff_t, ::std::ptrdiff_t>, dimensions> s; +}; + +// Deduction guides +template +box(Int...) -> box; +template +box(::std::initializer_list...) -> box; +template +box(::std::array) -> box; + +#ifdef UNITTESTED_FILE +UNITTEST("box<3>") +{ + // Expect to iterate over Card({0, 1, 2}x{1, 2}x{10, 11, 12, 13}) = 3*2*4 = 24 items + const size_t expected_cnt = 24; + size_t cnt = 0; + auto shape = box({0, 3}, {1, 3}, {10, 14}); + static_assert(::std::is_same_v>); + for ([[maybe_unused]] const auto& pos : shape) + { + EXPECT(cnt < expected_cnt); + cnt++; + } + + EXPECT(cnt == expected_cnt); +}; + +UNITTEST("box<3> upper") +{ + // Expect to iterate over Card({0, 1, 2}x{0, 1}x{0, 1, 2, 3}) = 3*2*4 = 24 items + const size_t expected_cnt = 24; + size_t cnt = 0; + auto shape = box(3, 2, 4); + static_assert(::std::is_same_v>); + for ([[maybe_unused]] const auto& pos : shape) + { + EXPECT(cnt < expected_cnt); + cnt++; + } + + EXPECT(cnt == expected_cnt); +}; + +UNITTEST("empty box<1>") +{ + auto shape = box({7, 7}); + static_assert(::std::is_same_v>); + + auto it_end = shape.end(); + auto it_begin = shape.begin(); + if (it_end != it_begin) + { + fprintf(stderr, "Error: begin() != end()\n"); + abort(); + } + + // There should be no entry in this range + for ([[maybe_unused]] const auto& pos : shape) + { + abort(); + } +}; + +UNITTEST("mix of integrals and pairs") +{ + const size_t expected_cnt = 12; + size_t cnt = 0; + auto shape = box(3, ::std::pair(1, 2), 4); + static_assert(::std::is_same_v>); + for ([[maybe_unused]] const auto& pos : shape) + { + EXPECT(cnt < expected_cnt); + cnt++; + } + + EXPECT(cnt == expected_cnt); +}; + +#endif // UNITTESTED_FILE + +// So that we can create unordered_map of pos4 entries +template <> +struct hash +{ + ::std::size_t operator()(pos4 const& s) const noexcept + { + return hash_all(s.x, s.y, s.z, s.t); + } +}; + +// So that we can create maps of dim4 entries +template <> +struct hash : hash +{}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/utility/getenv_cache.cuh b/cudax/include/cuda/experimental/__stf/utility/getenv_cache.cuh new file mode 100644 index 00000000000..5bb472da0e0 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/getenv_cache.cuh @@ -0,0 +1,66 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Cache getenv results + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +namespace cuda::experimental::stf::reserved +{ + +/** + * @brief Retrieves the value of an environment variable, caching the result for subsequent calls. + * + * This function checks a thread-local cache to see if the environment variable + * has already been retrieved. If the variable is found in the cache, its value is returned. + * Otherwise, the function calls `std::getenv` to retrieve the value from the environment, + * caches the result, and then returns it. + * + * @param name The name of the environment variable to retrieve. + * @return The value of the environment variable as a C-style string, or `nullptr` if the variable is not set. + * + * @note The cache is thread-local, so each thread will have its own independent cache. + */ +inline const char* cached_getenv(const char* name) +{ + static thread_local ::std::unordered_map<::std::string, ::std::string> cache; + + if (auto it = cache.find(name); it != cache.end()) + { + return it->second.c_str(); + } + + const char* value = ::std::getenv(name); + if (value) + { + cache[name] = value; + } + + return value; +} + +} // end namespace cuda::experimental::stf::reserved diff --git a/cudax/include/cuda/experimental/__stf/utility/handle.cuh b/cudax/include/cuda/experimental/__stf/utility/handle.cuh new file mode 100644 index 00000000000..29fb7188c0f --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/handle.cuh @@ -0,0 +1,292 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include // for ::std::shared_ptr + +namespace cuda::experimental::stf::reserved +{ + +/** + * @brief Provides flags for instantiating the `handle` class (below). + */ +enum handle_flags : unsigned +{ + defaults, ///< The default `handle` flags; no special features. + non_null, ///< Specifies that the pointer underlying the `handle` object cannot be null. +}; + +/// A tag for constructing `handle` objects with `static_cast`. +inline enum class use_static_cast {} use_static_cast = {}; + +/// A tag for constructing `handle` objects with `dynamic_cast`. +inline enum class use_dynamic_cast {} use_dynamic_cast = {}; + +/// @brief Performs bitwise OR operation on `handle_flags`. +/// @return The result of bitwise OR operation. +constexpr handle_flags operator|(handle_flags a, handle_flags b) +{ + return handle_flags(static_cast(a) | static_cast(b)); +} + +/// @brief Performs bitwise AND operation on `handle_flags`. +/// @return The result of bitwise AND operation. +constexpr handle_flags operator&(handle_flags a, handle_flags b) +{ + return handle_flags(static_cast(a) & static_cast(b)); +} + +/** + * @brief A handle to an object of type `T` with flags specified by `f`. + * + * The semantics of `handle` are similar to those of `std::shared_ptr` except as moted below. + * + * `handle` does not accept a `T*`argument for construction. It does accept another `handle` or a `shared_ptr`. + * Additionally, `handle` offers a constructor that forwards all arguments to `T`'s constructor. + * + * If `f` includes `handle_flags::non_null`, the constructor eagerly initializes the underlying pointer with an + * allocated and constructed object. Only moved-from objects will have a null pointer. + * + * For increased safety, `handle` enforces statically that `T` cannot be constructed outside of `handle`. This is + * achieved on the user side either by making `T`'s constructors protected or by making them private and making `handle` + * a friend of `T`. + */ +template +class handle +{ +public: + /// @name Defaulted constructors and assignment operators + /// @{ + handle(handle&) = default; + handle(const handle&) = default; + handle(handle&&) = default; + handle& operator=(handle&) = default; + handle& operator=(const handle&) = default; + handle& operator=(handle&&) = default; + /// @} + + /// @brief Default constructor. + handle() + { + static_assert(!::std::is_constructible_v, "T's default constructor must be protected."); + if constexpr (f & handle_flags::non_null) + { + static_assert(!::std::is_abstract_v, + "A non-nullable handle of an abstract type cannot have a default constructor."); + impl = ::std::make_shared>(); + } + } + + /// @brief Constructs a handle from another handle. + template + handle(handle rhs) + : impl(mv(rhs.impl)) + { + if constexpr (f & handle_flags::non_null) + { + static_assert(f1 & handle_flags::non_null, "Cannot initialize a non-nullable handle from a nullable one."); + } + } + + /// @brief Variadic template constructor for creating a handle. + template + handle(Args&&... args) + : impl(make(::std::forward(args)...)) + { + static_assert(!::std::is_constructible_v, "T's constructors must be protected."); + } + + /// @brief Constructs a handle from another handle with static_cast. + template + handle(handle& src, decltype(use_static_cast)) + : handle(const_cast&>(src), use_static_cast) + {} + + /// @brief Similar constructor as above but for const handle. + template + handle(const handle& src, decltype(use_static_cast)) + : handle(::std::static_pointer_cast(src.impl)) + { + if constexpr (f & handle_flags::non_null) + { + EXPECT(src.impl, "Pointer of static type ", type_name, " was null upon construction of non-null handle."); + assert(impl); + } + } + + /// @brief Constructs a handle from another handle with dynamic_cast. + template + handle(handle& src, decltype(use_dynamic_cast)) + : handle(const_cast&>(src), use_dynamic_cast) + {} + + /// @brief Similar constructor as above but for const handle. + template + handle(const handle& src, decltype(use_dynamic_cast)) + : handle(::std::dynamic_pointer_cast(src.impl)) + { + if constexpr (f & handle_flags::non_null) + { + EXPECT(src.impl, "Pointer of static type ", type_name, " was null upon construction of non-null handle."); + EXPECT(impl, "dynamic_cast<", type_name, "> failed for pointer of static type ", type_name); + } + } + + /// @brief Constructs a handle from a shared_ptr with dynamic_cast. + template + handle(const ::std::shared_ptr& src, decltype(use_dynamic_cast)) + : handle(::std::dynamic_pointer_cast(src)) + {} + + /// @brief Assignment operator for assigning one handle to another. + template + handle& operator=(handle rhs) + { + if constexpr (f & handle_flags::non_null) + { + static_assert(f1 & handle_flags::non_null, "Cannot assign a non-nullable handle from a nullable one."); + } + impl = mv(rhs.impl); + return *this; + } + + ~handle() = default; + + /// @brief Dereference operator. + T* operator->() const + { + assert(*this); + return impl.get(); + } + + /// @brief Dereference the handle to get the actual object. + T& operator*() + { + return *operator->(); + } + + /// @brief Const version of the dereference operator. + const T& operator*() const + { + return *operator->(); + } + + /// @brief Conversion operator to bool. + explicit operator bool() const + { + return impl.get() != nullptr; + } + + /// @brief Conversion operator to shared_ptr. + template + operator ::std::shared_ptr() const + { + return impl; + } + + /// @brief Equality operator. + template + bool operator==(const handle& rhs) + { + return impl == rhs.impl; + } + + /// @brief Weak type corresponding to handle + using weak_t = ::std::weak_ptr; + + /// @brief Returns a weak pointer to the underlying object. + weak_t weak() const + { + return impl; + } + + /// @brief If weak_ptr is null, does nothing and returns false. Otherwise, calls the lambda and returns true. + template + static bool if_valid(const weak_t& wp, Fun&& fun) + { + if (auto p = wp.lock()) + { + handle h{mv(p)}; + ::std::forward(fun)(mv(h)); + return true; + } + return false; + } + +private: + // All instantiations of handle are friends with one another + template + friend class handle; + + // Define a derived class to access the protected ctor + template + struct Derived : public U + { + template + Derived(Args&&... args) + : U(::std::forward(args)...) + {} + }; + + template + static auto make(Arg&& arg, Args&&... args) + { + if constexpr (sizeof...(args) == 0 && ::std::is_convertible_v>) + { + return ::std::forward(arg); + } + else + { + return ::std::make_shared>(::std::forward(arg), ::std::forward(args)...); + } + } + + ::std::shared_ptr impl; +}; + +#ifdef UNITTESTED_FILE +UNITTEST("Weak handle") +{ + class test + { + protected: + test(int x) + { + a = x; + } + + public: + int a; + }; + handle h(42); + EXPECT(h->a == 42); + auto w = h.weak(); + handle::if_valid(w, [](handle x) { + x->a++; + }); + EXPECT(h->a == 43); +}; +#endif // UNITTESTED_FILE + +} // namespace cuda::experimental::stf::reserved diff --git a/cudax/include/cuda/experimental/__stf/utility/hash.cuh b/cudax/include/cuda/experimental/__stf/utility/hash.cuh new file mode 100644 index 00000000000..89e213b2081 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/hash.cuh @@ -0,0 +1,225 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Implements hash_combine which updates a hash value by combining it + * with another value to form a new hash + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief We define a hash trait class in our namespace + */ +template +struct hash; + +namespace reserved +{ + +/** + * @brief Trait to check if std::hash is defined. + * + * Primary template assumes std::hash is not defined. + * + * @tparam E The type to check for std::hash definition. + */ +template +struct has_std_hash : ::std::false_type +{}; + +/** + * @brief Specialization of has_std_hash for types where std::hash is defined. + * + * Uses SFINAE to detect if std::hash can be instantiated. + * + * @tparam E The type to check for std::hash definition. + */ +template +struct has_std_hash>()(::std::declval()))>> + : ::std::true_type +{}; + +/** + * @brief Helper variable template to simplify usage of has_std_hash. + * + * Provides a convenient way to check if std::hash is defined. + * + * @tparam E The type to check for std::hash definition. + */ +template +inline constexpr bool has_std_hash_v = has_std_hash::value; + +} // end namespace reserved + +/** + * @brief Update a hash value by combining it with another value to form a new + * hash + * + * For some reason, C++ does not seem to provide this ... + * Taken from WG21 P0814R0 + */ +template +void hash_combine(size_t& seed, const T& val) +{ + if constexpr (reserved::has_std_hash_v) + { + // Use std::hash if it is specialized for T + seed ^= ::std::hash()(val) + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + else + { + // Otherwise, use cuda::experimental::stf::hash + seed ^= ::cuda::experimental::stf::hash()(val) + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } +} + +template +size_t hash_all(const Ts&... vals) +{ + if constexpr (sizeof...(Ts) == 1) + { + // Special case: single value, use std::hash if possible + if constexpr (reserved::has_std_hash_v) + { + return ::std::hash()(vals...); + } + else + { + return ::cuda::experimental::stf::hash()(vals...); + } + } + else + { + static_assert(sizeof...(Ts) != 0); + size_t seed = 0; + each_in_pack( + [&](auto& val) { + hash_combine(seed, val); + }, + vals...); + return seed; + } +} + +/** + * @brief Specialization of `hash` for `std::pair`. + * + * This hash specialization combines the individual hash values of the two elements in the pair to produce a unique hash + * value for the pair. + * + * @tparam T1 The type of the first element in the pair. + * @tparam T2 The type of the second element in the pair. + */ +template +struct hash<::std::pair> +{ + /** + * @brief Computes a hash value for a given `std::pair`. + * + * This function applies a hash function to each element of the pair and combines + * these hash values into a single hash value representing the pair. + * + * @param p The pair to hash. + * @return size_t The hash value of the tuple. + */ + size_t operator()(const ::std::pair& p) const + { + return cuda::experimental::stf::hash_all(p.first, p.second); + } +}; + +/** + * @brief Specialization of hash for std::tuple. + * + * Provides a hash function for std::tuple, allowing tuples to be used + * as keys in associative containers such as std::unordered_map or std::unordered_set. + * + * @tparam Ts Types of the elements in the tuple. + */ +template +struct hash<::std::tuple> +{ + /** + * @brief Computes a hash value for a given std::tuple. + * + * This function applies a hash function to each element of the tuple and combines + * these hash values into a single hash value representing the entire tuple. + * + * @param p The tuple to hash. + * @return size_t The hash value of the tuple. + */ + size_t operator()(const ::std::tuple& p) const + { + return ::std::apply(cuda::experimental::stf::hash_all, p); + } +}; + +namespace reserved +{ +/** + * @brief Trait to check if std::hash is defined. + * + * Primary template assumes std::hash is not defined. + * + * @tparam E The type to check for std::hash definition. + */ +template +struct has_cudastf_hash : ::std::false_type +{}; + +/** + * @brief Specialization of has_std_hash for types where std::hash is defined. + * + * Uses SFINAE to detect if std::hash can be instantiated. + * + * @tparam E The type to check for std::hash definition. + */ +template +struct has_cudastf_hash< + E, + ::std::void_t>()(::std::declval()))>> : ::std::true_type +{}; + +/** + * @brief Helper variable template to simplify usage of has_std_hash. + * + * Provides a convenient way to check if std::hash is defined. + * + * @tparam E The type to check for std::hash definition. + */ +template +inline constexpr bool has_cudastf_hash_v = has_cudastf_hash::value; + +} // end namespace reserved + +UNITTEST("hash for tuples") +{ + ::std::unordered_map<::std::tuple, int, ::cuda::experimental::stf::hash<::std::tuple>> m; + m[::std::tuple(1, 2)] = 42; +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/utility/memory.cuh b/cudax/include/cuda/experimental/__stf/utility/memory.cuh new file mode 100644 index 00000000000..d179212938c --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/memory.cuh @@ -0,0 +1,1088 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Reusable utilities to build asynchronous memory allocators and deal with memory pinning + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include + +namespace cuda::experimental::stf +{ + +namespace reserved +{ + +// host-allocated pool, modeled as a hashtable keyed by size +inline auto& host_pool() +{ + static ::std::unordered_multimap result; + return result; +} + +// managed memory pool, modeled as a hashtable keyed by size +inline auto& managed_pool() +{ + static ::std::unordered_multimap result; + return result; +} + +// maximum number of entries in the host-allocated pool +enum : size_t +{ + maxPoolEntries = 16 * 1024 +}; + +} // namespace reserved + +/** + * @brief Allocates memory on host and returns a pointer to it. Uses a pool of previous allocations. + * + * @param sz Number of bytes to allocate + * @return void* pointer to allocated data + */ +inline void* allocateHostMemory(size_t sz) +{ + void* result; + auto& pool = reserved::host_pool(); + if (auto i = pool.find(sz); i != pool.end()) + { + result = i->second; + pool.erase(i); + return result; + } + if (pool.size() > reserved::maxPoolEntries) + { + // Lots of unused slots, so wipe pooled memory and start anew. + for (auto& entry : pool) + { + cuda_safe_call(cudaFreeHost(entry.second)); + } + pool.clear(); + } + cuda_safe_call(cudaMallocHost(&result, sz)); + return result; +} + +/** + * @brief Allocates managed memory and returns a pointer to it. Uses a pool of previous allocations. + * + * @param sz Number of bytes to allocate + * @return void* pointer to allocated data + */ +inline void* allocateManagedMemory(size_t sz) +{ + void* result; + auto& pool = reserved::managed_pool(); + + if (auto i = pool.find(sz); i != pool.end()) + { + result = i->second; + pool.erase(i); + return result; + } + if (pool.size() > reserved::maxPoolEntries) + { + // Lots of unused slots, so wipe pooled memory and start anew. + for (auto& entry : pool) + { + cuda_safe_call(cudaFree(entry.second)); + } + pool.clear(); + } + cuda_safe_call(cudaMallocManaged(&result, sz)); + return result; +} + +/** + * @brief Deallocates memory allocated with `allocateHostMemory` immediately. + * + * @param p pointer to memory chunk + * @param sz size in bytes + * @param loc location of the call, defaulted + */ +inline void deallocateHostMemory(void* p, size_t sz, source_location loc = RESERVED_STF_SOURCE_LOCATION()) +{ + ::std::ignore = loc; + assert([&] { + auto r = reserved::host_pool().equal_range(sz); + for (auto i = r.first; i != r.second; ++i) + { + if (i->second == p) + { + fprintf(stderr, "%s(%u) WARNING: double deallocation\n", loc.file_name(), loc.line()); + return false; + } + } + return true; + }()); + reserved::host_pool().insert(::std::make_pair(sz, p)); +} + +/** + * @brief Deallocates managed memory allocated with `allocateManagedMemory` immediately. + * + * @param p pointer to memory chunk + * @param sz size in bytes + * @param loc location of the call, defaulted + */ +inline void deallocateManagedMemory(void* p, size_t sz, source_location loc = RESERVED_STF_SOURCE_LOCATION()) +{ + ::std::ignore = loc; + assert([&] { + auto r = reserved::managed_pool().equal_range(sz); + for (auto i = r.first; i != r.second; ++i) + { + if (i->second == p) + { + fprintf(stderr, "%s(%u) WARNING: double deallocation\n", loc.file_name(), loc.line()); + return false; + } + } + return true; + }()); + reserved::managed_pool().insert(::std::make_pair(sz, p)); +} + +/** + * @brief Deallocates memory allocated with `allocateHostMemory` in a stream-ordered fashion. Will perform + * deallocation when all preceding operations on `stream` have completed. + * + * @param p pointer + * @param sz size in bytes + * @param stream the stream used for ordering + */ +inline void deallocateHostMemory(void* p, size_t sz, cudaStream_t stream) +{ + cuda_safe_call(cudaLaunchHostFunc( + stream, + [](void* vp) { + auto args = static_cast<::std::pair*>(vp); + deallocateHostMemory(args->second, args->first); + delete args; + }, + new ::std::pair(sz, p))); +} + +/** + * @brief Deallocates managed memory allocated with `allocateManagedMemory` in a stream-ordered fashion. Will perform + * deallocation when all preceding operations on `stream` have completed. + * + * @param p pointer + * @param sz size in bytes + * @param stream the stream used for ordering + */ +inline void deallocateManagedMemory(void* p, size_t sz, cudaStream_t stream) +{ + cuda_safe_call(cudaLaunchHostFunc( + stream, + [](void* vp) { + auto args = static_cast<::std::pair*>(vp); + deallocateManagedMemory(args->second, args->first); + delete args; + }, + new ::std::pair(sz, p))); +} + +/** + * @brief Deallocates memory allocated with `allocateHostMemory` in a graph-ordered fashion. Will perform + * deallocation when all dependent graph operations have completed. + * + * @param p pointer + * @param sz size in byutes + * @param graph graph used for ordering + * @param pDependencies array of dependencies + * @param numDependencies number of elements in `pDependencies` + * @return cudaGraphNode_t the newly inserted node + */ +inline cudaGraphNode_t deallocateHostMemory( + void* p, size_t sz, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies) +{ + const cudaHostNodeParams params = { + .fn = + [](void* vp) { + auto args = static_cast<::std::pair*>(vp); + deallocateHostMemory(args->second, args->first); + delete args; + }, + .userData = new ::std::pair(sz, p)}; + cudaGraphNode_t result; + cuda_safe_call(cudaGraphAddHostNode(&result, graph, pDependencies, numDependencies, ¶ms)); + return result; +} + +// TODO deallocateManagedMemory graph... + +/** + * @brief Checks whether an address is pinned or not. + * + * Note that this call will erase the last CUDA error (see cudaGetLastError). + * + * @param p address to check + * @return bool value indicating if p corresponds to a piece of memory that is already pinned or not. + */ +template +bool address_is_pinned(T* p) +{ + cudaPointerAttributes attr; + cuda_safe_call(cudaPointerGetAttributes(&attr, p)); + EXPECT(attr.type != cudaMemoryTypeDevice); + return attr.type != cudaMemoryTypeUnregistered; +} + +/** + * @brief Pins host memory for efficient use with CUDA primitives + * + * @tparam T memory type + * @param p pointer to beginning of memory block + * @param n number of elements in the block + */ +template +cudaError_t pin_memory(T* p, size_t n) +{ + assert(p); + cudaError_t result = cudaSuccess; + if (!address_is_pinned(p)) + { + // We cast to (void *) because T may be a const type : we are not going + // to modify the content, so this is legit ... + using NonConstT = typename std::remove_const::type; + cudaHostRegister(const_cast(p), n * sizeof(T), cudaHostRegisterPortable); + // Fetch the result and clear the last error + result = cudaGetLastError(); + } + return result; +} + +/** + * @brief Unpins host memory previously pinned with `pin_memory` + * + * @tparam T memory type + * @param p pointer to beginning of memory block + */ +template +void unpin_memory(T* p) +{ + assert(p); + + // Make sure no one did a mistake before ignoring the one that may come ! + cuda_safe_call(cudaGetLastError()); + + // We cast to non const T * because T may be a const type : we are not going + // to modify the content, so this is legit ... + using NonConstT = typename std::remove_const::type; + if (cudaHostUnregister(const_cast(p)) == cudaErrorHostMemoryNotRegistered) + { + // Ignore that error, we probably also ignored an error about registering that buffer too ! + cudaGetLastError(); + } +} + +#ifdef UNITTESTED_FILE +UNITTEST("pin_memory") +{ + ::std::vector a(1024); + + EXPECT(!address_is_pinned(&a[0])); + EXPECT(!address_is_pinned(&a[1023])); + + pin_memory(&a[0], 1024); + EXPECT(address_is_pinned(&a[0])); + EXPECT(address_is_pinned(&a[1023])); + unpin_memory(&a[0]); + + EXPECT(!address_is_pinned(&a[0])); + EXPECT(!address_is_pinned(&a[1023])); +}; + +UNITTEST("pin_memory const") +{ + ::std::vector a(1024); + + const double* ca = &a[0]; + + EXPECT(!address_is_pinned(ca)); + + pin_memory(ca, 1024); + EXPECT(address_is_pinned(ca)); + unpin_memory(ca); + + EXPECT(!address_is_pinned(ca)); +}; + +#endif // UNITTESTED_FILE + +template +class small_vector +{ + // TODO: in some cases, uint16_t or uint8_t may be better. + using small_size_t = uint32_t; + +public: + // Type definitions + using value_type = T; + using size_type = ::std::size_t; + using difference_type = ::std::ptrdiff_t; + using reference = value_type&; + using const_reference = const value_type&; + using pointer = value_type*; + using const_pointer = const value_type*; + using iterator = pointer; + using const_iterator = const_pointer; + using reverse_iterator = ::std::reverse_iterator; + using const_reverse_iterator = ::std::reverse_iterator; + + // Constructors + small_vector() = default; + + small_vector(const small_vector& rhs) + { + if (rhs.size() <= small_cap) + { + auto b = rhs.begin(), e = b + rhs.size(); + loop([&](auto i) { + if (b >= e) + { + return false; + } + new (small_begin() + i) T(*b++); + ++small_length; + return true; + }); + } + else + { + new (&big())::std::vector(rhs.big()); + small_length = small_size_t(-1); + } + } + + small_vector(small_vector&& rhs) noexcept + { + if (rhs.size() <= small_cap) + { + auto b = rhs.begin(), e = b + rhs.size(); + loop([&](auto i) { + if (b >= e) + { + return false; + } + new (small_begin() + i) T(mv(*b++)); + ++small_length; + return true; + }); + } + else + { + new (&big())::std::vector(mv(rhs.big())); + small_length = small_size_t(-1); + } + } + + small_vector(::std::initializer_list init) + { + if (init.size() <= small_cap) + { + loop([&](auto i) { + if (i >= init.size()) + { + return false; + } + new (small_begin() + i) T(init.begin()[i]); + ++small_length; + return true; + }); + } + else + { + new (&big())::std::vector(init); + small_length = small_size_t(-1); + } + } + + // Assignment operators + small_vector& operator=(const small_vector& rhs) + { + if (this == &rhs) + { + return *this; + } + + clear(); + reserve(rhs.size()); + + if (is_small()) + { + loop([&](auto i) { + if (i >= rhs.size()) + { + return false; + } + new (small_begin() + i) T(rhs[i]); + ++small_length; + return true; + }); + } + else + { + big().assign(rhs.begin(), rhs.end()); + } + + return *this; + } + + small_vector& operator=(small_vector&& rhs) noexcept + { + if (this == &rhs) + { + return *this; + } + + if (is_small()) + { + clear(); + if (rhs.is_small()) + { + loop([&](auto i) { + if (i >= rhs.small_length) + { + return false; + } + new (small_begin() + i) T(mv(rhs.small_begin()[i])); + ++small_length; + return true; + }); + } + else + { + new (&big())::std::vector(mv(rhs.big())); + small_length = small_size_t(-1); + } + } + else + { + if (rhs.is_small()) + { + big().assign(rhs.small_begin(), rhs.small_begin() + rhs.small_length); + } + else + { + big() = mv(rhs.big()); + } + } + + return *this; + } + + small_vector& operator=(::std::initializer_list ilist); + + ~small_vector() + { + if (is_small()) + { + loop([&](auto i) { + if (i >= small_length) + { + return false; + } + small_begin()[i].~T(); + return true; + }); + } + else + { + using goner = ::std::vector; + big().~goner(); + } + } + + // Element access + T& operator[](size_t pos) + { + if (is_small()) + { + assert(pos < small_length); + return small_begin()[pos]; + } + assert(pos < big().size()); + return big()[pos]; + } + + const T& operator[](size_t pos) const + { + if (is_small()) + { + assert(pos < small_length); + return small_begin()[pos]; + } + assert(pos < big().size()); + return big()[pos]; + } + + T& at(size_t pos); + const T& at(size_t pos) const; + T& front() + { + return *begin(); + } + const T& front() const + { + return *begin(); + } + + T& back(); + const T& back() const; + T* data() noexcept; + const T* data() const noexcept; + + // Iterators + iterator begin() noexcept + { + return is_small() ? small_begin() : big().data(); + } + const_iterator begin() const noexcept + { + return is_small() ? small_begin() : big().data(); + } + + const_iterator cbegin() const noexcept + { + return is_small() ? small_begin() : big().data(); + } + + iterator end() noexcept + { + return begin() + size(); + } + + const_iterator end() const noexcept + { + return begin() + size(); + } + const_iterator cend() const noexcept + { + return begin() + size(); + } + + reverse_iterator rbegin() noexcept; + const_reverse_iterator rbegin() const noexcept; + const_reverse_iterator crbegin() const noexcept; + reverse_iterator rend() noexcept; + const_reverse_iterator rend() const noexcept; + const_reverse_iterator crend() const noexcept; + + // Capacity + bool empty() const noexcept + { + return is_small() ? small_length == 0 : big().empty(); + } + + size_t size() const noexcept + { + return is_small() ? small_length : big().size(); + } + + size_t max_size() const noexcept + { + return big().max_size(); + } + + void reserve(size_t new_cap) + { + if (is_small()) + { + if (new_cap <= small_cap) + { + return; + } + if (small_length > 0) + { + // Non-empty small_begin vector is a bit tricky + ::std::vector copy; + copy.reserve(new_cap); + for (auto& e : *this) + { + copy.push_back(mv(e)); + } + clear(); + new (&big())::std::vector(mv(copy)); + small_length = small_size_t(-1); + return; + } + new (&big())::std::vector(); + small_length = small_size_t(-1); + // fall through to call reserve() + } + big().reserve(new_cap); + } + + size_t capacity() const noexcept + { + return is_small() ? small_cap : big().capacity(); + } + void shrink_to_fit(); + + // Modifiers + void clear() noexcept + { + if (is_small()) + { + loop([&](auto i) { + if (i >= small_length) + { + return false; + } + small_begin()[i].~T(); + return true; + }); + small_length = 0; + } + else + { + big().clear(); + } + } + + iterator insert(const_iterator pos, const T& value); + iterator insert(const_iterator pos, T&& value); + iterator insert(const_iterator pos, size_t count, const T& value); + + template + iterator insert(iterator pos, InputIt first, InputIt last) + { + if (is_small()) + { + // Todo: support non-random iterators + const std::size_t new_size = small_length + (last - first); + if (new_size <= small_cap) + { + auto b = small_begin(), e = b + small_length; + assert(b <= pos && pos <= e); + assert(first <= last); + auto result = ::std::move_backward(pos, e, e + (last - first)); + for (; first != last; ++first, ++pos, ++small_length) + { + new (pos) T(*first); + } + assert(size() == new_size); + return result; + } + ::std::vector copy; + copy.reserve(new_size); + copy.insert(copy.end(), ::std::move_iterator(small_begin()), ::std::move_iterator(pos)); + auto result = copy.insert(copy.end(), first, last); + copy.insert(copy.end(), ::std::move_iterator(pos), ::std::move_iterator(small_begin() + small_length)); + new (&big())::std::vector(mv(copy)); + small_length = small_size_t(-1); + assert(size() == new_size); + return big().data() + (result - big().begin()); + } + else + { + auto result = big().insert(big().begin() + (pos - big().data()), first, last); + return big().data() + (result - big().begin()); + } + } + + iterator insert(const_iterator pos, ::std::initializer_list ilist); + template + iterator emplace(const_iterator pos, Args&&... args); + iterator erase(const_iterator pos); + + iterator erase(iterator first, iterator last) + { + if (is_small()) + { + auto b = small_begin(), e = b + small_length; + assert(b <= first && first <= last && last <= e); + auto result = ::std::move(last, e, first); + ::std::destroy(result, e); + small_length -= static_cast(last - first); + return result; + } + else + { + auto f_ = big().begin() + (first - big().data()); + auto l_ = big().begin() + (last - big().data()); + auto result = big().erase(f_, l_); + return big().data() + (result - big().begin()); + } + } + + void push_back(const T& value) + { + push_back(T(value)); // force an rvalue + } + + void push_back(T&& value) + { + if (is_small()) + { + if (small_length < small_cap) + { + new (small_begin() + small_length) T(mv(value)); + ++small_length; + return; + } + reserve(small_cap * 2 + 1); + assert(!is_small()); + // fall through to big case + } + big().push_back(mv(value)); + } + + template + void emplace_back(Args&&... args); + + void pop_back() + { + if (is_small()) + { + assert(small_length > 0); + small_begin()[--small_length].~T(); + } + else + { + assert(big().size() > 0); + big().pop_back(); + } + } + + void resize(size_t new_size) + { + if (is_small()) + { + if (new_size <= small_length) + { + shrink_small(small_length - new_size); + } + else + { + resize(new_size, T()); + } + } + else + { + big().resize(new_size); + } + } + + void resize(size_t new_size, const value_type& value) + { + if (is_small()) + { + if (new_size <= small_length) + { + shrink_small(small_length - new_size); + } + else + { + if (new_size <= small_cap) + { + ::std::uninitialized_fill(small_begin() + small_length, small_begin() + new_size, value); + small_length = small_size_t(new_size); + } + else + { + // Small to big conversion, just create a new vector + ::std::vector copy; + copy.reserve(new_size); + copy.insert( + copy.end(), ::std::move_iterator(small_begin()), ::std::move_iterator(small_begin() + small_length)); + copy.resize(new_size, value); + clear(); // call destructors for the moved-from elements + new (&big())::std::vector(mv(copy)); + small_length = small_size_t(-1); + } + } + } + else + { + big().resize(new_size, value); + } + } + + void swap(small_vector& other) noexcept + { + if (is_small()) + { + if (other.is_small()) + { + if (small_length < other.small_length) + { + return other.swap(*this); + } + // We are longer, the other is shorter + auto b = small_begin(), e = small_begin() + small_length, ob = other.small_begin(), + oe = other.small_begin() + other.small_length; + ::std::swap_ranges(ob, oe, b); + ::std::uninitialized_move(b + other.small_length, e, oe); + ::std::destroy(b + other.small_length, e); + ::std::swap(small_length, other.small_length); + } + else + { + auto tmp = mv(other.big()); + using goner = ::std::vector; + other.big().~goner(); + other.small_length = 0; // for exception safety + new (&other) small_vector(mv(*this)); // this could theoretically throw + // nothrow code from here down + this->~small_vector(); + new (&big())::std::vector(mv(tmp)); + small_length = small_size_t(-1); + } + } + else + { + if (other.is_small()) + { + other.swap(*this); + } + else + { + big().swap(other.big()); + } + } + } + + // Non-member functions + friend bool operator==(const small_vector& lhs, const small_vector& rhs) + { + return lhs.size() == rhs.size() && ::std::equal(lhs.begin(), lhs.end(), rhs.begin()); + } + // friend bool operator!=(const small_vector& lhs, const small_vector& rhs); + // friend bool operator<(const small_vector& lhs, const small_vector& rhs); + // friend bool operator<=(const small_vector& lhs, const small_vector& rhs); + // friend bool operator>(const small_vector& lhs, const small_vector& rhs); + // friend bool operator>=(const small_vector& lhs, const small_vector& rhs); + // friend void swap(small_vector& lhs, small_vector& rhs) noexcept; + +private: + auto small_begin() + { + return reinterpret_cast(&small_); + } + + auto small_begin() const + { + return reinterpret_cast(&small_); + } + + auto& big() + { + return *reinterpret_cast<::std::vector*>(&big_); + } + + auto& big() const + { + return *reinterpret_cast*>(&big_); + } + + bool is_small() const + { + return small_length <= small_cap; + } + + void shrink_small(size_t delta) + { + assert(delta <= small_length); + assert(is_small()); + loop([&](auto i) { + if (i >= delta) + { + return false; + } + small_begin()[small_length - 1 - i].~T(); + return true; + }); + small_length -= delta; + } + + template + void loop(F&& f) + { + if constexpr (small_cap < 16) + { + unroll(::std::forward(f)); + } + else + { + for (auto i : each(0, small_cap)) + { + using result_t = decltype(f(::std::integral_constant())); + if constexpr (::std::is_same_v) + { + f(i); + } + else + { + if (!f(i)) + { + return; + } + } + } + } + } + + union + { + ::std::aligned_storage_t small_[small_cap]; + ::std::aligned_storage_t), alignof(::std::vector)> big_; + }; + small_size_t small_length = 0; +}; + +#ifdef UNITTESTED_FILE +UNITTEST("small_vector basics") +{ + // Construction and initialization + EXPECT(small_vector{}.empty()); + EXPECT(small_vector().capacity() == 5); + EXPECT(small_vector({1.0, 2.5, 3.14})[2] == 3.14); + + // Element access and modification + small_vector<::std::string, 1> v; + v.push_back("Hello"); + v.push_back("World"); + EXPECT(v[0] + v[1] == "HelloWorld"); + v.pop_back(); + EXPECT(v.size() == 1); + + // Capacity and memory management + small_vector v2; + v2.reserve(10); + EXPECT(v2.capacity() == 10); + v2.push_back(true); + EXPECT(v2.capacity() >= 10); // capacity may not change immediately + + // Basic algorithms + small_vector v3 = {3, 1, 4, 2}; + ::std::sort(v3.begin(), v3.end()); + EXPECT(v3[0] == 1); + EXPECT(v3[3] == 4); + + // Iteration + for (auto element : small_vector{'a', 'b', 'c'}) + { + EXPECT(element < 'd'); + } +}; +#endif // UNITTESTED_FILE + +namespace reserved +{ + +/*! + * @brief A simple object pool with linear search for managing objects of type `T`. + * + * The `linear_pool` class provides a basic mechanism for reusing objects of a + * specific type. It stores a collection of objects and allows retrieval of + * existing objects with matching parameters or creation of new objects if + * necessary. + * + * @tparam T The type of objects to be managed by the pool. + */ +template +class linear_pool +{ +public: + /*! + * @brief Constructs a new, empty linear pool. + */ + linear_pool() = default; + + /*! + * @brief Adds an object to the pool. + * + * @param p A pointer to the object to be added. Must be non-null. + */ + void put(::std::unique_ptr p) + { + EXPECT(p); // Enforce that the pointer is not null. + payload.push_back(mv(p)); + } + + /*! + * @brief Retrieves an object from the pool with matching parameters or + * creates a new one if necessary. + * + * @tparam P The types of the parameters to match. + * @param p The parameters to match. + * @return A pointer to an object with the specified parameters. + */ + template + ::std::unique_ptr get(P&&... p) + { + for (auto it = payload.begin(); it != payload.end(); ++it) + { + T* e = it->get(); + assert(e); + if (*e == ::std::tuple(p...)) + { + it->release(); + // Move the last element to replace the retrieved element, + // maintaining a compact pool. + if (it + 1 < payload.end()) + { + *it = mv(payload.back()); + } + payload.pop_back(); + return ::std::unique_ptr(e); + } + } + + // If no matching object is found, create a new one. + return ::std::make_unique(::std::forward

(p)...); + } + + /*! + * @brief Calls a function object on each object in the pool. + * + * @tparam F The type of the function to be called. + * @param f The function to call on each object. + */ + template + void each(F&& f) + { + for (auto& ptr : payload) + { + assert(ptr); + f(*ptr); + } + } + +private: + /*! + * @brief The collection of objects in the pool. + */ + ::std::vector<::std::unique_ptr> payload; +}; + +} // end namespace reserved + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/utility/nvtx.cuh b/cudax/include/cuda/experimental/__stf/utility/nvtx.cuh new file mode 100644 index 00000000000..689c9ce8ac4 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/nvtx.cuh @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Wrappers for NVTX + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if __has_include() +# include +#endif + +namespace cuda::experimental::stf +{ + +/** + * @brief A RAII style wrapper for NVTX ranges + */ +class nvtx_range +{ +public: + nvtx_range(const char* message) + { +#if __has_include() + nvtxRangePushA(message); +#endif + } + + // Noncopyable and nonassignable to avoid multiple pops + nvtx_range(const nvtx_range&) = delete; + nvtx_range(nvtx_range&&) = delete; + nvtx_range& operator=(const nvtx_range&) = delete; + + ~nvtx_range() + { +#if __has_include() + nvtxRangePop(); +#endif + } +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/utility/pretty_print.cuh b/cudax/include/cuda/experimental/__stf/utility/pretty_print.cuh new file mode 100644 index 00000000000..3250e537616 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/pretty_print.cuh @@ -0,0 +1,266 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Pretty printing utilities + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include +#include +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief Convert a size into a human readable string + */ +inline ::std::string pretty_print_bytes(size_t bytes) +{ + const char* units[] = {"B", "KB", "MB", "GB", "TB"}; + size_t size = sizeof(units) / sizeof(char*); + int i = 0; + + double pretty_size = static_cast(bytes); + while (pretty_size >= 1024.0 && static_cast(i) < size - 1) + { + pretty_size /= 1024.0; + ++i; + } + + ::std::ostringstream out; + out << ::std::fixed << ::std::setprecision(2) << pretty_size << ' ' << units[i]; + return out.str(); +} + +namespace reserved +{ +/** + * A trait class to have the specifier to display the T type. + */ +template +struct FormatSpecifier +{ + static constexpr const char* value = "%f "; // Default for floating point + static constexpr const char* name = "float"; // Default for floating point +}; + +template <> +struct FormatSpecifier +{ + static constexpr const char* value = "%d "; // For int + static constexpr const char* name = "int"; +}; + +template <> +struct FormatSpecifier +{ + static constexpr const char* value = "%u "; // For unsigned int + static constexpr const char* name = "unsigned int"; +}; + +template <> +struct FormatSpecifier +{ + static constexpr const char* value = "%llu "; // For uint64_t, assuming LLP64 or LP64 model + static constexpr const char* name = "uint64_t"; +}; + +} // end namespace reserved + +/** + * @brief Writes an `mdspan` to a VTK file. + * + * This function writes an `mdspan` to a VTK file, which is useful for visualization and debugging. + * + * @tparam P The template parameter pack for mdspan + * @param s The input mdspan to be written + * @param filename The output file name where the VTK data will be written + */ +template +void mdspan_to_vtk(mdspan_like s, const ::std::string& filename) +{ + fprintf(stderr, "Writing slice of size to file %s\n", filename.c_str()); + FILE* f = EXPECT(fopen(filename.c_str(), "w+") != nullptr); + SCOPE(exit) + { + EXPECT(fclose(f) != -1); + }; + + EXPECT(fprintf(f, "# vtk DataFile Version 2.0\noutput\nASCII\n") != -1); + + size_t nx = 1; + size_t ny = 1; + size_t nz = 1; + + if constexpr (s.rank() > 0) + { + nx = s.extent(0); + } + + if constexpr (s.rank() > 1) + { + ny = s.extent(1); + } + + if constexpr (s.rank() > 2) + { + nz = s.extent(2); + } + + EXPECT(fprintf(f, "DATASET STRUCTURED_POINTS\n") != -1); + EXPECT(fprintf(f, "DIMENSIONS %zu %zu %zu\n", nx, ny, nz) != -1); + EXPECT(fprintf(f, "ORIGIN 0 0 0\n") != -1); + EXPECT(fprintf(f, "SPACING 1 1 1\n") != -1); + EXPECT(fprintf(f, "POINT_DATA %zu\n", s.size()) != -1); + EXPECT(fprintf(f, "SCALARS value float\n") != -1); + EXPECT(fprintf(f, "LOOKUP_TABLE default\n") != -1); + + static_assert(s.rank() <= 3 && s.rank() > 0); + + if constexpr (s.rank() == 1) + { + for (size_t x = 0; x < nx; x++) + { + EXPECT(fprintf(f, "%f ", s(x)) != -1); + } + } + + if constexpr (s.rank() == 2) + { + for (size_t y = 0; y < ny; y++) + { + for (size_t x = 0; x < nx; x++) + { + EXPECT(fprintf(f, "%f ", s(x, y)) != -1); + } + EXPECT(fprintf(f, "\n") != -1); + } + } + + if constexpr (s.rank() == 3) + { + for (size_t z = 0; z < nz; z++) + { + for (size_t y = 0; y < ny; y++) + { + for (size_t x = 0; x < nx; x++) + { + EXPECT(fprintf(f, "%f ", s(x, y, z)) != -1); + } + EXPECT(fprintf(f, "\n") != -1); + } + } + } + + // for (size_t y = 0; y < 4; y++) + // { + // for (size_t x = 0; x < 4; x++) + // { + // fprintf(stderr, "%f ", s(y, x)); + // } + // fprintf(stderr, "\n"); + // } +} + +/** + * @brief Print a formatted slice to a file or standard error output. + * + * @tparam T The type of data contained in the slice + * @tparam dimensions The rank of the slice + * @param s The slice object to be printed + * @param text The descriptive text to be printed before the slice data + * @param f The file pointer where the slice data should be printed (default: standard error output) + * + * The output is prefixed by `text` followed by a newline. + */ +template +void mdspan_print(mdspan_like s, const ::std::string& text, FILE* f = stderr) +{ + using T = typename mdspan_like::value_type; + + fprintf(f, "%s\n", text.c_str()); + const char* format = reserved::FormatSpecifier::value; + + size_t nx = 1; + size_t ny = 1; + size_t nz = 1; + + static constexpr size_t dimensions = s.rank(); + + if constexpr (dimensions > 0) + { + nx = s.extent(0); + } + + if constexpr (dimensions > 1) + { + ny = s.extent(1); + } + + if constexpr (dimensions > 2) + { + nz = s.extent(2); + } + + static_assert(dimensions <= 3 && dimensions > 0); + + if constexpr (dimensions == 1) + { + for (size_t x = 0; x < nx; x++) + { + EXPECT(fprintf(f, format, s(x)) != -1); + } + } + + if constexpr (dimensions == 2) + { + for (size_t y = 0; y < ny; y++) + { + for (size_t x = 0; x < nx; x++) + { + EXPECT(fprintf(f, format, s(x, y)) != -1); + } + EXPECT(fprintf(f, "\n") != -1); + } + } + + if constexpr (dimensions == 3) + { + for (size_t z = 0; z < nz; z++) + { + for (size_t y = 0; y < ny; y++) + { + for (size_t x = 0; x < nx; x++) + { + EXPECT(fprintf(f, format, s(x, y, z)) != -1); + } + EXPECT(fprintf(f, "\n") != -1); + } + } + } +} + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/utility/run_once.cuh b/cudax/include/cuda/experimental/__stf/utility/run_once.cuh new file mode 100644 index 00000000000..0a7f6008037 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/run_once.cuh @@ -0,0 +1,108 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Mechanism to ensure a function is only called once (for a set of arguments) + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief A class that ensures a function is executed only once for a given set of arguments (including none). + * + * The `run_once` class is a utility that caches the result of a function call based on its arguments, + * ensuring that the function is only run once for each unique set of arguments. + * It is particularly useful in scenarios where a function call is expensive and + * the result is invariant for the same input parameters. + * + * @tparam Ts Types of the arguments used to uniquely identify the function call. + */ +template +class run_once +{ +public: + /** + * @brief Constructs the `run_once` object with given arguments. + * + * These arguments are used to uniquely identify the function call. + * + * @param val Arguments that are used to determine if the function has been run before. + */ + run_once(Ts... val) + : val(mv(val)...) {}; + + /** + * @brief Invokes the function if it has not been run with the stored arguments before. + * + * If the function has been run before with the same arguments, returns the cached result. + * + * @tparam Fun The type of the function to be executed. + * @param fun The function to be executed. + * @return The result of the function call. The type of the result is determined by the return type of the function. + */ + template + auto& operator->*(Fun&& fun) && + { + using ReturnType = std::invoke_result_t; + + // Static assertions to ensure ReturnType meets the requirements + static_assert( + std::is_constructible_v(fun), std::declval()...))>, + "ReturnType must be constructible from the result of fun()"); + static_assert(std::is_move_constructible_v, "ReturnType must be MoveConstructible"); + static_assert(std::is_move_assignable_v, "ReturnType must be MoveAssignable"); + + if constexpr (sizeof...(Ts) == 0) + { + static auto result = fun(); + return result; + } + else + { + using ReturnType = ::std::invoke_result_t; + static ::std::unordered_map<::std::tuple, ReturnType, ::cuda::experimental::stf::hash<::std::tuple>> + cache; + + if (auto it = cache.find(val); it != cache.end()) + { + return it->second; + } + + // We only set the cache AFTER the end of the computation + auto result = ::std::apply(::std::forward(fun), ::std::move(val)); + + return cache[val] = mv(result); + } + } + +private: + // Stores the arguments used to invoke the function. + [[no_unique_address]] ::std::tuple val; +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/utility/scope_guard.cuh b/cudax/include/cuda/experimental/__stf/utility/scope_guard.cuh new file mode 100644 index 00000000000..6ded0af3480 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/scope_guard.cuh @@ -0,0 +1,214 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Implements the SCOPE mechanism + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief Automatically runs code when a scope is exited (`SCOPE(exit)`), exited by means of an exception + * (`SCOPE(fail)`), or exited normally (`SCOPE(success)`). + * + * The code controlled by `SCOPE(exit)` and `SCOPE(fail)` must not throw, otherwise the application will be terminated. + * The code controlled by `SCOPE(exit)` may throw. + * + * `SCOPE(exit)` runs its code at the natural termination of the current scope. Example: @snippet this SCOPE(exit) + * + * `SCOPE(fail)` runs its code if and only if the current scope is left by means of throwing an exception. Example: + * @snippet this SCOPE(fail) + * + * Finally, `SCOPE(success)` runs its code if and only if the current scope is left by normal flow (as opposed to by an + * exception). Example: @snippet this SCOPE(success) + * + * If two or more `SCOPE` declarations are present in the same scope, they will take effect in the reverse order of + * their lexical order. Example: @snippet this SCOPE combinations + * + * See Also: https://en.cppreference.com/w/cpp/experimental/scope_exit, + * https://en.cppreference.com/w/cpp/experimental/scope_fail, + * https://en.cppreference.com/w/cpp/experimental/scope_success + */ +///@{ +#define SCOPE(kind) \ + auto CUDASTF_UNIQUE_NAME(scope_guard) = \ + ::std::integral_constant<::cuda::experimental::stf::scope_guard_condition, \ + (::cuda::experimental::stf::scope_guard_condition::kind)>() \ + ->*[&]() +///@} + +enum class scope_guard_condition +{ + exit, + fail, + success +}; + +template +auto operator->*(::std::integral_constant, F&& f) +{ + struct result + { + result(F&& f, int threshold) + : f(::std::forward(f)) + , threshold(threshold) + {} + result(result&) = delete; + result(result&& rhs) + : f(mv(rhs.f)) + , threshold(rhs.threshold) + { + // Disable call to lambda in rhs's destructor in all cases, we don't want double calls + rhs.threshold = -2; + } + + // Destructor (i.e. user's lambda) may throw exceptions if and only if we're in `SCOPE(success)`. + ~result() noexcept(cond != scope_guard_condition::success) + { + // By convention, call always if threshold is -1, never if threshold < -1 + if (threshold == -1 || ::std::uncaught_exceptions() == threshold) + { + f(); + } + } + + private: + F f; + int threshold; + }; + + // Threshold is -1 for SCOPE(exit), the same as current exceptions count for SCOPE(success), and 1 above the current + // exception count for SCOPE(fail). + return result( + ::std::forward(f), + cond == scope_guard_condition::exit ? -1 : ::std::uncaught_exceptions() + (cond == scope_guard_condition::fail)); +} + +} // namespace cuda::experimental::stf + +#ifdef UNITTESTED_FILE +UNITTEST("SCOPE(exit)") +{ + //! [SCOPE(exit)] + // SCOPE(exit) runs the lambda upon the termination of the current scope. + bool done = false; + { + SCOPE(exit) + { + done = true; + }; + EXPECT(!done, "SCOPE_EXIT should not run early."); + } + EXPECT(done); + //! [SCOPE(exit)] +}; + +UNITTEST("SCOPE(fail)") +{ + //! [SCOPE(fail)] + bool done = false; + { + SCOPE(fail) + { + done = true; + }; + EXPECT(!done, "SCOPE_FAIL should not run early."); + } + assert(!done); + + try + { + SCOPE(fail) + { + done = true; + }; + EXPECT(!done); + throw 42; + } + catch (...) + { + EXPECT(done); + } + //! [SCOPE(fail)] +}; + +UNITTEST("SCOPE(success)") +{ + //! [SCOPE(success)] + bool done = false; + { + SCOPE(success) + { + done = true; + }; + EXPECT(!done); + } + EXPECT(done); + done = false; + + try + { + SCOPE(success) + { + done = true; + }; + EXPECT(!done); + throw 42; + } + catch (...) + { + EXPECT(!done); + } + //! [SCOPE(success)] +}; + +UNITTEST("SCOPE combinations") +{ + //! [SCOPE combinations] + int counter = 0; + { + SCOPE(exit) + { + EXPECT(counter == 2); + counter = 0; + }; + SCOPE(success) + { + EXPECT(counter == 1); + ++counter; + }; + SCOPE(exit) + { + EXPECT(counter == 0); + ++counter; + }; + EXPECT(counter == 0); + } + EXPECT(counter == 0); + //! [SCOPE combinations] +}; + +#endif // UNITTESTED_FILE diff --git a/cudax/include/cuda/experimental/__stf/utility/source_location.cuh b/cudax/include/cuda/experimental/__stf/utility/source_location.cuh new file mode 100644 index 00000000000..69553dda45c --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/source_location.cuh @@ -0,0 +1,90 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Source location + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +// GCC11 provides non constexpr builtins +#if defined(__cpp_lib_source_location) && (!defined(_CCCL_COMPILER_GCC) || _CCCL_GCC_VERSION >= 120000) +// C++20 version using std::source_location +# include + +namespace cuda::experimental::stf +{ +using source_location = ::std::source_location; +} +# define RESERVED_STF_SOURCE_LOCATION() ::cuda::experimental::stf::source_location::current() + +#elif __has_include() +# include +namespace cuda::experimental::stf +{ +using source_location = ::std::experimental::source_location; +} + +# define RESERVED_STF_SOURCE_LOCATION() ::cuda::experimental::stf::source_location::current() + +#else +// Custom implementation for C++17 or earlier +namespace cuda::experimental::stf +{ +class source_location +{ +public: + constexpr source_location( + const char* file = "unknown file", int line = 0, const char* func = "unknown function") noexcept + : file_(file) + , line_(line) + , func_(func) + {} + + constexpr const char* file_name() const noexcept + { + return file_; + } + constexpr int line() const noexcept + { + return line_; + } + constexpr const char* function_name() const noexcept + { + return func_; + } + + static constexpr source_location + current(const char* file = __FILE__, int line = __LINE__, const char* func = __func__) noexcept + { + return source_location(file, line, func); + } + +private: + const char* file_; + int line_; + const char* func_; +}; +} // end namespace cuda::experimental::stf + +# define RESERVED_STF_SOURCE_LOCATION() \ + ::cuda::experimental::stf::source_location::current(__FILE__, __LINE__, __func__) +#endif diff --git a/cudax/include/cuda/experimental/__stf/utility/stopwatch.cuh b/cudax/include/cuda/experimental/__stf/utility/stopwatch.cuh new file mode 100644 index 00000000000..e779f4b53a2 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/stopwatch.cuh @@ -0,0 +1,134 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +#include +#include + +namespace cuda::experimental::stf +{ + +class stopwatch +{ +public: + stopwatch(const stopwatch&) = delete; + stopwatch& operator=(const stopwatch&) = delete; + + stopwatch(stopwatch&& rhs) noexcept + : _start(rhs._start) + , _stop(rhs._stop) + , _state(rhs._state) + { + rhs._start = rhs._stop = nullptr; + rhs._state = state::invalid; + } + + stopwatch& operator=(stopwatch&& rhs) + { + ::std::swap(_start, rhs._start); + ::std::swap(_stop, rhs._stop); + ::std::swap(_state, rhs._state); + return *this; + } + + stopwatch() + : _state(state::idle) + { + cuda_safe_call(cudaEventCreate(&_start)); + cuda_safe_call(cudaEventCreate(&_stop)); + } + + static inline constexpr class autostart_t + { + } autostart = {}; + + stopwatch(autostart_t, cudaStream_t stream = nullptr) + : stopwatch() + { + start(stream); + } + + ~stopwatch() + { + if (_state != state::invalid) + { + cuda_safe_call(cudaEventDestroy(_start)); + cuda_safe_call(cudaEventDestroy(_stop)); + } + } + + void start(cudaStream_t stream = nullptr) + { + assert(_state == state::idle || _state == state::stopped); + cuda_safe_call(cudaEventRecord(_start, stream)); + _state = state::started; + } + + void stop(cudaStream_t stream = nullptr) + { + assert(_state == state::started); + cuda_safe_call(cudaEventRecord(_stop, stream)); + _state = state::stopped; + } + + ::std::chrono::duration elapsed() + { + assert(state::stopped); + float result; + cuda_safe_call(cudaEventSynchronize(_stop)); + cuda_safe_call(cudaEventElapsedTime(&result, _start, _stop)); + return ::std::chrono::duration(result); + } + + template + float bandwidth(size_t items_transferred) + { + return 1000. * sizeof(T) * items_transferred / elapsed().count(); + } + +private: + cudaEvent_t _start; + cudaEvent_t _stop; + enum class state + { + invalid, + idle, + started, + stopped + } _state = state::invalid; +}; + +#ifdef UNITTESTED_FILE +UNITTEST("stopwatch") +{ + stopwatch sw(stopwatch::autostart); + ::std::this_thread::sleep_for(::std::chrono::milliseconds(100)); + sw.stop(); + auto elapsed = sw.elapsed(); + assert(elapsed.count() >= 100); +}; +#endif // UNITTESTED_FILE + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/utility/stream_to_dev.cuh b/cudax/include/cuda/experimental/__stf/utility/stream_to_dev.cuh new file mode 100644 index 00000000000..364fcf04c5b --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/stream_to_dev.cuh @@ -0,0 +1,56 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Utility to get the id of the CUDA device associated to a CUDA stream + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief This computes the CUDA device in which the stream was created + */ +inline int get_device_from_stream(cudaStream_t stream) +{ + // Convert the runtime API stream to a driver API structure + auto stream_driver = CUstream(stream); + + CUcontext ctx; + cuda_safe_call(cuStreamGetCtx(stream_driver, &ctx)); + + // Query the context associated with a stream by using the underlying driver API + CUdevice stream_dev; + cuda_safe_call(cuCtxPushCurrent(ctx)); + cuda_safe_call(cuCtxGetDevice(&stream_dev)); + cuda_safe_call(cuCtxPopCurrent(&ctx)); + + return int(stream_dev); +} + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/utility/threads.cuh b/cudax/include/cuda/experimental/__stf/utility/threads.cuh new file mode 100644 index 00000000000..99ceafffe2a --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/threads.cuh @@ -0,0 +1,198 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include +#include + +namespace cuda::experimental::stf::reserved +{ + +/** + * @brief A simple RAII-style wrapper for ensuring single-threaded access to a code section. The interface is the same + * as for `std::lock_guard`. + * + * The `single_threaded_section` class is designed to ensure, in debug (i.e., non-`NDEBUG`) mode, that only one thread + * can execute a specific code section at a time. There is no guarantee that a passing program has no related bugs, + * but there are no false positives. + * + * If `NDEBUG` is defined, all member functions are defined to do nothing. If `NDEBUG` is not defined, the constructor + * `assert`s that the mutex can be acquired with no contention. + * + * `single_threaded_section` is reentrant even if `Mutex` is not, i.e. a function defining a `single_threaded_section` + * object may call another function that in turn defines a `single_threaded_section` object on the same mutex. + * + * Logic: In a single-threaded section of code, the mutex is never contested. If another thread has the mutex locked + * (hence the `assert` fails), then this section is not single-threaded. + * + * @tparam Mutex The type of mutex to use for synchronization. + */ +template +class single_threaded_section +{ +public: + using mutex_type = Mutex; + +#ifndef NDEBUG + + explicit single_threaded_section(mutex_type& m, source_location loc = RESERVED_STF_SOURCE_LOCATION()) + : mutex(m) + { + if (mutex.try_lock()) + { + if constexpr (!::std::is_same_v) + { + // Would not be able to reenter this mutex, so release immediately. + mutex.unlock(); + } + return; + } + fprintf(stderr, "%s(%u) Error: contested single-threaded section.\n", loc.file_name(), loc.line()); + abort(); + } + + single_threaded_section(mutex_type& m, ::std::adopt_lock_t) noexcept + : mutex(m) + {} // calling thread owns mutex + + single_threaded_section(const single_threaded_section&) = delete; + single_threaded_section& operator=(const single_threaded_section&) = delete; + + ~single_threaded_section() + { + if constexpr (::std::is_same_v) + { + // Keep the recursive mutex up until destruction. + mutex.unlock(); + } + } + +private: + mutex_type& mutex; + +#else + + explicit single_threaded_section(mutex_type&) {} + single_threaded_section(mutex_type&, ::std::adopt_lock_t) noexcept {} + single_threaded_section(const single_threaded_section&) = delete; + single_threaded_section& operator=(const single_threaded_section&) = delete; + +#endif +}; + +/** + * @brief Generates a unique `std::atomic` counter object for each literal usage. + * + * This templated operator overload allows for the creation of unique counter objects + * based on template literal strings. Each unique instantiation of this template with a + * different type results in a separate counter object. This can be used for counting + * occurrences or events associated uniquely with compile-time known values. + * + * Example usage: + * @code + * counter.increment(); + * counter.increment(); + * std::cout << "A count: " << counter.load() << std::endl; // Outputs: A count: 1 + * std::cout << "B count: " << counter.load() << std::endl; // Outputs: B count: 2 + * @endcode + * + * @tparam T tag type + */ +template +class counter +{ +public: + counter() = default; + + static auto load() + { + return count.load(); + } + + static auto increment() + { + count++; + } + + static auto decrement() + { + count--; + } + +private: + static inline ::std::atomic count{0}; +}; + +/** + * @brief Generates an object for tracking the high water mark (maximum value) recorded. + * + * This generates a unique counter for each type `T`, which allows for + * recording and querying the high water mark. The high water mark is the + * highest value recorded using the `record` method. This utility can be used + * to monitor peak usages or values in a system, such as maximum memory usage, + * maximum concurrent connections, or other metrics where the peak value is of + * interest. + * + * Each unique instantiation of this template with a different type results in + * a separate tracking context, allowing for isolated tracking based on + * compile-time known values. + * + * Example usage: + * @code + * high_water_mark.record(1024); // Record a new value + * std::cout << "Current high water mark: " << high_water_mark.load() << std::endl; + * @endcode + * + * @tparam T tag type for this counter + * + * @note The methods in this class are thread-safe, ensuring that + * concurrent updates from different threads are safely handled. + */ +template +class high_water_mark +{ +public: + high_water_mark() = default; + + static void record(unsigned long v) + { + for (;;) + { + auto previous = tracker.load(); + if (previous >= v || tracker.compare_exchange_weak(previous, v)) + { + break; + } + } + } + + static unsigned long load() + { + return tracker.load(); + } + +private: + static inline ::std::atomic tracker{0}; +}; + +} // namespace cuda::experimental::stf::reserved diff --git a/cudax/include/cuda/experimental/__stf/utility/traits.cuh b/cudax/include/cuda/experimental/__stf/utility/traits.cuh new file mode 100644 index 00000000000..402737a44d3 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/traits.cuh @@ -0,0 +1,583 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Utilities related to trait classes + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +#include +#include +#include +#include + +namespace cuda::experimental::stf +{ + +namespace reserved +{ + +// We use this function as a detector for what __PRETTY_FUNCTION__ looks like +template +constexpr ::std::string_view type_name_IMPL() +{ +#if defined(_CCCL_COMPILER_MSVC) + return __FUNCSIG__; +#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv + return __PRETTY_FUNCTION__; +#endif // !_CCCL_COMPILER_MSVC +} + +// Length of prefix and suffix in __PRETTY_FUNCTION__ when used with `type_name`. +inline constexpr ::std::pair type_name_affixes = [] { + const auto p = type_name_IMPL(); + const auto target = ::std::string_view("double"); + const auto len = target.size(); + // Simulate p.find() by hand because clang can't do it. + size_t i = target.npos; + for (std::size_t start = 0; start <= p.size() - len; ++start) + { + if (p.substr(start, len) == target) + { + i = start; // Found the substring, set i to the starting position + break; // Exit loop after finding the first match + } + } + auto j = p.size() - i - len; + return ::std::pair{i, j}; +}(); + +template +constexpr ::std::string_view type_name_impl() +{ +#if defined(_CCCL_COMPILER_MSVC) + constexpr ::std::string_view p = __FUNCSIG__; + // MSVC does not provide constexpr methods so we make this utility much simpler and return __FUNCSIG__ directly + return p; +#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv + ::std::string_view p = __PRETTY_FUNCTION__; + return p.substr(type_name_affixes.first, p.size() - type_name_affixes.first - type_name_affixes.second); +#endif // !_CCCL_COMPILER_MSVC +} + +} // namespace reserved + +/** + * @brief Yields a string form of type name. Exact spelling not guaranteed (e.g. `type_name` may be `"int*"`, + * `"int *"` etc). + * + * @tparam T The type to show. + * + * @paragraph example Example + * @snippet unittest.h type_name + */ +template +inline constexpr ::std::string_view type_name = reserved::type_name_impl(); + +/** + * @brief Converts each element in `t` to a new value by calling `f`, then returns a tuple collecting the values thus + * obtained. + * + * @tparam Tuple Type of the tuple to convert + * @tparam Fun Type of mapping function to apply + * @param t Object to convert, must support `std::apply` + * @param f function to convert each element of the tuple, must take a single parameter + * @return constexpr auto The tuple resulting from the mapping + * + * @paragraph example Example + * @snippet unittest.h tuple2tuple + */ +template +constexpr auto tuple2tuple(const Tuple& t, Fun&& f) +{ + return ::std::apply( + [&](auto&&... x) { + return ::std::tuple(f(::std::forward(x))...); + }, + t); +} + +/* + * @brief A function that will fail to compile, and result in an error message + * with type T. Used internally for debugging. Since this uses a static_assert, + * it will break compilation even if the function is called in a path that is + * supposed to be unreachable ! + * + * @tparam T A type which we want to display. + */ +template +class print_type_name_and_fail +{ + static_assert(::std::integral_constant::value, "Type name is: "); +}; + +namespace reserved +{ + +/** + * @brief A singleton template class implementing the Meyers Singleton design pattern. + * + * @tparam T The type of the singleton object. + * + * It uses the "Construct On First Use Idiom" to prevent issues related to + * the static initialization order fiasco. + * + * Usage rules: + * - The default constructor of `T` should be protected. + * - The destructor of `T` should be protected. + * - The copy and move constructors of `T` should be disabled (implicit if you follow the rules above). + * + * Example usage: + * ```cpp + * class my_singleton : public meyers_singleton { + * protected: + * my_singleton() = default; + * ~my_singleton() = default; + * }; + * ``` + */ +template +class meyers_singleton +{ +protected: + template + struct wrapper + { + using type = U; + }; + friend typename wrapper::type; + + meyers_singleton() = default; + ~meyers_singleton() = default; + meyers_singleton(const meyers_singleton&) = delete; + meyers_singleton(meyers_singleton&&) = delete; + +public: + /** + * @brief Provides access to the single instance of the class. + * + * @return T& A reference to the singleton instance. + * + * If the instance hasn't been created yet, this function will create it. + */ + static T& instance() + { + static_assert(!::std::is_default_constructible_v, + "Make the default constructor of your Meyers singleton protected."); + static_assert(!::std::is_destructible_v, "Make the destructor of your Meyers singleton protected."); + static_assert(!::std::is_copy_constructible_v, "Disable the copy constructor of your Meyers singleton."); + static_assert(!::std::is_move_constructible_v, "Disable the move constructor of your Meyers singleton."); + struct U : T + {}; + static U instance; + return instance; + } +}; + +} // end namespace reserved + +/** + * @brief Converts an array-like object (such as an `std::array`) to an `std::tuple`. + * + * This function template takes an array-like object and returns a tuple containing the same elements. + * If the input array has a size of zero, an empty tuple is returned. + * + * @tparam Array Type of the array-like object. Must have `std::tuple_size_v` specialization. + * @param array The array-like object to be converted to a tuple. + * @return A tuple containing the elements of the input array. + * + * Example usage: + * @code + * std::array arr = {1, 2, 3}; + * auto t = to_tuple(arr); // t is a std::tuple + * @endcode + */ +template +auto to_tuple(Array&& array) +{ + return tuple2tuple(::std::forward(array), [](auto&& e) { + return ::std::forward(e); + }); +} + +/** + * @brief Array-like tuple with a single element type repeated `n` times. + * + * The `array_tuple` template generates a `std::tuple` with a single type `T` repeated `n` times. + * This can be used to create a tuple with consistent types and a fixed size. + * + * @tparam T The type of the elements that the tuple will contain. + * @tparam n The number of elements that the tuple will contain. + * + * ### Example + * + * ```cpp + * using my_tuple = array_tuple; // Results in std::tuple + * ``` + * + * @note The specialization `array_tuple` will result in an empty tuple (`std::tuple<>`). + */ +template +using array_tuple = decltype(to_tuple(::std::array{})); + +// Mini-unittest +static_assert(::std::is_same_v, ::std::tuple>); + +namespace reserved +{ + +/** + * @brief Converts an `std::tuple` into a `cuda::std::array`. + * + * This function takes an `std::tuple` with elements of potentially differing types, provided the first type, `T0`, is + * assignable from the rest. The resulting `std::array` will contain the elements from the tuple, where all the + * elements are of the type `T0`. + * + * @tparam T0 The type of the first element in the tuple, and the type of the elements in the resulting array. + * @tparam Ts... The types of the other elements in the tuple. They must all be convertible to `T0`. + * @param obj The `std::tuple` object to convert. + * @return An `std::array` containing the elements from the tuple, converted to type `T0`. + */ +template +::cuda::std::array to_cuda_array(const ::std::tuple& obj) +{ + ::cuda::std::array result; + each_in_tuple(obj, [&](auto index, const auto& value) { + result[index] = value; + }); + return result; +} + +/** + * @brief Converts an `std::array` to a `cuda::std::array` + */ +template +::cuda::std::array convert_to_cuda_array(const ::std::array& std_array) +{ + ::cuda::std::array result; + for (size_t i = 0; i < N; i++) + { + result[i] = std_array[i]; + } + return result; +} + +} // end namespace reserved + +/** + * @brief Extracts and returns the first argument from a parameter pack that is convertible to type `T`. + * + * This function template recursively inspects each argument in a parameter pack until it finds the first + * argument that can be converted to type `T`. It statically asserts that only one such convertible argument exists + * in the parameter pack to ensure uniqueness. + * + * @tparam T The target type to which the argument should be convertible. + * @tparam P0 The type of the first argument in the parameter pack. + * @tparam P Variadic template representing the rest of the arguments in the parameter pack. + * @param p0 The first argument in the parameter pack. + * @param p The remaining arguments in the parameter pack. + * @return `T` A copy or reference to the first argument that is convertible to type `T`. + * + * @note This function will cause a compile-time error if more than one argument in the parameter pack is convertible to + * type `T`. + * + * \code{.cpp} + * int i = 42; + * double d = 3.14; + * std::string s = "hello"; + * // The following call will return 's'. + * auto result = only_convertible(i, d, s); + * \endcode + * + */ +template +T only_convertible(P0&& p0, P&&... p) +{ + if constexpr (::std::is_convertible_v) + { + ((void) p, ...); + static_assert(!(::std::is_convertible_v || ...), "Duplicate argument type found"); + return ::std::forward(p0); + } + else + { + // Ignore current head and recurse to tail + return only_convertible(::std::forward

(p)...); + } +} + +/** + * @brief Creates an `std::array` of type `T` from all convertible elements in a parameter pack. + * + * This template function inspects each element in a parameter pack and adds it to an `std::array` if it is + * convertible to the type `T`. It returns this array containing all convertible elements. The function ensures that + * only convertible elements are added by using `std::is_convertible`. It also handles exceptions and properly + * destructs already constructed elements in case of an exception during array construction. + * + * @tparam T The target type to which the elements of the parameter pack should be convertible. + * @tparam P Variadic template representing the types in the parameter pack. + * @param p The parameter pack containing elements to be checked for convertibility and potentially added to the array. + * @return `::std::array` An array of type `T` containing all elements from the parameter pack that are + * convertible to `T`. + * + * @note The size of the returned array, `N`, is determined at compile time based on the number of convertible elements + * in the parameter pack. + * @throws Any exception thrown during the construction of elements in the array will be propagated after destructing + * already constructed elements. + * + * \code{.cpp} + * int i = 42; + * double d = 3.14; + * std::string s = "hello"; + * // The following call will create an array of strings from all arguments that can be converted to a string. + * // In this case, only 's' is convertible, so the array will contain two copies of 's'. + * auto result = all_convertible(i, s, d, s); + * \endcode + */ +template +auto all_convertible(P&&... p) +{ + // We use a union here to prevent the compiler from calling the destructor of the array. + // All construction/destruction will be done manually for efficiency purposes. + static constexpr size_t size = (::std::is_convertible_v + ...); + unsigned char buffer[size * sizeof(T)]; + auto& result = *reinterpret_cast<::std::array*>(&buffer[0]); + size_t i = 0; // marks the already-constructed portion of the array + try + { + each_in_pack( + [&](auto&& e) { + if constexpr (::std::is_convertible_v) + { + new (result.data() + i) T(::std::forward(e)); + ++i; + } + }, + ::std::forward

(p)...); + return mv(result); + } + catch (...) + { + for (size_t j = 0; j < i; ++j) + { + result[j].~T(); + } + throw; + } +} + +/* + * @brief Chooses a parameter from `P...` of a type convertible to `T`. If found, it is returned. If no such parameter + * is found, returns `default_v`. + * + * For now only value semantics are supported. + * + * @tparam T Result type + * @tparam P Variadic parameter types + * @param default_v Default value + * @param p Variadic parameter values + * @return T Either the first convertible parameter, or `default_v` if no such parameter is found + */ +template +T only_convertible_or([[maybe_unused]] T default_v, P&&... p) +{ + if constexpr (!(::std::is_convertible_v || ...)) + { + ((void) p, ...); + return default_v; + } + else + { + return only_convertible(::std::forward

(p)...); + } +} + +namespace reserved +{ +/* Checks whether a collection of `DataTypes` objects can be unambiguously initialized (in some order) + from a collection of `ArgTypes` objects. Not all objects must be initialized, + e.g. `check_initialization(1)` passes. */ +template +struct check_initialization +{ + /* Yields the number of types in `Ts` to which `T` can be converted. */ + template + static constexpr int count_convertibilty = (::std::is_convertible_v + ... + 0); + + template + static constexpr void from() + { + ( + [] { + using T = ArgTypes; + static_assert(count_convertibilty > 0, + "Incompatible argument: argument type doesn't match any member type."); + static_assert(count_convertibilty == 1, + "Ambiguous argument: argument type converts to more than one member type."); + }(), + ...); // This expands ArgTypes + } +}; +} // namespace reserved + +/** + * @brief Checks the convertibility of argument types to a set of data types. + * + * This function checks if each type in `ArgTypes` is convertible to exactly one type in DataTypes. + * If a type is not convertible to exactly one type, a static assertion will fail at compile time. + * + * @tparam ArgTypes The types to check the convertibility of. + * @tparam DataTypes The types to check the convertibility to. + * @param unused The data of the types to check the convertibility to. + * + * @note A static_assert error occurs if a type is not convertible to exactly one type. + * + * \code{.cpp} + * struct A {}; + * struct B {}; + * struct C {}; + * struct D {}; + * + * A a; + * B b; + * C c; + * D d; + * + * // This will compile successfully because each type A, B, C is convertible to itself and only itself. + * shuffled_args_check(c, a, b); + * + * // This will fail to compile because type D is not provided in the DataTypes. + * // shuffled_args_check(a, b, c, d); + * + * // This will fail to compile because int is convertible to both float and double, causing ambiguity. + * // shuffled_args_check(5); + * \endcode + */ +template +void shuffled_args_check(const DataTypes&...) +{ + reserved::check_initialization::template from(); +} + +/** + * @brief Creates a tuple with arguments in any order. + * + * This function creates a tuple where each element is a value from the variadic argument list `args` that is + * convertible to the corresponding type in `DataTypes`. The function checks the convertibility of each argument type to + * the corresponding type in `DataTypes` and throws a static assertion if any argument type is not convertible to + * exactly one type in `DataTypes`. + * + * @tparam DataTypes The types to use for the tuple elements. + * @tparam ArgTypes The types of the arguments to shuffle. + * @param args The arguments to shuffle. + * + * @return std::tuple A tuple where each element is a value from `args` that is convertible to the + * corresponding type in `DataTypes`. + * + * @note A static_assert error is issued if a type is not convertible to exactly one type. + * + * \code{.cpp} + * struct A { int value; }; + * struct B { std::string text; }; + * struct C { float number; }; + * + * A a{10}; + * B b{"example"}; + * C c{3.14f}; + * + * // This will create an `std::tuple` from the provided arguments, + * // automatically assigning them based on their types. + * auto my_tuple = shuffled_tuple(b, c, a); + * // Now my_tuple is of type std::tuple, and contains a, b, c in that order. + * + * // This will not compile because there are two arguments that can fill the same tuple slot. + * // auto my_tuple = shuffled_tuple(b, c, a, a); + * + * // This will not compile because the argument '5' can convert to both 'int' and 'float', + * // causing an ambiguity. + * // auto my_tuple = shuffled_tuple(b, c, 5); + * \endcode + */ +template +::std::tuple shuffled_tuple(ArgTypes... args) +{ + reserved::check_initialization::template from(); + return ::std::tuple{only_convertible_or(DataTypes(), mv(args)...)...}; +} + +/** + * @brief Creates a tuple where each element is an `std::array`, constructed from the arguments provided. + * + * This function constructs a tuple where each element is an array of a specific type. Each array in the tuple + * contains all the arguments of the corresponding type. The function checks at compile-time to ensure that each + * argument type is convertible to exactly one of the tuple's element types, avoiding ambiguity or incompatible types. + * + * @tparam DataTypes The types of the arrays in the tuple. Each type corresponds to an array in the tuple. + * @tparam ArgTypes Variadic template representing the types of the arguments. + * @param args The arguments from which the arrays in the tuple are constructed. These can be in any order. + * @return A tuple where each element is an array of one of the `DataTypes`, containing the respective convertible + * arguments. + * + * @note The function uses `all_convertible` internally to construct arrays for each data type in `DataTypes`. + * + * Example usage: + * @code + * auto result = shuffled_array_tuple('a', 5, 6.0, 'b', 7); + * // result is a tuple containing: + * // std::array {5, 7}, + * // std::array {6.0}, + * // std::array {'a', 'b'} + * @endcode + */ +template +auto shuffled_array_tuple(ArgTypes... args) +{ + reserved::check_initialization::template from(); + return ::std::tuple{all_convertible(mv(args)...)...}; +} + +namespace reserved +{ + +/** + * @brief A compile-time boolean that checks if a type supports streaming with std::ostream <<. + * + * This trait is true if the type T can be streamed using std::ostream <<, and false otherwise. + * + * @tparam T The type to check for streaming support with std::ostream <<. + */ +template +struct has_ostream_operator : ::std::false_type +{}; + +template +struct has_ostream_operator() << ::std::declval()), void())> + : ::std::true_type +{}; + +} // end namespace reserved + +} // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/utility/unique_id.cuh b/cudax/include/cuda/experimental/__stf/utility/unique_id.cuh new file mode 100644 index 00000000000..5f19c18dc18 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/unique_id.cuh @@ -0,0 +1,87 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +namespace cuda::experimental::stf +{ + +namespace reserved +{ + +/* This defines an object with a unique identifier. This object is non + * copyable, but moving it transfers the unique id to the destination object. + */ +template +class unique_id +{ +public: + unique_id() = default; + constexpr unique_id(unique_id&& other) noexcept + : _value(::std::exchange(other._value, -1)) + {} + constexpr unique_id(const int val) noexcept + : _value(val) + {} + constexpr operator int() const noexcept + { + return _value; + } + + unique_id& operator=(unique_id&& rhs) + { + _value = ::std::exchange(rhs._value, -1); + return *this; + } + + unique_id(const unique_id&) = delete; + unique_id& operator=(const unique_id&) = delete; + + bool operator==(const unique_id& other) const noexcept + { + return _value == other._value; + } + +private: + static int next_id() + { + static ::std::atomic id = 0; + return id++; + } + + int _value = next_id(); +}; + +} // end namespace reserved + +template +struct hash> +{ + size_t operator()(const reserved::unique_id& id) const + { + return ::std::hash()(id); + } +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/utility/unittest.cuh b/cudax/include/cuda/experimental/__stf/utility/unittest.cuh new file mode 100644 index 00000000000..ef7ae74981a --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/unittest.cuh @@ -0,0 +1,709 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Unit testing framework used in CUDASTF + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +# include +# include + +# include + +// One level of macro indirection is required in order to resolve __COUNTER__, +// and get varname1 instead of varname__COUNTER__. +# define _55f56f4e3b45c8cf3fa50b28fed72e2a(a, b) _a56ec7069122ad2e0888a508ecdc4639(a, b) +# define _a56ec7069122ad2e0888a508ecdc4639(a, b) a##b + +/** + * @brief A macro that creates a unique identifier based on a given base name. + * + * This macro generates a unique identifier by concatenating the base name with the value of the + * `__COUNTER__` macro. The `__COUNTER__` macro is a non-standard extension that is supported by + * some compilers, such as GCC and Clang, and it expands to a unique integer value within the + * translation unit. + * + * @code + * // Example usage: + * int CUDASTF_UNIQUE_NAME(my_variable) = 42; // Creates a unique variable name like my_variable0 + * int CUDASTF_UNIQUE_NAME(my_variable) = 43; // Creates a unique variable name like my_variable1 + * @endcode + * + * @note This macro is not portable as the `__COUNTER__` macro is not part of the C++ standard. + * + * @param base The base name for the unique identifier. + */ +# define CUDASTF_UNIQUE_NAME(base) _55f56f4e3b45c8cf3fa50b28fed72e2a(base, __COUNTER__) + +/** + * @brief Macro to check a variety of conditions, throws if the condition is false + * + * @param ... condition to check, followed by zero or more messages to be made part of the text of the exception thrown + * if the condition is false. Messages are concatenated. + * + * @return If condition is a comparison expression, returns the left-hand side operand, otherwise returns a copy of the + * condition + * + * The condition and all optional error messages are evaluated exactly once. + * + * The file and line of the invocation are made part of the exception message, in a format similar to compiler error + * messages (recognizable by tools such as emacs, Visual Studio Code, etc.). In case of a failing comparison, the + * exception thrown will contain information about the left-hand side and right-hand side values. For example, + * `EXPECT(a == b)` will throw an exception that makes the values of `a` and `b` part of the textual content of the + * exception object (accessible by calling `what()`). + * + * @paragraph example Example + * @snippet this EXPECT + */ +# define EXPECT(...) \ + ::cuda::experimental::stf::expecter::validate( \ + RESERVED_STF_SOURCE_LOCATION(), ::cuda::experimental::stf::expecter()->*__VA_ARGS__) + +namespace cuda::experimental::stf +{ + +/* + * @brief Entry point for all comparisons (equality, inequality, and ordering) tested. + * + */ +struct expecter +{ + /* + * @brief Exception embedding location information. + * + */ + struct failure : ::std::runtime_error + { + template + failure(source_location loc, const Msgs&... msgs) + : ::std::runtime_error(text(loc, msgs...)) + {} + + template + static ::std::string text(source_location loc, const Msgs&... msgs) + { + ::std::stringstream s; + s << loc.file_name() << '(' << loc.line() << "): "; + (stream(s, msgs), ...); + auto result = s.str(); + if (!result.empty() && result.back() != '\n') + { + result += '\n'; + } + return result; + } + + private: + template + static void stream(::std::stringstream& s, const T& x) + { + if constexpr (reserved::has_ostream_operator::value) + { + s << x; + } + else + { + s << type_name << "{?}"; + } + } + }; + + /* + * @brief Wrapper for a comparison operation using one of `==`, `!=`, `<`, `>`, `<=`, `>=`. Includes the result and + * the operands. + * + * @tparam T Type of the left-hand side operator (qualifiers and reference included) + * @tparam U Type of the right-hand side operator (qualifiers and reference included) + */ + template + struct comparison_expression + { + bool value; + L lhs; + R rhs; + const char* op; + comparison_expression(bool value, L lhs, R rhs, const char* op) + : value(value) + , lhs(::std::forward(lhs)) + , rhs(::std::forward(rhs)) + , op(op) + {} + }; + + /* + * @brief Wraps a single term (the left-hand side one) in a comparison expression. + * + * @tparam T type of the term, may include a reference or a const reference etc. + */ + template + struct term + { + T value; + + template + term(U&& value) + : value(::std::forward(value)) + {} + + /* + * All comparison operators are defined assuming `const&` inputs. They are evaluated eagerly and return a + * `compariso_expression` object with information necessary for constructing an error message, if any. + */ +# define _9d10c7e37932af3c4f39a5ce7ff00b5a(op) \ + template \ + auto operator op(const U& rhs)&& \ + { \ + using T_noref = std::remove_reference_t; \ + using U_noref = std::remove_reference_t; \ + if constexpr (std::is_integral_v && std::is_integral_v \ + && std::is_signed_v != std::is_signed_v) \ + { \ + using CommonType = std::common_type_t; \ + const bool r = static_cast(value) op static_cast(rhs); \ + return comparison_expression(r, std::forward(value), rhs, #op); \ + } \ + else \ + { \ + /* Direct comparison for other cases */ \ + const bool r = value op rhs; \ + return comparison_expression(r, std::forward(value), rhs, #op); \ + } \ + } + + _9d10c7e37932af3c4f39a5ce7ff00b5a(==); + _9d10c7e37932af3c4f39a5ce7ff00b5a(!=); + _9d10c7e37932af3c4f39a5ce7ff00b5a(<); + _9d10c7e37932af3c4f39a5ce7ff00b5a(>); + _9d10c7e37932af3c4f39a5ce7ff00b5a(<=); + _9d10c7e37932af3c4f39a5ce7ff00b5a(>=); + +# undef _9d10c7e37932af3c4f39a5ce7ff00b5a + + /* + * Operators `^`, `|`, `*`, `/`, `%`, `+`, `-`, `>>`, and `<<` are defined only for carrying the actual + * computation alongside with the `term` wrapper. For example, in the expression `EXPECT(a+b==c)`, which expands + * to `expecter()->*a+b==c`, the operator+ defined below will make sure `a+b` is correctly computed before + * evaluating + * `==`. + */ +# define _9d10c7e37932af3c4f39a5ce7ff00b5a(op) \ + template \ + auto operator op(U&& rhs)&& \ + { \ + using Result = decltype(value op ::std::forward(rhs)); \ + return term(::std::forward(value) op ::std::forward(rhs)); \ + } + + _9d10c7e37932af3c4f39a5ce7ff00b5a(*); + _9d10c7e37932af3c4f39a5ce7ff00b5a(/); + _9d10c7e37932af3c4f39a5ce7ff00b5a(%); + _9d10c7e37932af3c4f39a5ce7ff00b5a(+); + _9d10c7e37932af3c4f39a5ce7ff00b5a(-); + _9d10c7e37932af3c4f39a5ce7ff00b5a(>>); + _9d10c7e37932af3c4f39a5ce7ff00b5a(<<); + _9d10c7e37932af3c4f39a5ce7ff00b5a(&); + _9d10c7e37932af3c4f39a5ce7ff00b5a(^); + _9d10c7e37932af3c4f39a5ce7ff00b5a(|); + _9d10c7e37932af3c4f39a5ce7ff00b5a(=); + _9d10c7e37932af3c4f39a5ce7ff00b5a(+=); + _9d10c7e37932af3c4f39a5ce7ff00b5a(-=); + _9d10c7e37932af3c4f39a5ce7ff00b5a(*=); + _9d10c7e37932af3c4f39a5ce7ff00b5a(/=); + _9d10c7e37932af3c4f39a5ce7ff00b5a(%=); + _9d10c7e37932af3c4f39a5ce7ff00b5a(<<=); + _9d10c7e37932af3c4f39a5ce7ff00b5a(>>=); + _9d10c7e37932af3c4f39a5ce7ff00b5a(&=); + _9d10c7e37932af3c4f39a5ce7ff00b5a(^=); + _9d10c7e37932af3c4f39a5ce7ff00b5a(|=); + +# undef _9d10c7e37932af3c4f39a5ce7ff00b5a + + /* + * Operators `&&`, `||`, `,` do not obey order of evaluation and are disallowed at top level. For example you + * cannot write `EXPECT(a && b)` - instead, you could write `EXPECT((a && b))` but you lose information about + * which side of the conjunction failed. + */ + template + auto operator&&(U&&) = delete; + template + auto operator||(U&&) = delete; + template + auto operator,(U&&) = delete; + }; + + /** + * @brief Initiates a test, use like this: `test ->* a == b;` + * + * @tparam T The type of the initial left-hand side operand + * @param lhs The laft-hand operand, will be saved. + * @return auto + */ + template + term operator->*(T&& lhs) + { + return term(::std::forward(lhs)); + } + + /* + * @brief Validates a single-term expression, as in `EXPECT(v.empty())`. + * + * @tparam T the term's type including qualifiers + * @param loc source location for error reporting + * @param t term value, an exception will be thrown if non-true + */ + template + static decltype(auto) validate(source_location loc, term&& t, const Msgs&... msgs) + { + if (t.value) + { + return ::std::forward(t.value); + } + if constexpr (sizeof...(msgs) == 0) + { + using U = ::std::remove_reference_t; + throw failure( + loc, + "Tested expression of type " + ::std::string(type_name) + " is " + + (std::is_same_v ? "false" + : ::std::is_arithmetic_v ? "zero" + : "null") + + ".\n"); + } + else + { + throw failure(loc, msgs...); + } + } + + /* + * @brief Validates a comparison using `==`, `!=` etc. + * + * @tparam L type of left-hand side operand + * @tparam R type of left-hand side operand + * @param loc source code location + * @param e expression, an exception will be thrown is `e.value` is false + */ + template + static decltype(auto) validate(source_location loc, comparison_expression&& e, const Msgs&... msgs) + { + if (e.value) + { + return ::std::forward(e.lhs); + } + throw failure(loc, e.lhs, ' ', e.op, ' ', e.rhs, " is false.\n", msgs...); + } +}; + +/* + * @brief Small object that runs a unittest + * + * Typically not used directly; just use `UNITTEST` (see below) to define and run tests. + * + * Also includes static globals for test assertions and for counting pass/fail unittests. + */ +template +class unittest; + +/* + * @brief Parameterless unittest implementation + * + * Typically not used directly; just use `UNITTEST` (see below) to define and run tests. + * + * Most unittests are not parameterized. In addition, all parameterized unittests ultimately inherit `unittest<>` so + * this class has most of the unittest paraphernalia within. + */ +template <> +class unittest<> +{ +public: + /* + * @brief Constructor (to be used with the `UNITTEST` macro) taking a name and the current location. + * + * @param name name of the unittest, e.g. `UNITTEST("name goes here") { ... };` + * @param loc current location of the use of `UNITTEST`, automatically provided by the `UNITTEST` macro. + */ + unittest(const char* name, source_location loc) + : name(name) + , loc(mv(loc)) + {} + + /* + * @brief Factory method that creates a (potentially parameterized) unittest from variadic arguments. + * + * @tparam Params types of unittest parameters, if any (multiple types are allowed) + * @param name name of the unittest + * @param loc code location + * @param params unittest parameters, if any + * @return unittest the unittest object with inferred type and properly initialized + * + * This function is not used directly. Instead, the invocation `UNITTEST("name", 1, 2.2)` calls + * `unittest<>::make("name", source_location::current(), 1, 2,2)`. + */ + template + static unittest make(const char* name, source_location loc, Params&&... params) + { + return unittest(name, loc, ::std::forward(params)...); + } + +public: + /* + * @brief Unittest entry point, to be used with the `UNITTEST` macro. + * + * @tparam Fun type of lambda containing the unittest + * @param fun lambda containing the unittest + * @return unittest& `*this` + */ + template + unittest& operator->*([[maybe_unused]] Fun&& fun) + { +# ifdef UNITTESTED_FILE + if (!::std::filesystem::equivalent(loc.file_name(), UNITTESTED_FILE)) + { + return *this; + } + ++total(); + try + { + fun(name); + ++passed(); + // fprintf(stdout, "%s(%zu): PASS(%s)\n", filename.c_str(), line, name); + } + catch (const expecter::failure& e) + { + fprintf(stderr, "%sUNITTEST FAILURE: %s\n", e.what(), name); + } + catch (const ::std::exception& e) + { + fprintf(stderr, "%s(%u): %s\nUNITTEST FAILURE: %s\n", loc.file_name(), loc.line(), e.what(), name); + } +# endif + return *this; + } + +public: + static size_t& passed() + { + static size_t result; + return result; + } + + static size_t& total() + { + static size_t result; + return result; + } + +protected: + const char* const name; + const source_location loc; +}; + +/* + * @brief Implementation of `unittest` for one or more parameters. + * + * @tparam Param first parameter type (may include qualifiers and adornments) + * @tparam Params other parameters, if any (may include qualifiers and adornments) (may be empty) + * + * The hierarchy is set up in a linear fashion: `unittest` inherits `unittest` which in turn + * inherits `unittest<>`. + */ +template +class unittest : public unittest +{ +public: + /** + * @brief Constructor invoked by the `UNITTEST` macro. + * + * @param name name of the unittest + * @param loc code location of the invocation + * @param param first parameter, is saved in this object and forwarded to the unittest lambda + * @param params other parameters, if any + */ + unittest(const char* name, source_location loc, Param param, Params... params) + : unittest(name, loc, ::std::forward(params)...) + , param(::std::forward(param)) + {} + + /** + * @brief Launch the given unittest lambda + * + * @tparam Fun Lambda type + * @param fun lambda object + * @return unittest& `*this` + */ + template + unittest& operator->*(Fun&& fun) + { + unittest<>::operator->*([&](const char* name) { + fun(name, ::std::forward(param)); + }); + if constexpr (sizeof...(Params) > 0) + { + unittest::operator->*(::std::forward(fun)); + } + return *this; + } + +private: + Param param; +}; + +} // namespace cuda::experimental::stf + +// Try to detect when __VA_OPT__ is available +# if defined(__cplusplus) && __cplusplus >= 202002L +# define STF_HAS_UNITTEST_WITH_ARGS 1 +# endif + +# ifdef UNITTESTED_FILE + +int main() +{ + using namespace cuda::experimental::stf; + if (unittest<>::total() != unittest<>::passed()) + { + fprintf(stderr, "FAIL: %zu of %zu\n", unittest<>::total() - unittest<>::passed(), unittest<>::total()); + return 1; + } + // fprintf(stdout, "PASS: %zu of %zu\n", passed(), total()); +} + +# ifdef STF_HAS_UNITTEST_WITH_ARGS +# define UNITTEST(name, ...) \ + [[maybe_unused]] static const auto CUDASTF_UNIQUE_NAME(unittest) = \ + ::cuda::experimental::stf::unittest<>::make(name, RESERVED_STF_SOURCE_LOCATION() __VA_OPT__(, __VA_ARGS__)) \ + ->*[]([[maybe_unused]] const char* unittest_name __VA_OPT__(, [[maybe_unused]] auto&& unittest_param)) +# else +# define UNITTEST(name) \ + [[maybe_unused]] static const auto CUDASTF_UNIQUE_NAME(unittest) = \ + ::cuda::experimental::stf::unittest<>::make(name, RESERVED_STF_SOURCE_LOCATION())->*[ \ + ]([[maybe_unused]] const char* unittest_name) +# endif + +# else + +# ifdef STF_HAS_UNITTEST_WITH_ARGS +# define UNITTEST(name, ...) \ + [[maybe_unused]] static const auto CUDASTF_UNIQUE_NAME(unused_unittest) = \ + []([[maybe_unused]] auto&& unittest_name __VA_OPT__(, [[maybe_unused]] auto&& unittest_param)) +# else +# define UNITTEST(name) \ + [[maybe_unused]] static const auto CUDASTF_UNIQUE_NAME(unused_unittest) = \ + []([[maybe_unused]] auto&& unittest_name) +# endif + +# endif + +// Just making sure that core functionality works +# ifdef STF_HAS_UNITTEST_WITH_ARGS +UNITTEST("Numeric NOP test.", int(), float(), double()) +{ + using T = ::std::remove_reference_t; + auto x = EXPECT(T(1) + T(1) == T(2)); + static_assert(::std::is_same_v); +}; +# endif // STF_HAS_UNITTEST_WITH_ARGS + +UNITTEST("EXPECT") +{ + //! [EXPECT] + EXPECT(1 == 1); // pass + EXPECT(1 + 1 == 2, "All we know about mathematics has been upended!"); // pass + auto p = EXPECT(malloc(100) != nullptr); // pass, put the pointer to allocated memory in p + free(p); + try + { + EXPECT(41 == 102); // fail, will throw exception + } + catch (const ::std::exception& e) + { + // The exception message will contain the actual numbers 41 and 102. + EXPECT(::std::string_view(e.what()).find("41 == 102 is false") != ::std::string_view::npos); + } + //! [EXPECT] +}; + +/* Warning! All occurrences of `babe699bbb083d2b0aac2460e75bca96` below will be replaced with `UNITTEST` in + * postprocessing. This is because we instructed doxygen to NOT generate documentation for the symbol UNITTEST, but we + * still want to have the definition of the macro show up in the documentation. So we define + * `babe699bbb083d2b0aac2460e75bca96`, document it, then replace it with `UNITTEST` in all doxygen-generated files. + */ + +/** + * @brief Macro to introduce unit tests. Use `UNITTEST("description") { ... code ... };` at namespace level or inside a + * function + * + * @param name description of the unittest, e.g. `"Widgets are copyable and moveable"` + * @param ... optional parameters in case multiple invocations of the unittest are necessary. Each parameters must be a + * C++ value of any type and is passed to the unittest code with the name `unittest_param`. + * + * The `UNITTEST` invocation must be either at namespace level or in a function body as a declaration. The invocation is + * followed by a braced compound statement ended with a semicolon. Example: + * @snippet this UNITTEST simple + * + * `UNITTEST` accepts optional arguments, which are available inside the unittest body as `unittest_param`. The body of + * the unittest will be executed once per argument. Arguments may have different types, effectively allowing a given + * unittest to run with any number of types. Example: + * @snippet this UNITTEST parameterized + * + * Unittest code will be executed if and only of the macro `UNITTESTED_FILE` is defined as a string that is equal to the + * name of the current file. A project can be unittested one source file at a time by using the + * @c -DUNITTESTED_FILE='"my_file.cpp"' flag in the project's build tool. + * + * If the unittest code throws an exception, it counts as a failure. Exception information will be output to `stderr` + * and testing will continue with the next unittest. + */ +# define babe699bbb083d2b0aac2460e75bca96(name, ...) + +# undef babe699bbb083d2b0aac2460e75bca96 + +//! [UNITTEST simple] +UNITTEST("Simple unittest") +{ + EXPECT(1 != 2); +}; +//! [UNITTEST simple] + +# ifdef STF_HAS_UNITTEST_WITH_ARGS +//! [UNITTEST parameterized] +UNITTEST("Parameterized unittest", 42, 42L, 42.0f, 42.0) +{ + // In case the type of the parameter is needed, it can be accessed as follows: + // using T = ::std::remove_reference_t; + EXPECT(unittest_param == 42); +}; +//! [UNITTEST parameterized] +# endif // STF_HAS_UNITTEST_WITH_ARGS + +// Documentation unittest for traits.h +UNITTEST("type_name") +{ + using namespace cuda::experimental::stf; + //! [type_name] + EXPECT(type_name == "int"); + EXPECT(type_name == "double"); + //! [type_name] +}; + +// Another documentation unittest for traits.h +UNITTEST("tuple2tuple") +{ + using namespace cuda::experimental::stf; + //! [tuple2tuple] + auto t = ::std::make_tuple(1, 2, 3); + auto t1 = tuple2tuple(t, [](auto x) { + return x + 1.0; + }); + static_assert(::std::is_same_v>); + EXPECT(t1 == ::std::make_tuple(2.0, 3.0, 4.0)); + //! [tuple2tuple] +}; + +UNITTEST("only_convertible") +{ + using namespace cuda::experimental::stf; + auto a = only_convertible(1, "hello"); + EXPECT(a == 1); +}; + +UNITTEST("all_convertible") +{ + using namespace cuda::experimental::stf; + auto a = all_convertible(1, 2, "hello", 3); + EXPECT((a == ::std::array{1, 2, 3})); +}; + +UNITTEST("shuffled arguments") +{ + using namespace cuda::experimental::stf; + auto x0 = only_convertible_or(42); + EXPECT(x0 == 42); + auto x1 = only_convertible_or(42, 23); + EXPECT(x1 == 23); + auto x2 = only_convertible_or(42, "23", ::std::tuple{1}); + EXPECT(x2 == 42); + // Note that a needs to be iniliazed because shuffled_args_check takes a const ref + int a = 0; + shuffled_args_check(a); + shuffled_args_check(a); + // shuffled_args_check(a); // "Duplicate argument: ..." + // shuffled_args_check(a, a); // "Ambiguous argument: ..." + // shuffled_args_check(a); // "Incompatible argument: ..." + auto t2 = shuffled_tuple("hello", 1); + auto [x, str] = shuffled_tuple("hello", 1); + EXPECT(str == "hello"); + EXPECT(x == 1); + + struct A + { + int value; + }; + struct B + { + ::std::string text; + }; + struct C + { + float number; + }; + + A aa{10}; + B bb{"example"}; + C cc{3.14f}; + // This will create an `::std::tuple` from the provided arguments, + // automatically assigning them based on their types. + shuffled_tuple(bb, cc, aa); + shuffled_tuple(cc, aa); + shuffled_tuple(); + // This will not compile because there are two arguments that can fill the same tuple slot. + // auto my_tuple = shuffled_tuple(bb, cc, aa, aa); + // This will not compile because the argument '5' can convert to both 'int' and 'float', + // causing an ambiguity. + // auto my_tuple = shuffled_tuple(b, c, 5); +}; + +UNITTEST("shuffled_array_tuple") +{ + using namespace cuda::experimental::stf; + auto a = shuffled_array_tuple(1, 2, "hello", 3, "world"); + EXPECT((a == ::std::tuple<::std::array, ::std::array<::std::string, 2>>{{1, 2, 3}, {"hello", "world"}})); +}; + +UNITTEST("source_location") +{ + auto test_func = [](::cuda::experimental::stf::source_location loc = RESERVED_STF_SOURCE_LOCATION()) { + // Check the source location metadata + EXPECT(loc.line() > 0); // The line number should be positive + EXPECT(loc.file_name() != nullptr); // File name should not be null + EXPECT(loc.function_name() != nullptr); // Function name should not be null + }; + + // Call the test function and validate the source location information + test_func(); +}; + +#else // DOXYGEN_SHOULD_SKIP_THIS Do not document +// Ensure these are ignored by Doxygen +# define UNITTEST(name, ...) +#endif // DOXYGEN_SHOULD_SKIP_THIS Do not document diff --git a/cudax/include/cuda/experimental/__stf/utility/unstable_unique.cuh b/cudax/include/cuda/experimental/__stf/utility/unstable_unique.cuh new file mode 100644 index 00000000000..99fcc0ea0b8 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/utility/unstable_unique.cuh @@ -0,0 +1,175 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +/** + * @file + * @brief Implementation of unstable_unique + */ + +#include + +#include +#include +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief Removes duplicates from a range using a custom predicate. + * + * This function operates from both sides of the range, moving elements from the right-hand side to the left to + * eliminate duplicates. The order of the elements is not preserved. + * + * @tparam iterator The type of the iterator. + * @tparam BinaryPredicate The type of the predicate. + * @param first, last The range of elements to remove duplicates from. + * @param p The predicate to use for comparing elements. + * + * @return iterator The new end of the range after duplicates have been removed. + * + * @par Example: + * @code + * ::std::vector v = {1, 1, 2, 2, 3, 3, 4, 4, 5, 5}; + * auto new_end = unstable_unique(v.begin(), v.end(), [](int a, int b) { return a == b; }); + * v.erase(new_end, v.end()); + * // v is now {1, 5, 2, 4, 3, 5} + * @endcode + */ +template +iterator unstable_unique(iterator first, iterator last, BinaryPredicate p) +{ + if (first == last || ::std::next(first) == last) + { + return last; // 0 or 1-element range + } + + ++first; // position first on the first unknown element (first element will stay by definition) + bool first_is_known_duplicate = false; // see below for description + + for (; first < last; ++first) + { + if (!first_is_known_duplicate) + { + if (!p(*first, *::std::prev(first))) + { + continue; + } + } + // Here we know that `*first` is a dupe and should be replaced. Also the range is not empty. + assert(first < last); + for (--last;; --last) + { + if (first == last) + { + return first; // just past the last unique element + } + assert(first < last); + if (!p(*last, *::std::prev(last))) + { + break; + } + } + assert(!p(*first, *last)); + // Here we know we're good to replace *first with *last. + // Complicating matter: if we do so, we "forget" whether *::std::next(first) is a duplicate of *first. + // Maintain `first_is_known_duplicate` to keep track of that. + first_is_known_duplicate = p(*first, *::std::next(first)); + *first = mv(*last); + } + + return first; +} + +/** + * @brief Removes duplicates from a range using the built-in operator==. + * + * This function operates like `unstable_unique` above with `operator==` as the predicate. + * + * @tparam iterator The type of the iterator. + * @param first, last The range of elements to remove duplicates from. + * + * @return iterator The new end of the range after duplicates have been removed. + * + * @par Example: + * @code + * ::std::vector v = {1, 1, 2, 2, 3, 3, 4, 4, 5, 5}; + * auto new_end = unstable_unique(v.begin(), v.end()); + * v.erase(new_end, v.end()); + * // v is now {1, 5, 2, 4, 3, 5} + * @endcode + */ +template +iterator unstable_unique(iterator first, iterator last) +{ + return unstable_unique(first, last, [](auto& a, auto& b) { + return a == b; + }); +} + +#ifdef UNITTESTED_FILE +UNITTEST("unstable_unique") +{ + ::std::vector v1; + auto new_end1 = unstable_unique(v1.begin(), v1.end()); + EXPECT(v1.end() == new_end1); + + ::std::vector v2 = {1, 2, 3, 4, 5}; + auto new_end2 = unstable_unique(v2.begin(), v2.end()); + // ::std::cout << new_end2 - v2.begin() << '\n'; + EXPECT(v2.end() == new_end2); + // ::std::copy(v2.begin(), new_end2, ::std::ostream_iterator(::std::cout, " ")); + EXPECT(::std::vector({1, 2, 3, 4, 5}) == v2); + + ::std::vector v3 = {1, 1, 2, 3, 4, 5}; + auto new_end3 = unstable_unique(v3.begin(), v3.end()); + // ::std::cerr << new_end3 - v3.begin() << '\n'; + EXPECT(v3.begin() + 5 == new_end3); + EXPECT(::std::vector({1, 5, 2, 3, 4, 5}) == v3); + ::std::vector v4 = {1, 1, 2, 2, 3, 3, 4, 4, 5, 5}; + auto new_end4 = unstable_unique(v4.begin(), v4.end()); + // ::std::cerr << new_end4 - v4.begin() << '\n'; + EXPECT(v4.begin() + 5 == new_end4); + EXPECT(::std::vector({1, 5, 2, 4, 3, 3, 4, 4, 5, 5}) == v4); + + ::std::vector v5 = {1, 1, 1, 1, 1}; + auto new_end5 = unstable_unique(v5.begin(), v5.end()); + EXPECT(1 + v5.begin() == new_end5); + EXPECT(::std::vector({1, 1, 1, 1, 1}) == v5); + + ::std::vector v6 = {1, 1, 1, 1, 1, 2}; + auto new_end6 = unstable_unique(v6.begin(), v6.end()); + EXPECT(v6.begin() + 2 == new_end6); + EXPECT(::std::vector({1, 2, 1, 1, 1, 2}) == v6); + + ::std::vector v7 = {1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 4, 5}; + assert(v7.size() == 15); + auto new_end7 = unstable_unique(v7.begin(), v7.end(), [](int a, int b) { + return a == b; + }); + // ::std::cerr << new_end7 - v7.begin() << '\n'; + EXPECT(v7.begin() + 5 == new_end7); + EXPECT(::std::vector{1, 5, 4, 3, 2, 1, 1, 1, 1, 2, 2, 2, 3, 4, 5} == v7); +}; +#endif // UNITTESTED_FILE + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/stf.cuh b/cudax/include/cuda/experimental/stf.cuh new file mode 100644 index 00000000000..a26d873fcb5 --- /dev/null +++ b/cudax/include/cuda/experimental/stf.cuh @@ -0,0 +1,1834 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** @file + * + * @brief Main include file for the CUDASTF library. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace cuda::experimental::stf +{ + +/** + * @brief A context is an enviroment for executing cudastf tasks. + * + */ +class context +{ + template + class unified_scope + { + public: + unified_scope(T1 arg) + : payload(mv(arg)) + {} + unified_scope(T2 arg) + : payload(mv(arg)) + {} + + /// Get the string attached to the task for debugging purposes + const ::std::string& get_symbol() const + { + return ::std::visit( + [&](auto& self) { + return self.get_symbol(); + }, + payload); + } + + auto& set_symbol(::std::string s) & + { + ::std::visit( + [&](auto& self) { + self.set_symbol(mv(s)); + }, + payload); + return *this; + } + + auto&& set_symbol(::std::string s) && + { + ::std::visit( + [&](auto& self) { + self.set_symbol(mv(s)); + }, + payload); + return mv(*this); + } + + template + void operator->*(Fun&& f) + { + if (payload.index() == 0) + { + ::std::get<0>(payload)->*::std::forward(f); + } + else + { + EXPECT(payload.index() == 1UL, "Uninitialized scope."); + ::std::get<1>(payload)->*::std::forward(f); + } + } + + private: + ::std::variant payload; + }; + + /* + * A task that can be either a stream task or a graph task. + */ + template + class unified_task + { + public: + unified_task(stream_task task) + : payload(mv(task)) + {} + unified_task(graph_task task) + : payload(mv(task)) + {} + + void set_symbol(::std::string s) & + { + ::std::visit( + [&](auto& self) { + self.set_symbol(mv(s)); + }, + payload); + } + + auto&& set_symbol(::std::string s) && + { + ::std::visit( + [&](auto& self) { + self.set_symbol(mv(s)); + }, + payload); + return mv(*this); + } + + /** + * @brief Add dependencies to this task. + * + * @tparam Args + * @param args + * @return stream_or_graph_dynamic_task& + */ + template + unified_task& add_deps(Args&&... args) + { + ::std::visit( + [&](auto& self) { + self.add_deps(::std::forward(args)...); + }, + payload); + return *this; + } + + /** + * @brief retrieve the data instance associated to an + * index in a task. + * + * @tparam T + * @param submitted index + * @return slice + */ + template + decltype(auto) get(size_t submitted_index) const + { + return ::std::visit( + [&](auto& self) -> decltype(auto) { + return self.template get(submitted_index); + }, + payload); + } + + template + void operator->*(Fun&& f) + { + ::std::visit( + [&](auto& self) { + self->*f; + }, + payload); + } + + private: + ::std::variant, graph_task> payload; + }; + +public: + /** + * @brief Default constructor for the context class. + */ + context() = default; + + /** + * @brief Constructs a stream context with a CUDA stream and an optional asynchronous resource handle. + * + * @param stream The CUDA stream to be used in the context. + * @param handle Optional asynchronous resource handle. + */ + context(cudaStream_t stream, async_resources_handle handle = async_resources_handle(nullptr)) + : payload(stream_ctx(stream, handle)) + { + // The default choice is stream_ctx, otherwise we should assign a graph_ctx with the appropriate parameters + } + + /** + * @brief Constructs a stream context with an asynchronous resource handle. + * + * @param handle The asynchronous resource handle. + */ + context(async_resources_handle handle) + : payload(stream_ctx(handle)) + { + // The default choice is stream_ctx, otherwise we should assign a graph_ctx with the appropriate parameters + } + + /** + * @brief Constructs a context from a stream context. + * + * @param ctx The context to be assigned. + */ + context(stream_ctx ctx) + : payload(mv(ctx)) + {} + + /** + * @brief Constructs a context from a graph context. + * + * @param ctx The context to be assigned. + */ + context(graph_ctx ctx) + : payload(mv(ctx)) + {} + + /** + * @brief Assigns a specific context type to the context. + * + * @tparam Ctx The type of the context to be assigned. + * @param ctx The context to be assigned. + * @return Reference to the updated context. + */ + template + context& operator=(Ctx ctx) + { + payload = mv(ctx); + return *this; + } + + /** + * @brief Converts the context to a string representation. + * + * @return A string representation of the context. + */ + ::std::string to_string() const + { + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + return ::std::visit( + [&](auto& self) { + return self.to_string(); + }, + payload); + } + + /** + * @brief Creates logical data with specified sizes. + * + * @tparam T The type of the logical data. + * @tparam Sizes The sizes of the logical data dimensions. + * @param elements The number of elements. + * @param othersizes The sizes of other dimensions. + */ + template + auto logical_data(size_t elements, Sizes... othersizes) + { + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + return ::std::visit( + [&](auto& self) { + return self.template logical_data(elements, othersizes...); + }, + payload); + } + + /** + * @brief Creates logical data with specified parameters. + * + * @tparam P0 The type of the first parameter. + * @tparam Ps The types of the other parameters. + * @param p0 The first parameter. + * @param ps The other parameters. + */ + template + auto logical_data(P0&& p0, Ps&&... ps) + { + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + using T0 = ::std::remove_reference_t; + if constexpr (::std::is_integral_v) + { + // Assume we create an array with the given length, so forward to the previous function. + return logical_data(size_t(p0), ::std::forward(ps)...); + } + else + { + // Forward all parameters to the homonym function in the context. + return ::std::visit( + [&](auto& self) { + return self.logical_data(::std::forward(p0), ::std::forward(ps)...); + }, + payload); + } + } + + template + frozen_logical_data freeze(::cuda::experimental::stf::logical_data d, + access_mode m = access_mode::read, + data_place where = data_place::invalid) + { + return ::std::visit( + [&](auto& self) { + return self.freeze(mv(d), m, mv(where)); + }, + payload); + } + + /** + * @brief Creates logical data from a pointer and size. + * + * @tparam T The type of the logical data. + * @param p The pointer to the data. + * @param n The number of elements. + * @param dplace The data place of the logical data (default is host). + * @return The created logical data. + */ + template + auto logical_data(T* p, size_t n, data_place dplace = data_place::host) + { + EXPECT(dplace != data_place::invalid); + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + return ::std::visit( + [&](auto& self) { + return self.logical_data(make_slice(p, n), mv(dplace)); + }, + payload); + } + + template + unified_task task(exec_place e_place, task_dep... deps) + { + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + // Workaround: For some obscure reason `mv(deps)...` fails to compile + return ::std::visit( + [&](auto& self) { + return unified_task(self.task(mv(e_place), ::std::move(deps)...)); + }, + payload); + } + + template + unified_task task(task_dep... deps) + { + return task(default_exec_place(), mv(deps)...); + } + + /* + * parallel_for : apply an operation over a shaped index space + */ + template + auto parallel_for(exec_place e_place, S shape, task_dep... deps) + { +#ifndef __CUDACC__ + // We want a test that always fail, but is only triggered when the + // function is instantiated so the test relies on actual templated + // types. + static_assert(::std::is_same_v, "parallel_for is only supported with CUDA compilers."); +#endif + EXPECT(payload.index() != ::std::variant_npos, "Context is not initialized."); + using result_t = unified_scope, + reserved::parallel_for_scope>; + return ::std::visit( + [&](auto& self) { + return result_t(self.parallel_for(mv(e_place), mv(shape), deps...)); + }, + payload); + } + + template + auto parallel_for(partitioner_t p, exec_place e_place, S shape, task_dep... deps) + { +#ifndef __CUDACC__ + // We want a test that always fail, but is only triggered when the + // function is instantiated so the test relies on actual templated + // types. + static_assert(::std::is_same_v, "parallel_for is only supported with CUDA compilers."); +#endif + EXPECT(payload.index() != ::std::variant_npos, "Context is not initialized."); + using result_t = unified_scope, + reserved::parallel_for_scope>; + return ::std::visit( + [&](auto& self) { + return result_t(self.parallel_for(mv(p), mv(e_place), mv(shape), deps...)); + }, + payload); + } + + template + auto parallel_for(S shape, task_dep... deps) + { + return parallel_for(default_exec_place(), mv(shape), mv(deps)...); + } + + template + auto host_launch(task_dep... deps) + { + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + using result_t = unified_scope, + reserved::host_launch_scope>; + return ::std::visit( + [&](auto& self) { + return result_t(self.host_launch(deps...)); + }, + payload); + } + + template + auto cuda_kernel(task_dep... deps) + { + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + // false : we expect a single kernel descriptor in the lambda function return type + using result_t = unified_scope, + reserved::cuda_kernel_scope>; + return ::std::visit( + [&](auto& self) { + return result_t(self.cuda_kernel(deps...)); + }, + payload); + } + + template + auto cuda_kernel(exec_place e_place, task_dep... deps) + { + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + // false : we expect a single kernel descriptor in the lambda function return type + using result_t = unified_scope, + reserved::cuda_kernel_scope>; + return ::std::visit( + [&](auto& self) { + return result_t(self.cuda_kernel(e_place, deps...)); + }, + payload); + } + + template + auto cuda_kernel_chain(task_dep... deps) + { + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + // true : we expect a vector of cuda kernel descriptors in the lambda function return type + using result_t = unified_scope, + reserved::cuda_kernel_scope>; + return ::std::visit( + [&](auto& self) { + return result_t(self.cuda_kernel_chain(deps...)); + }, + payload); + } + + template + auto cuda_kernel_chain(exec_place e_place, task_dep... deps) + { + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + // true : we expect a vector of cuda kernel descriptors in the lambda function return type + using result_t = unified_scope, + reserved::cuda_kernel_scope>; + return ::std::visit( + [&](auto& self) { + return result_t(self.cuda_kernel_chain(e_place, deps...)); + }, + payload); + } + + template + auto launch(thread_hierarchy_spec_t spec, exec_place e_place, task_dep... deps) + { +#ifndef __CUDACC__ + // We want a test that always fail, but is only triggered when the + // function is instantiated so the test relies on actual templated + // types. + static_assert(::std::is_same_v, + "launch is only supported with CUDA compilers."); +#endif + + using result_t = unified_scope, + reserved::launch_scope>; + return ::std::visit( + [&](auto& self) { + using Self = ::std::remove_reference_t; + return result_t(self.launch(mv(spec), mv(e_place), deps...)); + }, + payload); + } + + // /* Default execution policy, explicit place */ + // default depth to avoid breaking all codes (XXX temporary) + template + auto launch(exec_place e_place, task_dep... deps) + { + return launch(par(par()), mv(e_place), (deps)...); + } + + // /* Default execution policy, on automatically selected device */ + template + auto launch(task_dep... deps) + { + return launch(default_exec_place(), mv(deps)...); + } + + template + auto launch(thread_hierarchy_spec ths, task_dep... deps) + { + return launch(mv(ths), default_exec_place(), mv(deps)...); + } + + auto repeat(size_t count) + { + using result_t = unified_scope, reserved::repeat_scope>; + return ::std::visit( + [&](auto& self) { + using Self = ::std::remove_reference_t; + return result_t(self.repeat(count)); + }, + payload); + } + + auto repeat(::std::function condition) + { + using result_t = unified_scope, reserved::repeat_scope>; + return ::std::visit( + [&](auto& self) { + using Self = ::std::remove_reference_t; + return result_t(self.repeat(mv(condition))); + }, + payload); + } + + cudaStream_t task_fence() + { + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + return ::std::visit( + [&](auto& self) { + return self.task_fence(); + }, + payload); + } + + void finalize() + { + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + ::std::visit( + [](auto& self) { + self.finalize(); + }, + payload); + } + + void submit() + { + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + ::std::visit( + [](auto& self) { + self.submit(); + }, + payload); + } + + void set_allocator(block_allocator_untyped custom_allocator) + { + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + ::std::visit( + [&](auto& self) { + self.set_allocator(mv(custom_allocator)); + }, + payload); + } + + void attach_allocator(block_allocator_untyped custom_allocator) + { + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + ::std::visit( + [&](auto& self) { + self.attach_allocator(mv(custom_allocator)); + }, + payload); + } + + void update_uncached_allocator(block_allocator_untyped custom) + { + ::std::visit( + [&](auto& self) { + self.update_uncached_allocator(mv(custom)); + }, + payload); + } + + void change_epoch() + { + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + ::std::visit( + [](auto& self) { + self.change_epoch(); + }, + payload); + } + + ::std::shared_ptr get_dot() + { + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + return ::std::visit( + [](auto& self) { + return self.get_dot(); + }, + payload); + } + + template + void set_parent_ctx(parent_ctx_t& parent_ctx) + { + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + reserved::per_ctx_dot::set_parent_ctx(parent_ctx.get_dot(), get_dot()); + ::std::visit( + [&](auto& self) { + self.set_parent_ctx(parent_ctx.get_dot()); + }, + payload); + } + + /* Indicates whether the underlying context is a graph context, so that we + * may specialize code to deal with the specific constraints of CUDA graphs. */ + bool is_graph_ctx() const + { + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + return (payload.index() == 1); + } + + async_resources_handle& async_resources() const + { + // if (payload.index() == 0) { + // return ::std::get<0>(payload).async_resources(); + // } + // EXPECT(payload.index() == 1, "Uninitialized context."); + // return ::std::get<1>(payload).async_resources(); + return ::std::visit( + [&](auto& self) -> async_resources_handle& { + return self.async_resources(); + }, + payload); + } + + // Shortcuts to manipulate the current affinity stored in the async_resources_handle of the ctx + void push_affinity(::std::vector<::std::shared_ptr> p) const + { + async_resources().push_affinity(mv(p)); + } + void push_affinity(::std::shared_ptr p) const + { + async_resources().push_affinity(mv(p)); + } + void pop_affinity() const + { + async_resources().pop_affinity(); + } + const ::std::vector<::std::shared_ptr>& current_affinity() const + { + return async_resources().current_affinity(); + } + const exec_place& current_exec_place() const + { + assert(current_affinity().size() > 0); + return *(current_affinity()[0]); + } + + bool has_affinity() const + { + return async_resources().has_affinity(); + } + + /** + * @brief Determines the default execution place for a given context, which + * corresponds to the execution place when no place is provided. + * + * @return execution place used by constructs where the place is implicit. + * + * By default, we select the current device, unless an affinity was set in the + * context, in which case we take the first execution place in the current + * places. + */ + exec_place default_exec_place() const + { + return has_affinity() ? current_exec_place() : exec_place::current_device(); + } + + graph_ctx to_graph_ctx() const + { + // Check if payload holds graph_ctx (index == 1) + if (auto ctx = ::std::get_if(&payload)) + { + return *ctx; + } + else + { + throw ::std::runtime_error("Payload does not hold graph_ctx"); + } + } + +private: + template + auto visit(Fun&& fun) + -> decltype(::std::visit(::std::forward(fun), ::std::declval<::std::variant&>())) + { + assert(payload.index() != ::std::variant_npos && "Context is not initialized"); + return ::std::visit(::std::forward(fun), payload); + } + +public: + ::std::variant payload; +}; + +#ifdef UNITTESTED_FILE +UNITTEST("context") +{ + context ctx; + ctx.task_fence(); + ctx.submit(); + ctx.finalize(); +}; + +UNITTEST("context from existing contexts") +{ + stream_ctx ctx; + context unified_ctx = ctx; + unified_ctx.finalize(); +}; + +UNITTEST("context to make generic code") +{ + auto f = [](context ctx) { + ctx.task_fence(); + }; + + stream_ctx ctx1; + f(ctx1); + ctx1.finalize(); + + graph_ctx ctx2; + f(ctx2); + ctx2.finalize(); +}; + +UNITTEST("context to make select backend at runtime") +{ + bool test = true; + context ctx = test ? context(graph_ctx()) : context(stream_ctx()); + ctx.finalize(); +}; + +UNITTEST("context to make select backend at runtime (2)") +{ + // stream_ctx by default + context ctx; + bool test = true; + if (test) + { + ctx = graph_ctx(); + } + ctx.finalize(); +}; + +UNITTEST("context is_graph_ctx") +{ + context ctx; + EXPECT(!ctx.is_graph_ctx()); + ctx.finalize(); + + context ctx2 = graph_ctx(); + EXPECT(ctx2.is_graph_ctx()); + ctx2.finalize(); +}; + +UNITTEST("context with arguments") +{ + cudaStream_t stream; + cuda_safe_call(cudaStreamCreate(&stream)); + + async_resources_handle h; + + context ctx(h); + ctx.finalize(); + + context ctx2(stream, h); + ctx2.finalize(); + + context ctx3 = graph_ctx(h); + ctx3.finalize(); + + context ctx4 = graph_ctx(stream, h); + ctx4.finalize(); + + cuda_safe_call(cudaStreamDestroy(stream)); +}; + +# ifdef __CUDACC__ +namespace reserved +{ +inline void unit_test_context_pfor() +{ + context ctx; + SCOPE(exit) + { + ctx.finalize(); + }; + auto lA = ctx.logical_data(shape_of>(64)); + ctx.parallel_for(lA.shape(), lA.write())->*[] _CCCL_DEVICE(size_t i, slice A) { + A(i) = 2 * i; + }; + ctx.host_launch(lA.read())->*[](auto A) { + for (size_t i = 0; i < 64; i++) + { + EXPECT(A(i) == 2 * i); + } + }; +} + +UNITTEST("context parallel_for") +{ + unit_test_context_pfor(); +}; + +template +inline void unit_test_context_launch() +{ + context ctx; + if constexpr (use_graph) + { + ctx = graph_ctx(); + } + + /* Statically decide the type of the spec (to avoid duplicating code) */ + auto spec = []() { + if constexpr (use_con) + { + return con(); + } + else + { + return par(); + } + }(); + + SCOPE(exit) + { + ctx.finalize(); + }; + auto lA = ctx.logical_data(shape_of>(64)); + ctx.launch(spec, lA.write())->*[] _CCCL_DEVICE(auto th, slice A) { + for (auto i : th.apply_partition(shape(A))) + { + A(i) = 2 * i; + } + }; + ctx.host_launch(lA.read())->*[](auto A) { + for (size_t i = 0; i < 64; i++) + { + EXPECT(A(i) == 2 * i); + } + }; +} + +UNITTEST("context launch") +{ + // par() (normal launch) + unit_test_context_launch(); + unit_test_context_launch(); + + // con() cooperative kernel + unit_test_context_launch(); + unit_test_context_launch(); +}; + +/* Do not provide an exec_place, but put a spec */ +inline void unit_test_context_launch_spec_noplace() +{ + context ctx; + SCOPE(exit) + { + ctx.finalize(); + }; + auto lA = ctx.logical_data(shape_of>(64)); + ctx.launch(par(), lA.write())->*[] _CCCL_DEVICE(auto th, slice A) { + for (auto i : th.apply_partition(shape(A))) + { + A(i) = 2 * i; + } + }; + ctx.host_launch(lA.read())->*[](auto A) { + for (size_t i = 0; i < 64; i++) + { + EXPECT(A(i) == 2 * i); + } + }; +} + +UNITTEST("context launch spec noplace") +{ + unit_test_context_launch_spec_noplace(); +}; + +inline void unit_test_context_launch_generic() +{ + context ctx; + SCOPE(exit) + { + ctx.finalize(); + }; + auto lA = ctx.logical_data(shape_of>(64)); + ctx.host_launch(lA.write())->*[](slice A) { + for (auto i : shape(A)) + { + A(i) = 2 * i; + } + }; + + exec_place where2 = exec_place::current_device(); + // This will not compile because launch implementation will try to generate a CUDA kernel from that non device + // lambda + ctx.launch(where2, lA.rw())->*[] _CCCL_DEVICE(auto th, slice A) { + for (auto i : th.apply_partition(shape(A))) + { + A(i) = 2 * A(i); + } + }; + + ctx.host_launch(lA.read())->*[](auto A) { + for (size_t i = 0; i < 64; i++) + { + EXPECT(A(i) == 4 * i); + } + }; +} + +UNITTEST("context launch test generic") +{ + unit_test_context_launch_generic(); +}; + +inline void unit_test_context_launch_exec_places() +{ + // OK with this + // stream_ctx ctx; + + // does not compile with context + context ctx; + SCOPE(exit) + { + ctx.finalize(); + }; + auto lA = ctx.logical_data(shape_of>(64)); + ctx.host_launch(lA.write())->*[](slice A) { + for (auto i : shape(A)) + { + A(i) = 2 * i; + } + }; + + ctx.launch(exec_place::current_device(), lA.rw())->*[] _CCCL_DEVICE(auto th, slice A) { + for (auto i : th.apply_partition(shape(A))) + { + A(i) = 2 * A(i); + } + }; + + ctx.host_launch(lA.read())->*[](auto A) { + for (size_t i = 0; i < 64; i++) + { + EXPECT(A(i) == 4 * i); + } + }; +} + +UNITTEST("context launch specific exec places") +{ + unit_test_context_launch_exec_places(); +}; + +inline void unit_test_context_launch_sync() +{ + // OK with this (workaround) + stream_ctx ctx; + + // does not compile with context + // context ctx; + SCOPE(exit) + { + ctx.finalize(); + }; + auto lA = ctx.logical_data(shape_of>(64)); + + auto spec = con<1024>(); + ctx.host_launch(lA.write())->*[](slice A) { + for (auto i : shape(A)) + { + A(i) = 2 * i; + } + }; + + ctx.launch(spec, exec_place::current_device(), lA.rw())->*[] _CCCL_DEVICE(auto th, slice A) { + for (auto i : th.apply_partition(shape(A))) + { + A(i) = 2 * A(i); + } + + th.sync(); + }; + + ctx.host_launch(lA.read())->*[](auto A) { + for (size_t i = 0; i < 64; i++) + { + EXPECT(A(i) == 4 * i); + } + }; +} + +UNITTEST("context launch sync") +{ + unit_test_context_launch_sync(); +}; + +inline void unit_test_context_repeat() +{ + context ctx; + + constexpr size_t K = 10; + + // does not compile with context + // context ctx; + SCOPE(exit) + { + ctx.finalize(); + }; + auto lA = ctx.logical_data(shape_of>(64)); + + ctx.launch(lA.write())->*[] _CCCL_DEVICE(auto th, slice A) { + for (auto i : th.apply_partition(shape(A))) + { + A(i) = i; + } + }; + + // Repeat K times : A(i) = 2 * A(i) + ctx.repeat(K)->*[&](context ctx, size_t) { + ctx.launch(lA.rw())->*[] _CCCL_DEVICE(auto th, slice A) { + for (auto i : th.apply_partition(shape(A))) + { + A(i) = 2 * A(i); + } + }; + }; + + // Check that we have A(i) = 2^K * i + ctx.host_launch(lA.read())->*[](auto A) { + for (size_t i = 0; i < 64; i++) + { + EXPECT(A(i) == (1 << K) * i); + } + }; +} + +UNITTEST("context repeat") +{ + unit_test_context_repeat(); +}; + +template +inline void unit_test_context_launch_implicit_widths(spec_t spec) +{ + // OK with this (workaround) + stream_ctx ctx; + + // does not compile with context + // context ctx; + SCOPE(exit) + { + ctx.finalize(); + }; + auto lA = ctx.logical_data(shape_of>(64)); + + ctx.host_launch(lA.write())->*[](slice A) { + for (auto i : shape(A)) + { + A(i) = 2 * i; + } + }; + + ctx.launch(spec, exec_place::current_device(), lA.rw())->*[] _CCCL_DEVICE(auto th, slice A) { + for (auto i : th.apply_partition(shape(A))) + { + A(i) = 2 * A(i); + } + }; + + ctx.host_launch(lA.read())->*[](auto A) { + for (size_t i = 0; i < 64; i++) + { + EXPECT(A(i) == 4 * i); + } + }; +} + +UNITTEST("context launch implicit widths") +{ + unit_test_context_launch_implicit_widths(par()); + unit_test_context_launch_implicit_widths(par(par())); +}; + +// make sure we have the different interfaces to declare logical_data +UNITTEST("context logical_data") +{ + context ctx; + // shape of 32 double + auto lA = ctx.logical_data(32); + auto lB = ctx.logical_data(32, 128); + int array[128]; + auto lC = ctx.logical_data(array); + int array2[128]; + auto lD = ctx.logical_data(&array2[0], 128); + ctx.finalize(); +}; + +UNITTEST("context task") +{ + // stream_ctx ctx; + context ctx; + int a = 42; + + auto la = ctx.logical_data(&a, 1); + + auto lb = ctx.logical_data(la.shape()); + + ctx.task(la.read(), lb.write())->*[](auto s, auto a, auto b) { + // no-op + cudaMemcpyAsync(&a(0), &b(0), sizeof(int), cudaMemcpyDeviceToDevice, s); + }; + + ctx.finalize(); +}; + +inline void unit_test_recursive_apply() +{ + context ctx; + SCOPE(exit) + { + ctx.finalize(); + }; + + /* 2 level spec */ + auto lA = ctx.logical_data(shape_of>(1280)); + + /* This creates a spec with 2 levels, and applies a partitionner defined as + * the composition of blocked() in the first level, and cyclic() in the second + * level */ + auto spec = par<8>(par<16>()); + ctx.launch(spec, exec_place::current_device(), lA.write())->*[] _CCCL_DEVICE(auto th, slice A) { + for (auto i : th.apply_partition(shape(A), ::std::tuple())) + { + A(i) = 2 * i + 7; + } + }; + + ctx.host_launch(lA.read())->*[](auto A) { + for (size_t i = 0; i < 1280; i++) + { + EXPECT(A(i) == 2 * i + 7); + } + }; + + /* 3 level spec */ + auto lB = ctx.logical_data(shape_of>(1280)); + + auto spec3 = par(par<8>(par<16>())); + ctx.launch(spec3, exec_place::current_device(), lB.write())->*[] _CCCL_DEVICE(auto th, slice B) { + for (auto i : th.apply_partition(shape(B), ::std::tuple())) + { + B(i) = 2 * i + 7; + } + }; + + ctx.host_launch(lB.read())->*[](auto B) { + for (size_t i = 0; i < 1280; i++) + { + EXPECT(B(i) == 2 * i + 7); + } + }; +} + +UNITTEST("launch recursive apply") +{ + unit_test_recursive_apply(); +}; + +UNITTEST("logical data slice const") +{ + context ctx; + double A[128]; + slice cA = make_slice((const double*) &A[0], 128); + auto lA = ctx.logical_data(cA); + ctx.task(lA.read())->*[](cudaStream_t, auto A) { + static_assert(::std::is_same_v>); + }; + ctx.finalize(); +}; + +inline void unit_test_partitioner_product() +{ + context ctx; + SCOPE(exit) + { + ctx.finalize(); + }; + + // Define the combination of partitioners as a product of partitioners + auto p = ::std::tuple(); + + auto lA = ctx.logical_data(shape_of>(1280)); + + /* This creates a spec with 2 levels, and applies a partitionner defined as + * the composition of blocked() in the first level, and cyclic() in the second + * level */ + auto spec = par<8>(par<16>()); + + ctx.launch(spec, exec_place::current_device(), lA.write())->*[=] _CCCL_DEVICE(auto th, slice A) { + for (auto i : th.apply_partition(shape(A), p)) + { + A(i) = 2 * i + 7; + } + }; + + ctx.host_launch(lA.read())->*[](auto A) { + for (size_t i = 0; i < 1280; i++) + { + EXPECT(A(i) == 2 * i + 7); + } + }; +} + +UNITTEST("unit_test_partitioner_product") +{ + unit_test_partitioner_product(); +}; + +} // namespace reserved +# endif // __CUDACC__ + +UNITTEST("make_tuple_indexwise") +{ + auto t1 = make_tuple_indexwise<3>([&](auto i) { + if constexpr (i == 2) + { + return ::std::ignore; + } + else + { + return int(i); + } + }); + static_assert(::std::is_same_v>); + EXPECT(t1 == ::std::tuple(0, 1)); + + auto t2 = make_tuple_indexwise<3>([&](auto i) { + if constexpr (i == 1) + { + return ::std::ignore; + } + else + { + return int(i); + } + }); + static_assert(::std::is_same_v>); + EXPECT(t2 == ::std::tuple(0, 2)); +}; + +UNITTEST("auto_dump set/get") +{ + context ctx; + + int A[1024]; + int B[1024]; + auto lA = ctx.logical_data(A); + auto lB = ctx.logical_data(B); + + // Disable auto dump + lA.set_auto_dump(false); + EXPECT(lA.get_auto_dump() == false); + + // Enabled by default + EXPECT(lB.get_auto_dump() == true); +}; + +UNITTEST("cuda stream place") +{ + cudaStream_t user_stream; + cuda_safe_call(cudaStreamCreate(&user_stream)); + + context ctx; + + int A[1024]; + int B[1024]; + auto lA = ctx.logical_data(A); + auto lB = ctx.logical_data(B); + + // Make sure that a task using exec_place::cuda_stream(user_stream) does run with user_stream + ctx.task(exec_place::cuda_stream(user_stream), lA.write(), lB.write())->*[=](cudaStream_t stream, auto, auto) { + EXPECT(stream == user_stream); + }; + + ctx.finalize(); +}; + +UNITTEST("cuda stream place multi-gpu") +{ + cudaStream_t user_stream; + + // Create a CUDA stream in a different device (if available) + int ndevices = cuda_try(); + // use the last device + int target_dev_id = ndevices - 1; + + cuda_safe_call(cudaSetDevice(target_dev_id)); + cuda_safe_call(cudaStreamCreate(&user_stream)); + cuda_safe_call(cudaSetDevice(0)); + + context ctx; + + int A[1024]; + int B[1024]; + auto lA = ctx.logical_data(A); + auto lB = ctx.logical_data(B); + + // Make sure that a task using exec_place::cuda_stream(user_stream) does run with user_stream + ctx.task(exec_place::cuda_stream(user_stream), lA.write(), lB.write())->*[=](cudaStream_t stream, auto, auto) { + EXPECT(stream == user_stream); + EXPECT(target_dev_id == cuda_try()); + }; + + // Make sure we restored the device + EXPECT(0 == cuda_try()); + + ctx.finalize(); +}; + +#endif // UNITTESTED_FILE + +/** + * @brief Algorithms are a mechanism to implement reusable task sequences implemented by the means of CUDA graphs nested + * within a task. + * + * The underlying CUDA graphs are cached so that they are temptatively reused + * when the algorithm is run again. Nested algorithms are internally implemented as child graphs. + */ +class algorithm +{ +private: + template + class runner_impl + { + public: + runner_impl(context_t& _ctx, algorithm& _alg, task_dep... _deps) + : alg(_alg) + , ctx(_ctx) + , deps(::std::make_tuple(mv(_deps)...)) {}; + + template + void operator->*(Fun&& fun) + { + // We cannot use ::std::apply with a lambda function here instead + // because this would use extended lambda functions within a lambda + // function which is prohibited + call_with_tuple_impl(::std::forward(fun), ::std::index_sequence_for{}); + } + + private: + // Helper function to call fun with context and unpacked tuple arguments + template + void call_with_tuple_impl(Fun&& fun, ::std::index_sequence) + { + // We may simply execute the algorithm within the existing context + // if we do not want to generate sub-graphs (eg. to analyze the + // advantage of using such algorithms) + if (getenv("CUDASTF_ALGORITHM_INLINE")) + { + alg.run_inline(::std::forward(fun), ctx, ::std::get(deps)...); + } + else + { + alg.run_as_task(::std::forward(fun), ctx, ::std::get(deps)...); + } + } + + algorithm& alg; + context_t& ctx; + ::std::tuple...> deps; + }; + +public: + algorithm(::std::string _symbol = "algorithm") + : symbol(mv(_symbol)) + {} + + /* Inject the execution of the algorithm within a CUDA graph */ + template + void run_in_graph(Fun fun, parent_ctx_t& parent_ctx, cudaGraph_t graph, Args... args) + { + auto argsTuple = ::std::make_tuple(args...); + ::cuda::experimental::stf::hash hasher; + size_t hashValue = hasher(argsTuple); + + ::std::shared_ptr inner_graph; + + if (auto search = graph_cache.find(hashValue); search != graph_cache.end()) + { + inner_graph = search->second; + } + else + { + graph_ctx gctx(parent_ctx.async_resources()); + + // Useful for tools + gctx.set_parent_ctx(parent_ctx); + gctx.get_dot()->set_ctx_symbol("algo: " + symbol); + + auto current_place = gctx.default_exec_place(); + + // Transform an instance into a new logical data + auto logify = [&gctx, ¤t_place](auto x) { + // Our infrastructure currently does not like to work with + // constant types for the data interface so we pretend this is + // a modifiable data if necessary + return gctx.logical_data(rw_type_of(x), current_place.affine_data_place()); + }; + + // Transform the tuple of instances into a tuple of logical data + auto logicalArgsTuple = ::std::apply( + [&](auto&&... args) { + return ::std::tuple(logify(::std::forward(args))...); + }, + argsTuple); + + // call fun(gctx, ...logical data...) + ::std::apply(fun, ::std::tuple_cat(::std::make_tuple(gctx), logicalArgsTuple)); + + inner_graph = gctx.finalize_as_graph(); + + // TODO validate that the graph is reusable before storing it ! + // fprintf(stderr, "CACHE graph...\n"); + graph_cache[hashValue] = inner_graph; + } + + cudaGraphNode_t c; + cuda_safe_call(cudaGraphAddChildGraphNode(&c, graph, nullptr, 0, *inner_graph)); + } + + /* This simply executes the algorithm within the existing context. This + * makes it possible to observe the impact of an algorithm by disabling it in + * practice (without bloating the code with both the algorithm and the original + * code) */ + template + void run_inline(Fun fun, context_t& ctx, task_dep... deps) + { + ::std::apply(fun, + ::std::tuple_cat(::std::make_tuple(ctx), ::std::make_tuple(logical_data(deps.get_data())...))); + } + + /* Helper to run the algorithm in a stream_ctx */ + template + void run_as_task(Fun fun, stream_ctx& ctx, task_dep... deps) + { + ctx.task(deps...).set_symbol(symbol)->*[this, &fun, &ctx](cudaStream_t stream, Deps... args) { + this->run(fun, ctx, stream, args...); + }; + } + + /* Helper to run the algorithm in a graph_ctx */ + template + void run_as_task(Fun fun, graph_ctx& ctx, task_dep... deps) + { + ctx.task(deps...).set_symbol(symbol)->*[this, &fun, &ctx](cudaGraph_t g, Deps... args) { + this->run_in_graph(fun, ctx, g, args...); + }; + } + + /** + * @brief Executes `fun` within a task that takes a pack of dependencies + * + * As an alternative, the run_as_task_dynamic may take a variable number of dependencies + */ + template + void run_as_task(Fun fun, context& ctx, task_dep... deps) + { + ::std::visit( + [&](auto& actual_ctx) { + this->run_as_task(fun, actual_ctx, deps...); + }, + ctx.payload); + } + + /* Helper to run the algorithm in a stream_ctx */ + template + void run_as_task_dynamic(Fun fun, stream_ctx& ctx, const ::std::vector& deps) + { + auto t = ctx.task(); + for (auto& d : deps) + { + t.add_deps(d); + } + + t.set_symbol(symbol); + + t->*[this, &fun, &ctx, &t](cudaStream_t stream) { + this->run_dynamic(fun, ctx, stream, t); + }; + } + + /* Helper to run the algorithm in a graph_ctx */ + template + void run_as_task_dynamic(Fun /* fun */, graph_ctx& /* ctx */, const ::std::vector& /* deps */) + { + /// TODO + abort(); + } + + /** + * @brief Executes `fun` within a task that takes a vector of untyped dependencies. + * + * This is an alternative for run_as_task which may take a variable number of dependencies + */ + template + void run_as_task_dynamic(Fun fun, context& ctx, const ::std::vector& deps) + { + ::std::visit( + [&](auto& actual_ctx) { + this->run_as_task_dynamic(fun, actual_ctx, deps); + }, + ctx.payload); + } + + /** + * @brief Helper to use algorithm using the ->* idiom instead of passing the implementation as an argument of + * run_as_task + * + * example: + * algorithm alg; + * alg.runner(ctx, lX.read(), lY.rw())->*[](context inner_ctx, logical_data> X, + * logical_data> Y) { inner_ctx.parallel_for(Y.shape(), X.rw(), Y.rw())->*[]__device__(size_t i, auto + * x, auto y) { y(i) = 2.0*x(i); + * }; + * }; + * + * + * Which is equivalent to: + * auto fn = [](context inner_ctx, logical_data> X, logical_data> Y) { + * inner_ctx.parallel_for(Y.shape(), X.rw(), Y.rw())->*[]__device__(size_t i, auto x, auto y) { + * y(i) = 2.0*x(i); + * } + * }; + * + * algorithm alg; + * alg.run_as_task(fn, ctx, lX.read(), lY.rw()); + */ + template + runner_impl runner(context_t& ctx, task_dep... deps) + { + return runner_impl(ctx, *this, mv(deps)...); + } + + /* Execute the algorithm as a CUDA graph and launch this graph in a CUDA + * stream */ + template + void run(Fun fun, parent_ctx_t& parent_ctx, cudaStream_t stream, Args... args) + { + auto argsTuple = ::std::make_tuple(args...); + graph_ctx gctx(parent_ctx.async_resources()); + + // Useful for tools + gctx.set_parent_ctx(parent_ctx); + gctx.get_dot()->set_ctx_symbol("algo: " + symbol); + + // This creates an adapter which "redirects" allocations to the CUDA stream API + auto wrapper = stream_adapter(gctx, stream); + + gctx.update_uncached_allocator(wrapper.allocator()); + + auto current_place = gctx.default_exec_place(); + + // Transform an instance into a new logical data + auto logify = [&gctx, ¤t_place](auto x) { + // Our infrastructure currently does not like to work with constant + // types for the data interface so we pretend this is a modifiable + // data if necessary + return gctx.logical_data(rw_type_of(x), current_place.affine_data_place()); + }; + + // Transform the tuple of instances into a tuple of logical data + auto logicalArgsTuple = ::std::apply( + [&](auto&&... args) { + return ::std::tuple(logify(::std::forward(args))...); + }, + argsTuple); + + // call fun(gctx, ...logical data...) + ::std::apply(fun, ::std::tuple_cat(::std::make_tuple(gctx), logicalArgsTuple)); + + ::std::shared_ptr gctx_graph = gctx.finalize_as_graph(); + + // Try to reuse existing exec graphs... + ::std::shared_ptr eg = nullptr; + bool found = false; + for (::std::shared_ptr& pe : cached_exec_graphs[stream]) + { + found = graph_ctx::try_updating_executable_graph(*pe, *gctx_graph); + if (found) + { + eg = pe; + break; + } + } + + if (!found) + { + auto cudaGraphExecDeleter = [](cudaGraphExec_t* pGraphExec) { + cudaGraphExecDestroy(*pGraphExec); + }; + ::std::shared_ptr res(new cudaGraphExec_t, cudaGraphExecDeleter); + + dump_algorithm(gctx_graph); + + cuda_try(cudaGraphInstantiateWithFlags(res.get(), *gctx_graph, 0)); + + eg = res; + + cached_exec_graphs[stream].push_back(eg); + } + + cuda_safe_call(cudaGraphLaunch(*eg, stream)); + + // Free resources allocated through the adapter + wrapper.clear(); + } + + /* Contrary to `run`, we here have a dynamic set of dependencies for the + * task, so fun does not take a pack of data instances as a parameter */ + template + void run_dynamic(Fun fun, parent_ctx_t& parent_ctx, cudaStream_t stream, task_t& t) + { + graph_ctx gctx(parent_ctx.async_resources()); + + // Useful for tools + gctx.set_parent_ctx(parent_ctx); + gctx.get_dot()->set_ctx_symbol("algo: " + symbol); + + gctx.set_allocator(block_allocator(gctx)); + + auto current_place = gctx.default_exec_place(); + + fun(gctx, t); + + ::std::shared_ptr gctx_graph = gctx.finalize_as_graph(); + + // Try to reuse existing exec graphs... + ::std::shared_ptr eg = nullptr; + bool found = false; + for (::std::shared_ptr& pe : cached_exec_graphs[stream]) + { + found = graph_ctx::try_updating_executable_graph(*pe, *gctx_graph); + if (found) + { + eg = pe; + break; + } + } + + if (!found) + { + auto cudaGraphExecDeleter = [](cudaGraphExec_t* pGraphExec) { + cudaGraphExecDestroy(*pGraphExec); + }; + ::std::shared_ptr res(new cudaGraphExec_t, cudaGraphExecDeleter); + + dump_algorithm(gctx_graph); + + cuda_try(cudaGraphInstantiateWithFlags(res.get(), *gctx_graph, 0)); + + eg = res; + + cached_exec_graphs[stream].push_back(eg); + } + + cuda_safe_call(cudaGraphLaunch(*eg, stream)); + } + +private: + // Generate a DOT output of a CUDA graph using CUDA + void dump_algorithm(const ::std::shared_ptr& gctx_graph) + { + if (getenv("CUDASTF_DUMP_ALGORITHMS")) + { + static int print_to_dot_cnt = 0; // Warning: not thread-safe + ::std::string filename = "algo_" + symbol + "_" + ::std::to_string(print_to_dot_cnt++) + ".dot"; + cudaGraphDebugDotPrint(*gctx_graph, filename.c_str(), cudaGraphDebugDotFlags(0)); + } + } + + ::std::map>> cached_exec_graphs; + + // Cache executable graphs + ::std::unordered_map> exec_graph_cache; + + // Cache CUDA graphs + ::std::unordered_map> graph_cache; + + ::std::string symbol; +}; + +template +class for_each_batched +{ +public: + for_each_batched( + context ctx, size_t cnt, size_t batch_size, ::std::function<::std::tuple...>(size_t)> df) + : cnt(cnt) + , batch_size(batch_size) + , df(mv(df)) + , ctx(ctx) + {} + + // Create a batch operation that computes fun(start), fun(start+1), ... f(end-1) + template + void batched_iterations(Fun&& fun, size_t start, size_t end) + { + // Create "untyped" dependencies + ::std::vector deps; + for (size_t i = start; i < end; i++) + { + ::std::apply( + [&deps](auto&&... args) { + // Call the method on each tuple element + (deps.push_back(args), ...); + }, + df(i)); + } + + // templated by Fun + static algorithm batch_alg; + + auto fn = [this, start, end, &fun](context gctx, stream_task<> t) { + // How many logical data per iteration ? + constexpr size_t data_per_iteration = ::std::tuple_size::value; + (void) data_per_iteration; + + auto logify = [](auto& dest_ctx, auto x) { + return dest_ctx.logical_data(rw_type_of(x), exec_place::current_device().affine_data_place()); + }; + + for (size_t i = start; i < end; i++) + { + // Compute a tuple of all instances (e.g. tuple, slice>) + + // Transform the tuple by applying a lambda to each element + auto instance_tuple = + tuple_transform(df(i), [&t, i, start, data_per_iteration](auto&& item, std::size_t arg_ind) { + // Get the arg_ind-th element of the i-th batch. + // Its type is the same as the arg_ind-th entry of + // df(i) + // + // For example : if df(i) is tuple(lX.read(), + // lY.rw()), the second entry of the batch has the + // same type as the lY interface + using arg_type = typename ::std::decay_t::data_t; + return t.template get((i - start) * data_per_iteration + arg_ind); + }); + + // Logify all these instances (create temporary aliases) + // Returns eg. a tuple>, logical_data>> + auto logified_instances_tuple = ::std::apply( + [&logify, &gctx](auto&&... args) { + return ::std::make_tuple(logify(gctx, args)...); + }, + instance_tuple); + + ::std::apply(fun, ::std::tuple_cat(::std::make_tuple(context(gctx), i), logified_instances_tuple)); + } + }; + + // Launch the fn method as a task which takes an untyped vector of dependencies + batch_alg.run_as_task_dynamic(fn, ctx, deps); + } + + template + void operator->*(Fun&& fun) + { + // Process in batches + for (size_t start = 0; start < cnt; start += batch_size) + { + size_t end = ::std::min(start + batch_size, cnt); + batched_iterations(fun, start, end); + } + } + +private: + // Helper function to apply a lambda to each element of the tuple with its index + template + auto tuple_transform_impl(Tuple&& t, F&& f, ::std::index_sequence) + { + // Apply the lambda 'f' to each element and its index + return ::std::make_tuple(f(::std::get(t), Is)...); + } + + // function to transform the tuple with a lambda + template + auto tuple_transform(Tuple&& t, F&& f) + { + constexpr size_t N = ::std::tuple_size>::value; + return tuple_transform_impl(::std::forward(t), ::std::forward(f), ::std::make_index_sequence{}); + } + + size_t cnt; + size_t batch_size; + ::std::function<::std::tuple...>(size_t)> df; + context ctx; +}; + +} // namespace cuda::experimental::stf diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index 3a220da4eda..617ff9a486c 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -1,5 +1,7 @@ cccl_get_catch2() +find_package(cudax) # already found, bring in version info. + find_package(Thrust ${cudax_VERSION} EXACT CONFIG NO_DEFAULT_PATH # Only check the explicit path in HINTS: HINTS "${CCCL_SOURCE_DIR}/lib/cmake/thrust/" @@ -120,5 +122,10 @@ foreach(cn_target IN LISTS cudax_TARGETS) cudax_add_catch2_test(test_target green_context ${cn_target} green_context/green_ctx_smoke.cu ) - endforeach() + +# FIXME: Enable MSVC +if (NOT "MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") + # STF tests are handled separately: + add_subdirectory(stf) +endif() diff --git a/cudax/test/memory_resource/any_async_resource.cu b/cudax/test/memory_resource/any_async_resource.cu index 8e1c7e9dfb0..f032ac3f6b8 100644 --- a/cudax/test/memory_resource/any_async_resource.cu +++ b/cudax/test/memory_resource/any_async_resource.cu @@ -10,7 +10,7 @@ #include -#include "test_resource.h" +#include "test_resource.cuh" #include #include diff --git a/cudax/test/memory_resource/any_resource.cu b/cudax/test/memory_resource/any_resource.cu index d13459832b7..213dee61d93 100644 --- a/cudax/test/memory_resource/any_resource.cu +++ b/cudax/test/memory_resource/any_resource.cu @@ -10,7 +10,7 @@ #include -#include "test_resource.h" +#include "test_resource.cuh" #include #include diff --git a/cudax/test/memory_resource/shared_resource.cu b/cudax/test/memory_resource/shared_resource.cu index 365579de4f8..ae292e337c9 100644 --- a/cudax/test/memory_resource/shared_resource.cu +++ b/cudax/test/memory_resource/shared_resource.cu @@ -11,7 +11,7 @@ #include #include -#include "test_resource.h" +#include "test_resource.cuh" #include #include diff --git a/cudax/test/memory_resource/test_resource.h b/cudax/test/memory_resource/test_resource.cuh similarity index 100% rename from cudax/test/memory_resource/test_resource.h rename to cudax/test/memory_resource/test_resource.cuh diff --git a/cudax/test/stf/CMakeLists.txt b/cudax/test/stf/CMakeLists.txt new file mode 100644 index 00000000000..0b238da03d3 --- /dev/null +++ b/cudax/test/stf/CMakeLists.txt @@ -0,0 +1,282 @@ +set(stf_test_sources + algorithm/graph_algorithms.cu + algorithm/in_graph_ctx.cu + algorithm/nested.cu + algorithm/algorithm_with_read.cu + allocators/adapter.cu + allocators/buddy_allocator.cu + allocators/cap_tmp_buffers.cu + cpp/read_const.cu + cpp/redundant_data.cu + cpp/redundant_data_different_modes.cu + cpp/reuse_computation.cu + cpp/reuse_computation_2.cu + cpp/scoped_graph_task.cu + cpp/user_streams.cu + cpp/concurrency_test.cu + dot/basic.cu + dot/with_events.cu + dot/graph_print_to_dot.cu + tools/auto_dump/auto_dump.cu + error_checks/ctx_mismatch.cu + error_checks/data_interface_mismatch.cu + error_checks/double_finalize.cu + error_checks/erase_frozen.cu + error_checks/misformed_tasks_dbl_end.cu + error_checks/misformed_tasks_dbl_start.cu + error_checks/non_managed_data.cu + error_checks/slice_check_bounds.cu + error_checks/uninitialized_data.cu + error_checks/unsatisfiable_spec.cu + error_checks/write_frozen.cu + examples/01-axpy-launch-ranges-cg.cu + examples/01-axpy-places.cu + examples/05-stencil.cu + examples/05-stencil-no-copy.cu + examples/05-stencil-places.cu + examples/05-stencil2d-places.cu + examples/09-nbody.cu + examples/09-nbody-algorithm.cu + examples/09-nbody-blocked.cu + fhe/parse_arctyrex.cu + freeze/constant_logical_data.cu + freeze/freeze.cu + freeze/freeze_rw.cu + freeze/task_fence.cu + graph/concurrency_test.cu + graph/epoch.cu + graph/for_each_batched.cu + graph/for_each_batched_write.cu + graph/freeze_for_graph.cu + graph/graph_ctx_low_level.cu + graph/graph_composition.cu + graph/many.cu + graph/multiple_graph_ctx.cu + graph/static_graph_ctx.cu + graph/graph_tmp_data.cu + green_context/axpy_gc.cu + green_context/cuda_graph.cu + green_context/gc_grid.cu + gnu/include_only.cpp + hash/logical_data.cu + hash/ctx_hash.cu + hashtable/fusion.cu + hashtable/fusion_reduction.cu + hashtable/test.cu + hashtable/parallel_for.cu + hashtable/parallel_for_shape.cu + interface/mix_stream_and_graph.cu + interface/mix_stream_and_graph_2.cu + interface/data_from_device.cu + interface/data_from_device_2.cu + interface/data_from_device_wb.cu + interface/data_from_device_async.cu + interface/graph_use_device_data.cu + interface/stream_add_callback.cu + interface/move_operator.cu + interface/scal.cu + interface/scalar_div.cu + local_stf/interop_cuda.cu + local_stf/legacy_to_stf.cu + loop_dispatch/dispatch_on_streams.cu + loop_dispatch/nested_loop_dispatch.cu + loop_dispatch/loop_dispatch.cu + parallel_for/parallel_for_repeat.cu + parallel_for/test_parallel_for.cu + parallel_for/test2_parallel_for_context.cu + parallel_for/parallel_for_box.cu + parallel_for/parallel_for_all_devs.cu + parallel_for/fdtd.cu + parallel_for/tiled_loops.cu + places/cuda_stream_place.cu + places/managed.cu + places/managed_from_shape.cu + places/managed_from_user.cu + places/non_current_device.cu + places/place_partition.cu + places/recursion.cu + reclaiming/graph.cu + reclaiming/graph_2.cu + reclaiming/graph_real_oom.cu + reclaiming/stream.cu + reductions/many_inc.cu + reductions/redux_test.cu + reductions/redux_test2.cu + reductions/slice2d_reduction.cu + reductions/slice_custom_op.cu + reductions/successive_reductions.cu + reductions/successive_reductions_pfor.cu + reductions/sum.cu + reductions/sum_array.cu + reductions/sum_multiple_places.cu + reductions/sum_multiple_places_no_refvalue.cu + reductions/write_back_after_redux.cu + slice/pinning.cu + stencil/stencil-1D.cu + stress/empty_tasks.cu + stress/empty_tasks_alloc.cu + stress/kernel_chain.cu + stress/kernel_chain_fused.cu + stress/launch_overhead.cu + stress/launch_vs_parallelfor.cu + stress/many_read.cu + stress/parallel_for_overhead.cu + stress/task_bench.cu + # threads/axpy-threads.cu + threads/axpy-threads-2.cu + # threads/axpy-threads-pfor.cu # Currently has a difficult-to-reproduce concurrency problem + utility/timing_with_fences.cu +) + +# Examples using CUBLAS, CUSOLVER... +set(stf_test_mathlib_sources + cuda-samples/0_Introduction/vectorAdd/vectorAdd_cudastf.cu + # Reduce compilation time by not adding this (useless) example + # cuda-samples/0_Introduction/vectorAdd/vectorAdd + cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.cu + cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi_cudastf.cu + cuda-samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_custf.cu + cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarloMultiGPU.cu + examples/07-cholesky-redux.cu + examples/07-cholesky-unified.cu + gnu/06-pdgemm.cpp + gnu/07-cholesky.cpp +) + +set(stf_unittested_headers + cuda/experimental/stf.cuh + cuda/experimental/__stf/allocators/buddy_allocator.cuh + cuda/experimental/__stf/graph/graph_ctx.cuh + cuda/experimental/__stf/internal/async_resources_handle.cuh + cuda/experimental/__stf/internal/execution_policy.cuh + cuda/experimental/__stf/internal/interpreted_execution_policy.cuh + cuda/experimental/__stf/internal/logical_data.cuh + cuda/experimental/__stf/internal/parallel_for_scope.cuh + cuda/experimental/__stf/internal/slice.cuh + cuda/experimental/__stf/internal/thread_hierarchy.cuh + cuda/experimental/__stf/places/cyclic_shape.cuh + cuda/experimental/__stf/places/inner_shape.cuh + cuda/experimental/__stf/places/places.cuh + cuda/experimental/__stf/places/tiled_partition.cuh + cuda/experimental/__stf/stream/stream_ctx.cuh + cuda/experimental/__stf/utility/cartesian_iterator.cuh + cuda/experimental/__stf/utility/cuda_safe_call.cuh + cuda/experimental/__stf/utility/dimensions.cuh + cuda/experimental/__stf/utility/handle.cuh + cuda/experimental/__stf/utility/hash.cuh + cuda/experimental/__stf/utility/memory.cuh + cuda/experimental/__stf/utility/scope_guard.cuh + cuda/experimental/__stf/utility/stopwatch.cuh + cuda/experimental/__stf/utility/unittest.cuh + cuda/experimental/__stf/utility/unstable_unique.cuh +) + +find_package(CUDAToolkit REQUIRED) + +## cudax_add_stf_test +# +# Add an stf test executable and register it with ctest. +# +# target_name_var: Variable name to overwrite with the name of the test +# target. Useful for post-processing target information. +# source: The source file for the test. +# cn_target: The reference cudax target with configuration information. +# Additional args are passed to cudax_stf_configure_target. +function(cudax_add_stf_test target_name_var source cn_target) + cudax_get_target_property(config_dialect ${cn_target} DIALECT) + cudax_get_target_property(config_prefix ${cn_target} PREFIX) + + get_filename_component(dir ${source} DIRECTORY) + get_filename_component(filename ${source} NAME_WE) + if (dir) + set(filename "${dir}/${filename}") + endif() + string(REPLACE "/" "." test_name "${filename}") + + set(test_target ${config_prefix}.test.stf.${test_name}) + + add_executable(${test_target} ${source}) + cccl_configure_target(${test_target} DIALECT ${config_dialect}) + cudax_clone_target_properties(${test_target} ${cn_target}) + cudax_stf_configure_target(${test_target} ${ARGN}) + + set(stf_meta_target ${config_prefix}.tests.stf) + add_dependencies(${stf_meta_target} ${test_target}) + + add_test(NAME ${test_target} COMMAND "$") + + set(${target_name_var} ${test_target} PARENT_SCOPE) +endfunction() + +## cudax_add_stf_unittest_header +# +# Add an stf unittested header executable and register it with ctest. +# +# Unittested headers contain a set of tests that are enabled by including +# `unittest.cuh` and defining `UNITTESTED_FILE`. +# +# target_name_var: Variable name to overwrite with the name of the test +# target. Useful for post-processing target information. +# source: The source file for the test. +# cn_target: The reference cudax target with configuration information. +# Additional args are passed to cudax_stf_configure_target. +function(cudax_add_stf_unittest_header target_name_var source cn_target) + cudax_get_target_property(config_dialect ${cn_target} DIALECT) + cudax_get_target_property(config_prefix ${cn_target} PREFIX) + + get_filename_component(relative_path ${source} DIRECTORY) + get_filename_component(filename ${source} NAME_WE) + + string(REPLACE "cuda/experimental/" "" test_label "${relative_path}/${filename}") + string(REPLACE "/" "." test_label "${test_label}") + set(test_target "${config_prefix}.test.stf.unittest_headers.${test_label}") + + set(ut_template "${cudax_SOURCE_DIR}/cmake/stf_header_unittest.in.cu") + set(ut_source "${cudax_BINARY_DIR}/unittest_headers/${test_target}.cu") + configure_file(${ut_template} ${ut_source} @ONLY) + + add_executable(${test_target} ${ut_source}) + cccl_configure_target(${test_target} DIALECT ${config_dialect}) + cudax_clone_target_properties(${test_target} ${cn_target}) + cudax_stf_configure_target(${test_target} ${ARGN}) + + set(stf_unittest_headers_meta_target ${config_prefix}.tests.stf.unittest_headers) + add_dependencies(${stf_unittest_headers_meta_target} ${test_target}) + + add_test(NAME ${test_target} COMMAND ${test_target}) + + set(${target_name_var} ${test_target} PARENT_SCOPE) +endfunction() + +# Create tests for each enabled configuration: +foreach(cn_target IN LISTS cudax_TARGETS) + cudax_get_target_property(config_prefix ${cn_target} PREFIX) + + # Metatargets for the current configuration's tests: + set(config_meta_target ${config_prefix}.tests) + set(stf_test_meta_target ${config_prefix}.tests.stf) + add_custom_target(${stf_test_meta_target}) + add_dependencies(${config_meta_target} ${stf_test_meta_target}) + set(stf_unittest_headers_meta_target ${config_prefix}.tests.stf.unittest_headers) + add_custom_target(${stf_unittest_headers_meta_target}) + add_dependencies(${stf_test_meta_target} ${stf_unittest_headers_meta_target}) + + # Basic tests: + foreach(source IN LISTS stf_test_sources) + cudax_add_stf_test(test_target "${source}" ${cn_target}) + endforeach() + + # Tests with mathlib deps: + if (cudax_ENABLE_CUDASTF_MATHLIBS) + foreach(source IN LISTS stf_test_mathlib_sources) + cudax_add_stf_test(test_target "${source}" ${cn_target} LINK_MATHLIBS) + endforeach() + endif() + + # Unittested headers + foreach(source IN LISTS stf_unittested_headers) + cudax_add_stf_unittest_header(test_target "${source}" ${cn_target}) + endforeach() +endforeach() + +add_subdirectory(static_error_checks) diff --git a/cudax/test/stf/algorithm/algorithm_with_read.cu b/cudax/test/stf/algorithm/algorithm_with_read.cu new file mode 100644 index 00000000000..75210d4ab0c --- /dev/null +++ b/cudax/test/stf/algorithm/algorithm_with_read.cu @@ -0,0 +1,58 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +template +void init(context& ctx, logical_data l, int val) +{ + ctx.parallel_for(l.shape(), l.write())->*[=] __device__(size_t i, auto s) { + s(i) = val; + }; +} + +int main() +{ + context ctx; + + auto a = ctx.logical_data(10000000); + auto b = ctx.logical_data(10000000); + auto c = ctx.logical_data(10000000); + auto d = ctx.logical_data(10000000); + + init(ctx, a, 12); + init(ctx, b, 35); + init(ctx, c, 42); + init(ctx, d, 17); + + /* a += 1; a += b; */ + auto fn = [](context ctx, logical_data> a, logical_data> b) { + ctx.parallel_for(a.shape(), a.rw())->*[] __device__(size_t i, auto sa) { + sa(i) += 1; + }; + ctx.parallel_for(a.shape(), a.rw(), b.read())->*[] __device__(size_t i, auto sa, auto sb) { + sa(i) += sb(i); + }; + }; + + algorithm alg; + + for (size_t i = 0; i < 100; i++) + { + alg.run_as_task(fn, ctx, a.rw(), b.read()); + alg.run_as_task(fn, ctx, a.rw(), c.read()); + alg.run_as_task(fn, ctx, c.rw(), d.read()); + alg.run_as_task(fn, ctx, d.rw(), a.read()); + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/algorithm/graph_algorithms.cu b/cudax/test/stf/algorithm/graph_algorithms.cu new file mode 100644 index 00000000000..6b13ff05387 --- /dev/null +++ b/cudax/test/stf/algorithm/graph_algorithms.cu @@ -0,0 +1,108 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +template +void lib_call(ctx_t& ctx, logical_data a, logical_data b) +{ + nvtx_range r("lib_call"); + // b *= 2 + // a = a + b + // b = a + ctx.parallel_for(b.shape(), b.rw())->*[] __device__(size_t i, auto sb) { + sb(i) *= 2; + }; + + ctx.parallel_for(a.shape(), a.rw(), b.read())->*[] __device__(size_t i, auto sa, auto sb) { + sa(i) += sb(i); + }; + + ctx.parallel_for(a.shape(), a.read(), b.write())->*[] __device__(size_t i, auto sa, auto sb) { + sb(i) = sa(i); + }; +} + +template +void init(context& ctx, logical_data l, int val) +{ + ctx.parallel_for(l.shape(), l.write())->*[=] __device__(size_t i, auto s) { + s(i) = val; + }; +} + +int main() +{ + context ctx; + + auto a = ctx.logical_data(size_t(10000000)); + auto b = ctx.logical_data(size_t(10000000)); + auto c = ctx.logical_data(size_t(10000000)); + auto d = ctx.logical_data(size_t(10000000)); + + init(ctx, a, 12); + init(ctx, b, 35); + init(ctx, c, 42); + init(ctx, d, 42); + + auto fn = [](context ctx, logical_data> a, logical_data> b) { + ctx.parallel_for(b.shape(), b.rw())->*[] __device__(size_t i, auto sb) { + sb(i) *= 2; + }; + ctx.parallel_for(a.shape(), a.rw(), b.read())->*[] __device__(size_t i, auto sa, auto sb) { + sa(i) += sb(i); + }; + ctx.parallel_for(a.shape(), a.read(), b.write())->*[] __device__(size_t i, auto sa, auto sb) { + sb(i) = sa(i); + }; + }; + + algorithm alg; + + { + nvtx_range r("run"); + for (size_t i = 0; i < 100; i++) + { + ctx.task(a.rw(), b.rw())->*[&alg, &fn, &ctx](cudaStream_t stream, slice sa, slice sb) { + alg.run(fn, ctx, stream, sa, sb); + }; + + ctx.task(b.rw(), c.rw())->*[&alg, &fn, &ctx](cudaStream_t stream, slice sb, slice sc) { + alg.run(fn, ctx, stream, sb, sc); + }; + + ctx.task(c.rw(), d.rw())->*[&alg, &fn, &ctx](cudaStream_t stream, slice sc, slice sd) { + alg.run(fn, ctx, stream, sc, sd); + }; + + ctx.task(d.rw(), a.rw())->*[&alg, &fn, &ctx](cudaStream_t stream, slice sd, slice sa) { + alg.run(fn, ctx, stream, sd, sa); + }; + } + } + + { + nvtx_range r("run_as_task"); + for (size_t i = 0; i < 100; i++) + { + alg.run_as_task(fn, ctx, a.rw(), b.rw()); + + alg.run_as_task(fn, ctx, a.rw(), c.rw()); + + alg.run_as_task(fn, ctx, c.rw(), d.rw()); + + alg.run_as_task(fn, ctx, d.rw(), a.rw()); + } + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/algorithm/in_graph_ctx.cu b/cudax/test/stf/algorithm/in_graph_ctx.cu new file mode 100644 index 00000000000..b810a9e3875 --- /dev/null +++ b/cudax/test/stf/algorithm/in_graph_ctx.cu @@ -0,0 +1,68 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +template +void init(context_t& ctx, logical_data l, int val) +{ + ctx.parallel_for(l.shape(), l.write())->*[=] __device__(size_t i, auto s) { + s(i) = val; + }; +} + +int main() +{ + // context ctx = graph_ctx(); + graph_ctx ctx; + + auto a = ctx.logical_data(size_t(1000000)); + auto b = ctx.logical_data(size_t(1000000)); + auto c = ctx.logical_data(size_t(1000000)); + auto d = ctx.logical_data(size_t(1000000)); + + init(ctx, a, 12); + init(ctx, b, 35); + init(ctx, c, 42); + init(ctx, d, 42); + + auto fn = [](context ctx, logical_data> a, logical_data> b) { + ctx.parallel_for(a.shape(), a.rw())->*[] __device__(size_t i, auto sa) { + sa(i) *= 3; + }; + ctx.parallel_for(b.shape(), b.rw())->*[] __device__(size_t i, auto sb) { + sb(i) *= 2; + }; + + ctx.parallel_for(a.shape(), a.rw(), b.read())->*[] __device__(size_t i, auto sa, auto sb) { + sa(i) += sb(i); + }; + + ctx.parallel_for(a.shape(), a.read(), b.write())->*[] __device__(size_t i, auto sa, auto sb) { + sb(i) = sa(i); + }; + }; + + algorithm alg; + + for (size_t i = 0; i < 5; i++) + { + alg.run_as_task(fn, ctx, a.rw(), b.rw()); + alg.run_as_task(fn, ctx, a.rw(), c.rw()); + alg.run_as_task(fn, ctx, c.rw(), d.rw()); + alg.run_as_task(fn, ctx, d.rw(), a.rw()); + } + + ctx.finalize(); + + // cudaGraphDebugDotPrint(ctx.get_graph(), "pif.dot", 0); +} diff --git a/cudax/test/stf/algorithm/nested.cu b/cudax/test/stf/algorithm/nested.cu new file mode 100644 index 00000000000..04e0291e8bd --- /dev/null +++ b/cudax/test/stf/algorithm/nested.cu @@ -0,0 +1,67 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +template +void init(context& ctx, logical_data l, int val) +{ + ctx.parallel_for(l.shape(), l.write())->*[=] __device__(size_t i, auto s) { + s(i) = val; + }; +} + +int main() +{ + context ctx; + + auto a = ctx.logical_data(size_t(1000000)); + auto b = ctx.logical_data(size_t(1000000)); + auto c = ctx.logical_data(size_t(1000000)); + auto d = ctx.logical_data(size_t(1000000)); + + init(ctx, a, 12); + init(ctx, b, 35); + init(ctx, c, 42); + init(ctx, d, 17); + + auto fn1 = [](context ctx, logical_data> a) { + ctx.parallel_for(a.shape(), a.rw())->*[] __device__(size_t i, auto sa) { + sa(i) += 1; + }; + }; + + algorithm alg1; + + auto fn2 = [&alg1, &fn1](context ctx, logical_data> a, logical_data> b) { + alg1.run_as_task(fn1, ctx, a.rw()); + alg1.run_as_task(fn1, ctx, b.rw()); + ctx.parallel_for(a.shape(), a.rw(), b.read())->*[] __device__(size_t i, auto sa, auto sb) { + sa(i) += sb(i); + }; + ctx.parallel_for(a.shape(), a.read(), b.write())->*[] __device__(size_t i, auto sa, auto sb) { + sb(i) = sa(i); + }; + }; + + algorithm alg2; + + for (size_t i = 0; i < 100; i++) + { + alg2.run_as_task(fn2, ctx, a.rw(), b.rw()); + alg2.run_as_task(fn2, ctx, a.rw(), c.rw()); + alg2.run_as_task(fn2, ctx, c.rw(), d.rw()); + alg2.run_as_task(fn2, ctx, d.rw(), a.rw()); + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/allocators/adapter.cu b/cudax/test/stf/allocators/adapter.cu new file mode 100644 index 00000000000..e205883523d --- /dev/null +++ b/cudax/test/stf/allocators/adapter.cu @@ -0,0 +1,56 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +int main() +{ + double* d_ptrA; + const size_t N = 128 * 1024; + const size_t NITER = 10; + + // User allocated memory + cuda_safe_call(cudaMalloc(&d_ptrA, N * sizeof(double))); + + async_resources_handle handle; + + cudaStream_t stream; + cuda_safe_call(cudaStreamCreate(&stream)); + + for (size_t i = 0; i < NITER; i++) + { + graph_ctx ctx(stream, handle); + + auto wrapper = stream_adapter(ctx, stream); + ctx.set_allocator(block_allocator(ctx, wrapper.allocator())); + + auto A = ctx.logical_data(make_slice(d_ptrA, N), data_place::current_device()); + + for (size_t k = 0; k < 4; k++) + { + auto tmp = ctx.logical_data(A.shape()); + auto tmp2 = ctx.logical_data(A.shape()); + // Test device and managed memory + ctx.parallel_for(A.shape(), A.read(), tmp.write(), tmp2.write(data_place::managed)) + ->*[] __device__(size_t i, auto a, auto tmp, auto tmp2) { + tmp(i) = a(i); + tmp2(i) = a(i); + }; + } + + ctx.finalize(); + + wrapper.clear(); + } + cuda_safe_call(cudaStreamSynchronize(stream)); +} diff --git a/cudax/test/stf/allocators/buddy_allocator.cu b/cudax/test/stf/allocators/buddy_allocator.cu new file mode 100644 index 00000000000..5fc46920bc6 --- /dev/null +++ b/cudax/test/stf/allocators/buddy_allocator.cu @@ -0,0 +1,43 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +/** + * @brief Ensure the buddy allocation is working properly on the different backends + */ + +using namespace cuda::experimental::stf; + +template +void test_buddy() +{ + ctx_t ctx; + ctx.set_allocator(block_allocator(ctx)); + + std::vector>> data; + for (size_t i = 0; i < 10; i++) + { + size_t s = (1 + i % 8) * 1024ULL * 1024ULL; + auto l = ctx.logical_data(shape_of>(s)); + data.push_back(l); + + ctx.task(l.write())->*[](cudaStream_t, auto) {}; + } + + ctx.finalize(); +} + +int main(int, char**) +{ + test_buddy(); + test_buddy(); + test_buddy(); +} diff --git a/cudax/test/stf/allocators/cap_tmp_buffers.cu b/cudax/test/stf/allocators/cap_tmp_buffers.cu new file mode 100644 index 00000000000..11839527d99 --- /dev/null +++ b/cudax/test/stf/allocators/cap_tmp_buffers.cu @@ -0,0 +1,56 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +int main(int, char**) +{ + context ctx; + + const size_t PART_SIZE = 1024; + const size_t PART_CNT = 64; + + pooled_allocator_config config; + config.max_entries_per_place = 8; + auto fixed_alloc = block_allocator(ctx, config); + + /* Create a large device buffer which will be used part by part. */ + double* dA; + cuda_safe_call(cudaMalloc(&dA, PART_SIZE * PART_CNT * sizeof(double))); + + for (size_t p = 0; p < PART_CNT; p++) + { + /* Create a logical data from a subset of the existing device buffer */ + auto Ap = ctx.logical_data(make_slice(&dA[p * PART_SIZE], PART_SIZE), data_place::current_device()); + + ctx.parallel_for(Ap.shape(), Ap.write()).set_symbol("init_Ap")->*[p, PART_SIZE] __device__(size_t i, auto ap) { + ap(i) = 1.0 * (i + p * PART_SIZE); + }; + + auto tmp = ctx.logical_data(Ap.shape()); + tmp.set_allocator(fixed_alloc); + + ctx.parallel_for(Ap.shape(), Ap.read(), tmp.write()).set_symbol("set_tmp")->* + [] __device__(size_t i, auto ap, auto tmp) { + tmp(i) = 2.0 * ap(i); + }; + + ctx.parallel_for(Ap.shape(), Ap.write(), tmp.read()).set_symbol("update_Ap") + ->*[] __device__(size_t i, auto ap, auto tmp) { + ap(i) = tmp(i); + }; + } + + ctx.finalize(); + + cuda_safe_call(cudaFree(dA)); +} diff --git a/cudax/test/stf/cpp/concurrency_test.cu b/cudax/test/stf/cpp/concurrency_test.cu new file mode 100644 index 00000000000..0ba4a6b875d --- /dev/null +++ b/cudax/test/stf/cpp/concurrency_test.cu @@ -0,0 +1,98 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +/* + * The goal of this test is to ensure that using read access modes actually + * results in concurrent tasks + */ + +using namespace cuda::experimental::stf; + +/** + * @brief Call `__nanosleep` (potentially repeatedly) to sleep `nanoseconds` nanoseconds. Supports sleep times longer + * than 4 billion nanoseconds (i.e. 4 seconds). + * + * @param nanoseconds how many nanoseconds to sleep + * @return void + */ +__global__ void nano_sleep(unsigned long long nanoseconds) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) + static constexpr auto m = std::numeric_limits::max(); + for (;;) + { + if (nanoseconds > m) + { + __nanosleep(m); + nanoseconds -= m; + } + else + { + __nanosleep(static_cast(nanoseconds)); + break; + } + } +#else + const clock_t end = clock() + nanoseconds / (1000000000ULL / CLOCKS_PER_SEC); + while (clock() < end) + { + // busy wait + } +#endif +} + +void run(context& ctx, int NTASKS, int ms) +{ + int dummy[1]; + auto handle = ctx.logical_data(dummy); + + ctx.task().add_deps(handle.rw())->*[](cudaStream_t stream) { + nano_sleep<<<1, 1, 0, stream>>>(0); + }; + + for (int iter = 0; iter < 10; iter++) + { + for (int k = 0; k < NTASKS; k++) + { + ctx.task().add_deps(handle.read())->*[&](cudaStream_t stream) { + nano_sleep<<<1, 1, 0, stream>>>(ms * 1000ULL * 1000ULL); + }; + } + + ctx.task().add_deps(handle.rw())->*[&](cudaStream_t stream) { + nano_sleep<<<1, 1, 0, stream>>>(0); + }; + } + + ctx.finalize(); +} + +int main(int argc, char** argv) +{ + int NTASKS = 256; + int ms = 40; + + if (argc > 1) + { + NTASKS = atoi(argv[1]); + } + + if (argc > 2) + { + ms = atoi(argv[2]); + } + + context ctx; + run(ctx, NTASKS, ms); + ctx = graph_ctx(); + run(ctx, NTASKS, ms); +} diff --git a/cudax/test/stf/cpp/read_const.cu b/cudax/test/stf/cpp/read_const.cu new file mode 100644 index 00000000000..d3af6850165 --- /dev/null +++ b/cudax/test/stf/cpp/read_const.cu @@ -0,0 +1,83 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief This test ensures that a task can access a logical data with a const qualifier. + */ + +#include + +using namespace cuda::experimental::stf; + +struct foo +{ + // Intentionally choose an odd (actually prime) size + foo(context& ctx) + { + l = ctx.logical_data(shape_of>(50867)); + } + + void set(context& ctx, int val) + { + ctx.parallel_for(l.shape(), l.write())->*[=] _CCCL_DEVICE(size_t i, auto dl) { + dl(i) = val; + }; + } + + void copy_from(context& ctx, const foo& other) + { + ctx.parallel_for(l.shape(), l.write(), other.l.read())->*[=] _CCCL_DEVICE(size_t i, auto dl, auto dotherl) { + dl(i) = dotherl(i); + }; + } + + void ensure(context& ctx, int val) + { + std::ignore = val; + ctx.parallel_for(l.shape(), l.read())->*[=] _CCCL_DEVICE(size_t i, auto dl) { + assert(dl(i) == val); + }; + } + + auto& get_l() const + { + return l; + } + + logical_data> l; +}; + +void read_only_access(context& ctx, const foo& f) +{ + ctx.parallel_for(f.l.shape(), f.l.read())->*[] _CCCL_DEVICE(size_t i, auto dl) { + // no-op + }; + + ctx.parallel_for(f.get_l().shape(), f.get_l().read())->*[] _CCCL_DEVICE(size_t i, auto dl) { + // no-op + }; +} + +int main() +{ + context ctx; + + foo A(ctx); + A.set(ctx, 42); + A.ensure(ctx, 42); + foo B(ctx); + B.copy_from(ctx, A); + B.ensure(ctx, 42); + + read_only_access(ctx, A); + + ctx.finalize(); +} diff --git a/cudax/test/stf/cpp/redundant_data.cu b/cudax/test/stf/cpp/redundant_data.cu new file mode 100644 index 00000000000..81e23f44456 --- /dev/null +++ b/cudax/test/stf/cpp/redundant_data.cu @@ -0,0 +1,86 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Ensure we can use the same logical data multiple time in a task + */ + +#include +#include + +using namespace cuda::experimental::stf; + +template +__global__ void diff_cnt(int n, T* x, T* y, int* delta) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int ind = tid; ind < n; ind += nthreads) + { + if (y[ind] != x[ind]) + { + atomicAdd(delta, 1); + } + } +} + +template +void compare_two_vectors(Ctx& ctx, logical_data& a, logical_data& b, int& delta) +{ + auto delta_cnt = ctx.logical_data(make_slice(&delta, 1)); + const auto n = a.shape().extent(0); + + // Count the number of differences + ctx.task(a.read(), b.read(), delta_cnt.rw())->*[=](cudaStream_t stream, auto da, auto db, auto ddelta) { + diff_cnt<<<16, 128, 0, stream>>>(static_cast(n), da.data_handle(), db.data_handle(), ddelta.data_handle()); + }; + + // Read that value on the host + ctx.host_launch(delta_cnt.read())->*[&](auto /*unused*/) {}; +} + +static const size_t N = 12; + +template +void run(double (&X)[N], double (&Y)[N]) +{ + Ctx ctx; + auto handle_X = ctx.logical_data(X); + auto handle_Y = ctx.logical_data(Y); + + int ret1 = 0, ret2 = 0; + + compare_two_vectors(ctx, handle_X, handle_Y, ret1); + compare_two_vectors(ctx, handle_X, handle_X, ret2); + + ctx.finalize(); + + // After sync, we can inspect the returned values. + // First two vectors are different + assert(ret1 > 0); + // Other two vectors are equal + assert(ret2 == 0); +} + +int main() +{ + double X[N], Y[N]; + + for (size_t ind = 0; ind < N; ind++) + { + X[ind] = 1.0 * ind; + Y[ind] = 2.0 * ind - 3.0; + } + + run(X, Y); + run(X, Y); +} diff --git a/cudax/test/stf/cpp/redundant_data_different_modes.cu b/cudax/test/stf/cpp/redundant_data_different_modes.cu new file mode 100644 index 00000000000..d8d7dccca3a --- /dev/null +++ b/cudax/test/stf/cpp/redundant_data_different_modes.cu @@ -0,0 +1,56 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Ensure we can use the same logical data multiple times in the same + * task even with different access modes (which should be combined) + */ + +#include +#include + +using namespace cuda::experimental::stf; + +// a = b + 1; +template +__global__ void add(T* a, const T* b) +{ + *a = *b + 1; +} + +template +void run() +{ + Ctx ctx; + + int var = 42; + auto var_handle = ctx.logical_data(make_slice(&var, 1)); + + // da and db are for the same variable : we expect it to be equivalent to a RW access + ctx.task(var_handle.write(), var_handle.read())->*[](cudaStream_t stream, auto da, auto db) { + add<<<1, 1, 0, stream>>>(da.data_handle(), db.data_handle()); + }; + + // Read that value on the host + ctx.host_launch(var_handle.read())->*[](auto da) { + int result = *da.data_handle(); + (void) result; + assert(result == 43); + }; + + ctx.finalize(); +} + +int main() +{ + run(); + run(); +} diff --git a/cudax/test/stf/cpp/reuse_computation.cu b/cudax/test/stf/cpp/reuse_computation.cu new file mode 100644 index 00000000000..77831b904bd --- /dev/null +++ b/cudax/test/stf/cpp/reuse_computation.cu @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +int main() +{ + context ctx; + + const int N = 16; + size_t niter = 12; + + int A[N]; + + for (int i = 0; i < N; i++) + { + A[i] = 2 * i + 1; + } + + auto lres = ctx.logical_data(A); + + for (size_t k = 0; k < niter; k++) + { + auto ltmp = ctx.logical_data(lres.shape()); + ctx.parallel_for(ltmp.shape(), ltmp.write())->*[] __device__(size_t i, auto tmp) { + tmp(i) = i; + }; + + ctx.parallel_for(lres.shape(), ltmp.read(), lres.rw())->*[] __device__(size_t i, auto tmp, auto res) { + res(i) += tmp(i); + }; + } + + for (size_t k = 0; k < niter; k++) + { + auto ltmp = run_once()->*[&]() { + // Ensure this is only done once ! + static bool done = false; + EXPECT(!done); + done = true; + + auto ltmp = ctx.logical_data(lres.shape()); + ctx.parallel_for(ltmp.shape(), ltmp.write())->*[] __device__(size_t i, auto tmp) { + tmp(i) = i; + }; + return ltmp; + }; + + auto ltmp2 = run_once(size_t(k % 4))->*[&](size_t val) { + // fprintf(stderr, "COMPUTE FOR %ld\n", val); + + auto ltmp = ctx.logical_data(lres.shape()); + ctx.parallel_for(ltmp.shape(), ltmp.write())->*[val] __device__(size_t i, auto tmp) { + tmp(i) = val; + }; + return ltmp; + }; + + ctx.parallel_for(lres.shape(), ltmp.read(), lres.rw())->*[] __device__(size_t i, auto tmp, auto res) { + res(i) += tmp(i); + }; + } + + ctx.finalize(); + + for (int i = 0; i < N; i++) + { + EXPECT(A[i] == (2 * i + 1) + 2 * i * niter); + } +} diff --git a/cudax/test/stf/cpp/reuse_computation_2.cu b/cudax/test/stf/cpp/reuse_computation_2.cu new file mode 100644 index 00000000000..840a0939948 --- /dev/null +++ b/cudax/test/stf/cpp/reuse_computation_2.cu @@ -0,0 +1,65 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +int main() +{ + context ctx; + + const int N = 16; + size_t niter = 12; + + int A[N]; + + for (int i = 0; i < N; i++) + { + A[i] = 2 * i + 1; + } + + auto lres = ctx.logical_data(A); + + for (size_t k = 0; k < niter; k++) + { + auto ltmp = ctx.logical_data(lres.shape()); + ctx.parallel_for(ltmp.shape(), ltmp.write())->*[k] __device__(size_t i, auto tmp) { + tmp(i) = (k % 2) * i; + }; + + ctx.parallel_for(lres.shape(), ltmp.read(), lres.rw())->*[] __device__(size_t i, auto tmp, auto res) { + res(i) += tmp(i); + }; + } + + for (size_t k = 0; k < niter; k++) + { + auto ltmp = run_once(k)->*[&](size_t k) { + auto out = ctx.logical_data(lres.shape()); + ctx.parallel_for(out.shape(), out.write())->*[k] __device__(size_t i, auto tmp) { + tmp(i) = (k % 2) * i; + }; + return out; + }; + + ctx.parallel_for(lres.shape(), ltmp.read(), lres.rw())->*[] __device__(size_t i, auto tmp, auto res) { + res(i) += tmp(i); + }; + } + + ctx.finalize(); + + for (int i = 0; i < N; i++) + { + EXPECT(A[i] == (2 * i + 1) + 2 * i * niter / 2); + } +} diff --git a/cudax/test/stf/cpp/scoped_graph_task.cu b/cudax/test/stf/cpp/scoped_graph_task.cu new file mode 100644 index 00000000000..91e66e441ca --- /dev/null +++ b/cudax/test/stf/cpp/scoped_graph_task.cu @@ -0,0 +1,101 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Show how we can create tasks in the CUDA graph backend by using the + * actual CUDA graph API in tasks (instead of relying on graph capture + * implicitly) + */ + +#include + +using namespace cuda::experimental::stf; + +int main(int argc, char** argv) +{ + const size_t n = 12; + + double X[n]; + double Y[n]; + + for (size_t ind = 0; ind < n; ind++) + { + X[ind] = 1.0 * ind + 42; + Y[ind] = 0.0; + } + + // We here do not assume there is a valid copy on the host and only provide + // constant parameters + graph_ctx ctx; + auto handle_X = ctx.logical_data(X); + handle_X.set_symbol("x"); + auto handle_Y = ctx.logical_data(Y); + handle_Y.set_symbol("y"); + auto handle_TMP = ctx.logical_data(n); + handle_TMP.set_symbol("tmp"); + + int NITER = 4; + for (int iter = 0; iter < NITER; iter++) + { + // We swap X and Y using TMP as temporary buffer + // TMP = X + // X = Y + // Y = TMP + ctx.task(exec_place::current_device(), handle_X.rw(), handle_Y.rw(), handle_TMP.write()) + ->*[&](cudaGraph_t child_graph, auto d_x, auto d_y, auto d_tmp) { + // TMP = X + cudaGraphNode_t cpy_tmp_to_x; + cuda_try(cudaGraphAddMemcpyNode1D( + &cpy_tmp_to_x, + child_graph, + nullptr, + 0, + d_tmp.data_handle(), + d_x.data_handle(), + n * sizeof(double), + cudaMemcpyDeviceToDevice)); + + // X = Y + cudaGraphNode_t cpy_x_to_y; + cuda_try(cudaGraphAddMemcpyNode1D( + &cpy_x_to_y, + child_graph, + &cpy_tmp_to_x, + 1, + d_x.data_handle(), + d_y.data_handle(), + n * sizeof(double), + cudaMemcpyDeviceToDevice)); + + // Y = TMP + cudaGraphNode_t cpy_tmp_to_y; + cuda_try(cudaGraphAddMemcpyNode1D( + &cpy_tmp_to_y, + child_graph, + &cpy_x_to_y, + 1, + d_y.data_handle(), + d_tmp.data_handle(), + n * sizeof(double), + cudaMemcpyDeviceToDevice)); + }; + } + + ctx.submit(); + + if (argc > 1) + { + std::cout << "Generating DOT output in " << argv[1] << std::endl; + ctx.print_to_dot(argv[1]); + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/cpp/user_streams.cu b/cudax/test/stf/cpp/user_streams.cu new file mode 100644 index 00000000000..cecae7ef839 --- /dev/null +++ b/cudax/test/stf/cpp/user_streams.cu @@ -0,0 +1,99 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +/* + * In this example, the user provides streams in which the STF model inserts the proper dependencies + */ + +static __global__ void cuda_sleep_kernel(long long int clock_cnt) +{ + long long int start_clock = clock64(); + long long int clock_offset = 0; + while (clock_offset < clock_cnt) + { + clock_offset = clock64() - start_clock; + } +} + +void cuda_sleep(double ms, cudaStream_t stream) +{ + int device; + cudaGetDevice(&device); + + // cudaDevAttrClockRate: Peak clock frequency in kilohertz; + int clock_rate; + cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, device); + + long long int clock_cnt = (long long int) (ms * clock_rate); + cuda_sleep_kernel<<<1, 1, 0, stream>>>(clock_cnt); +} + +int main() +{ + stream_ctx ctx; + double vA, vB, vC, vD; + auto A = ctx.logical_data(make_slice(&vA, 1)); + auto B = ctx.logical_data(make_slice(&vB, 1)); + auto C = ctx.logical_data(make_slice(&vC, 1)); + auto D = ctx.logical_data(make_slice(&vD, 1)); + + // We are going to submit kernels with the following data accesses, where + // K2 and K3 can be executed concurrently, after K1 and been executed, and + // before K4 is executed. + // K1(Aw); K2(Ar,Bw); K3(Ar, Cw); K4(Br,Cr,Dw); + + // User-provided streams + cudaStream_t K1_stream; + cudaStream_t K2_stream; + cudaStream_t K3_stream; + cudaStream_t K4_stream; + cudaStreamCreate(&K1_stream); + cudaStreamCreate(&K2_stream); + cudaStreamCreate(&K3_stream); + cudaStreamCreate(&K4_stream); + + // Kernel 1 : A(write) + auto k1 = ctx.task(A.rw()); + k1.set_stream(K1_stream); + k1.set_symbol("K1"); + k1.start(); + cuda_sleep(500, K1_stream); + k1.end(); + + // Kernel 2 : A(read) B(write) + auto k2 = ctx.task(A.read(), B.write()); + k2.set_stream(K2_stream); + k2.set_symbol("K2"); + k2.start(); + cuda_sleep(500, K2_stream); + k2.end(); + + // Kernel 3 : A(read) C(write) + auto k3 = ctx.task(A.read(), C.write()); + k3.set_stream(K3_stream); + k3.set_symbol("K3"); + k3.start(); + cuda_sleep(500, K3_stream); + k3.end(); + + // Kernel 4 : B(read) C(read) D(write) + auto k4 = ctx.task(B.read(), C.read(), D.write()); + k4.set_stream(K4_stream); + k4.set_symbol("K4"); + k4.start(); + cuda_sleep(500, K4_stream); + k4.end(); + + ctx.finalize(); +} diff --git a/cudax/test/stf/cuda-samples/0_Introduction/vectorAdd/README.md b/cudax/test/stf/cuda-samples/0_Introduction/vectorAdd/README.md new file mode 100644 index 00000000000..df55c9d3d87 --- /dev/null +++ b/cudax/test/stf/cuda-samples/0_Introduction/vectorAdd/README.md @@ -0,0 +1,69 @@ +# vectorAdd - Vector Addition + +## Description + +This CUDA Runtime API sample is a very basic sample that implements element by element vector addition. It is the same as the sample illustrating Chapter 3 of the programming guide with some additions like error checking. + +## Key Concepts + +CUDA Runtime API, Vector Addition + +## Supported SM Architectures + +[SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows + +## Supported CPU Architecture + +x86_64, ppc64le, armv7l + +## CUDA APIs involved + +### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) +cudaFree, cudaMalloc, cudaGetLastError, cudaMemcpy, cudaGetErrorString + +## Prerequisites + +Download and install the [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +## References (for more details) diff --git a/cudax/test/stf/cuda-samples/0_Introduction/vectorAdd/vectorAdd.cu b/cudax/test/stf/cuda-samples/0_Introduction/vectorAdd/vectorAdd.cu new file mode 100644 index 00000000000..ea7f39cad5c --- /dev/null +++ b/cudax/test/stf/cuda-samples/0_Introduction/vectorAdd/vectorAdd.cu @@ -0,0 +1,44 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * Vector addition: C = A + B. + * + * This sample is a very basic sample that implements element by element + * vector addition. It is the same as the sample illustrating Chapter 2 + * of the programming guide with some additions like error checking. + */ diff --git a/cudax/test/stf/cuda-samples/0_Introduction/vectorAdd/vectorAdd_cudastf.cu b/cudax/test/stf/cuda-samples/0_Introduction/vectorAdd/vectorAdd_cudastf.cu new file mode 100644 index 00000000000..f192542d732 --- /dev/null +++ b/cudax/test/stf/cuda-samples/0_Introduction/vectorAdd/vectorAdd_cudastf.cu @@ -0,0 +1,151 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * Vector addition: C = A + B. + * + * This sample is a very basic sample that implements element by element + * vector addition. It is the same as the sample illustrating Chapter 2 + * of the programming guide with some additions like error checking. + */ + +#include +#include + +using namespace cuda::experimental::stf; + +/** + * CUDA Kernel Device code + * + * Computes the vector addition of A and B into C. The 3 vectors have the same + * number of elements numElements. + */ +__global__ void vectorAdd(const float* A, const float* B, float* C, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + C[i] = A[i] + B[i] + 0.0f; + } +} + +template +void run() +{ + Ctx ctx; + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + size_t size = numElements * sizeof(float); + // printf("[Vector addition of %d elements]\n", numElements); + + // Allocate the host input vector A + float* h_A = (float*) malloc(size); + + // Allocate the host input vector B + float* h_B = (float*) malloc(size); + + // Allocate the host output vector C + float* h_C = (float*) malloc(size); + + // Verify that allocations succeeded + if (h_A == NULL || h_B == NULL || h_C == NULL) + { + fprintf(stderr, "Failed to allocate host vectors!\n"); + exit(EXIT_FAILURE); + } + + // Initialize the host input vectors + for (int i = 0; i < numElements; ++i) + { + h_A[i] = rand() / (float) RAND_MAX; + h_B[i] = rand() / (float) RAND_MAX; + } + + auto A_handle = ctx.logical_data(h_A, numElements); + auto B_handle = ctx.logical_data(h_B, numElements); + auto C_handle = ctx.logical_data(h_C, numElements); + + auto t = ctx.task(A_handle.read(), B_handle.read(), C_handle.rw()); + t->*[&](cudaStream_t stream, auto d_A, auto d_B, auto d_C) { + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + // printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock); + vectorAdd<<>>( + d_A.data_handle(), d_B.data_handle(), d_C.data_handle(), numElements); + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } + }; + + auto t_host = ctx.host_launch(A_handle.read(), B_handle.read(), C_handle.read()); + t_host->*[&](auto, auto, auto) { + // Verify that the result vector is correct + for (int i = 0; i < numElements; ++i) + { + if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) + { + fprintf(stderr, "Result verification failed at element %d!\n", i); + exit(EXIT_FAILURE); + } + } + }; + + ctx.finalize(); + + // Free host memory + free(h_A); + free(h_B); + free(h_C); +} + +/** + * Host main routine + */ +int main(void) +{ + run(); + run(); +} diff --git a/cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/README.md b/cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/README.md new file mode 100644 index 00000000000..ab58050488d --- /dev/null +++ b/cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/README.md @@ -0,0 +1,69 @@ +# jacobiCudaGraphs - Jacobi CUDA Graphs + +## Description + +Demonstrates Instantiated CUDA Graph Update with Jacobi Iterative Method using cudaGraphExecKernelNodeSetParams() and cudaGraphExecUpdate() approach. + +## Key Concepts + +CUDA Graphs, Stream Capture, Instantiated CUDA Graph Update, Cooperative Groups + +## Supported SM Architectures + +[SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows + +## Supported CPU Architecture + +x86_64, ppc64le, armv7l + +## CUDA APIs involved + +### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) +cudaGraphAddMemsetNode, cudaStreamCreateWithFlags, cudaMemcpyAsync, cudaMallocHost, cudaPitchedPtr, cudaGraphCreate, cudaMalloc, cudaPos, cudaGraphAddMemcpyNode, cudaStreamEndCapture, cudaGraphExecDestroy, cudaStreamBeginCapture, cudaGraphExecKernelNodeSetParams, cudaStreamSynchronize, cudaGraphLaunch, cudaFree, cudaGraphInstantiate, cudaExtent, cudaMemsetAsync, cudaFreeHost, cudaGraphAddKernelNode, cudaGraphExecUpdate + +## Prerequisites + +Download and install the [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +## References (for more details) diff --git a/cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.cu b/cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.cu new file mode 100644 index 00000000000..4b8e8be4600 --- /dev/null +++ b/cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.cu @@ -0,0 +1,626 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// This sample demonstrates Instantiated CUDA Graph Update +// with Jacobi Iterative Method in 3 different methods: +// 1 - JacobiMethodGpuCudaGraphExecKernelSetParams() - CUDA Graph with +// cudaGraphExecKernelNodeSetParams() 2 - JacobiMethodGpuCudaGraphExecUpdate() - +// CUDA Graph with cudaGraphExecUpdate() 3 - JacobiMethodGpu() - Non CUDA Graph +// method + +// Jacobi method on a linear system A*x = b, +// where A is diagonally dominant and the exact solution consists +// of all ones. + +#include + +#include + +using cuda::experimental::stf::cuda_safe_call; + +#define N_ROWS 512 + +namespace cg = cooperative_groups; + +// 8 Rows of square-matrix A processed by each CTA. +// This can be max 32 and only power of 2 (i.e., 2/4/8/16/32). +#define ROWS_PER_CTA 8 + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 +#else +__device__ double atomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = (unsigned long long int*) address; + unsigned long long int old = *address_as_ull, assumed; + + do + { + assumed = old; + old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != + // NaN) + } while (assumed != old); + + return __longlong_as_double(old); +} +#endif + +// creates N_ROWS x N_ROWS matrix A with N_ROWS+1 on the diagonal and 1 +// elsewhere. The elements of the right hand side b all equal 2*n, hence the +// exact solution x to A*x = b is a vector of ones. +void createLinearSystem(float* A, double* b) +{ + int i, j; + for (i = 0; i < N_ROWS; i++) + { + b[i] = 2.0 * N_ROWS; + for (j = 0; j < N_ROWS; j++) + { + A[i * N_ROWS + j] = 1.0; + } + A[i * N_ROWS + i] = N_ROWS + 1.0; + } +} + +static __global__ void +JacobiMethod(const float* A, const double* b, const float conv_threshold, double* x, double* x_new, double* sum) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ double x_shared[N_ROWS]; // N_ROWS == n + __shared__ double b_shared[ROWS_PER_CTA + 1]; + + for (int i = threadIdx.x; i < N_ROWS; i += blockDim.x) + { + x_shared[i] = x[i]; + } + + if (threadIdx.x < ROWS_PER_CTA) + { + int k = threadIdx.x; + for (int i = k + (blockIdx.x * ROWS_PER_CTA); (k < ROWS_PER_CTA) && (i < N_ROWS); + k += ROWS_PER_CTA, i += ROWS_PER_CTA) + { + b_shared[i % (ROWS_PER_CTA + 1)] = b[i]; + } + } + + cg::sync(cta); + + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + for (int k = 0, i = blockIdx.x * ROWS_PER_CTA; (k < ROWS_PER_CTA) && (i < N_ROWS); k++, i++) + { + double rowThreadSum = 0.0; + for (int j = threadIdx.x; j < N_ROWS; j += blockDim.x) + { + rowThreadSum += (A[i * N_ROWS + j] * x_shared[j]); + } + + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) + { + rowThreadSum += tile32.shfl_down(rowThreadSum, offset); + } + + if (tile32.thread_rank() == 0) + { + atomicAdd(&b_shared[i % (ROWS_PER_CTA + 1)], -rowThreadSum); + } + } + + cg::sync(cta); + + if (threadIdx.x < ROWS_PER_CTA) + { + cg::thread_block_tile tile8 = cg::tiled_partition(cta); + double temp_sum = 0.0; + + int k = threadIdx.x; + + for (int i = k + (blockIdx.x * ROWS_PER_CTA); (k < ROWS_PER_CTA) && (i < N_ROWS); + k += ROWS_PER_CTA, i += ROWS_PER_CTA) + { + double dx = b_shared[i % (ROWS_PER_CTA + 1)]; + dx /= A[i * N_ROWS + i]; + + x_new[i] = (x_shared[i] + dx); + temp_sum += fabs(dx); + } + + for (int offset = tile8.size() / 2; offset > 0; offset /= 2) + { + temp_sum += tile8.shfl_down(temp_sum, offset); + } + + if (tile8.thread_rank() == 0) + { + atomicAdd(sum, temp_sum); + } + } +} + +// Thread block size for finalError kernel should be multiple of 32 +static __global__ void finalError(double* x, double* g_sum) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + extern __shared__ double warpSum[]; + double sum = 0.0; + + int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; + + for (int i = globalThreadId; i < N_ROWS; i += blockDim.x * gridDim.x) + { + double d = x[i] - 1.0; + sum += fabs(d); + } + + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) + { + sum += tile32.shfl_down(sum, offset); + } + + if (tile32.thread_rank() == 0) + { + warpSum[threadIdx.x / warpSize] = sum; + } + + cg::sync(cta); + + double blockSum = 0.0; + if (threadIdx.x < (blockDim.x / warpSize)) + { + blockSum = warpSum[threadIdx.x]; + } + + if (threadIdx.x < 32) + { + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) + { + blockSum += tile32.shfl_down(blockSum, offset); + } + if (tile32.thread_rank() == 0) + { + atomicAdd(g_sum, blockSum); + } + } +} + +// Run the Jacobi method for A*x = b on GPU with CUDA Graph - +// cudaGraphExecKernelNodeSetParams(). +double JacobiMethodGpuCudaGraphExecKernelSetParams( + const float* A, + const double* b, + const float conv_threshold, + const int max_iter, + double* x, + double* x_new, + cudaStream_t stream) +{ + // CTA size + dim3 nthreads(256, 1, 1); + // grid size + dim3 nblocks((N_ROWS / ROWS_PER_CTA) + 2, 1, 1); + cudaGraph_t graph; + cudaGraphExec_t graphExec = NULL; + + double sum = 0.0; + double* d_sum = NULL; + cuda_safe_call(cudaMalloc(&d_sum, sizeof(double))); + + std::vector nodeDependencies; + cudaGraphNode_t memcpyNode, jacobiKernelNode, memsetNode; + cudaMemcpy3DParms memcpyParams; + cudaMemsetParams memsetParams; + + memsetParams.dst = (void*) d_sum; + memsetParams.value = 0; + memsetParams.pitch = 0; + // elementSize can be max 4 bytes, so we take sizeof(float) and width=2 + memsetParams.elementSize = sizeof(float); + memsetParams.width = 2; + memsetParams.height = 1; + + cuda_safe_call(cudaGraphCreate(&graph, 0)); + cuda_safe_call(cudaGraphAddMemsetNode(&memsetNode, graph, NULL, 0, &memsetParams)); + nodeDependencies.push_back(memsetNode); + + cudaKernelNodeParams NodeParams0, NodeParams1; + NodeParams0.func = (void*) JacobiMethod; + NodeParams0.gridDim = nblocks; + NodeParams0.blockDim = nthreads; + NodeParams0.sharedMemBytes = 0; + void* kernelArgs0[6] = {(void*) &A, (void*) &b, (void*) &conv_threshold, (void*) &x, (void*) &x_new, (void*) &d_sum}; + NodeParams0.kernelParams = kernelArgs0; + NodeParams0.extra = NULL; + + cuda_safe_call( + cudaGraphAddKernelNode(&jacobiKernelNode, graph, nodeDependencies.data(), nodeDependencies.size(), &NodeParams0)); + + nodeDependencies.clear(); + nodeDependencies.push_back(jacobiKernelNode); + + memcpyParams.srcArray = NULL; + memcpyParams.srcPos = make_cudaPos(0, 0, 0); + memcpyParams.srcPtr = make_cudaPitchedPtr(d_sum, sizeof(double), 1, 1); + memcpyParams.dstArray = NULL; + memcpyParams.dstPos = make_cudaPos(0, 0, 0); + memcpyParams.dstPtr = make_cudaPitchedPtr(&sum, sizeof(double), 1, 1); + memcpyParams.extent = make_cudaExtent(sizeof(double), 1, 1); + memcpyParams.kind = cudaMemcpyDeviceToHost; + + cuda_safe_call( + cudaGraphAddMemcpyNode(&memcpyNode, graph, nodeDependencies.data(), nodeDependencies.size(), &memcpyParams)); + + cuda_safe_call(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); + + NodeParams1.func = (void*) JacobiMethod; + NodeParams1.gridDim = nblocks; + NodeParams1.blockDim = nthreads; + NodeParams1.sharedMemBytes = 0; + void* kernelArgs1[6] = {(void*) &A, (void*) &b, (void*) &conv_threshold, (void*) &x_new, (void*) &x, (void*) &d_sum}; + NodeParams1.kernelParams = kernelArgs1; + NodeParams1.extra = NULL; + + int k = 0; + for (k = 0; k < max_iter; k++) + { + cuda_safe_call( + cudaGraphExecKernelNodeSetParams(graphExec, jacobiKernelNode, ((k & 1) == 0) ? &NodeParams0 : &NodeParams1)); + cuda_safe_call(cudaGraphLaunch(graphExec, stream)); + cuda_safe_call(cudaStreamSynchronize(stream)); + + if (sum <= conv_threshold) + { + cuda_safe_call(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); + nblocks.x = (N_ROWS / nthreads.x) + 1; + size_t sharedMemSize = ((nthreads.x / 32) + 1) * sizeof(double); + if ((k & 1) == 0) + { + finalError<<>>(x_new, d_sum); + } + else + { + finalError<<>>(x, d_sum); + } + + cuda_safe_call(cudaMemcpyAsync(&sum, d_sum, sizeof(double), cudaMemcpyDeviceToHost, stream)); + cuda_safe_call(cudaStreamSynchronize(stream)); + // printf("GPU iterations : %d\n", k + 1); + // printf("GPU error: %.3e\n", sum); + break; + } + } + + cuda_safe_call(cudaFree(d_sum)); + return sum; +} + +// Run the Jacobi method for A*x = b on GPU with Instantiated CUDA Graph Update +// API - cudaGraphExecUpdate(). +double JacobiMethodGpuCudaGraphExecUpdate( + const float* A, + const double* b, + const float conv_threshold, + const int max_iter, + double* x, + double* x_new, + cudaStream_t stream) +{ + // CTA size + dim3 nthreads(256, 1, 1); + // grid size + dim3 nblocks((N_ROWS / ROWS_PER_CTA) + 2, 1, 1); + cudaGraph_t graph; + cudaGraphExec_t graphExec = NULL; + + double sum = 0.0; + double* d_sum; + cuda_safe_call(cudaMalloc(&d_sum, sizeof(double))); + + int k = 0; + for (k = 0; k < max_iter; k++) + { + cuda_safe_call(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); + cuda_safe_call(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); + if ((k & 1) == 0) + { + JacobiMethod<<>>(A, b, conv_threshold, x, x_new, d_sum); + } + else + { + JacobiMethod<<>>(A, b, conv_threshold, x_new, x, d_sum); + } + cuda_safe_call(cudaMemcpyAsync(&sum, d_sum, sizeof(double), cudaMemcpyDeviceToHost, stream)); + cuda_safe_call(cudaStreamEndCapture(stream, &graph)); + + if (graphExec == NULL) + { + cuda_safe_call(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); + } + else + { + cudaGraphExecUpdateResult updateResult_out; + cuda_safe_call(cudaGraphExecUpdate(graphExec, graph, NULL, &updateResult_out)); + if (updateResult_out != cudaGraphExecUpdateSuccess) + { + if (graphExec != NULL) + { + cuda_safe_call(cudaGraphExecDestroy(graphExec)); + } + printf("k = %d graph update failed with error - %d\n", k, updateResult_out); + cuda_safe_call(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); + } + } + cuda_safe_call(cudaGraphLaunch(graphExec, stream)); + cuda_safe_call(cudaStreamSynchronize(stream)); + + if (sum <= conv_threshold) + { + cuda_safe_call(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); + nblocks.x = (N_ROWS / nthreads.x) + 1; + size_t sharedMemSize = ((nthreads.x / 32) + 1) * sizeof(double); + if ((k & 1) == 0) + { + finalError<<>>(x_new, d_sum); + } + else + { + finalError<<>>(x, d_sum); + } + + cuda_safe_call(cudaMemcpyAsync(&sum, d_sum, sizeof(double), cudaMemcpyDeviceToHost, stream)); + cuda_safe_call(cudaStreamSynchronize(stream)); + // printf("GPU iterations : %d\n", k + 1); + // printf("GPU error: %.3e\n", sum); + break; + } + } + + cuda_safe_call(cudaFree(d_sum)); + return sum; +} + +// Run the Jacobi method for A*x = b on GPU without CUDA Graph. +double JacobiMethodGpu( + const float* A, + const double* b, + const float conv_threshold, + const int max_iter, + double* x, + double* x_new, + cudaStream_t stream) +{ + // CTA size + dim3 nthreads(256, 1, 1); + // grid size + dim3 nblocks((N_ROWS / ROWS_PER_CTA) + 2, 1, 1); + + double sum = 0.0; + double* d_sum; + cuda_safe_call(cudaMalloc(&d_sum, sizeof(double))); + int k = 0; + + for (k = 0; k < max_iter; k++) + { + cuda_safe_call(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); + if ((k & 1) == 0) + { + JacobiMethod<<>>(A, b, conv_threshold, x, x_new, d_sum); + } + else + { + JacobiMethod<<>>(A, b, conv_threshold, x_new, x, d_sum); + } + cuda_safe_call(cudaMemcpyAsync(&sum, d_sum, sizeof(double), cudaMemcpyDeviceToHost, stream)); + cuda_safe_call(cudaStreamSynchronize(stream)); + + if (sum <= conv_threshold) + { + cuda_safe_call(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); + nblocks.x = (N_ROWS / nthreads.x) + 1; + size_t sharedMemSize = ((nthreads.x / 32) + 1) * sizeof(double); + if ((k & 1) == 0) + { + finalError<<>>(x_new, d_sum); + } + else + { + finalError<<>>(x, d_sum); + } + + cuda_safe_call(cudaMemcpyAsync(&sum, d_sum, sizeof(double), cudaMemcpyDeviceToHost, stream)); + cuda_safe_call(cudaStreamSynchronize(stream)); + // printf("GPU iterations : %d\n", k + 1); + // printf("GPU error: %.3e\n", sum); + break; + } + } + + cuda_safe_call(cudaFree(d_sum)); + return sum; +} + +// Run the Jacobi method for A*x = b on CPU. +void JacobiMethodCPU(float* A, double* b, float conv_threshold, int max_iter, int* num_iter, double* x) +{ + double* x_new; + x_new = (double*) calloc(N_ROWS, sizeof(double)); + int k; + + for (k = 0; k < max_iter; k++) + { + double sum = 0.0; + for (int i = 0; i < N_ROWS; i++) + { + double temp_dx = b[i]; + for (int j = 0; j < N_ROWS; j++) + { + temp_dx -= A[i * N_ROWS + j] * x[j]; + } + temp_dx /= A[i * N_ROWS + i]; + x_new[i] += temp_dx; + sum += fabs(temp_dx); + } + + for (int i = 0; i < N_ROWS; i++) + { + x[i] = x_new[i]; + } + + if (sum <= conv_threshold) + { + break; + } + } + *num_iter = k + 1; + free(x_new); +} + +int main() +{ + // if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + // printf("Command line: jacobiCudaGraphs [-option]\n"); + // printf("Valid options:\n"); + // printf( + // "-gpumethod=<0,1 or 2> : 0 - [Default] " + // "JacobiMethodGpuCudaGraphExecKernelSetParams\n"); + // printf(" : 1 - JacobiMethodGpuCudaGraphExecUpdate\n"); + // printf(" : 2 - JacobiMethodGpu - Non CUDA Graph\n"); + // printf("-device=device_num : cuda device id"); + // printf("-help : Output a help message\n"); + // exit(EXIT_SUCCESS); + // } + // + int gpumethod = 0; + // if (checkCmdLineFlag(argc, (const char **)argv, "gpumethod")) { + // gpumethod = getCmdLineArgumentInt(argc, (const char **)argv, "gpumethod"); + // + // if (gpumethod < 0 || gpumethod > 2) { + // printf("Error: gpumethod must be 0 or 1 or 2, gpumethod=%d is invalid\n", + // gpumethod); + // exit(EXIT_SUCCESS); + // } + // } + + // int dev = findCudaDevice(argc, (const char **)argv); + // int dev = 0; + + double* b = NULL; + float* A = NULL; + cuda_safe_call(cudaMallocHost(&b, N_ROWS * sizeof(double))); + memset(b, 0, N_ROWS * sizeof(double)); + cuda_safe_call(cudaMallocHost(&A, N_ROWS * N_ROWS * sizeof(float))); + memset(A, 0, N_ROWS * N_ROWS * sizeof(float)); + + createLinearSystem(A, b); + double* x = NULL; + // start with array of all zeroes + x = (double*) calloc(N_ROWS, sizeof(double)); + + float conv_threshold = 1.0e-2; + int max_iter = 4 * N_ROWS * N_ROWS; + int cnt = 0; + + // // create timer + // StopWatchInterface *timerCPU = NULL, *timerGpu = NULL; + // sdkCreateTimer(&timerCPU); + // + // sdkStartTimer(&timerCPU); + JacobiMethodCPU(A, b, conv_threshold, max_iter, &cnt, x); + + double sum = 0.0; + // Compute error + for (int i = 0; i < N_ROWS; i++) + { + double d = x[i] - 1.0; + sum += fabs(d); + } + // sdkStopTimer(&timerCPU); + // printf("CPU iterations : %d\n", cnt); + // printf("CPU error: %.3e\n", sum); + // printf("CPU Processing time: %f (ms)\n", sdkGetTimerValue(&timerCPU)); + + float* d_A; + double *d_b, *d_x, *d_x_new; + cudaStream_t stream1; + cuda_safe_call(cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking)); + cuda_safe_call(cudaMalloc(&d_b, sizeof(double) * N_ROWS)); + cuda_safe_call(cudaMalloc(&d_A, sizeof(float) * N_ROWS * N_ROWS)); + cuda_safe_call(cudaMalloc(&d_x, sizeof(double) * N_ROWS)); + cuda_safe_call(cudaMalloc(&d_x_new, sizeof(double) * N_ROWS)); + + cuda_safe_call(cudaMemsetAsync(d_x, 0, sizeof(double) * N_ROWS, stream1)); + cuda_safe_call(cudaMemsetAsync(d_x_new, 0, sizeof(double) * N_ROWS, stream1)); + cuda_safe_call(cudaMemcpyAsync(d_A, A, sizeof(float) * N_ROWS * N_ROWS, cudaMemcpyHostToDevice, stream1)); + cuda_safe_call(cudaMemcpyAsync(d_b, b, sizeof(double) * N_ROWS, cudaMemcpyHostToDevice, stream1)); + + // sdkCreateTimer(&timerGpu); + // sdkStartTimer(&timerGpu); + + double sumGPU = 0.0; + if (gpumethod == 0) + { + sumGPU = JacobiMethodGpuCudaGraphExecKernelSetParams(d_A, d_b, conv_threshold, max_iter, d_x, d_x_new, stream1); + } + else if (gpumethod == 1) + { + sumGPU = JacobiMethodGpuCudaGraphExecUpdate(d_A, d_b, conv_threshold, max_iter, d_x, d_x_new, stream1); + } + else if (gpumethod == 2) + { + sumGPU = JacobiMethodGpu(d_A, d_b, conv_threshold, max_iter, d_x, d_x_new, stream1); + } + + // sdkStopTimer(&timerGpu); + // printf("GPU Processing time: %f (ms)\n", sdkGetTimerValue(&timerGpu)); + + cuda_safe_call(cudaFree(d_b)); + cuda_safe_call(cudaFree(d_A)); + cuda_safe_call(cudaFree(d_x)); + cuda_safe_call(cudaFree(d_x_new)); + + cuda_safe_call(cudaFreeHost(A)); + cuda_safe_call(cudaFreeHost(b)); + + // printf("&&&& jacobiCudaGraphs %s\n", (fabs(sum - sumGPU) < conv_threshold) ? "PASSED" : "FAILED"); + + return (fabs(sum - sumGPU) < conv_threshold) ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi_cudastf.cu b/cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi_cudastf.cu new file mode 100644 index 00000000000..e409e118199 --- /dev/null +++ b/cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi_cudastf.cu @@ -0,0 +1,407 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// Jacobi method on a linear system A*x = b, +// where A is diagonally dominant and the exact solution consists +// of all ones. + +#include + +#define N_ROWS 512 + +namespace cg = cooperative_groups; + +using namespace cuda::experimental::stf; + +// 8 Rows of square-matrix A processed by each CTA. +// This can be max 32 and only power of 2 (i.e., 2/4/8/16/32). +#define ROWS_PER_CTA 8 + +// creates N_ROWS x N_ROWS matrix A with N_ROWS+1 on the diagonal and 1 +// elsewhere. The elements of the right hand side b all equal 2*n, hence the +// exact solution x to A*x = b is a vector of ones. +void createLinearSystem(float* A, double* b) +{ + int i, j; + for (i = 0; i < N_ROWS; i++) + { + b[i] = 2.0 * N_ROWS; + for (j = 0; j < N_ROWS; j++) + { + A[i * N_ROWS + j] = 1.0; + } + A[i * N_ROWS + i] = N_ROWS + 1.0; + } +} + +static __global__ void +JacobiMethod(const float* A, const double* b, const float conv_threshold, double* x, double* x_new, double* sum) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ double x_shared[N_ROWS]; // N_ROWS == n + __shared__ double b_shared[ROWS_PER_CTA + 1]; + + for (int i = threadIdx.x; i < N_ROWS; i += blockDim.x) + { + x_shared[i] = x[i]; + } + + if (threadIdx.x < ROWS_PER_CTA) + { + int k = threadIdx.x; + for (int i = k + (blockIdx.x * ROWS_PER_CTA); (k < ROWS_PER_CTA) && (i < N_ROWS); + k += ROWS_PER_CTA, i += ROWS_PER_CTA) + { + b_shared[i % (ROWS_PER_CTA + 1)] = b[i]; + } + } + + cg::sync(cta); + + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + for (int k = 0, i = blockIdx.x * ROWS_PER_CTA; (k < ROWS_PER_CTA) && (i < N_ROWS); k++, i++) + { + double rowThreadSum = 0.0; + for (int j = threadIdx.x; j < N_ROWS; j += blockDim.x) + { + rowThreadSum += (A[i * N_ROWS + j] * x_shared[j]); + } + + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) + { + rowThreadSum += tile32.shfl_down(rowThreadSum, offset); + } + + if (tile32.thread_rank() == 0) + { + atomicAdd(&b_shared[i % (ROWS_PER_CTA + 1)], -rowThreadSum); + } + } + + cg::sync(cta); + + if (threadIdx.x < ROWS_PER_CTA) + { + cg::thread_block_tile tile8 = cg::tiled_partition(cta); + double temp_sum = 0.0; + + int k = threadIdx.x; + + for (int i = k + (blockIdx.x * ROWS_PER_CTA); (k < ROWS_PER_CTA) && (i < N_ROWS); + k += ROWS_PER_CTA, i += ROWS_PER_CTA) + { + double dx = b_shared[i % (ROWS_PER_CTA + 1)]; + dx /= A[i * N_ROWS + i]; + + x_new[i] = (x_shared[i] + dx); + temp_sum += fabs(dx); + } + + for (int offset = tile8.size() / 2; offset > 0; offset /= 2) + { + temp_sum += tile8.shfl_down(temp_sum, offset); + } + + if (tile8.thread_rank() == 0) + { + atomicAdd(sum, temp_sum); + } + } +} + +// Thread block size for finalError kernel should be multiple of 32 +static __global__ void finalError(const double* x, double* g_sum) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + extern __shared__ double warpSum[]; + double sum = 0.0; + + int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; + + for (int i = globalThreadId; i < N_ROWS; i += blockDim.x * gridDim.x) + { + double d = x[i] - 1.0; + sum += fabs(d); + } + + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) + { + sum += tile32.shfl_down(sum, offset); + } + + if (tile32.thread_rank() == 0) + { + warpSum[threadIdx.x / warpSize] = sum; + } + + cg::sync(cta); + + double blockSum = 0.0; + if (threadIdx.x < (blockDim.x / warpSize)) + { + blockSum = warpSum[threadIdx.x]; + } + + if (threadIdx.x < 32) + { + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) + { + blockSum += tile32.shfl_down(blockSum, offset); + } + if (tile32.thread_rank() == 0) + { + atomicAdd(g_sum, blockSum); + } + } +} + +// Run the Jacobi method for A*x = b on GPU without CUDA Graph. +// double JacobiMethodGpu(const float *A, const double *b, +// const float conv_threshold, const int max_iter, +// double *x, double *x_new) { +double JacobiMethodGpu( + stream_ctx& ctx, + logical_data>& A_handle, + logical_data>& b_handle, + const float conv_threshold, + const int max_iter, + logical_data>& x_handle, + logical_data>& x_new_handle) +{ + // CTA size + dim3 nthreads(256, 1, 1); + // grid size + dim3 nblocks((N_ROWS / ROWS_PER_CTA) + 2, 1, 1); + + double sum = 0.0; + + // Task for the whole Jacobi + auto sum_handle = ctx.logical_data(&sum, 1).set_symbol("sum"); + + int k; + for (k = 0; k < max_iter; k++) + { + auto x_mode = (k & 1) == 0 ? access_mode::read : access_mode::rw; + auto x_new_mode = (k & 1) == 0 ? access_mode::rw : access_mode::read; + + ctx.task(A_handle.read(), + b_handle.read(), + task_dep>(x_handle, x_mode), + task_dep>(x_new_handle, x_new_mode), + sum_handle.write()) + .set_symbol("JacobiMethod") + ->* + [&](cudaStream_t stream, auto A, auto b, auto x, auto x_new, auto d_sum) { + cuda_try(cudaMemsetAsync(d_sum.data_handle(), 0, sizeof(double), stream)); + + if ((k & 1) == 0) + { + JacobiMethod<<>>( + A.data_handle(), b.data_handle(), conv_threshold, x.data_handle(), x_new.data_handle(), d_sum.data_handle()); + } + else + { + JacobiMethod<<>>( + A.data_handle(), b.data_handle(), conv_threshold, x_new.data_handle(), x.data_handle(), d_sum.data_handle()); + } + }; + + bool converged; + // Check if sum is smaller than the threshold and halt the loop if this is true + auto t_host = ctx.task(sum_handle.read()); + t_host.on(exec_place::host); + t_host.set_symbol("check_converged"); + t_host->*[&](cudaStream_t stream, auto sum2) { + cuda_try(cudaStreamSynchronize(stream)); + // fprintf(stderr, "SUM %e\n", sum); + assert(*sum2.data_handle() == sum); + converged = (*sum2.data_handle() <= conv_threshold); + }; + + // read sum + if (converged) + { + break; + } + } + + auto final_x_handle = ((k & 1) == 0) ? &x_new_handle : &x_handle; + auto t = ctx.task(sum_handle.write(), final_x_handle->read()); + t.set_symbol("finalError"); + t->*[&](cudaStream_t stream, auto d_sum, auto final_x) { + cuda_try(cudaMemsetAsync(d_sum.data_handle(), 0, sizeof(double), stream)); + + nblocks.x = (N_ROWS / nthreads.x) + 1; + size_t sharedMemSize = ((nthreads.x / 32) + 1) * sizeof(double); + finalError<<>>(final_x.data_handle(), d_sum.data_handle()); + }; + + auto t_display = ctx.task(sum_handle.read()); + t_display.on(exec_place::host); + t_display.set_symbol("finalError"); + t_display->*[&](cudaStream_t stream, auto /*unused*/) { + cuda_try(cudaStreamSynchronize(stream)); + // printf("GPU iterations : %d\n", k + 1); + // printf("GPU error: %.3e\n", *h_sum.data_handle()); + }; + + return sum; +} + +// Run the Jacobi method for A*x = b on CPU. +void JacobiMethodCPU(float* A, double* b, float conv_threshold, int max_iter, int* num_iter, double* x) +{ + double* x_new = (double*) calloc(N_ROWS, sizeof(double)); + SCOPE(exit) + { + free(x_new); + }; + + int k = 0; + + for (; k < max_iter; k++) + { + double sum = 0.0; + for (int i = 0; i < N_ROWS; i++) + { + double temp_dx = b[i]; + for (int j = 0; j < N_ROWS; j++) + { + temp_dx -= A[i * N_ROWS + j] * x[j]; + } + temp_dx /= A[i * N_ROWS + i]; + x_new[i] += temp_dx; + sum += fabs(temp_dx); + } + + for (int i = 0; i < N_ROWS; i++) + { + x[i] = x_new[i]; + } + + if (sum <= conv_threshold) + { + break; + } + } + + *num_iter = k + 1; +} + +template +int run() +{ + Ctx ctx; + + double* b = cuda_try>(N_ROWS * sizeof(double), 0); + SCOPE(exit) + { + cuda_try(cudaFreeHost(b)); + }; + + float* A = cuda_try>(N_ROWS * N_ROWS * sizeof(float), 0); + SCOPE(exit) + { + cuda_try(cudaFreeHost(A)); + }; + + memset(b, 0, N_ROWS * sizeof(double)); + + memset(A, 0, N_ROWS * N_ROWS * sizeof(float)); + + createLinearSystem(A, b); + // start with array of all zeroes + double* x = (double*) calloc(N_ROWS, sizeof(double)); + SCOPE(exit) + { + free(x); + }; + + auto A_handle = ctx.logical_data(A, N_ROWS * N_ROWS); + auto b_handle = ctx.logical_data(b, N_ROWS); + auto x_handle = ctx.logical_data(x, N_ROWS); + auto x_new_handle = ctx.template logical_data(N_ROWS); + + A_handle.set_symbol("A"); + b_handle.set_symbol("b"); + x_handle.set_symbol("x"); + x_new_handle.set_symbol("x_new"); + + float conv_threshold = 1.0e-2; + int max_iter = 4 * N_ROWS * N_ROWS; + int cnt = 0; + + JacobiMethodCPU(A, b, conv_threshold, max_iter, &cnt, x); + + double sum = 0.0; + // Compute error + for (int i = 0; i < N_ROWS; i++) + { + double d = x[i] - 1.0; + sum += fabs(d); + } + + ctx.task(x_handle.write()).set_symbol("memset x")->*[&](cudaStream_t stream, auto d_x) { + cuda_try(cudaMemsetAsync(d_x.data_handle(), 0, sizeof(double) * N_ROWS, stream)); + }; + + ctx.task(x_new_handle.write()).set_symbol("memset x_new")->*[](cudaStream_t stream, auto d_x_new) { + cuda_try(cudaMemsetAsync(d_x_new.data_handle(), 0, sizeof(double) * N_ROWS, stream)); + }; + + double sumGPU = JacobiMethodGpu(ctx, A_handle, b_handle, conv_threshold, max_iter, x_handle, x_new_handle); + + ctx.finalize(); + + if (fabs(sum - sumGPU) > conv_threshold) + { + printf("&&&& jacobiCudaGraphs FAILED\n"); + return EXIT_FAILURE; + } + return 0; +} + +int main() +{ + return run(); + // run(); +} diff --git a/cudax/test/stf/cuda-samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_custf.cu b/cudax/test/stf/cuda-samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_custf.cu new file mode 100644 index 00000000000..0d1ec06345b --- /dev/null +++ b/cudax/test/stf/cuda-samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_custf.cu @@ -0,0 +1,521 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This sample implements a conjugate gradient solver on multiple GPU using + * Unified Memory optimized prefetching and usage hints. + * + */ + +// includes, system +#include + +#include +#include +#include +#include + +#include +#include +#include + +// Utilities and system includes +#include +#include + +#include +#include + +using namespace cuda::experimental::stf; + +namespace cg = cooperative_groups; + +const char* sSDKname = "conjugateGradientMultiDeviceCG"; + +#define ENABLE_CPU_DEBUG_CODE 0 +#define THREADS_PER_BLOCK 64 + +__device__ double grid_dot_result = 0.0; + +/* genTridiag: generate a random tridiagonal symmetric matrix */ +void genTridiag(slice I, slice J, slice val, int N, int nz) +{ + I(0) = 0, J(0) = 0, J(1) = 1; + val(0) = (float) rand() / RAND_MAX + 10.0f; + val(1) = (float) rand() / RAND_MAX; + int start; + + for (int i = 1; i < N; i++) + { + if (i > 1) + { + I(i) = I(i - 1) + 3; + } + else + { + I(1) = 2; + } + + start = (i - 1) * 3 + 2; + J(start) = i - 1; + J(start + 1) = i; + + if (i < N - 1) + { + J(start + 2) = i + 1; + } + + val(start) = val(start - 1); + val(start + 1) = (float) rand() / RAND_MAX + 10.0f; + + if (i < N - 1) + { + val(start + 2) = (float) rand() / RAND_MAX; + } + } + + I(N) = nz; +} + +// I - contains location of the given non-zero element in the row of the matrix +// J - contains location of the given non-zero element in the column of the +// matrix val - contains values of the given non-zero elements of the matrix +// inputVecX - input vector to be multiplied +// outputVecY - resultant vector +void cpuSpMV(int* I, int* J, float* val, int /*unused*/, int num_rows, float alpha, float* inputVecX, float* outputVecY) +{ + for (int i = 0; i < num_rows; i++) + { + int num_elems_this_row = I[i + 1] - I[i]; + + float output = 0.0; + for (int j = 0; j < num_elems_this_row; j++) + { + output += alpha * val[I[i] + j] * inputVecX[J[I[i] + j]]; + } + outputVecY[i] = output; + } + + return; +} + +float dotProduct(float* vecA, float* vecB, int size) +{ + float result = 0.0; + + for (int i = 0; i < size; i++) + { + result = result + (vecA[i] * vecB[i]); + } + + return result; +} + +void scaleVector(float* vec, float alpha, int size) +{ + for (int i = 0; i < size; i++) + { + vec[i] = alpha * vec[i]; + } +} + +void saxpy(float* x, float* y, float a, int size) +{ + for (int i = 0; i < size; i++) + { + y[i] = a * x[i] + y[i]; + } +} + +void cpuConjugateGrad(int* I, int* J, float* val, float* x, float* Ax, float* p, float* r, int nnz, int N, float tol) +{ + int max_iter = 10000; + + float alpha = 1.0; + float alpham1 = -1.0; + float r0 = 0.0, b, a, na; + + cpuSpMV(I, J, val, nnz, N, alpha, x, Ax); + saxpy(Ax, r, alpham1, N); + + float r1 = dotProduct(r, r, N); + + int k = 1; + + while (r1 > tol * tol && k <= max_iter) + { + if (k > 1) + { + b = r1 / r0; + scaleVector(p, b, N); + + saxpy(r, p, alpha, N); + } + else + { + for (int i = 0; i < N; i++) + { + p[i] = r[i]; + } + } + + cpuSpMV(I, J, val, nnz, N, alpha, p, Ax); + + float dot = dotProduct(p, Ax, N); + a = r1 / dot; + + saxpy(p, x, a, N); + na = -a; + saxpy(Ax, r, na, N); + + r0 = r1; + r1 = dotProduct(r, r, N); + + printf("\nCPU code iteration = %3d, residual = %e\n", k, sqrt(r1)); + k++; + } +} + +template +__device__ void gpuSpMV( + slice I, + slice J, + slice val, + int nnz, + int num_rows, + float alpha, + slice inputVecX, + slice outputVecY, + const thread_hierarchy_t& t) +{ + for (int i = t.rank(); i < num_rows; i += t.size()) + { + int row_elem = I(i); + int next_row_elem = I(i + 1); + int num_elems_this_row = next_row_elem - row_elem; + + float output = 0.0; + for (int j = 0; j < num_elems_this_row; j++) + { + output += alpha * val(row_elem + j) * inputVecX(J(row_elem + j)); + } + + outputVecY(i) = output; + } +} + +template +__device__ void gpuSaxpy(slice x, slice y, float a, int size, const thread_hierarchy_t& t) +{ + for (int i = t.rank(); i < size; i += t.size()) + { + y(i) = a * x(i) + y(i); + } +} + +template +__device__ double +gpuDotProduct(slice vecA, slice vecB, int size, double* dot_result, thread_hierarchy_t& t) +{ + slice tmp = t.template storage(1); + + cg::thread_block cta = cooperative_groups::this_thread_block(); + + double temp_sum = 0.0; + + for (int i = t.rank(); i < size; i += t.size()) + { + temp_sum += (double) (vecA(i) * vecB(i)); + } + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + temp_sum = cg::reduce(tile32, temp_sum, cg::plus()); + + if (tile32.thread_rank() == 0) + { + tmp[tile32.meta_group_rank()] = temp_sum; + } + + cta.sync(); + + if (tile32.meta_group_rank() == 0) + { + temp_sum = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0; + temp_sum = cg::reduce(tile32, temp_sum, cg::plus()); + + if (tile32.thread_rank() == 0) + { + atomicAdd(&grid_dot_result, temp_sum); + } + } + + t.sync(); + + if (t.rank(0, -1) == 0) + { + atomicAdd_system(dot_result, grid_dot_result); + grid_dot_result = 0.0; + } + + t.sync(); + return *dot_result; +} + +template +__device__ void gpuCopyVector(slice srcA, slice destB, int size, const thread_hierarchy_t& t) +{ + for (int i = t.rank(); i < size; i += t.size()) + { + destB(i) = srcA(i); + } +} + +template +__device__ void +gpuScaleVectorAndSaxpy(slice x, slice y, float a, float scale, int size, const thread_hierarchy_t& t) +{ + for (int i = t.rank(); i < size; i += t.size()) + { + y(i) = a * x(i) + scale * y(i); + } +} + +template +__device__ void multiGpuConjugateGradient( + thread_hierarchy_t t, + slice I, + slice J, + slice val, + slice x, + slice Ax, + slice p, + slice r, + double* dot_result, + int nnz, + int N, + float tol) +{ + const int max_iter = 10000; + + float alpha = 1.0; + float alpham1 = -1.0; + float r0 = 0.0, r1, b, a, na; + + for (int i = t.rank(); i < N; i += t.size()) + { + r[i] = 1.0; + x[i] = 0.0; + } + + gpuSpMV(I, J, val, nnz, N, alpha, x, Ax, t); + + gpuSaxpy(Ax, r, alpham1, N, t); + + r1 = gpuDotProduct(r, r, N, dot_result, t); + + int k = 1; + while (r1 > tol * tol && k <= max_iter) + { + if (k > 1) + { + b = r1 / r0; + gpuScaleVectorAndSaxpy(r, p, alpha, b, N, t); + } + else + { + gpuCopyVector(r, p, N, t); + } + + gpuSpMV(I, J, val, nnz, N, alpha, p, Ax, t); + + if (t.rank() == 0) + { + *dot_result = 0.0; + } + + a = r1 / gpuDotProduct(p, Ax, N, dot_result, t); + + gpuSaxpy(p, x, a, N, t); + + na = -a; + + gpuSaxpy(Ax, r, na, N, t); + + r0 = r1; + + if (t.rank() == 0) + { + *dot_result = 0.0; + } + + r1 = gpuDotProduct(r, r, N, dot_result, t); + + k++; + } +} + +int main() +{ + stream_ctx ctx; +#if 0 + constexpr size_t kNumGpusRequired = 8; +#else + constexpr size_t kNumGpusRequired = 1; +#endif + int N = 0, nz = 0, *I = NULL, *J = NULL; + float* val = NULL; + const float tol = 1e-5f; + float* x; + float rhs = 1.0; + float r1; + float *r, *p, *Ax; + + // printf("Starting [%s]...\n", sSDKname); + + /* Generate a random tridiagonal symmetric matrix in CSR format */ + N = 10485760 * 2; + nz = (N - 2) * 3 + 4; + + I = (int*) malloc(sizeof(int) * (N + 1)); + J = (int*) malloc(sizeof(int) * nz); + val = (float*) malloc(sizeof(float) * nz); + float* val_cpu = (float*) malloc(sizeof(float) * nz); + + auto handle_I = ctx.logical_data(I, {(unsigned) (N + 1)}); + auto handle_J = ctx.logical_data(J, {(unsigned) nz}); + auto handle_val = ctx.logical_data(val, {(unsigned) nz}); + + ctx.host_launch(handle_I.write(), handle_J.write(), handle_val.write())->*[=](auto I, auto J, auto val) { + genTridiag(I, J, val, N, nz); + memcpy(val_cpu, val.data_handle(), sizeof(float) * nz); + }; + + double* dot_result = (double*) malloc(sizeof(double)); + dot_result[0] = 0.0; + + x = (float*) malloc(sizeof(float) * N); + r = (float*) malloc(sizeof(float) * N); + p = (float*) malloc(sizeof(float) * N); + Ax = (float*) malloc(sizeof(float) * N); + + auto handle_r = ctx.logical_data(r, {(unsigned) N}); + auto handle_p = ctx.logical_data(p, {(unsigned) N}); + auto handle_Ax = ctx.logical_data(Ax, {(unsigned) N}); + auto handle_x = ctx.logical_data(x, {(unsigned) N}); + auto handle_dot_result = ctx.logical_data(dot_result, {(unsigned) 1}); + + // std::cout << "\nRunning on GPUs = " << kNumGpusRequired << std::endl; + const int sMemSize = sizeof(double) * ((THREADS_PER_BLOCK / 32) + 1); + + // auto all_devs = exec_place::repeat(exec_place::device(0), kNumGpusRequired); + auto all_devs = exec_place::n_devices(kNumGpusRequired); + + /* The grid size is 0 and will be computed upon launch */ + auto spec = con(con(THREADS_PER_BLOCK, mem(sMemSize))); + ctx.launch( + spec, + all_devs, + handle_I.read(), + handle_J.read(), + handle_val.read(), + handle_x.write(), + handle_Ax.write(), + handle_p.write(), + handle_r.write(), + handle_dot_result.write()) + ->*[=] + _CCCL_DEVICE(auto t, + slice I, + slice J, + slice val, + slice x, + slice Ax, + slice p, + slice r, + slice dot_result) { + multiGpuConjugateGradient(t, I, J, val, x, Ax, p, r, dot_result.data_handle(), nz, N, tol); + }; + + ctx.finalize(); + + r1 = dot_result[0]; + + printf("GPU Final, residual = %e \n ", sqrt(r1)); + +#if ENABLE_CPU_DEBUG_CODE + float* Ax_cpu = (float*) malloc(sizeof(float) * N); + float* r_cpu = (float*) malloc(sizeof(float) * N); + float* p_cpu = (float*) malloc(sizeof(float) * N); + float* x_cpu = (float*) malloc(sizeof(float) * N); + + for (int i = 0; i < N; i++) + { + r_cpu[i] = 1.0; + Ax_cpu[i] = x_cpu[i] = 0.0; + } + cpuConjugateGrad(I, J, val, x_cpu, Ax_cpu, p_cpu, r_cpu, nz, N, tol); +#endif + + float rsum, diff, err = 0.0; + + for (int i = 0; i < N; i++) + { + rsum = 0.0; + + for (int j = I[i]; j < I[i + 1]; j++) + { + rsum += val_cpu[j] * x[J[j]]; + } + + diff = fabs(rsum - rhs); + + if (diff > err) + { + err = diff; + } + } + +#if ENABLE_CPU_DEBUG_CODE + free(Ax_cpu); + free(r_cpu); + free(p_cpu); + free(x_cpu); +#endif + + printf("Test Summary: Error amount = %f \n", err); + fprintf(stdout, "&&&& conjugateGradientMultiDeviceCG %s\n", (sqrt(r1) < tol) ? "PASSED" : "FAILED"); + exit((sqrt(r1) < tol) ? EXIT_SUCCESS : EXIT_FAILURE); +} diff --git a/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarloMultiGPU.cu b/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarloMultiGPU.cu new file mode 100644 index 00000000000..3cae7cb6bde --- /dev/null +++ b/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarloMultiGPU.cu @@ -0,0 +1,488 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This sample evaluates fair call price for a + * given set of European options using Monte Carlo approach. + * See supplied whitepaper for more explanations. + */ + +#include "MonteCarlo_gold.cu" +#include "MonteCarlo_kernel.cu" + +int* pArgc = NULL; +char** pArgv = NULL; + +#ifdef WIN32 +# define strcasecmp _strcmpi +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Common functions +//////////////////////////////////////////////////////////////////////////////// +float randFloat(float low, float high) +{ + float t = (float) rand() / (float) RAND_MAX; + return (1.0f - t) * low + t * high; +} + +/// Utility function to tweak problem size for small GPUs +int adjustProblemSize(int GPU_N, int default_nOptions) +{ + int nOptions = default_nOptions; + + // select problem size + for (int i = 0; i < GPU_N; i++) + { + cudaDeviceProp deviceProp; + cuda_safe_call(cudaGetDeviceProperties(&deviceProp, i)); + // int cudaCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * + // deviceProp.multiProcessorCount; + int cudaCores = 80; + + if (cudaCores <= 32) + { + nOptions = (nOptions < cudaCores / 2 ? nOptions : cudaCores / 2); + } + } + + return nOptions; +} + +int adjustGridSize(int GPUIndex, int defaultGridSize) +{ + cudaDeviceProp deviceProp; + cuda_safe_call(cudaGetDeviceProperties(&deviceProp, GPUIndex)); + int maxGridSize = deviceProp.multiProcessorCount * 40; + return ((defaultGridSize > maxGridSize) ? maxGridSize : defaultGridSize); +} + +/////////////////////////////////////////////////////////////////////////////// +// CPU reference functions +/////////////////////////////////////////////////////////////////////////////// +extern "C" void MonteCarloCPU(TOptionValue& callValue, TOptionData optionData, float* h_Random, int pathN); + +// Black-Scholes formula for call options +extern "C" void BlackScholesCall(float& CallResult, TOptionData optionData); + +//////////////////////////////////////////////////////////////////////////////// +// GPU-driving host thread +//////////////////////////////////////////////////////////////////////////////// +// Timer +// StopWatchInterface **hTimer = NULL; + +static void* solverThread(void* solver_args) +{ + TOptionPlan* plan = (TOptionPlan*) solver_args; + + // Init GPU + cuda_safe_call(cudaSetDevice(plan->device)); + + stream_ctx ctx; + + cudaDeviceProp deviceProp; + cuda_safe_call(cudaGetDeviceProperties(&deviceProp, plan->device)); + + // // Start the timer + // sdkStartTimer(&hTimer[plan->device]); + + // Allocate intermediate memory for MC integrator and initialize + // RNG states + initMonteCarloGPU(ctx, plan); + + // Main computation + MonteCarloGPU(ctx, plan); + + cuda_safe_call(cudaDeviceSynchronize()); + + // // Stop the timer + // sdkStopTimer(&hTimer[plan->device]); + + // Shut down this GPU + closeMonteCarloGPU(ctx, plan); + + cudaStreamSynchronize(0); + + // printf("solverThread() finished - GPU Device %d: %s\n", plan->device, deviceProp.name); + + return NULL; +} + +static void multiSolver(TOptionPlan* plan, int nPlans) +{ + // Init Each GPU + // In CUDA 4.0 we can call cudaSetDevice multiple times to target each device + // Set the device desired, then perform initializations on that device + stream_ctx ctx; + + for (int i = 0; i < nPlans; i++) + { + // set the target device to perform initialization on + cuda_safe_call(cudaSetDevice(plan[i].device)); + + cudaDeviceProp deviceProp; + cuda_safe_call(cudaGetDeviceProperties(&deviceProp, plan[i].device)); + + // Allocate intermediate memory for MC integrator + // and initialize RNG state + initMonteCarloGPU(ctx, &plan[i]); + + // Main computations + MonteCarloGPU(ctx, &plan[i]); + + closeMonteCarloGPU(ctx, &plan[i]); + } + + ctx.finalize(); +} + +/////////////////////////////////////////////////////////////////////////////// +// Main program +/////////////////////////////////////////////////////////////////////////////// +#define DO_CPU +#undef DO_CPU + +#define PRINT_RESULTS +#undef PRINT_RESULTS + +void usage() +{ + printf("--method=[threaded,streamed] --scaling=[strong,weak] [--help]\n"); + printf("Method=threaded: 1 CPU thread for each GPU [default]\n"); + printf(" streamed: 1 CPU thread handles all GPUs (requires CUDA 4.0 or " + "newer)\n"); + printf("Scaling=strong : constant problem size\n"); + printf(" weak : problem size scales with number of available GPUs " + "[default]\n"); +} + +int main(int argc, char** argv) +{ + char* multiMethodChoice = NULL; + char* scalingChoice = NULL; + bool use_threads = true; + bool bqatest = false; + bool strongScaling = false; + + pArgc = &argc; + pArgv = argv; + + // printf("%s Starting...\n\n", argv[0]); + + // if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) { + // bqatest = true; + // } + // + // getCmdLineArgumentString(argc, (const char **)argv, "method", + // &multiMethodChoice); + // getCmdLineArgumentString(argc, (const char **)argv, "scaling", + // &scalingChoice); + // + // if (checkCmdLineFlag(argc, (const char **)argv, "h") || + // checkCmdLineFlag(argc, (const char **)argv, "help")) { + // usage(); + // exit(EXIT_SUCCESS); + // } + // + if (multiMethodChoice == NULL) + { + use_threads = false; + } + else + { + if (!strcasecmp(multiMethodChoice, "threaded")) + { + use_threads = true; + } + else + { + use_threads = false; + } + } + + if (use_threads == false) + { + // printf("Using single CPU thread for multiple GPUs\n"); + } + + if (scalingChoice == NULL) + { + strongScaling = false; + } + else + { + if (!strcasecmp(scalingChoice, "strong")) + { + strongScaling = true; + } + else + { + strongScaling = false; + } + } + + // GPU number present in the system + int GPU_N; + cuda_safe_call(cudaGetDeviceCount(&GPU_N)); + int nOptions = 8 * 1024; + + nOptions = adjustProblemSize(GPU_N, nOptions); + + // select problem size + int scale = (strongScaling) ? 1 : GPU_N; + int OPT_N = nOptions * scale; + int PATH_N = 262144; + + // // initialize the timers + // hTimer = new StopWatchInterface *[GPU_N]; + // + // for (int i = 0; i < GPU_N; i++) { + // sdkCreateTimer(&hTimer[i]); + // sdkResetTimer(&hTimer[i]); + // } + + // Input data array + TOptionData* optionData = new TOptionData[OPT_N]; + // Final GPU MC results + TOptionValue* callValueGPU = new TOptionValue[OPT_N]; + //"Theoretical" call values by Black-Scholes formula + float* callValueBS = new float[OPT_N]; + // Solver config + TOptionPlan* optionSolver = new TOptionPlan[GPU_N]; + // OS thread ID + pthread_t* threadID = new pthread_t[GPU_N]; + + int gpuBase, gpuIndex; + int i; + + double delta, ref, sumDelta, sumRef, sumReserve; + + // printf("MonteCarloMultiGPU\n"); + // printf("==================\n"); + // printf("Parallelization method = %s\n", use_threads ? "threaded" : "streamed"); + // printf("Problem scaling = %s\n", strongScaling ? "strong" : "weak"); + // printf("Number of GPUs = %d\n", GPU_N); + // printf("Total number of options = %d\n", OPT_N); + // printf("Number of paths = %d\n", PATH_N); + + // printf("main(): generating input data...\n"); + srand(123); + + for (i = 0; i < OPT_N; i++) + { + optionData[i].S = randFloat(5.0f, 50.0f); + optionData[i].X = randFloat(10.0f, 25.0f); + optionData[i].T = randFloat(1.0f, 5.0f); + optionData[i].R = 0.06f; + optionData[i].V = 0.10f; + callValueGPU[i].Expected = -1.0f; + callValueGPU[i].Confidence = -1.0f; + } + + // printf("main(): starting %i host threads...\n", GPU_N); + + // Get option count for each GPU + for (i = 0; i < GPU_N; i++) + { + optionSolver[i].optionCount = OPT_N / GPU_N; + } + + // Take into account cases with "odd" option counts + for (i = 0; i < (OPT_N % GPU_N); i++) + { + optionSolver[i].optionCount++; + } + + // Assign GPU option ranges + gpuBase = 0; + + for (i = 0; i < GPU_N; i++) + { + optionSolver[i].device = i; + optionSolver[i].optionData = optionData + gpuBase; + optionSolver[i].callValue = callValueGPU + gpuBase; + optionSolver[i].pathN = PATH_N; + optionSolver[i].gridSize = adjustGridSize(optionSolver[i].device, optionSolver[i].optionCount); + gpuBase += optionSolver[i].optionCount; + } + + if (use_threads || bqatest) + { + // Start CPU thread for each GPU + for (gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++) + { + pthread_create(&threadID[gpuIndex], NULL, solverThread, &optionSolver[gpuIndex]); + } + + // printf("main(): waiting for GPU results...\n"); + for (gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++) + { + pthread_join(threadID[gpuIndex], NULL); + } + + // printf("main(): GPU statistics, threaded\n"); + + for (i = 0; i < GPU_N; i++) + { + cudaDeviceProp deviceProp; + cuda_safe_call(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device)); + // printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name); + // printf("Options : %i\n", optionSolver[i].optionCount); + // printf("Simulation paths: %i\n", optionSolver[i].pathN); + // time = sdkGetTimerValue(&hTimer[i]); + // printf("Total time (ms.): %f\n", time); + // printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); + } + + // printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); + sumDelta = 0; + sumRef = 0; + sumReserve = 0; + + for (i = 0; i < OPT_N; i++) + { + BlackScholesCall(callValueBS[i], optionData[i]); + delta = fabs(callValueBS[i] - callValueGPU[i].Expected); + ref = callValueBS[i]; + sumDelta += delta; + sumRef += fabs(ref); + + if (delta > 1e-6) + { + sumReserve += callValueGPU[i].Confidence / delta; + } + +#ifdef PRINT_RESULTS + printf("BS: %f; delta: %E\n", callValueBS[i], delta); +#endif + } + + sumReserve /= OPT_N; + } + + if (!use_threads || bqatest) + { + multiSolver(optionSolver, GPU_N); + + // printf("main(): GPU statistics, streamed\n"); + + for (i = 0; i < GPU_N; i++) + { + cudaDeviceProp deviceProp; + cuda_safe_call(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device)); + // printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name); + // printf("Options : %i\n", optionSolver[i].optionCount); + // printf("Simulation paths: %i\n", optionSolver[i].pathN); + } + + // time = sdkGetTimerValue(&hTimer[0]); + // printf("\nTotal time (ms.): %f\n", time); + // printf("\tNote: This is elapsed time for all to compute.\n"); + // printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); + + // printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); + sumDelta = 0; + sumRef = 0; + sumReserve = 0; + + for (i = 0; i < OPT_N; i++) + { + BlackScholesCall(callValueBS[i], optionData[i]); + delta = fabs(callValueBS[i] - callValueGPU[i].Expected); + ref = callValueBS[i]; + sumDelta += delta; + sumRef += fabs(ref); + + if (delta > 1e-6) + { + sumReserve += callValueGPU[i].Confidence / delta; + } + +#ifdef PRINT_RESULTS + printf("BS: %f; delta: %E\n", callValueBS[i], delta); +#endif + } + + sumReserve /= OPT_N; + } + +#ifdef DO_CPU + // printf("main(): running CPU MonteCarlo...\n"); + TOptionValue callValueCPU; + sumDelta = 0; + sumRef = 0; + + for (i = 0; i < OPT_N; i++) + { + MonteCarloCPU(callValueCPU, optionData[i], NULL, PATH_N); + delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected); + ref = callValueCPU.Expected; + sumDelta += delta; + sumRef += fabs(ref); + // printf("Exp : %f | %f\t", callValueCPU.Expected, callValueGPU[i].Expected); + // printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence); + } + + // printf("L1 norm: %E\n", sumDelta / sumRef); +#endif + + // printf("Shutting down...\n"); + + for (int i = 0; i < GPU_N; i++) + { + // sdkStartTimer(&hTimer[i]); + cuda_safe_call(cudaSetDevice(i)); + } + + delete[] optionSolver; + delete[] callValueBS; + delete[] callValueGPU; + delete[] optionData; + delete[] threadID; + // delete[] hTimer; + + // printf("Test Summary...\n"); + // printf("L1 norm : %E\n", sumDelta / sumRef); + // printf("Average reserve: %f\n", sumReserve); + // printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " + // "Results may vary when GPU Boost is enabled.\n\n"); + if (sumReserve <= 1.0f) + { + printf("Test failed!\n"); + return EXIT_FAILURE; + } +} diff --git a/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarlo_common.cuh b/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarlo_common.cuh new file mode 100644 index 00000000000..f24399b5353 --- /dev/null +++ b/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarlo_common.cuh @@ -0,0 +1,127 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MONTECARLO_COMMON_H +#define MONTECARLO_COMMON_H +#include + +#include "curand_kernel.h" +#include "realtype.cuh" + +using namespace cuda::experimental::stf; + +//////////////////////////////////////////////////////////////////////////////// +// Global types +//////////////////////////////////////////////////////////////////////////////// +typedef struct +{ + float S; + float X; + float T; + float R; + float V; +} TOptionData; + +typedef struct +// #ifdef __CUDACC__ +//__align__(8) +// #endif +{ + float Expected; + float Confidence; +} TOptionValue; + +// Preprocessed input option data +typedef struct +{ + real S; + real X; + real MuByT; + real VBySqrtT; +} __TOptionData; + +// GPU outputs before CPU postprocessing +typedef struct +{ + real Expected; + real Confidence; +} __TOptionValue; + +typedef struct +{ + // Device ID for multi-GPU version + int device; + // Option count for this plan + int optionCount; + + // Host-side data source and result destination + TOptionData* optionData; + TOptionValue* callValue; + logical_data> preproc_optionData_handle; + logical_data> callValue_handle; + + // Temporary Host-side pinned memory for async + faster data transfers + __TOptionValue* h_CallValue; + + // Device- and host-side option data + // void* d_OptionData; + void* h_OptionData; + + // Device-side option values + // void* d_CallValue; + + // Intermediate device-side buffers + // void* d_Buffer; + + // random number generator states + curandState* rngStates; + logical_data> rngStates_handle; + + // Pseudorandom samples count + int pathN; + + // Time stamp + float time; + + int gridSize; +} TOptionPlan; + +// extern "C" void initMonteCarloGPU(TOptionPlan *plan); +// extern "C" void MonteCarloGPU(TOptionPlan *plan, cudaStream_t stream = 0); +// extern "C" void closeMonteCarloGPU(TOptionPlan *plan); + +#endif diff --git a/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarlo_gold.cu b/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarlo_gold.cu new file mode 100644 index 00000000000..eab2d497014 --- /dev/null +++ b/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarlo_gold.cu @@ -0,0 +1,160 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "MonteCarlo_common.cuh" + +//////////////////////////////////////////////////////////////////////////////// +// Black-Scholes formula for Monte Carlo results validation +//////////////////////////////////////////////////////////////////////////////// +#define A1 0.31938153 +#define A2 -0.356563782 +#define A3 1.781477937 +#define A4 -1.821255978 +#define A5 1.330274429 +#define RSQRT2PI 0.39894228040143267793994605993438 + +// Polynomial approximation of +// cumulative normal distribution function +double CND(double d) +{ + double K = 1.0 / (1.0 + 0.2316419 * fabs(d)); + + double cnd = RSQRT2PI * exp(-0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); + + if (d > 0) + { + cnd = 1.0 - cnd; + } + + return cnd; +} + +// Black-Scholes formula for call value +extern "C" void BlackScholesCall(float& callValue, TOptionData optionData) +{ + double S = optionData.S; + double X = optionData.X; + double T = optionData.T; + double R = optionData.R; + double V = optionData.V; + + double sqrtT = sqrt(T); + double d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT); + double d2 = d1 - V * sqrtT; + double CNDD1 = CND(d1); + double CNDD2 = CND(d2); + double expRT = exp(-R * T); + + callValue = (float) (S * CNDD1 - X * expRT * CNDD2); +} + +//////////////////////////////////////////////////////////////////////////////// +// CPU Monte Carlo +//////////////////////////////////////////////////////////////////////////////// +static double CPU_endCallValue(double S, double X, double r, double MuByT, double VBySqrtT) +{ + double callValue = S * exp(MuByT + VBySqrtT * r) - X; + return (callValue > 0) ? callValue : 0; +} + +#define CURAND_SAFE_CALL(call) \ + do \ + { \ + const curandStatus_t err = (call); \ + if (CURAND_STATUS_SUCCESS != err) \ + { \ + int dev = -1; \ + cudaGetDevice(&dev); \ + fprintf(stderr, "%s:%u [device %d] CURAND error in call %s.\n", __FILE__, __LINE__, dev, #call); \ + abort(); \ + } \ + } while (0) + +extern "C" void MonteCarloCPU(TOptionValue& callValue, TOptionData optionData, float* h_Samples, int pathN) +{ + const double S = optionData.S; + const double X = optionData.X; + const double T = optionData.T; + const double R = optionData.R; + const double V = optionData.V; + const double MuByT = (R - 0.5 * V * V) * T; + const double VBySqrtT = V * sqrt(T); + + float* samples; + curandGenerator_t gen; + + CURAND_SAFE_CALL(curandCreateGeneratorHost(&gen, CURAND_RNG_PSEUDO_DEFAULT)); + unsigned long long seed = 1234ULL; + CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(gen, seed)); + + if (h_Samples != NULL) + { + samples = h_Samples; + } + else + { + samples = (float*) malloc(pathN * sizeof(float)); + CURAND_SAFE_CALL(curandGenerateNormal(gen, samples, pathN, 0.0, 1.0)); + } + + // for(int i=0; i<10; i++) printf("CPU sample = %f\n", samples[i]); + + double sum = 0, sum2 = 0; + + for (int pos = 0; pos < pathN; pos++) + { + double sample = samples[pos]; + double callValue = CPU_endCallValue(S, X, sample, MuByT, VBySqrtT); + sum += callValue; + sum2 += callValue * callValue; + } + + if (h_Samples == NULL) + { + free(samples); + } + + CURAND_SAFE_CALL(curandDestroyGenerator(gen)); + + // Derive average from the total sum and discount by riskfree rate + callValue.Expected = (float) (exp(-R * T) * sum / (double) pathN); + // Standard deviation + double stdDev = sqrt(((double) pathN * sum2 - sum * sum) / ((double) pathN * (double) (pathN - 1))); + // Confidence width; in 95% of all cases theoretical value lies within these + // borders + callValue.Confidence = (float) (exp(-R * T) * 1.96 * stdDev / sqrt((double) pathN)); +} diff --git a/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarlo_kernel.cu b/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarlo_kernel.cu new file mode 100644 index 00000000000..c6fea7e9fcf --- /dev/null +++ b/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarlo_kernel.cu @@ -0,0 +1,264 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +//////////////////////////////////////////////////////////////////////////////// +// Global types +//////////////////////////////////////////////////////////////////////////////// +#include "MonteCarlo_reduction.cuh" + +// This will output the proper error string when calling cudaGetLastError +#define getLastCudaError(msg) __getLastCudaError((msg), __FILE__, __LINE__) + +namespace cg = cooperative_groups; + +inline void __getLastCudaError(const char* errorMessage, const char* file, const int line) +{ + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) + { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error:" + " %s : (%d) %s.\n", + file, + line, + errorMessage, + static_cast(err), + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Internal GPU-side data structures +//////////////////////////////////////////////////////////////////////////////// +#define MAX_OPTIONS (1024 * 1024) + +//////////////////////////////////////////////////////////////////////////////// +// Overloaded shortcut payoff functions for different precision modes +//////////////////////////////////////////////////////////////////////////////// +__device__ inline float endCallValue(float S, float X, float r, float MuByT, float VBySqrtT) +{ + float callValue = S * __expf(MuByT + VBySqrtT * r) - X; + return (callValue > 0.0F) ? callValue : 0.0F; +} + +__device__ inline double endCallValue(double S, double X, double r, double MuByT, double VBySqrtT) +{ + double callValue = S * exp(MuByT + VBySqrtT * r) - X; + return (callValue > 0.0) ? callValue : 0.0; +} + +#define THREAD_N 256 + +//////////////////////////////////////////////////////////////////////////////// +// This kernel computes the integral over all paths using a single thread block +// per option. It is fastest when the number of thread blocks times the work per +// block is high enough to keep the GPU busy. +//////////////////////////////////////////////////////////////////////////////// +static __global__ void MonteCarloOneBlockPerOption( + curandState* __restrict rngStates, + const __TOptionData* __restrict d_OptionData, + __TOptionValue* __restrict d_CallValue, + int pathN, + int optionN) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + const int SUM_N = THREAD_N; + __shared__ real s_SumCall[SUM_N]; + __shared__ real s_Sum2Call[SUM_N]; + + // determine global thread id + int tid = threadIdx.x + blockIdx.x * blockDim.x; + + // Copy random number state to local memory for efficiency + curandState localState = rngStates[tid]; + for (int optionIndex = blockIdx.x; optionIndex < optionN; optionIndex += gridDim.x) + { + const real S = d_OptionData[optionIndex].S; + const real X = d_OptionData[optionIndex].X; + const real MuByT = d_OptionData[optionIndex].MuByT; + const real VBySqrtT = d_OptionData[optionIndex].VBySqrtT; + + // Cycle through the entire samples array: + // derive end stock price for each path + // accumulate partial integrals into intermediate shared memory buffer + for (int iSum = threadIdx.x; iSum < SUM_N; iSum += blockDim.x) + { + __TOptionValue sumCall = {0, 0}; + +#pragma unroll 8 + for (int i = iSum; i < pathN; i += SUM_N) + { + real r = curand_normal(&localState); + real callValue = endCallValue(S, X, r, MuByT, VBySqrtT); + sumCall.Expected += callValue; + sumCall.Confidence += callValue * callValue; + } + + s_SumCall[iSum] = sumCall.Expected; + s_Sum2Call[iSum] = sumCall.Confidence; + } + + // Reduce shared memory accumulators + // and write final result to global memory + cg::sync(cta); + sumReduce(s_SumCall, s_Sum2Call, cta, tile32, &d_CallValue[optionIndex]); + } +} + +static __global__ void rngSetupStates(curandState* rngState, int device_id) +{ + // determine global thread id + int tid = threadIdx.x + blockIdx.x * blockDim.x; + // Each threadblock gets different seed, + // Threads within a threadblock get different sequence numbers + curand_init(blockIdx.x + gridDim.x * device_id, threadIdx.x, 0, &rngState[tid]); +} + +//////////////////////////////////////////////////////////////////////////////// +// Host-side interface to GPU Monte Carlo +//////////////////////////////////////////////////////////////////////////////// + +template +void initMonteCarloGPU(Ctx& ctx, TOptionPlan* plan) +{ + plan->h_OptionData = new __TOptionData[plan->optionCount]; + plan->h_CallValue = new __TOptionValue[plan->optionCount]; + cuda_safe_call( + cudaHostRegister(plan->h_OptionData, plan->optionCount * sizeof(__TOptionData), cudaHostRegisterPortable)); + cuda_safe_call( + cudaHostRegister(plan->h_CallValue, plan->optionCount * sizeof(__TOptionValue), cudaHostRegisterPortable)); + + // Register this vector + plan->preproc_optionData_handle = ctx.logical_data((__TOptionData*) plan->h_OptionData, plan->optionCount); + plan->callValue_handle = ctx.logical_data((__TOptionValue*) plan->h_CallValue, plan->optionCount); + plan->rngStates_handle = ctx.logical_data(shape_of>(plan->gridSize * THREAD_N)); + + plan->preproc_optionData_handle.set_symbol("preproc_optionData"); + plan->callValue_handle.set_symbol("callValue"); + plan->rngStates_handle.set_symbol("rngStates"); + + cuda_safe_call(cudaSetDevice(plan->device)); + + // Allocate states for pseudo random number generators + auto t = ctx.task(plan->rngStates_handle.write()); + t.set_symbol("rngSetupStates"); + t->*[&](cudaStream_t stream, auto rngStates) { + // fprintf(stderr, "MEMSET on %zu\n", plan->gridSize * THREAD_N * sizeof(curandState)); + cuda_safe_call( + cudaMemsetAsync(rngStates.data_handle(), 0, plan->gridSize * THREAD_N * sizeof(curandState), stream)); + getLastCudaError("cudaMemsetAsync failed.\n"); + + // place each device pathN random numbers apart on the random number sequence + // fprintf(stderr, "rngSetupStates => rngStates %p\n", rngStates.data_handle()); + rngSetupStates<<gridSize, THREAD_N, 0, stream>>>(rngStates.data_handle(), plan->device); + getLastCudaError("rngSetupStates kernel failed.\n"); + }; +} + +// Compute statistics and deallocate internal device memory +template +void closeMonteCarloGPU(Ctx& ctx, TOptionPlan* plan) +{ + auto t = ctx.task(exec_place::host, plan->callValue_handle.rw()); + t.set_symbol("compute_stats"); + t->*[&](cudaStream_t stream, auto h_CallValue) { + cuda_safe_call(cudaStreamSynchronize(stream)); + for (int i = 0; i < plan->optionCount; i++) + { + const double RT = plan->optionData[i].R * plan->optionData[i].T; + const double sum = h_CallValue.data_handle()[i].Expected; + const double sum2 = h_CallValue.data_handle()[i].Confidence; + const double pathN = plan->pathN; + // Derive average from the total sum and discount by riskfree rate + plan->callValue[i].Expected = (float) (exp(-RT) * sum / pathN); + // Standard deviation + double stdDev = sqrt((pathN * sum2 - sum * sum) / (pathN * (pathN - 1))); + // Confidence width; in 95% of all cases theoretical value lies within these + // borders + plan->callValue[i].Confidence = (float) (exp(-RT) * 1.96 * stdDev / sqrt(pathN)); + } + }; +} + +// Main computations +template +void MonteCarloGPU(Ctx& ctx, TOptionPlan* plan, cudaStream_t /*unused*/ = 0) +{ + if (plan->optionCount <= 0 || plan->optionCount > MAX_OPTIONS) + { + printf("MonteCarloGPU(): bad option count.\n"); + return; + } + + // Preprocess computations on the host + auto t_host = ctx.task(exec_place::host, plan->preproc_optionData_handle.rw()); + t_host.set_symbol("preprocess"); + t_host->*[&](cudaStream_t stream, auto h_preproc_OptionData) { + cuda_safe_call(cudaStreamSynchronize(stream)); + + for (int i = 0; i < plan->optionCount; i++) + { + const double T = plan->optionData[i].T; + const double R = plan->optionData[i].R; + const double V = plan->optionData[i].V; + const double MuByT = (R - 0.5 * V * V) * T; + const double VBySqrtT = V * sqrt(T); + h_preproc_OptionData.data_handle()[i].S = (real) plan->optionData[i].S; + h_preproc_OptionData.data_handle()[i].X = (real) plan->optionData[i].X; + h_preproc_OptionData.data_handle()[i].MuByT = (real) MuByT; + h_preproc_OptionData.data_handle()[i].VBySqrtT = (real) VBySqrtT; + } + }; + + auto t = + ctx.task(plan->preproc_optionData_handle.read(), plan->callValue_handle.write(), plan->rngStates_handle.rw()); + t.set_symbol("MonteCarloOneBlockPerOption"); + t->*[&](cudaStream_t stream, auto preproc_optionData, auto callValue_handle, auto rngStates) { + MonteCarloOneBlockPerOption<<gridSize, THREAD_N, 0, stream>>>( + rngStates.data_handle(), + preproc_optionData.data_handle(), + callValue_handle.data_handle(), + plan->pathN, + plan->optionCount); + getLastCudaError("MonteCarloOneBlockPerOption() execution failed\n"); + }; +} diff --git a/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarlo_reduction.cuh b/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarlo_reduction.cuh new file mode 100644 index 00000000000..f83d2492375 --- /dev/null +++ b/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/MonteCarlo_reduction.cuh @@ -0,0 +1,84 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MONTECARLO_REDUCTION_CUH +#define MONTECARLO_REDUCTION_CUH + +#include "MonteCarlo_common.cuh" +#include + +namespace cg = cooperative_groups; + +//////////////////////////////////////////////////////////////////////////////// +// This function calculates total sum for each of the two input arrays. +// SUM_N must be power of two +// Unrolling provides a bit of a performance improvement for small +// to medium path counts. +//////////////////////////////////////////////////////////////////////////////// + +template +__device__ void +sumReduce(T* sum, T* sum2, cg::thread_block& cta, cg::thread_block_tile<32>& tile32, __TOptionValue* d_CallValue) +{ + const int VEC = 32; + const int tid = cta.thread_rank(); + + T beta = sum[tid]; + T beta2 = sum2[tid]; + T temp, temp2; + + for (int i = VEC / 2; i > 0; i >>= 1) + { + if (tile32.thread_rank() < i) + { + temp = sum[tid + i]; + temp2 = sum2[tid + i]; + beta += temp; + beta2 += temp2; + sum[tid] = beta; + sum2[tid] = beta2; + } + cg::sync(tile32); + } + cg::sync(cta); + + if (tid == 0) + { + beta = 0; + beta2 = 0; + for (int i = 0; i < blockDim.x; i += VEC) + { + beta += sum[i]; + beta2 += sum2[i]; + } + __TOptionValue t = {beta, beta2}; + *d_CallValue = t; + } + cg::sync(cta); +} + +#endif diff --git a/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/README.md b/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/README.md new file mode 100644 index 00000000000..7a0f77cb6bb --- /dev/null +++ b/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/README.md @@ -0,0 +1,75 @@ +# MonteCarloMultiGPU - Monte Carlo Option Pricing with Multi-GPU support + +## Description + +This sample evaluates fair call price for a given set of European options using the Monte Carlo approach, taking advantage of all CUDA-capable GPUs installed in the system. This sample use double precision hardware if a GTX 200 class GPU is present. The sample also takes advantage of CUDA 4.0 capability to supporting using a single CPU thread to control multiple GPUs + +## Key Concepts + +Random Number Generator, Computational Finance, CURAND Library + +## Supported SM Architectures + +[SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows + +## Supported CPU Architecture + +x86_64, ppc64le, armv7l + +## CUDA APIs involved + +### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) +cudaMemset, cudaFree, cudaStreamDestroy, cudaEventRecord, cudaMallocHost, cudaStreamCreate, cudaEventCreate, cudaGetDeviceCount, cudaDeviceSynchronize, cudaEventSynchronize, cudaFreeHost, cudaMalloc, cudaEventDestroy, cudaSetDevice, cudaMemcpyAsync, cudaStreamSynchronize, cudaGetDeviceProperties + +## Dependencies needed to build/run +[CURAND](../../../README.md#curand) + +## Prerequisites + +Download and install the [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Make sure the dependencies mentioned in [Dependencies]() section above are installed. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +## References (for more details) + +[whitepaper](./doc/MonteCarlo.pdf) diff --git a/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/realtype.cuh b/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/realtype.cuh new file mode 100644 index 00000000000..284bdd038d5 --- /dev/null +++ b/cudax/test/stf/cuda-samples/5_Domain_Specific/MonteCarloMultiGPU_cudastf/realtype.cuh @@ -0,0 +1,49 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef REALTYPE_H +#define REALTYPE_H + +// #define DOUBLE_PRECISION + +#ifndef DOUBLE_PRECISION +typedef float real; +#else +typedef double real; +#endif + +#endif diff --git a/cudax/test/stf/dot/basic.cu b/cudax/test/stf/dot/basic.cu new file mode 100644 index 00000000000..dce79545a71 --- /dev/null +++ b/cudax/test/stf/dot/basic.cu @@ -0,0 +1,48 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief This test makes sure we can generate a dot file + */ + +#include + +using namespace cuda::experimental::stf; + +int main() +{ +// TODO (miscco): Make it work for windows +#if !defined(_CCCL_COMPILER_MSVC) + // Generate a random filename + int r = rand(); + + char filename[64]; + snprintf(filename, 64, "output_%d.dot", r); + // fprintf(stderr, "filename %s\n", filename); + setenv("CUDASTF_DOT_FILE", filename, 1); + + stream_ctx ctx; + + auto lA = ctx.logical_data(shape_of>(64)); + ctx.task(lA.write())->*[](cudaStream_t, auto) {}; + ctx.task(lA.rw())->*[](cudaStream_t, auto) {}; + ctx.finalize(); + + // Call this explicitely for the purpose of the test + reserved::dot::instance().finish(); + + // Make sure the file exists, and erase it + // fprintf(stderr, "ERASE. ...\n"); + EXPECT(access(filename, F_OK) != -1); + + EXPECT(unlink(filename) == 0); +#endif // !_CCCL_COMPILER_MSVC +} diff --git a/cudax/test/stf/dot/graph_print_to_dot.cu b/cudax/test/stf/dot/graph_print_to_dot.cu new file mode 100644 index 00000000000..62e5abfd193 --- /dev/null +++ b/cudax/test/stf/dot/graph_print_to_dot.cu @@ -0,0 +1,50 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief This test makes sure we can generate a dot file + */ + +#include + +using namespace cuda::experimental::stf; + +__global__ void dummy() {} + +int main() +{ +// TODO (miscco): Make it work for windows +#if !defined(_CCCL_COMPILER_MSVC) + // Generate a random filename + int r = rand(); + + char filename[64]; + snprintf(filename, 64, "output_%d.dot", r); + // fprintf(stderr, "filename %s\n", filename); + + graph_ctx ctx; + + auto lA = ctx.logical_data(shape_of>(64)); + ctx.task(lA.write())->*[](cudaStream_t s, auto) { + dummy<<<1, 1, 0, s>>>(); + }; + ctx.task(lA.rw())->*[](cudaStream_t s, auto) { + dummy<<<1, 1, 0, s>>>(); + }; + ctx.print_to_dot(filename, cudaGraphDebugDotFlagsVerbose); + ctx.finalize(); + + // Make sure the file exists, and erase it + EXPECT(access(filename, F_OK) != -1); + + EXPECT(unlink(filename) == 0); +#endif // !_CCCL_COMPILER_MSVC +} diff --git a/cudax/test/stf/dot/with_events.cu b/cudax/test/stf/dot/with_events.cu new file mode 100644 index 00000000000..a03425024b8 --- /dev/null +++ b/cudax/test/stf/dot/with_events.cu @@ -0,0 +1,49 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief This test makes sure we can generate a dot file with events + */ + +#include + +using namespace cuda::experimental::stf; + +int main() +{ +// TODO (miscco): Make it work for windows +#if !defined(_CCCL_COMPILER_MSVC) + // Generate a random filename + int r = rand(); + + char filename[64]; + snprintf(filename, 64, "output_%d.dot", r); + // fprintf(stderr, "filename %s\n", filename); + setenv("CUDASTF_DOT_FILE", filename, 1); + setenv("CUDASTF_DOT_IGNORE_PREREQS", "0", 1); + + stream_ctx ctx; + + auto lA = ctx.logical_data(shape_of>(64)); + ctx.task(lA.write())->*[](cudaStream_t, auto) {}; + ctx.task(lA.rw())->*[](cudaStream_t, auto) {}; + ctx.finalize(); + + // Call this explicitely for the purpose of the test + reserved::dot::instance().finish(); + + // Make sure the file exists, and erase it + // fprintf(stderr, "ERASE. ...\n"); + EXPECT(access(filename, F_OK) != -1); + + EXPECT(unlink(filename) == 0); +#endif // !_CCCL_COMPILER_MSVC +} diff --git a/cudax/test/stf/error_checks/ctx_mismatch.cu b/cudax/test/stf/error_checks/ctx_mismatch.cu new file mode 100644 index 00000000000..cafa6873dec --- /dev/null +++ b/cudax/test/stf/error_checks/ctx_mismatch.cu @@ -0,0 +1,92 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Ensure an error is detected when a task uses a logical data from a + * different context + */ + +#include + +#include + +using namespace cuda::experimental::stf; + +bool should_abort = false; + +void cleanupRoutine(int /*unused*/) +{ + if (should_abort) + { + exit(EXIT_SUCCESS); + } + else + { + fprintf(stderr, "Unexpected SIGABRT !\n"); + exit(EXIT_FAILURE); + } +} + +template +void run(double (&X)[n]) +{ + Ctx ctx1; + auto lX = ctx1.logical_data(X); + + // We are now using lX in the wrong context + should_abort = true; + + Ctx ctx2; + ctx2.task(lX.rw())->*[&](cudaStream_t /*unused*/, auto /*unused*/) {}; + + assert(0 && "This should not be reached"); +} + +int main() +{ + /* Setup an handler to catch the SIGABRT signal during the programming error */ +#if defined(_CCCL_COMPILER_MSVC) + signal(SIGABRT, &cleanupRoutine); +#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC + struct sigaction sigabrt_action + {}; + memset(&sigabrt_action, 0, sizeof(sigabrt_action)); + sigabrt_action.sa_handler = &cleanupRoutine; + + if (sigaction(SIGABRT, &sigabrt_action, nullptr) != 0) + { + perror("sigaction SIGABRT"); + exit(EXIT_FAILURE); + } +#endif // !_CCCL_COMPILER_MSVC + + const int n = 12; + double X[n]; + + for (int ind = 0; ind < n; ind++) + { + X[ind] = 1.0 * ind; + } + + // We can't run both stream and graph tests because either will abort the program. So choose one at random. + srand(static_cast(time(nullptr))); + if (rand() % 2 == 0) + { + run(X); + } + else + { + run(X); + } + + assert(0 && "This should not be reached"); + return EXIT_FAILURE; +} diff --git a/cudax/test/stf/error_checks/data_interface_mismatch.cu b/cudax/test/stf/error_checks/data_interface_mismatch.cu new file mode 100644 index 00000000000..79969f390ba --- /dev/null +++ b/cudax/test/stf/error_checks/data_interface_mismatch.cu @@ -0,0 +1,102 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Ensure an error is detected dynamically if we access a data instance + * with the wrong interface type + */ + +#include +#include + +#include + +using namespace cuda::experimental::stf; + +bool should_abort = false; + +void cleanupRoutine(int /*unused*/) +{ + if (should_abort) + { + exit(EXIT_SUCCESS); + } + else + { + fprintf(stderr, "Unexpected SIGABRT !\n"); + exit(EXIT_FAILURE); + } +} + +template +void run(double (&X)[n]) +{ + Ctx ctx; + // This creates an untyped logical data that is implicitly a vector of size + // n. Had the code used `auto` instead of `logical_data_untyped`, errors + // would have been rejected statically. We want to disable static checking + // for the purposes of this test. + logical_data_untyped handle_X = ctx.logical_data(X); + + // Here we create a dynamically-typed task, again to go around static typechecking. + auto t = ctx.task(); + t.add_deps(handle_X.rw()); + + t->*[&](auto&) { + should_abort = true; + // We have a programming error here with a vector of `double` accessad as a vector of `float`. + handle_X.instance>(t); + should_abort = false; + }; + + assert(0 && "This should not be reached"); +} + +int main() +{ + /* Setup an handler to catch the SIGABRT signal during the programming error */ +#if defined(_CCCL_COMPILER_MSVC) + signal(SIGABRT, &cleanupRoutine); +#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC + struct sigaction sigabrt_action + {}; + memset(&sigabrt_action, 0, sizeof(sigabrt_action)); + sigabrt_action.sa_handler = &cleanupRoutine; + + if (sigaction(SIGABRT, &sigabrt_action, nullptr) != 0) + { + perror("sigaction SIGABRT"); + exit(EXIT_FAILURE); + } +#endif // !_CCCL_COMPILER_MSVC + + const int n = 12; + double X[n]; + + for (int ind = 0; ind < n; ind++) + { + X[ind] = 1.0 * ind; + } + + // We can't run both stream and graph tests because either will abort the program. So choose one at random. + srand(static_cast(time(nullptr))); + if (rand() % 2 == 0) + { + run(X); + } + else + { + run(X); + } + + assert(0 && "This should not be reached"); + return EXIT_FAILURE; +} diff --git a/cudax/test/stf/error_checks/double_finalize.cu b/cudax/test/stf/error_checks/double_finalize.cu new file mode 100644 index 00000000000..6de61c0c2b3 --- /dev/null +++ b/cudax/test/stf/error_checks/double_finalize.cu @@ -0,0 +1,81 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Ensure an error is detected if we can finalize more than once + */ + +#include + +#include + +using namespace cuda::experimental::stf; + +bool should_abort = false; + +void cleanupRoutine(int /*unused*/) +{ + if (should_abort) + { + exit(EXIT_SUCCESS); + } + else + { + fprintf(stderr, "Unexpected SIGABRT !\n"); + exit(EXIT_FAILURE); + } +} + +int main() +{ + // This test only works when assert() is enabled in +#ifndef NDEBUG + /* Setup an handler to catch the SIGABRT signal during the programming error */ +# if defined(_CCCL_COMPILER_MSVC) + signal(SIGABRT, &cleanupRoutine); +# else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC + struct sigaction sigabrt_action + {}; + memset(&sigabrt_action, 0, sizeof(sigabrt_action)); + sigabrt_action.sa_handler = &cleanupRoutine; + + if (sigaction(SIGABRT, &sigabrt_action, nullptr) != 0) + { + perror("sigaction SIGABRT"); + exit(EXIT_FAILURE); + } +# endif // !_CCCL_COMPILER_MSVC + + context ctx; + + const int n = 12; + double X[n]; + + for (int ind = 0; ind < n; ind++) + { + X[ind] = 1.0 * ind; + } + + // This creates a handle that is implicitly a vector of size n + auto lX = ctx.logical_data(X); + + ctx.task(lX.rw())->*[](cudaStream_t, auto) { /* no-op */ }; + + ctx.finalize(); + + should_abort = true; + // We cannot call sync twice + ctx.finalize(); + + assert(0 && "This should not be reached"); + return EXIT_FAILURE; +#endif +} diff --git a/cudax/test/stf/error_checks/erase_frozen.cu b/cudax/test/stf/error_checks/erase_frozen.cu new file mode 100644 index 00000000000..3e99c360aa2 --- /dev/null +++ b/cudax/test/stf/error_checks/erase_frozen.cu @@ -0,0 +1,78 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Ensure temporary data are destroyed + * + */ + +#include +#include + +#include + +using namespace cuda::experimental::stf; + +bool should_abort = false; + +void cleanupRoutine(int /*unused*/) +{ + if (should_abort) + { + exit(EXIT_SUCCESS); + } + else + { + fprintf(stderr, "Unexpected SIGABRT !\n"); + exit(EXIT_FAILURE); + } +} + +int main() +{ + /* Setup an handler to catch the SIGABRT signal during the programming error */ +#if defined(_CCCL_COMPILER_MSVC) + signal(SIGABRT, &cleanupRoutine); +#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC + struct sigaction sigabrt_action + {}; + memset(&sigabrt_action, 0, sizeof(sigabrt_action)); + sigabrt_action.sa_handler = &cleanupRoutine; + + if (sigaction(SIGABRT, &sigabrt_action, nullptr) != 0) + { + perror("sigaction SIGABRT"); + exit(EXIT_FAILURE); + } +#endif // !_CCCL_COMPILER_MSVC + + stream_ctx ctx; + const int N = 16; + int X[N]; + + for (int i = 0; i < N; i++) + { + X[i] = i; + } + + auto lX = ctx.logical_data(X); + + lX.freeze(access_mode::rw, data_place::current_device()); + + // This should cause an error because lX is frozen while the context is finalized + should_abort = true; + + ctx.finalize(); + + assert(0 && "This should not be reached"); + return EXIT_FAILURE; +} diff --git a/cudax/test/stf/error_checks/misformed_tasks_dbl_end.cu b/cudax/test/stf/error_checks/misformed_tasks_dbl_end.cu new file mode 100644 index 00000000000..b91a8d0aabb --- /dev/null +++ b/cudax/test/stf/error_checks/misformed_tasks_dbl_end.cu @@ -0,0 +1,79 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Ensure that an error is detected if we end a task twice + */ + +#include + +#include + +using namespace cuda::experimental::stf; + +bool should_abort = false; + +void cleanupRoutine(int /*unused*/) +{ + if (should_abort) + { + exit(EXIT_SUCCESS); + } + else + { + fprintf(stderr, "Unexpected SIGABRT !\n"); + exit(EXIT_FAILURE); + } +} + +int main() +{ + // This test only works when assert() is enabled in +#ifndef NDEBUG + /* Setup an handler to catch the SIGABRT signal during the programming error */ +# if defined(_CCCL_COMPILER_MSVC) + signal(SIGABRT, &cleanupRoutine); +# else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC + struct sigaction sigabrt_action + {}; + memset(&sigabrt_action, 0, sizeof(sigabrt_action)); + sigabrt_action.sa_handler = &cleanupRoutine; + + if (sigaction(SIGABRT, &sigabrt_action, nullptr) != 0) + { + perror("sigaction SIGABRT"); + exit(EXIT_FAILURE); + } +# endif // !_CCCL_COMPILER_MSVC + + stream_ctx ctx; + + const int n = 12; + double X[n]; + + for (int ind = 0; ind < n; ind++) + { + X[ind] = 1.0 * ind; + } + + // This creates a handle that is implicitly a vector of size n + auto lX = ctx.logical_data(X); + + auto t = ctx.task(lX.rw()); + t.start(); + t.end(); + should_abort = true; + t.end(); + + assert(0 && "This should not be reached"); + return EXIT_FAILURE; +#endif +} diff --git a/cudax/test/stf/error_checks/misformed_tasks_dbl_start.cu b/cudax/test/stf/error_checks/misformed_tasks_dbl_start.cu new file mode 100644 index 00000000000..3f783773b0e --- /dev/null +++ b/cudax/test/stf/error_checks/misformed_tasks_dbl_start.cu @@ -0,0 +1,76 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Ensure that an error is detected if we start a task twice + */ + +#include + +#include + +using namespace cuda::experimental::stf; + +bool should_abort = false; + +void cleanupRoutine(int /*unused*/) +{ + if (should_abort) + { + exit(EXIT_SUCCESS); + } + else + { + fprintf(stderr, "Unexpected SIGABRT !\n"); + exit(EXIT_FAILURE); + } +} + +int main() +{ + /* Setup an handler to catch the SIGABRT signal during the programming error */ +#if defined(_CCCL_COMPILER_MSVC) + signal(SIGABRT, &cleanupRoutine); +#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC + struct sigaction sigabrt_action + {}; + memset(&sigabrt_action, 0, sizeof(sigabrt_action)); + sigabrt_action.sa_handler = &cleanupRoutine; + + if (sigaction(SIGABRT, &sigabrt_action, nullptr) != 0) + { + perror("sigaction SIGABRT"); + exit(EXIT_FAILURE); + } +#endif // !_CCCL_COMPILER_MSVC + + stream_ctx ctx; + + const int n = 12; + double X[n]; + + for (int ind = 0; ind < n; ind++) + { + X[ind] = 1.0 * ind; + } + + // This creates a handle that is implicitly a vector of size n + auto lX = ctx.logical_data(X); + + auto t = ctx.task(lX.rw()); + t.start(); + should_abort = true; + t.start(); + t.end(); + + assert(0 && "This should not be reached"); + return EXIT_FAILURE; +} diff --git a/cudax/test/stf/error_checks/non_managed_data.cu b/cudax/test/stf/error_checks/non_managed_data.cu new file mode 100644 index 00000000000..387322a0912 --- /dev/null +++ b/cudax/test/stf/error_checks/non_managed_data.cu @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Ensure an error is detected when trying to declare a logical data + * with a managed memory data place while the data is not in managed + * memory + */ + +#include +#include + +#include + +using namespace cuda::experimental::stf; + +bool should_abort = false; + +void cleanupRoutine(int /*unused*/) +{ + if (should_abort) + { + exit(EXIT_SUCCESS); + } + else + { + fprintf(stderr, "Unexpected SIGABRT !\n"); + exit(EXIT_FAILURE); + } +} + +int main() +{ + /* Setup an handler to catch the SIGABRT signal during the programming error */ +#ifndef NDEBUG +# if defined(_CCCL_COMPILER_MSVC) + signal(SIGABRT, &cleanupRoutine); +# else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC + struct sigaction sigabrt_action + {}; + memset(&sigabrt_action, 0, sizeof(sigabrt_action)); + sigabrt_action.sa_handler = &cleanupRoutine; + + if (sigaction(SIGABRT, &sigabrt_action, nullptr) != 0) + { + perror("sigaction SIGABRT"); + exit(EXIT_FAILURE); + } +# endif // !_CCCL_COMPILER_MSVC + + stream_ctx ctx; + + logical_data> lX; + + should_abort = true; + + int X[128]; + lX = ctx.logical_data(X, data_place::managed); + + assert(0 && "This should not be reached"); + return EXIT_FAILURE; +#endif +} diff --git a/cudax/test/stf/error_checks/slice_check_bounds.cu b/cudax/test/stf/error_checks/slice_check_bounds.cu new file mode 100644 index 00000000000..f27cebdd722 --- /dev/null +++ b/cudax/test/stf/error_checks/slice_check_bounds.cu @@ -0,0 +1,84 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Ensure that out of bound accesses on slices are detected with the + * CUDASTF_BOUNDSCHECK option set + */ + +/* + * We are forcing this option by defining this value to be set. + */ +#ifndef NDEBUG +# ifndef CUDASTF_BOUNDSCHECK +# define CUDASTF_BOUNDSCHECK +# endif // CUDASTF_BOUNDSCHECK +#endif // NDEBUG + +#include + +#include + +using namespace cuda::experimental::stf; + +bool should_abort = false; + +void cleanupRoutine(int /*unused*/) +{ + if (should_abort) + { + exit(EXIT_SUCCESS); + } + else + { + fprintf(stderr, "Unexpected SIGABRT !\n"); + exit(EXIT_FAILURE); + } +} + +int main() +{ + /* Setup an handler to catch the SIGABRT signal during the programming error */ +#ifndef NDEBUG +# if defined(_CCCL_COMPILER_MSVC) + signal(SIGABRT, &cleanupRoutine); +# else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC + struct sigaction sigabrt_action + {}; + memset(&sigabrt_action, 0, sizeof(sigabrt_action)); + sigabrt_action.sa_handler = &cleanupRoutine; + + if (sigaction(SIGABRT, &sigabrt_action, nullptr) != 0) + { + perror("sigaction SIGABRT"); + exit(EXIT_FAILURE); + } +# endif // !_CCCL_COMPILER_MSVC + + context ctx; + + int X[128]; + logical_data> lX; + lX = ctx.logical_data(X); + + should_abort = true; + + // The last access will be out of bounds + ctx.parallel_for(lX.shape(), lX.rw())->*[] __device__(size_t i, auto X) { + X(i + 1) = 42; + }; + + ctx.finalize(); + + assert(0 && "This should not be reached"); + return EXIT_FAILURE; +#endif +} diff --git a/cudax/test/stf/error_checks/uninitialized_data.cu b/cudax/test/stf/error_checks/uninitialized_data.cu new file mode 100644 index 00000000000..efd45db4d1b --- /dev/null +++ b/cudax/test/stf/error_checks/uninitialized_data.cu @@ -0,0 +1,73 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Ensure an error is detected if we use an uninitialized logical data in a task + */ + +#include +#include + +#include + +using namespace cuda::experimental::stf; + +bool should_abort = false; + +void cleanupRoutine(int /*unused*/) +{ + if (should_abort) + { + exit(EXIT_SUCCESS); + } + else + { + fprintf(stderr, "Unexpected SIGABRT !\n"); + exit(EXIT_FAILURE); + } +} + +int main() +{ + /* Setup an handler to catch the SIGABRT signal during the programming error */ +#ifndef NDEBUG +# if defined(_CCCL_COMPILER_MSVC) + signal(SIGABRT, &cleanupRoutine); +# else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC + struct sigaction sigabrt_action + {}; + memset(&sigabrt_action, 0, sizeof(sigabrt_action)); + sigabrt_action.sa_handler = &cleanupRoutine; + + if (sigaction(SIGABRT, &sigabrt_action, nullptr) != 0) + { + perror("sigaction SIGABRT"); + exit(EXIT_FAILURE); + } +# endif // !_CCCL_COMPILER_MSVC + + stream_ctx ctx; + + logical_data> lX; + logical_data> lY; + + int X[128]; + lX = ctx.logical_data(X); + + should_abort = true; + + // We did not initialize lY, so this task should not be able to use it. + ctx.task(lX.rw(), lY.rw())->*[](cudaStream_t, auto, auto) {}; + + assert(0 && "This should not be reached"); + return EXIT_FAILURE; +#endif +} diff --git a/cudax/test/stf/error_checks/unsatisfiable_spec.cu b/cudax/test/stf/error_checks/unsatisfiable_spec.cu new file mode 100644 index 00000000000..ee3c10ad9cf --- /dev/null +++ b/cudax/test/stf/error_checks/unsatisfiable_spec.cu @@ -0,0 +1,73 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Ensure an error is raised if we try to ask for an unreasonnable + * amount of resources in a thread hierarchy spec + */ + +#include + +#include + +using namespace cuda::experimental::stf; + +bool should_abort = false; + +void cleanupRoutine(int /*unused*/) +{ + if (should_abort) + { + exit(EXIT_SUCCESS); + } + else + { + fprintf(stderr, "Unexpected SIGABRT !\n"); + exit(EXIT_FAILURE); + } +} + +int main() +{ + /* Setup an handler to catch the SIGABRT signal during the programming error */ +#ifndef NDEBUG +# if defined(_CCCL_COMPILER_MSVC) + signal(SIGABRT, &cleanupRoutine); +# else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC + struct sigaction sigabrt_action + {}; + memset(&sigabrt_action, 0, sizeof(sigabrt_action)); + sigabrt_action.sa_handler = &cleanupRoutine; + + if (sigaction(SIGABRT, &sigabrt_action, nullptr) != 0) + { + perror("sigaction SIGABRT"); + exit(EXIT_FAILURE); + } +# endif // !_CCCL_COMPILER_MSVC + + context ctx; + + int X[128]; + auto lX = ctx.logical_data(X); + + should_abort = true; + + // We are asking an unreasonnable amount of threads per block + auto spec = con(con<128000>()); + ctx.launch(spec, lX.rw())->*[] __device__(auto th, auto X) { + X[th.rank()] = th.rank(); + }; + + assert(0 && "This should not be reached"); + return EXIT_FAILURE; +#endif +} diff --git a/cudax/test/stf/error_checks/write_frozen.cu b/cudax/test/stf/error_checks/write_frozen.cu new file mode 100644 index 00000000000..1d46c702c0f --- /dev/null +++ b/cudax/test/stf/error_checks/write_frozen.cu @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Ensure temporary data are destroyed + * + */ + +#include +#include + +#include + +using namespace cuda::experimental::stf; + +bool should_abort = false; + +void cleanupRoutine(int /*unused*/) +{ + if (should_abort) + { + exit(EXIT_SUCCESS); + } + else + { + fprintf(stderr, "Unexpected SIGABRT !\n"); + exit(EXIT_FAILURE); + } +} + +int main() +{ + /* Setup an handler to catch the SIGABRT signal during the programming error */ +#if defined(_CCCL_COMPILER_MSVC) + signal(SIGABRT, &cleanupRoutine); +#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC + struct sigaction sigabrt_action + {}; + memset(&sigabrt_action, 0, sizeof(sigabrt_action)); + sigabrt_action.sa_handler = &cleanupRoutine; + + if (sigaction(SIGABRT, &sigabrt_action, nullptr) != 0) + { + perror("sigaction SIGABRT"); + exit(EXIT_FAILURE); + } +#endif // !_CCCL_COMPILER_MSVC + + stream_ctx ctx; + const int N = 16; + int X[N]; + + for (int i = 0; i < N; i++) + { + X[i] = i; + } + + auto lX = ctx.logical_data(X); + + lX.freeze(access_mode::rw, data_place::current_device()); + + // This is an illegal access because we cannot make a write access on a frozen data + should_abort = true; + + ctx.task(lX.rw())->*[](cudaStream_t, auto) {}; + + ctx.finalize(); + + assert(0 && "This should not be reached"); + return EXIT_FAILURE; +} diff --git a/cudax/test/stf/examples/01-axpy-launch-ranges-cg.cu b/cudax/test/stf/examples/01-axpy-launch-ranges-cg.cu new file mode 100644 index 00000000000..82f31d06c53 --- /dev/null +++ b/cudax/test/stf/examples/01-axpy-launch-ranges-cg.cu @@ -0,0 +1,75 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +using namespace cuda::experimental::stf; + +double X0(int i) +{ + return sin((double) i); +} + +double Y0(int i) +{ + return cos((double) i); +} + +int main() +{ + stream_ctx ctx; + + const int N = 128; + double X[N], Y[N]; + + for (int ind = 0; ind < N; ind++) + { + X[ind] = X0(ind); + Y[ind] = Y0(ind); + } + + const double alpha = 3.14; + + auto handle_X = ctx.logical_data(X, {N}); + auto handle_Y = ctx.logical_data(Y, {N}); + + auto number_devices = 4; + auto all_devs = exec_place::repeat(exec_place::device(0), number_devices); + + auto spec = par(16 * 4, par(4)); + ctx.launch(spec, all_devs, handle_X.read(), handle_Y.rw())->*[=] _CCCL_DEVICE(auto th, auto x, auto y) { + // Blocked partition among elements in the outer most level + auto outer_sh = blocked_partition::apply(shape(x), pos4(th.rank(0)), dim4(th.size(0))); + + // Cyclic partition among elements in the remaining levels + auto inner_sh = cyclic_partition::apply(outer_sh, pos4(th.inner().rank()), dim4(th.inner().size())); + + for (auto ind : inner_sh) + { + y(ind) += alpha * x(ind); + } + }; + + ctx.host_launch(handle_X.read(), handle_Y.read())->*[=](auto X, auto Y) { + for (int ind = 0; ind < N; ind++) + { + // Y should be Y0 + alpha X0 + // fprintf(stderr, "Y[%ld] = %lf - expect %lf\n", ind, Y(ind), (Y0(ind) + alpha * X0(ind))); + EXPECT(fabs(Y(ind) - (Y0(ind) + alpha * X0(ind))) < 0.0001); + + // X should be X0 + EXPECT(fabs(X(ind) - X0(ind)) < 0.0001); + } + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/examples/01-axpy-places.cu b/cudax/test/stf/examples/01-axpy-places.cu new file mode 100644 index 00000000000..8656c95b26c --- /dev/null +++ b/cudax/test/stf/examples/01-axpy-places.cu @@ -0,0 +1,123 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief This example illustrates how to use the task construct with grids of + * places and composite data places + */ + +#include +#include +#include + +using namespace cuda::experimental::stf; + +template +__global__ void axpy(size_t start, size_t cnt, T a, const T* x, T* y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int ind = tid; ind < cnt; ind += nthreads) + { + y[ind + start] += a * x[ind + start]; + } +} + +double X0(size_t i) +{ + return sin((double) i); +} + +double Y0(size_t i) +{ + return cos((double) i); +} + +template +void run() +{ + Ctx ctx; + + const int N = 1024 * 1024 * 32; + double *X, *Y; + + X = new double[N]; + Y = new double[N]; + SCOPE(exit) + { + delete[] X; + delete[] Y; + }; + + for (size_t ind = 0; ind < N; ind++) + { + X[ind] = X0(ind); + Y[ind] = Y0(ind); + } + + // std::shared_ptr all_devs = exec_place::all_devices(); + // use grid [ 0 0 0 0 ] for debugging purpose + auto all_devs = exec_place::repeat(exec_place::device(0), 4); + + // 512k doubles = 4MB (2 pages) + // A 1D blocking strategy over all devices with a block size of 32 and a round robin distribution of blocks accross + // devices + // data_place cdp = data_place(exec_place::all_devices().as_grid().get_grid(), + // [](dim4 grid_dim, pos4 index_pos) { return pos4((index_pos.x / (512 * 1024ULL)) % grid_dim.x); }); + + data_place cdp = data_place::composite(tiled_partition<512 * 1024ULL>(), all_devs); + + auto handle_X = ctx.logical_data(X, {N}); + auto handle_Y = ctx.logical_data(Y, {N}); + + double alpha = 3.14; + + /* Compute Y = Y + alpha X */ + auto t = ctx.task(all_devs, handle_X.read(cdp), handle_Y.rw(cdp)); + t->*[&](auto stream, auto sX, auto sY) { + // should be nullptr as we did not set a place + // fprintf(stderr, "t.get_stream() = %p\n", t.get_stream()); + size_t grid_size = t.grid_dims().size(); + + assert(N % grid_size == 0); + + for (size_t i = 0; i < grid_size; i++) + { + t.set_current_place(pos4(i)); + // fprintf(stderr, "t.get_stream(%ld) = %p\n", i, t.get_stream()); + axpy<<<16, 128, 0, stream>>>(i * N / grid_size, N / grid_size, alpha, sX.data_handle(), sY.data_handle()); + t.unset_current_place(); + } + }; + + /* Check the result on the host */ + ctx.host_launch(handle_X.read(), handle_Y.read())->*[&](auto sX, auto sY) { + for (size_t ind = 0; ind < N; ind++) + { + // Y should be Y0 + alpha X0 + EXPECT(fabs(sY(ind) - (Y0(ind) + alpha * X0(ind))) < 0.0001); + + // X should be X0 + EXPECT(fabs(sX(ind) - X0(ind)) < 0.0001); + } + }; + + ctx.finalize(); +} + +int main() +{ + run(); + // Disabled until composite data places are implemented with graphs + // run(); +} diff --git a/cudax/test/stf/examples/05-stencil-no-copy.cu b/cudax/test/stf/examples/05-stencil-no-copy.cu new file mode 100644 index 00000000000..f800ce481cc --- /dev/null +++ b/cudax/test/stf/examples/05-stencil-no-copy.cu @@ -0,0 +1,256 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +/* + * DATA BLOCKS + * | GHOSTS | DATA | GHOSTS | + */ +template +class data_block +{ +public: + data_block(stream_ctx& ctx, size_t beg, size_t end, size_t GHOST_SIZE) + : beg(beg) + , end(end) + , block_size(end - beg) + , ghost_size(GHOST_SIZE) + , array(std::vector(block_size + 2 * ghost_size)) + , handle(ctx.logical_data(&array[0], block_size + 2 * ghost_size)) + {} + +public: + size_t beg; + size_t end; + size_t block_size; + size_t ghost_size; + int dev_id; + +private: + std::vector array; + +public: + // HANDLE = whole data + boundaries + logical_data> handle; +}; + +template +T check_sum(stream_ctx& ctx, data_block& bn) +{ + T sum = 0.0; + + auto t = ctx.task(exec_place::host, bn.handle.read()); + t->*[&](cudaStream_t stream, auto h_center) { + cuda_safe_call(cudaStreamSynchronize(stream)); + for (size_t offset = bn.ghost_size; offset < bn.ghost_size + bn.block_size; offset++) + { + sum += h_center.data_handle()[offset]; + } + }; + + return sum; +} + +// array and array1 have a size of (cnt + 2*ghost_size) +template +__global__ void stencil_kernel(size_t cnt, size_t ghost_size, T* array, const T* array1) +{ + for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < cnt; idx += blockDim.x * gridDim.x) + { + size_t idx2 = idx + ghost_size; + array[idx2] = 0.9 * array1[idx2] + 0.05 * array1[idx2 - 1] + 0.05 * array1[idx2 + 1]; + } +} + +template +void stencil(stream_ctx& ctx, data_block& bn, data_block& bn1) +{ + int dev = bn.dev_id; + + auto t = ctx.task(exec_place::device(dev), bn.handle.rw(), bn1.handle.read()); + t->*[&](cudaStream_t stream, auto bn_array, auto bn1_array) { + stencil_kernel + <<<32, 64, 0, stream>>>(bn.block_size, bn.ghost_size, bn_array.data_handle(), bn1_array.data_handle()); + }; +} + +template +__global__ void copy_kernel(size_t cnt, T* dst, const T* src) +{ + for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < cnt; idx += blockDim.x * gridDim.x) + { + dst[idx] = src[idx]; + } +} + +template +void copy_task( + stream_ctx& ctx, + size_t cnt, + logical_data>& dst, + size_t offset_dst, + int dst_dev, + logical_data>& src, + size_t offset_src, + int src_dev) +{ + auto t = ctx.task(exec_place::device(dst_dev), dst.rw(), src.read(data_place::device(src_dev))); + t->*[&](cudaStream_t stream, auto dst_array, auto src_array) { + int nblocks = (cnt > 64) ? 32 : 1; + copy_kernel + <<>>(cnt, dst_array.data_handle() + offset_dst, src_array.data_handle() + offset_src); + }; +} + +// Copy left/right handles from neighbours to the array +template +void update_halo(stream_ctx& ctx, data_block& bn, data_block& left, data_block& right) +{ + size_t gs = bn.ghost_size; + size_t bs = bn.block_size; + + // Copy the bn.ghost_size last computed items in "left" (outside the halo) + copy_task(ctx, gs, bn.handle, 0, bn.dev_id, left.handle, bs, left.dev_id); + + // Copy the bn.ghost_size first computed items (outside the halo) + copy_task(ctx, gs, bn.handle, gs + bs, bn.dev_id, right.handle, gs, right.dev_id); +} + +// Copy inner part of bn into bn1 +template +void copy_inner(stream_ctx& ctx, data_block& bn1, data_block& bn) +{ + size_t gs = bn.ghost_size; + size_t bs = bn.block_size; + + int dev_id = bn.dev_id; + + // Copy the bn.ghost_size last computed items in "left" (outside the halo) + copy_task(ctx, bs, bn1.handle, gs, dev_id, bn.handle, gs, dev_id); +} + +int main(int argc, char** argv) +{ + int ndevs; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + stream_ctx ctx; + + int NITER = 500; + size_t NBLOCKS = 4 * ndevs; + size_t BLOCK_SIZE = 2048 * 1024; + + if (argc > 1) + { + NITER = atoi(argv[1]); + } + + if (argc > 2) + { + NBLOCKS = atoi(argv[2]); + } + + const size_t GHOST_SIZE = 1; + + size_t TOTAL_SIZE = NBLOCKS * BLOCK_SIZE; + + double* U0 = new double[NBLOCKS * BLOCK_SIZE]; + for (size_t idx = 0; idx < NBLOCKS * BLOCK_SIZE; idx++) + { + U0[idx] = (idx == 0) ? 1.0 : 0.0; + } + + std::vector> Un; + std::vector> Un1; + + // Create blocks and allocates host data + for (size_t b = 0; b < NBLOCKS; b++) + { + size_t beg = b * BLOCK_SIZE; + size_t end = (b + 1) * BLOCK_SIZE; + + Un.emplace_back(ctx, beg, end, 1ull); + Un1.emplace_back(ctx, beg, end, 1ull); + } + + for (size_t b = 0; b < NBLOCKS; b++) + { + Un[b].dev_id = b % ndevs; + Un1[b].dev_id = b % ndevs; + } + + // Fill blocks with initial values. For the sake of simplicity, we are + // using a synchronization primitive and host code, but this could have + // been written asynchronously using host callbacks. + for (size_t b = 0; b < NBLOCKS; b++) + { + size_t beg = b * BLOCK_SIZE; + + auto t = ctx.task(exec_place::host, Un[b].handle.rw(), Un1[b].handle.rw()); + t->*[&](cudaStream_t stream, auto Un_vals, auto Un1_vals) { + cuda_safe_call(cudaStreamSynchronize(stream)); + for (size_t local_idx = 0; local_idx < BLOCK_SIZE; local_idx++) + { + double val = U0[(beg + local_idx + TOTAL_SIZE) % TOTAL_SIZE]; + Un1_vals.data_handle()[local_idx + GHOST_SIZE] = val; + Un_vals.data_handle()[local_idx + GHOST_SIZE] = val; + } + }; + } + + for (int iter = 0; iter < NITER; iter++) + { + for (size_t b = 0; b < NBLOCKS; b++) + { + update_halo(ctx, Un1[b], Un[(b - 1 + NBLOCKS) % NBLOCKS], Un[(b + 1) % NBLOCKS]); + } + + // UPDATE Un from Un1 + for (size_t b = 0; b < NBLOCKS; b++) + { + stencil(ctx, Un[b], Un1[b]); + } + +#if 0 + // We make sure that the total sum of elements remains constant + if (iter % 250 == 0) + { + double sum = 0.0; + for (size_t b = 0; b < NBLOCKS; b++) + { + sum += check_sum(ctx, Un[b]); + } + + // fprintf(stderr, "iter %d : CHECK SUM = %e\n", iter, sum); + } +#endif + + for (size_t b = 0; b < NBLOCKS; b++) + { + // Copy inner part of Un into Un1 + copy_inner(ctx, Un[b], Un1[b]); + } + } + + // In this stencil, the sum of the elements is supposed to be a constant + double sum = 0.0; + for (size_t b = 0; b < NBLOCKS; b++) + { + sum += check_sum(ctx, Un[b]); + } + + double err = fabs(sum - 1.0); + EXPECT(err < 0.0001); + + ctx.finalize(); +} diff --git a/cudax/test/stf/examples/05-stencil-places.cu b/cudax/test/stf/examples/05-stencil-places.cu new file mode 100644 index 00000000000..384d9ca3c40 --- /dev/null +++ b/cudax/test/stf/examples/05-stencil-places.cu @@ -0,0 +1,92 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +template +__global__ void stencil_kernel(slice Un, slice Un1) +{ + size_t N = Un.extent(0); + for (size_t i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += blockDim.x * gridDim.x) + { + Un(i) = 0.9 * Un1(i) + 0.05 * Un1((i + N - 1) % N) + 0.05 * Un1((i + 1) % N); + } +} + +int main(int argc, char** argv) +{ + stream_ctx ctx; + + int NITER = 500; + int NBLOCKS = 20; + const size_t BLOCK_SIZE = 2048 * 1024; + + if (argc > 1) + { + NITER = atoi(argv[1]); + } + + if (argc > 2) + { + NBLOCKS = atoi(argv[2]); + } + + const size_t TOTAL_SIZE = NBLOCKS * BLOCK_SIZE; + + double* Un = new double[TOTAL_SIZE]; + double* Un1 = new double[TOTAL_SIZE]; + + for (size_t idx = 0; idx < TOTAL_SIZE; idx++) + { + Un[idx] = (idx == 0) ? 1.0 : 0.0; + Un1[idx] = Un[idx]; + } + + auto lUn = ctx.logical_data(make_slice(Un, TOTAL_SIZE)); + auto lUn1 = ctx.logical_data(make_slice(Un1, TOTAL_SIZE)); + + // std::shared_ptr all_devs = exec_place::all_devices(); + // use grid [ 0 0 0 0 ] for debugging purpose + auto all_devs = exec_place::repeat(exec_place::device(0), 4); + + data_place cdp = data_place::composite(tiled_partition(), all_devs); + + for (int iter = 0; iter < NITER; iter++) + { + // UPDATE Un from Un1 + ctx.task(lUn.rw(cdp), lUn1.read(cdp))->*[&](auto stream, auto sUn, auto sUn1) { + stencil_kernel<<<32, 128, 0, stream>>>(sUn, sUn1); + }; + + // We make sure that the total sum of elements remains constant + if (iter % 250 == 0) + { + double sum = 0.0; + + ctx.task(exec_place::host, lUn.read())->*[&](auto stream, auto sUn) { + cuda_safe_call(cudaStreamSynchronize(stream)); + for (size_t offset = 0; offset < TOTAL_SIZE; offset++) + { + sum += sUn(offset); + } + }; + + // TODO add an assertion to check whether sum is close enough to 1.0 + // fprintf(stderr, "iter %d : CHECK SUM = %e\n", iter, sum); + } + + std::swap(lUn, lUn1); + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/examples/05-stencil.cu b/cudax/test/stf/examples/05-stencil.cu new file mode 100644 index 00000000000..b97ad5940d3 --- /dev/null +++ b/cudax/test/stf/examples/05-stencil.cu @@ -0,0 +1,265 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +static stream_ctx ctx; + +/* + * DATA BLOCKS + * | GHOSTS | DATA | GHOSTS | + */ +template +class data_block +{ +public: + data_block(size_t beg, size_t end, size_t GHOST_SIZE) + : beg(beg) + , end(end) + , block_size(end - beg) + , ghost_size(GHOST_SIZE) + , array(std::vector(block_size + 2 * ghost_size)) + , left_interface(std::vector(ghost_size)) + , right_interface(std::vector(ghost_size)) + , handle(ctx.logical_data(&array[0], block_size + 2 * ghost_size)) + , left_handle(ctx.logical_data(&left_interface[0], ghost_size)) + , right_handle(ctx.logical_data(&right_interface[0], ghost_size)) + {} + + T check_sum() + { + T sum = 0.0; + + ctx.task(exec_place::host, handle.read())->*[&](cudaStream_t stream, auto sn) { + cuda_safe_call(cudaStreamSynchronize(stream)); + const T* h_center = sn.data_handle(); + for (size_t offset = ghost_size; offset < ghost_size + block_size; offset++) + { + sum += h_center[offset]; + } + }; + + return sum; + } + +public: + size_t beg; + size_t end; + size_t block_size; + size_t ghost_size; + int preferred_device; + +private: + std::vector array; + std::vector left_interface; + std::vector right_interface; + +public: + // HANDLE = whole data + boundaries + logical_data> handle; + // A piece of data to store the left part of the block + logical_data> left_handle; + // A piece of data to store the right part of the block + logical_data> right_handle; +}; + +// array and array1 have a size of (cnt + 2*ghost_size) +template +__global__ void stencil_kernel(size_t cnt, size_t ghost_size, T* array, const T* array1) +{ + for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < cnt; idx += blockDim.x * gridDim.x) + { + size_t idx2 = idx + ghost_size; + array[idx2] = 0.9 * array1[idx2] + 0.05 * array1[idx2 - 1] + 0.05 * array1[idx2 + 1]; + } +} + +// bn1.array = bn.array +template +void stencil(data_block& bn, data_block& bn1) +{ + int dev = bn.preferred_device; + + ctx.task(exec_place::device(dev), bn.handle.rw(), bn1.handle.read())->*[&](cudaStream_t stream, auto sN, auto sN1) { + stencil_kernel<<<32, 64, 0, stream>>>(bn.block_size, bn.ghost_size, sN.data_handle(), sN1.data_handle()); + }; +} + +template +__global__ void copy_kernel(size_t cnt, T* dst, const T* src) +{ + for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < cnt; idx += blockDim.x * gridDim.x) + { + dst[idx] = src[idx]; + } +} + +template +void copy_task( + size_t cnt, logical_data>& dst, size_t offset_dst, logical_data>& src, size_t offset_src, int dev) +{ + ctx.task(exec_place::device(dev), dst.rw(), src.read())->*[&](cudaStream_t stream, auto dstS, auto srcS) { + int nblocks = (cnt > 64) ? 32 : 1; + copy_kernel<<>>(cnt, dstS.data_handle() + offset_dst, srcS.data_handle() + offset_src); + }; +} + +template +void update_inner_interfaces(data_block& bn) +{ + // LEFT + copy_task(bn.ghost_size, bn.left_handle, 0, bn.handle, bn.ghost_size, bn.preferred_device); + + // RIGHT + copy_task(bn.ghost_size, bn.right_handle, 0, bn.handle, bn.block_size, bn.preferred_device); +} + +// Copy left/right handles from neighbours to the array +template +void update_outer_interfaces(data_block& bn, data_block& left, data_block& right) +{ + // update_outer_interface_left + copy_task(bn.ghost_size, bn.handle, 0, left.right_handle, 0, bn.preferred_device); + + // update_outer_interface_right + copy_task(bn.ghost_size, bn.handle, bn.ghost_size + bn.block_size, right.left_handle, 0, bn.preferred_device); +} + +// bn1.array = bn.array +template +void copy_array(data_block& bn, data_block& bn1) +{ + assert(bn.preferred_device == bn1.preferred_device); + copy_task(bn.block_size + 2 * bn.ghost_size, bn1.handle, 0, bn.handle, 0, bn.preferred_device); +} + +int main(int argc, char** argv) +{ + int NITER = 500; + size_t NBLOCKS = 4; + size_t BLOCK_SIZE = 2048 * 1024; + + if (argc > 1) + { + NITER = atoi(argv[1]); + } + + if (argc > 2) + { + NBLOCKS = atoi(argv[2]); + } + + const size_t GHOST_SIZE = 1; + + size_t TOTAL_SIZE = NBLOCKS * BLOCK_SIZE; + + int ndevs; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + // fprintf(stderr, "GOT %d devices\n", ndevs); + + double* U0 = new double[NBLOCKS * BLOCK_SIZE]; + for (size_t idx = 0; idx < NBLOCKS * BLOCK_SIZE; idx++) + { + U0[idx] = (idx == 0) ? 1.0 : 0.0; + } + + std::vector> Un; + std::vector> Un1; + + // Create blocks and allocates host data + for (size_t b = 0; b < NBLOCKS; b++) + { + size_t beg = b * BLOCK_SIZE; + size_t end = (b + 1) * BLOCK_SIZE; + + Un.emplace_back(beg, end, 1ull); + Un1.emplace_back(beg, end, 1ull); + } + + for (size_t b = 0; b < NBLOCKS; b++) + { + Un[b].preferred_device = b % ndevs; + Un1[b].preferred_device = b % ndevs; + } + + // Fill blocks with initial values. For the sake of simplicity, we are + // using a synchronization primitive and host code, but this could have + // been written asynchronously using host callbacks. + for (size_t b = 0; b < NBLOCKS; b++) + { + size_t beg = b * BLOCK_SIZE; + + ctx.task(exec_place::host, Un1[b].handle.rw())->*[&](cudaStream_t stream, auto sUn1) { + cuda_safe_call(cudaStreamSynchronize(stream)); + double* Un1_vals = sUn1.data_handle(); + + for (size_t local_idx = 0; local_idx < BLOCK_SIZE; local_idx++) + { + Un1_vals[local_idx + GHOST_SIZE] = U0[(beg + local_idx + TOTAL_SIZE) % TOTAL_SIZE]; + } + }; + } + + for (int iter = 0; iter < NITER; iter++) + { + for (size_t b = 0; b < NBLOCKS; b++) + { + // Update the internal copies of the left and right boundaries + update_inner_interfaces(Un1[b]); + } + + for (size_t b = 0; b < NBLOCKS; b++) + { + // Apply ghost cells from neighbours to put then in the "center" array + update_outer_interfaces(Un1[b], Un1[(b - 1 + NBLOCKS) % NBLOCKS], Un1[(b + 1) % NBLOCKS]); + } + + // UPDATE Un from Un1 + for (size_t b = 0; b < NBLOCKS; b++) + { + stencil(Un[b], Un1[b]); + } + + for (size_t b = 0; b < NBLOCKS; b++) + { + // Save Un into Un1 + copy_array(Un[b], Un1[b]); + } + +#if 0 + // We make sure that the total sum of elements remains constant + if (iter % 250 == 0) + { + double check_sum = 0.0; + for (size_t b = 0; b < NBLOCKS; b++) + { + check_sum += Un[b].check_sum(); + } + + // fprintf(stderr, "iter %d : CHECK SUM = %e\n", iter, check_sum); + } +#endif + } + + // In this stencil, the sum of the elements is supposed to be a constant + double check_sum = 0.0; + for (size_t b = 0; b < NBLOCKS; b++) + { + check_sum += Un[b].check_sum(); + } + + double err = fabs(check_sum - 1.0); + EXPECT(err < 0.0001); + + ctx.finalize(); +} diff --git a/cudax/test/stf/examples/05-stencil2d-places.cu b/cudax/test/stf/examples/05-stencil2d-places.cu new file mode 100644 index 00000000000..2fde20dd7b9 --- /dev/null +++ b/cudax/test/stf/examples/05-stencil2d-places.cu @@ -0,0 +1,114 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +using namespace cuda::experimental::stf; + +template +__global__ void stencil2D_kernel(slice sUn, slice sUn1) +{ + size_t N = sUn.extent(0); + for (size_t i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += blockDim.x * gridDim.x) + { + for (size_t j = 0; j < N; j++) + { + sUn(j, i) = 0.8 * sUn1(j, i) + 0.05 * sUn1(j, (i + 1) % N) + 0.05 * sUn1(j, (i - 1 + N) % N) + + 0.05 * sUn1((j + 1) % N, i) + 0.05 * sUn1((j - 1 + N) % N, i); + } + } +} + +int main(int argc, char** argv) +{ + stream_ctx ctx; + + size_t NITER = 500; + size_t N = 1000; + bool vtk_dump = false; + + if (argc > 1) + { + NITER = atoi(argv[1]); + } + + if (argc > 2) + { + N = atoi(argv[2]); + } + + if (argc > 3) + { + int val = atoi(argv[3]); + vtk_dump = (val == 1); + } + + size_t TOTAL_SIZE = N * N; + + double* Un = new double[TOTAL_SIZE]; + double* Un1 = new double[TOTAL_SIZE]; + + for (size_t idx = 0; idx < TOTAL_SIZE; idx++) + { + Un[idx] = (idx == 0) ? 1.0 : 0.0; + Un1[idx] = Un[idx]; + } + + auto lUn = ctx.logical_data(make_slice(Un, std::tuple{N, N}, N)); + auto lUn1 = ctx.logical_data(make_slice(Un1, std::tuple{N, N}, N)); + + // std::shared_ptr all_devs = exec_place::all_devices(); + // use grid [ 0 0 0 0 ] for debugging purpose + auto all_devs = exec_place::repeat(exec_place::device(0), 4); + + // Partition over the vector of processor along the y-axis of the data domain + // TODO implement the proper tiled_partitioning along y ! + data_place cdp = data_place::composite(tiled_partition<128>(), all_devs); + + for (size_t iter = 0; iter < NITER; iter++) + { + // UPDATE Un from Un1 + ctx.task(lUn.rw(cdp), lUn1.read(cdp))->*[&](auto stream, auto sUn, auto sUn1) { + stencil2D_kernel<<<32, 128, 0, stream>>>(sUn, sUn1); + }; + + // We make sure that the total sum of elements remains constant + if (iter % 250 == 0) + { + double sum = 0.0; + + ctx.task(exec_place::host, lUn.read())->*[&](auto stream, auto sUn) { + cuda_safe_call(cudaStreamSynchronize(stream)); + for (size_t j = 0; j < N; j++) + { + for (size_t i = 0; i < N; i++) + { + sum += sUn(j, i); + } + } + + if (vtk_dump) + { + char str[32]; + snprintf(str, 32, "Un_%05zu.vtk", iter); + mdspan_to_vtk(sUn, std::string(str)); + } + }; + + // fprintf(stderr, "iter %d : CHECK SUM = %e\n", iter, sum); + } + + std::swap(lUn, lUn1); + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/examples/07-cholesky-redux.cu b/cudax/test/stf/examples/07-cholesky-redux.cu new file mode 100644 index 00000000000..765047c8669 --- /dev/null +++ b/cudax/test/stf/examples/07-cholesky-redux.cu @@ -0,0 +1,751 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief This example implements a Cholesky decomposition over multiple devices using CUBLAS and CUSOLVER + * + * It also illustrates how we can use CUDASTF to allocate temporary data for CUSOLVER in CUDASTF tasks + */ + +#include +#include + +#include + +#define TILED + +using namespace cuda::experimental::stf; + +// Global for the sake of simplicity ! +stream_ctx ctx; + +/* Get a CUBLAS handle valid on the current device, or initialize it lazily */ +cublasHandle_t& get_cublas_handle() +{ + int dev; + cuda_safe_call(cudaGetDevice(&dev)); + + static std::unordered_map cublas_handles; + auto& result = cublas_handles[dev]; + if (result == cublasHandle_t()) + { // not found, default value inserted + // Lazy initialization, and save the handle for future use + cuda_safe_call(cublasCreate(&result)); + } + return result; +} + +/* Get a CUSOLVER handle valid on the current device, or initialize it lazily */ +cusolverDnHandle_t& get_cusolver_handle() +{ + int dev; + cuda_safe_call(cudaGetDevice(&dev)); + + static std::unordered_map cusolver_handles; + auto& result = cusolver_handles[dev]; + if (result == cusolverDnHandle_t()) + { // not found, default value inserted + // Lazy initialization, and save the handle for future use + cuda_safe_call(cusolverDnCreate(&result)); + } + return result; +} + +template +class matrix +{ +public: + matrix(int NROWS, int NCOLS, int BLOCKSIZE_ROWS, int BLOCKSIZE_COLS, bool is_sym, const char* _symbol = "matrix") + { + symbol = _symbol; + + sym_matrix = is_sym; + + m = NROWS; + mb = BLOCKSIZE_ROWS; + + n = NCOLS; + nb = BLOCKSIZE_COLS; + + assert(m % mb == 0); + assert(n % nb == 0); + + // cuda_safe_call(cudaMallocHost(&h_array, m*n*sizeof(T))); + // fprintf(stderr, "Allocating %ld x %ld x %ld = %ld bytes (%f GB) on host for %s\n", m, n, sizeof(T), s, + // s / (1024.0 * 1024.0 * 1024.0), _symbol); + h_array.resize(m * n); + cuda_safe_call(cudaHostRegister(&h_array[0], h_array.size() * sizeof(T), cudaHostRegisterPortable)); + + // Compute the number of blocks + mt = m / mb; + nt = n / nb; + + handles.resize(mt * nt); + + for (int colb = 0; colb < nt; colb++) + { + int low_rowb = sym_matrix ? colb : 0; + for (int rowb = low_rowb; rowb < mt; rowb++) + { + T* addr_h = get_block_h(rowb, colb); + auto& h = handle(rowb, colb); + +#ifdef TILED + // tiles are stored contiguously + size_t ld = mb; +#else + size_t ld = m; +#endif + std::ignore = ld; // work around bug in compiler + h = ctx.logical_data(make_slice(addr_h, std::tuple{mb, nb}, ld)); + h.set_symbol(std::string(symbol) + "_" + std::to_string(rowb) + "_" + std::to_string(colb)); + h.set_write_back(false); + } + } + + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + for (int a = 1; a * a <= ndevs; a++) + { + if (ndevs % a == 0) + { + grid_p = a; + grid_q = ndevs / a; + } + } + + assert(grid_p * grid_q == ndevs); + + // std::cout << "FOUND " << ndevs << " DEVICES " + // << "p=" << grid_p << " q=" << grid_q << std::endl; + } + + int get_preferred_devid(int row, int col) + { + return (row % grid_p) + (col % grid_q) * grid_p; + } + + auto& handle(int row, int col) + { + return handles[row + col * mt]; + } + + size_t get_index(size_t row, size_t col) + { +#ifdef TILED + // Find which tile contains this element + int tile_row = row / mb; + int tile_col = col / nb; + + size_t tile_size = mb * nb; + + // Look for the index of the begining of the tile + size_t tile_start = (tile_row + mt * tile_col) * tile_size; + + // Offset within the tile + size_t offset = (row % mb) + (col % nb) * mb; + + return tile_start + offset; +#else + return row + col * m; +#endif + } + + T* get_block_h(int brow, int bcol) + { + size_t index = get_index(brow * mb, bcol * nb); + return &h_array[index]; + } + + // Fill with func(Matrix*,row, col) + template + void fill(Fun&& fun) + { + nvtxRangePushA("FILL"); + // Fill blocks by blocks + for (int colb = 0; colb < nt; colb++) + { + int low_rowb = sym_matrix ? colb : 0; + for (int rowb = low_rowb; rowb < mt; rowb++) + { + // Each task fills a block + auto& h = handle(rowb, colb); + int devid = get_preferred_devid(rowb, colb); + + ctx.parallel_for(exec_place::device(devid), h.shape(), h.write()).set_symbol("INIT")->* + [=] __device__(size_t lrow, size_t lcol, auto sA) { + size_t row = lrow + rowb * sA.extent(0); + size_t col = lcol + colb * sA.extent(1); + sA(lrow, lcol) = fun(row, col); + }; + } + } + nvtxRangePop(); + } + + std::vector h_array; + size_t m; // nrows + size_t n; // ncols + + // Is this a sym matrix ? (lower assumed) + bool sym_matrix; + + size_t mb; // block size (rows) + size_t nb; // block size (cols) + + size_t mt; // number of column blocks + size_t nt; // number of row blocks + + // abstract data handles + std::vector>> handles; + + const char* symbol; + + // for the mapping + int ndevs; + int grid_p, grid_q; +}; + +void DPOTRF(cublasFillMode_t uplo, class matrix& A, int A_row, int A_col) +{ + auto& Akk = A.handle(A_row, A_col); + size_t m_akk = Akk.shape().extent(0); + // Note that the handle may be different from the actual handle... + int Lwork_expected; + cuda_safe_call(cusolverDnDpotrf_bufferSize(get_cusolver_handle(), uplo, m_akk, nullptr, 0, &Lwork_expected)); + + auto potrf_buffer = ctx.logical_data(Lwork_expected); + auto devInfo = ctx.logical_data(shape_of>(1)); + + auto t = + ctx.task(exec_place::device(A.get_preferred_devid(A_row, A_col)), Akk.rw(), potrf_buffer.write(), devInfo.write()); + t.set_symbol("DPOTRF"); + t->*[uplo](cudaStream_t s, auto sAkk, auto buffer, auto info) { + auto& h = get_cusolver_handle(); + cuda_safe_call(cusolverDnSetStream(h, s)); + + cuda_safe_call(cusolverDnDpotrf( + h, + uplo, + sAkk.extent(0), + sAkk.data_handle(), + sAkk.stride(1), + buffer.data_handle(), + buffer.extent(0), + info.data_handle())); + }; +} + +void DGEMM( + cublasOperation_t transa, + cublasOperation_t transb, + double alpha, + class matrix& A, + int A_row, + int A_col, + class matrix& B, + int B_row, + int B_col, + double beta, + class matrix& C, + int C_row, + int C_col) +{ + auto redux_op = std::make_shared>(); + + // If beta == 1.0 (we assume this is exactly 1.0), then this operation is + // an accumulation with the add operator + auto dep_c = (beta == 1.0) ? C.handle(C_row, C_col).redux(redux_op) : C.handle(C_row, C_col).rw(); + auto t = ctx.task(exec_place::device(A.get_preferred_devid(C_row, C_col)), + A.handle(A_row, A_col).read(), + B.handle(B_row, B_col).read(), + dep_c); + t.set_symbol("DGEMM"); + t->*[transa, transb, alpha, beta](cudaStream_t s, auto sA, auto sB, auto sC) { + EXPECT(sC.data_handle() != nullptr); + auto& h = get_cublas_handle(); + cuda_safe_call(cublasSetStream(h, s)); + + auto k = (transa == CUBLAS_OP_N) ? sA.extent(1) : sA.extent(0); + cuda_safe_call(cublasDgemm( + h, + transa, + transb, + sC.extent(0), + sC.extent(1), + k, + &alpha, + sA.data_handle(), + sA.stride(1), + sB.data_handle(), + sB.stride(1), + &beta, + sC.data_handle(), + sC.stride(1))); + }; +} + +void DSYRK( + cublasFillMode_t uplo, + cublasOperation_t trans, + double alpha, + class matrix& A, + int A_row, + int A_col, + double beta, + class matrix& C, + int C_row, + int C_col) +{ + auto t = ctx.task(exec_place::device(A.get_preferred_devid(C_row, C_col)), + A.handle(A_row, A_col).read(), + C.handle(C_row, C_col).rw()); + t.set_symbol("DSYRK"); + t->*[uplo, trans, alpha, beta](cudaStream_t s, auto sA, auto sC) { + auto& h = get_cublas_handle(); + cuda_safe_call(cublasSetStream(h, s)); + + // number of rows of matrix op(A) and C + auto n = sC.extent(0); + + // number of columns of matrix op(A) + auto k = (trans == CUBLAS_OP_N) ? sA.extent(1) : sA.extent(0); + + cuda_safe_call( + cublasDsyrk(h, uplo, trans, n, k, &alpha, sA.data_handle(), sA.stride(1), &beta, sC.data_handle(), sC.stride(1))); + }; +} + +void DTRSM( + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t transa, + cublasDiagType_t diag, + double alpha, + class matrix& A, + int A_row, + int A_col, + class matrix& B, + int B_row, + int B_col) +{ + auto t = ctx.task(exec_place::device(A.get_preferred_devid(B_row, B_col)), + A.handle(A_row, A_col).read(), + B.handle(B_row, B_col).rw()); + t.set_symbol("DTRSM"); + t->*[side, uplo, transa, diag, alpha](cudaStream_t s, auto sA, auto sB) { + auto& h = get_cublas_handle(); + cuda_safe_call(cublasSetStream(h, s)); + + cuda_safe_call(cublasDtrsm( + h, + side, + uplo, + transa, + diag, + sB.extent(0), + sB.extent(1), + &alpha, + sA.data_handle(), + sA.stride(1), + sB.data_handle(), + sB.stride(1))); + }; +} + +void PDNRM2_HOST(matrix* A, double* result) +{ +#ifdef HAVE_DOT + reserved::dot::set_current_color("red"); +#endif + + for (int rowb = 0; rowb < A->mt; rowb++) + { + for (int colb = 0; colb < A->nt; colb++) + { + ctx.host_launch(A->handle(rowb, colb).read())->*[=](auto sA) { + double res2 = 0.0; + for (size_t col = 0; col < sA.extent(1); col++) + { + for (size_t row = 0; row < sA.extent(0); row++) + { + double v = sA(row, col); + res2 += v * v; + } + } + *result += res2; + }; + } + } +} + +void PDPOTRF(matrix& A) +{ +#ifdef HAVE_DOT + reserved::dot::set_current_color("yellow"); +#endif + + assert(A.m == A.n); + assert(A.mt == A.nt); + + int NBLOCKS = A.mt; + assert(A.mb == A.nb); + + cuda_safe_call(cudaSetDevice(0)); + + nvtxRangePushA("SUBMIT_PDPOTRF"); + for (int K = 0; K < NBLOCKS; K++) + { + int dev_akk = A.get_preferred_devid(K, K); + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(K, K))); + DPOTRF(CUBLAS_FILL_MODE_LOWER, A, K, K); + + for (int row = K + 1; row < NBLOCKS; row++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, K))); + DTRSM(CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, K, K, A, row, K); + + for (int col = K + 1; col < row; col++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); + DGEMM(CUBLAS_OP_N, CUBLAS_OP_T, -1.0, A, row, K, A, col, K, 1.0, A, row, col); + } + + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, row))); + DSYRK(CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, -1.0, A, row, K, 1.0, A, row, row); + } + } + cuda_safe_call(cudaSetDevice(0)); + + nvtxRangePop(); +} + +// Algorithm from PLASMA +void PDTRSM(cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + double alpha, + class matrix& A, + class matrix& B) +{ + // std::cout << "[PDTRSM] START B MT " << B.mt << " NT " << B.nt << std::endl; + + if (side == CUBLAS_SIDE_LEFT) + { + if (uplo == CUBLAS_FILL_MODE_UPPER) + { + // TODO + assert(0); + abort(); + } + else + { + //=========================================== + // CUBLAS_SIDE_LEFT / CUBLAS_FILL_MODE_LOWER / CUBLAS_OP_N + //=========================================== + if (trans == CUBLAS_OP_N) + { + for (int k = 0; k < B.mt; k++) + { + double lalpha = k == 0 ? alpha : 1.0; + for (int n = 0; n < B.nt; n++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(k, k))); + DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); + } + for (int m = k + 1; m < B.mt; m++) + { + for (int n = 0; n < B.nt; n++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(m, k))); + DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); + } + } + } + } + //================================================ + // CUBLAS_SIDE_LEFT / CUBLAS_FILL_MODE_LOWER / CUBLAS_OP_[C|T] + //================================================ + else + { + for (int k = 0; k < B.mt; k++) + { + double lalpha = k == 0 ? alpha : 1.0; + for (int n = 0; n < B.nt; n++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); + DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); + } + for (int m = k + 1; m < B.mt; m++) + { + for (int n = 0; n < B.nt; n++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); + DGEMM( + trans, CUBLAS_OP_N, -1.0, A, B.mt - k - 1, B.mt - 1 - m, B, B.mt - k - 1, n, lalpha, B, B.mt - 1 - m, n); + } + } + } + } + } + } + else + { + // TODO + abort(); + } + cuda_safe_call(cudaSetDevice(0)); + // std::cout << "[PDTRSM] END" << std::endl; +} + +void PDPOTRS(matrix& A, class matrix& B, cublasFillMode_t uplo) +{ +#ifdef HAVE_DOT + reserved::dot::set_current_color("green"); +#endif + + // std::cout << "[PDPOTRS] START" << std::endl; + // Call the parallel functions. + PDTRSM( + CUBLAS_SIDE_LEFT, uplo, uplo == CUBLAS_FILL_MODE_UPPER ? CUBLAS_OP_T : CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, 1.0, A, B); + +#ifdef HAVE_DOT + reserved::dot::set_current_color("darkgreen"); +#endif + + PDTRSM( + CUBLAS_SIDE_LEFT, uplo, uplo == CUBLAS_FILL_MODE_UPPER ? CUBLAS_OP_N : CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, B); + // std::cout << "[PDPOTRS] END" << std::endl; +} + +/***************************************************************************** + * Parallel tile matrix-matrix + *multiplication. + * @see plasma_omp_dgemm + ******************************************************************************/ +void PDGEMM(cublasOperation_t transa, + cublasOperation_t transb, + double alpha, + class matrix& A, + class matrix& B, + double beta, + class matrix& C) +{ +#ifdef HAVE_DOT + reserved::dot::set_current_color("blue"); +#endif + + for (int m = 0; m < C.mt; m++) + { + for (int n = 0; n < C.nt; n++) + { + //========================================= + // alpha*A*B does not contribute; scale C + //========================================= + int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + if (alpha == 0.0 || inner_k == 0) + { + DGEMM(transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); + } + else if (transa == CUBLAS_OP_N) + { + //================================ + // CUBLAS_OP_N / CUBLAS_OP_N + //================================ + if (transb == CUBLAS_OP_N) + { + for (int k = 0; k < A.nt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); + } + } + //===================================== + // CUBLAS_OP_N / CUBLAS_OP_T + //===================================== + else + { + for (int k = 0; k < A.nt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); + } + } + } + else + { + //===================================== + // CUBLAS_OP_T / CUBLAS_OP_N + //===================================== + if (transb == CUBLAS_OP_N) + { + for (int k = 0; k < A.mt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); + } + } + //========================================== + // CUBLAS_OP_T / CUBLAS_OP_T + //========================================== + else + { + for (int k = 0; k < A.mt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); + } + } + } + } + } +} + +int main(int argc, char** argv) +{ + int N = 1024; + int NB = 128; + + if (argc > 1) + { + N = atoi(argv[1]); + } + + if (argc > 2) + { + NB = atoi(argv[2]); + } + + int check_result = 1; + if (getenv("CHECK_RESULT")) + { + check_result = atoi(getenv("CHECK_RESULT")); + } + + assert(N % NB == 0); + + // Use pools of preallocated blocks + auto fixed_alloc = block_allocator(ctx, NB * NB * sizeof(double)); + ctx.set_allocator(fixed_alloc); + + // Set up CUBLAS and CUSOLVER + int ndevs; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + for (size_t d = 0; d < ndevs; d++) + { + auto lX = ctx.logical_data(shape_of>(1)); + ctx.parallel_for(exec_place::device(d), lX.shape(), lX.write())->*[] __device__(size_t, auto) {}; + cuda_safe_call(cudaSetDevice(d)); + get_cublas_handle(); + get_cusolver_handle(); + } + + cuda_safe_call(cudaSetDevice(0)); + + matrix A(N, N, NB, NB, true, "A"); + matrix Aref(N, N, NB, NB, false, "Aref"); + + // (Hilbert matrix + 2*N*Id) to have a diagonal dominant matrix + auto hilbert = [=] __host__ __device__(size_t row, size_t col) { + return 1.0 / (col + row + 1.0) + 2.0 * N * (col == row); + }; + + if (check_result) + { + Aref.fill(hilbert); + } + + A.fill(hilbert); + + /* Right-hand side */ + matrix B_potrs(N, 1, NB, 1, false, "B"); + matrix Bref_potrs(N, 1, NB, 1, false, "Bref"); + + if (check_result) + { + auto rhs_vals = [] __host__ __device__(size_t row, size_t /*unused*/) { return 1.0 * (row + 1); }; + B_potrs.fill(rhs_vals); + Bref_potrs.fill(rhs_vals); + } + + // // Compute ||Bref|| + double Bref_nrm2 = 0.0; + double res_nrm2 = 0.0; + + if (check_result) + { + PDNRM2_HOST(&Bref_potrs, &Bref_nrm2); + } + + cudaEvent_t startEvent_pdpotrf, stopEvent_pdpotrf; + float milliseconds_pdpotrf = 0; + + // for (int row = 0; row < A.mt; row++) + // { + // for (int col = 0; col <= row; col++) + // { + // cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); + // NOOP(A, row, col); + // } + // } + + cuda_safe_call(cudaSetDevice(0)); + + cuda_safe_call(cudaStreamSynchronize(ctx.task_fence())); + + cuda_safe_call(cudaEventCreate(&startEvent_pdpotrf)); + cuda_safe_call(cudaEventCreate(&stopEvent_pdpotrf)); + + cuda_safe_call(cudaEventRecord(startEvent_pdpotrf, ctx.task_fence())); + + PDPOTRF(A); + + cuda_safe_call(cudaSetDevice(0)); + cuda_safe_call(cudaEventRecord(stopEvent_pdpotrf, ctx.task_fence())); + + /* + * POTRS + */ + + if (check_result) + { + // Solve AX = B and put the result in B + PDPOTRS(A, B_potrs, CUBLAS_FILL_MODE_LOWER); + + // Compute (AX - B) + // Bref = (Aref*B - Bref) + PDGEMM(CUBLAS_OP_N, CUBLAS_OP_N, 1.0, Aref, B_potrs, -1.0, Bref_potrs); + + // Compute ||AX - B|| = ||Bref|| + PDNRM2_HOST(&Bref_potrs, &res_nrm2); + } + + ctx.finalize(); + + cuda_safe_call(cudaEventElapsedTime(&milliseconds_pdpotrf, startEvent_pdpotrf, stopEvent_pdpotrf)); + + double gflops_pdpotrf = 1.0 / 3.0 * ((double) N * (double) N * (double) N) / (1000000000.0); + std::cout << "[PDPOTRF] ELAPSED: " << milliseconds_pdpotrf + << " ms, GFLOPS: " << gflops_pdpotrf / (milliseconds_pdpotrf / 1000.0) << std::endl; + + if (check_result) + { + if (double residual = sqrt(res_nrm2) / sqrt(Bref_nrm2); residual >= 0.01) + { + std::cerr << "[POTRS] ||AX - B|| : " << sqrt(res_nrm2) << std::endl; + std::cerr << "[POTRS] ||B|| : " << sqrt(Bref_nrm2) << std::endl; + std::cerr << "[POTRS] RESIDUAL (||AX - B||/||B||) : " << residual << std::endl; + assert(!"Algorithm did not converge."); + } + } +} diff --git a/cudax/test/stf/examples/07-cholesky-unified.cu b/cudax/test/stf/examples/07-cholesky-unified.cu new file mode 100644 index 00000000000..7a52ed5ac41 --- /dev/null +++ b/cudax/test/stf/examples/07-cholesky-unified.cu @@ -0,0 +1,693 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#define TILED + +using namespace cuda::experimental::stf; + +// The backend used in this example only depends on that type +using backend_type = stream_ctx; +// using backend_type = graph_ctx; + +// Global for the sake of simplicity ! +backend_type ctx; + +/* Get a CUBLAS handle valid on the current device, or initialize it lazily */ +cublasHandle_t& get_cublas_handle() +{ + int dev; + cuda_safe_call(cudaGetDevice(&dev)); + + static std::unordered_map cublas_handles; + auto& result = cublas_handles[dev]; + if (result == cublasHandle_t()) + { // not found, default value inserted + // Lazy initialization, and save the handle for future use + cuda_safe_call(cublasCreate(&result)); + } + return result; +} + +/* Get a CUSOLVER handle valid on the current device, or initialize it lazily */ +cusolverDnHandle_t& get_cusolver_handle() +{ + int dev; + cuda_safe_call(cudaGetDevice(&dev)); + + static std::unordered_map cusolver_handles; + auto& result = cusolver_handles[dev]; + if (result == cusolverDnHandle_t()) + { // not found, default value inserted + // Lazy initialization, and save the handle for future use + cuda_safe_call(cusolverDnCreate(&result)); + } + return result; +} + +template +class matrix +{ +public: + matrix(int NROWS, int NCOLS, int BLOCKSIZE_ROWS, int BLOCKSIZE_COLS, bool is_sym, const char* _symbol = "matrix") + { + symbol = _symbol; + + sym_matrix = is_sym; + + m = NROWS; + mb = BLOCKSIZE_ROWS; + + n = NCOLS; + nb = BLOCKSIZE_COLS; + + assert(m % mb == 0); + assert(n % nb == 0); + + // cuda_safe_call(cudaMallocHost(&h_array, m*n*sizeof(T))); + // fprintf(stderr, "Allocating %ld x %ld x %ld = %ld bytes (%f GB) on host for %s\n", m, n, sizeof(T), s, + // s / (1024.0 * 1024.0 * 1024.0), _symbol); + h_array.resize(m * n); + cuda_safe_call(cudaHostRegister(&h_array[0], h_array.size() * sizeof(T), cudaHostRegisterPortable)); + + // Compute the number of blocks + mt = m / mb; + nt = n / nb; + + handles.resize(mt * nt); + + for (int colb = 0; colb < nt; colb++) + { + int low_rowb = sym_matrix ? colb : 0; + for (int rowb = low_rowb; rowb < mt; rowb++) + { + T* addr_h = get_block_h(rowb, colb); + auto& h = handle(rowb, colb); + +#ifdef TILED + // tiles are stored contiguously + size_t ld = mb; +#else + size_t ld = m; +#endif + std::ignore = ld; // work around bug in compiler + h = ctx.logical_data(make_slice(addr_h, std::tuple{mb, nb}, ld)); + h.set_symbol(std::string(symbol) + "_" + std::to_string(rowb) + "_" + std::to_string(colb)); + } + } + + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + for (int a = 1; a * a <= ndevs; a++) + { + if (ndevs % a == 0) + { + grid_p = a; + grid_q = ndevs / a; + } + } + + assert(grid_p * grid_q == ndevs); + + // std::cout << "FOUND " << ndevs << " DEVICES " + // << "p=" << grid_p << " q=" << grid_q << std::endl; + } + + int get_preferred_devid(int row, int col) + { + return (row % grid_p) + (col % grid_q) * grid_p; + } + + auto& handle(int row, int col) + { + return handles[row + col * mt]; + } + + size_t get_index(size_t row, size_t col) + { +#ifdef TILED + // Find which tile contains this element + int tile_row = row / mb; + int tile_col = col / nb; + + size_t tile_size = mb * nb; + + // Look for the index of the begining of the tile + size_t tile_start = (tile_row + mt * tile_col) * tile_size; + + // Offset within the tile + size_t offset = (row % mb) + (col % nb) * mb; + + return tile_start + offset; +#else + return row + col * m; +#endif + } + + T* get_block_h(int brow, int bcol) + { + size_t index = get_index(brow * mb, bcol * nb); + return &h_array[index]; + } + + // Fill with func(Matrix*,row, col) + template + void fill(Fun&& fun) + { + // Fill blocks by blocks + for (int colb = 0; colb < nt; colb++) + { + int low_rowb = sym_matrix ? colb : 0; + for (int rowb = low_rowb; rowb < mt; rowb++) + { + // Each task fills a block + ctx.host_launch(handle(rowb, colb).write())->*[=, self = this](auto sA) { + for (int lcol = 0; lcol < sA.extent(1); lcol++) + { + size_t col = lcol + colb * sA.extent(1); + for (int lrow = 0; lrow < sA.extent(0); lrow++) + { + size_t row = lrow + rowb * sA.extent(0); + sA(lrow, lcol) = fun(*self, row, col); + } + } + }; + } + } + } + + std::vector h_array; + size_t m; // nrows + size_t n; // ncols + + // Is this a sym matrix ? (lower assumed) + bool sym_matrix; + + size_t mb; // block size (rows) + size_t nb; // block size (cols) + + size_t mt; // number of column blocks + size_t nt; // number of row blocks + + // abstract data handles + std::vector>> handles; + + const char* symbol; + + // for the mapping + int ndevs; + int grid_p, grid_q; +}; + +void DPOTRF(cublasFillMode_t uplo, class matrix& A, int A_row, int A_col) +{ + auto& Akk = A.handle(A_row, A_col); + size_t m_akk = Akk.shape().extent(0); + // Note that the handle may be different from the actual handle... + int Lwork_expected; + cuda_safe_call(cusolverDnDpotrf_bufferSize(get_cusolver_handle(), uplo, m_akk, nullptr, 0, &Lwork_expected)); + + auto potrf_buffer = ctx.logical_data(Lwork_expected); + auto devInfo = ctx.logical_data(shape_of>(1)); + + auto t = ctx.task(Akk.rw(), potrf_buffer.write(), devInfo.write()); + // t.set_symbol("DPOTRF"); + t->*[&](cudaStream_t s, auto sAkk, auto buffer, auto info) { + auto& h = get_cusolver_handle(); + cuda_safe_call(cusolverDnSetStream(h, s)); + + cuda_safe_call(cusolverDnDpotrf( + h, + uplo, + sAkk.extent(0), + sAkk.data_handle(), + sAkk.stride(1), + buffer.data_handle(), + buffer.extent(0), + info.data_handle())); + }; +} + +void DGEMM( + cublasOperation_t transa, + cublasOperation_t transb, + double alpha, + class matrix& A, + int A_row, + int A_col, + class matrix& B, + int B_row, + int B_col, + double beta, + class matrix& C, + int C_row, + int C_col) +{ + auto ignored = get_cublas_handle(); + auto t = ctx.task(A.handle(A_row, A_col).read(), B.handle(B_row, B_col).read(), C.handle(C_row, C_col).rw()); + // t.set_symbol("DGEMM"); + t->*[&](cudaStream_t s, auto sA, auto sB, auto sC) { + auto& h = get_cublas_handle(); + cuda_safe_call(cublasSetStream(h, s)); + + auto k = (transa == CUBLAS_OP_N) ? sA.extent(1) : sA.extent(0); + cuda_safe_call(cublasDgemm( + h, + transa, + transb, + sC.extent(0), + sC.extent(1), + k, + &alpha, + sA.data_handle(), + sA.stride(1), + sB.data_handle(), + sB.stride(1), + &beta, + sC.data_handle(), + sC.stride(1))); + }; +} + +void DSYRK( + cublasFillMode_t uplo, + cublasOperation_t trans, + double alpha, + class matrix& A, + int A_row, + int A_col, + double beta, + class matrix& C, + int C_row, + int C_col) +{ + auto ignored = get_cublas_handle(); + auto t = ctx.task(A.handle(A_row, A_col).read(), C.handle(C_row, C_col).rw()); + // t.set_symbol("DSYRK"); + t->*[&](cudaStream_t s, auto sA, auto sC) { + auto& h = get_cublas_handle(); + cuda_safe_call(cublasSetStream(h, s)); + + // number of rows of matrix op(A) and C + auto n = sC.extent(0); + + // number of columns of matrix op(A) + auto k = (trans == CUBLAS_OP_N) ? sA.extent(1) : sA.extent(0); + + cuda_safe_call( + cublasDsyrk(h, uplo, trans, n, k, &alpha, sA.data_handle(), sA.stride(1), &beta, sC.data_handle(), sC.stride(1))); + }; +} + +void DTRSM( + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t transa, + cublasDiagType_t diag, + double alpha, + class matrix& A, + int A_row, + int A_col, + class matrix& B, + int B_row, + int B_col) +{ + auto ignored = get_cublas_handle(); + auto t = ctx.task(A.handle(A_row, A_col).read(), B.handle(B_row, B_col).rw()); + // t.set_symbol("DTRSM"); + t->*[&](cudaStream_t s, auto sA, auto sB) { + auto& h = get_cublas_handle(); + cuda_safe_call(cublasSetStream(h, s)); + + cuda_safe_call(cublasDtrsm( + h, + side, + uplo, + transa, + diag, + sB.extent(0), + sB.extent(1), + &alpha, + sA.data_handle(), + sA.stride(1), + sB.data_handle(), + sB.stride(1))); + }; +} + +void PDNRM2_HOST(matrix* A, double* result) +{ +#ifdef HAVE_DOT + reserved::dot::set_current_color("red"); +#endif + + for (int rowb = 0; rowb < A->mt; rowb++) + { + for (int colb = 0; colb < A->nt; colb++) + { + ctx.host_launch(A->handle(rowb, colb).read())->*[=](auto sA) { + double res2 = 0.0; + for (size_t col = 0; col < sA.extent(1); col++) + { + for (size_t row = 0; row < sA.extent(0); row++) + { + double v = sA(row, col); + res2 += v * v; + } + } + *result += res2; + }; + } + } +} + +void PDPOTRF(matrix& A) +{ +#ifdef HAVE_DOT + reserved::dot::set_current_color("yellow"); +#endif + + assert(A.m == A.n); + assert(A.mt == A.nt); + + int NBLOCKS = A.mt; + assert(A.mb == A.nb); + + cuda_safe_call(cudaSetDevice(0)); + + nvtxRangePushA("SUBMIT_PDPOTRF"); + for (int K = 0; K < NBLOCKS; K++) + { + int dev_akk = A.get_preferred_devid(K, K); + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(K, K))); + DPOTRF(CUBLAS_FILL_MODE_LOWER, A, K, K); + + for (int row = K + 1; row < NBLOCKS; row++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, K))); + DTRSM(CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, K, K, A, row, K); + + for (int col = K + 1; col < row; col++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); + DGEMM(CUBLAS_OP_N, CUBLAS_OP_T, -1.0, A, row, K, A, col, K, 1.0, A, row, col); + } + + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, row))); + DSYRK(CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, -1.0, A, row, K, 1.0, A, row, row); + } + } + cuda_safe_call(cudaSetDevice(0)); + + nvtxRangePop(); +} + +// Algorithm from PLASMA +void PDTRSM(cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + double alpha, + class matrix& A, + class matrix& B) +{ + // std::cout << "[PDTRSM] START B MT " << B.mt << " NT " << B.nt << std::endl; + + if (side == CUBLAS_SIDE_LEFT) + { + if (uplo == CUBLAS_FILL_MODE_UPPER) + { + // TODO + assert(0); + abort(); + } + else + { + //=========================================== + // CUBLAS_SIDE_LEFT / CUBLAS_FILL_MODE_LOWER / CUBLAS_OP_N + //=========================================== + if (trans == CUBLAS_OP_N) + { + for (int k = 0; k < B.mt; k++) + { + double lalpha = k == 0 ? alpha : 1.0; + for (int n = 0; n < B.nt; n++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(k, k))); + DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); + } + for (int m = k + 1; m < B.mt; m++) + { + for (int n = 0; n < B.nt; n++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(m, k))); + DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); + } + } + } + } + //================================================ + // CUBLAS_SIDE_LEFT / CUBLAS_FILL_MODE_LOWER / CUBLAS_OP_[C|T] + //================================================ + else + { + for (int k = 0; k < B.mt; k++) + { + double lalpha = k == 0 ? alpha : 1.0; + for (int n = 0; n < B.nt; n++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); + DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); + } + for (int m = k + 1; m < B.mt; m++) + { + for (int n = 0; n < B.nt; n++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); + DGEMM( + trans, CUBLAS_OP_N, -1.0, A, B.mt - k - 1, B.mt - 1 - m, B, B.mt - k - 1, n, lalpha, B, B.mt - 1 - m, n); + } + } + } + } + } + } + else + { + // TODO + abort(); + } + cuda_safe_call(cudaSetDevice(0)); + // std::cout << "[PDTRSM] END" << std::endl; +} + +void PDPOTRS(matrix& A, class matrix& B, cublasFillMode_t uplo) +{ +#ifdef HAVE_DOT + reserved::dot::set_current_color("green"); +#endif + + // std::cout << "[PDPOTRS] START" << std::endl; + // Call the parallel functions. + PDTRSM( + CUBLAS_SIDE_LEFT, uplo, uplo == CUBLAS_FILL_MODE_UPPER ? CUBLAS_OP_T : CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, 1.0, A, B); + +#ifdef HAVE_DOT + reserved::dot::set_current_color("darkgreen"); +#endif + + PDTRSM( + CUBLAS_SIDE_LEFT, uplo, uplo == CUBLAS_FILL_MODE_UPPER ? CUBLAS_OP_N : CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, B); + // std::cout << "[PDPOTRS] END" << std::endl; +} + +/***************************************************************************** + * Parallel tile matrix-matrix + *multiplication. + * @see plasma_omp_dgemm + ******************************************************************************/ +void PDGEMM(cublasOperation_t transa, + cublasOperation_t transb, + double alpha, + class matrix& A, + class matrix& B, + double beta, + class matrix& C) +{ +#ifdef HAVE_DOT + reserved::dot::set_current_color("blue"); +#endif + + for (int m = 0; m < C.mt; m++) + { + for (int n = 0; n < C.nt; n++) + { + //========================================= + // alpha*A*B does not contribute; scale C + //========================================= + int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + if (alpha == 0.0 || inner_k == 0) + { + DGEMM(transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); + } + else if (transa == CUBLAS_OP_N) + { + //================================ + // CUBLAS_OP_N / CUBLAS_OP_N + //================================ + if (transb == CUBLAS_OP_N) + { + for (int k = 0; k < A.nt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); + } + } + //===================================== + // CUBLAS_OP_N / CUBLAS_OP_T + //===================================== + else + { + for (int k = 0; k < A.nt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); + } + } + } + else + { + //===================================== + // CUBLAS_OP_T / CUBLAS_OP_N + //===================================== + if (transb == CUBLAS_OP_N) + { + for (int k = 0; k < A.mt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); + } + } + //========================================== + // CUBLAS_OP_T / CUBLAS_OP_T + //========================================== + else + { + for (int k = 0; k < A.mt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); + } + } + } + } + } +} + +int main(int argc, char** argv) +{ + int N = 1024; + int NB = 128; + + if (argc > 1) + { + N = atoi(argv[1]); + } + + if (argc > 2) + { + NB = atoi(argv[2]); + } + + assert(N % NB == 0); + + // Set up CUBLAS and CUSOLVER + int ndevs; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + cuda_safe_call(cudaSetDevice(0)); + + matrix A(N, N, NB, NB, true, "A"); + matrix Aref(N, N, NB, NB, false, "Aref"); + + // (Hilbert matrix + 2*N*Id) to have a diagonal dominant matrix + auto hilbert = [](matrix& mat, int row, int col) { + return 1.0 / (col + row + 1.0) + 2.0 * mat.n * (col == row); + }; + + Aref.fill(hilbert); + A.fill(hilbert); + + /* Right-hand side */ + matrix B_potrs(N, 1, NB, 1, false, "B"); + matrix Bref_potrs(N, 1, NB, 1, false, "Bref"); + + auto rhs_vals = [](matrix& /*unused*/, int row, int /*unused*/) { + return 1.0 * (row + 1); + }; + B_potrs.fill(rhs_vals); + Bref_potrs.fill(rhs_vals); + + int check_result = 1; + if (getenv("CHECK_RESULT")) + { + check_result = atoi(getenv("CHECK_RESULT")); + } + + // // Compute ||Bref|| + double Bref_nrm2 = 0.0; + double res_nrm2 = 0.0; + + if (check_result) + { + PDNRM2_HOST(&Bref_potrs, &Bref_nrm2); + } + + // for (int row = 0; row < A.mt; row++) + // { + // for (int col = 0; col <= row; col++) + // { + // cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); + // NOOP(A, row, col); + // } + // } + + PDPOTRF(A); + + /* + * POTRS + */ + + if (check_result) + { + // Solve AX = B and put the result in B + PDPOTRS(A, B_potrs, CUBLAS_FILL_MODE_LOWER); + + // Compute (AX - B) + // Bref = (Aref*B - Bref) + PDGEMM(CUBLAS_OP_N, CUBLAS_OP_N, 1.0, Aref, B_potrs, -1.0, Bref_potrs); + + // Compute ||AX - B|| = ||Bref|| + PDNRM2_HOST(&Bref_potrs, &res_nrm2); + } + + ctx.finalize(); + + if (check_result) + { + double residual = sqrt(res_nrm2) / sqrt(Bref_nrm2); + // std::cout << "[POTRS] ||AX - B|| : " << sqrt(res_nrm2) << std::endl; + // std::cout << "[POTRS] ||B|| : " << sqrt(Bref_nrm2) << std::endl; + // std::cout << "[POTRS] RESIDUAL (||AX - B||/||B||) : " << residual << std::endl; + assert(residual < 0.01); + } + + return 0; +} diff --git a/cudax/test/stf/examples/09-nbody-algorithm.cu b/cudax/test/stf/examples/09-nbody-algorithm.cu new file mode 100644 index 00000000000..376fbed3aa8 --- /dev/null +++ b/cudax/test/stf/examples/09-nbody-algorithm.cu @@ -0,0 +1,129 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include + +using namespace cuda::experimental::stf; + +struct body +{ + // mass + double mass; + // position + double pos[3]; + // speed + double vel[3]; + // acceleration + double acc[3]; +}; + +int main() +{ + constexpr double kSofteningSquared = 1e-3; + constexpr double kG = 6.67259e-11; + + size_t BODY_CNT = 4096; + + double dt = 0.1; + size_t NITER = 25; + + context ctx; + + std::vector particles; + particles.resize(BODY_CNT); + + // Initialize particles + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(-1.0, 1.0); + for (auto& p : particles) + { + p.mass = 1.0; + + p.pos[0] = dis(gen); + p.pos[1] = dis(gen); + p.pos[2] = dis(gen); + + p.vel[0] = dis(gen); + p.vel[1] = dis(gen); + p.vel[2] = dis(gen); + + p.acc[0] = 0.0; + p.acc[1] = 0.0; + p.acc[2] = 0.0; + } + + auto h_particles = ctx.logical_data(make_slice(&particles[0], BODY_CNT)); + + auto fn = [dt](context ctx, logical_data> h_particles) { + // Compute accelerations + ctx.parallel_for(h_particles.shape(), h_particles.rw())->*[=] _CCCL_DEVICE __host__(size_t i, slice p) { + double acc[3]; + for (size_t k = 0; k < 3; k++) + { + acc[k] = p(i).acc[k]; + } + + for (size_t j = 0; j < p.extent(0); j++) + { + if (i != j) + { + double d[3]; + for (size_t k = 0; k < 3; k++) + { + d[k] = p(j).pos[k] - p(i).pos[k]; + } + + double dist = d[0] * d[0] + d[1] * d[1] + d[2] * d[2] + kSofteningSquared; + double dist_inv = 1.0 / sqrt(dist); + + for (size_t k = 0; k < 3; k++) + { + acc[k] += d[k] * kG * p(j).mass * dist_inv * dist_inv * dist_inv; + } + } + } + + for (size_t k = 0; k < 3; k++) + { + p(i).acc[k] = acc[k]; + } + }; + + // Update velocity and positions + ctx.parallel_for(h_particles.shape(), h_particles.rw())->*[=] __host__ __device__(size_t i, slice p) { + for (size_t k = 0; k < 3; k++) + { + p(i).vel[k] += p(i).acc[k] * dt; + } + + for (size_t k = 0; k < 3; k++) + { + p(i).pos[k] += p(i).vel[k] * dt; + } + + for (size_t k = 0; k < 3; k++) + { + p(i).acc[k] = 0.0; + } + }; + }; + + algorithm one_iter; + for (size_t iter = 0; iter < NITER; iter++) + { + // fprintf(stderr, "ITER %ld\n", iter); + one_iter.run_as_task(fn, ctx, h_particles.rw()); + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/examples/09-nbody-blocked-graph.cu b/cudax/test/stf/examples/09-nbody-blocked-graph.cu new file mode 100644 index 00000000000..41f90886a24 --- /dev/null +++ b/cudax/test/stf/examples/09-nbody-blocked-graph.cu @@ -0,0 +1,320 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * This file is WIP : it is not compiled automatically yet + */ + +#include +#include + +#include + +using namespace cuda::experimental::stf; + +struct body +{ + // mass + double mass; + // position + double pos[3]; + // speed + double vel[3]; +}; + +// Function to write VTK file for a single time step +void writeVTKFile(stream_ctx& ctx, + const std::string& filename, + size_t BLOCK_SIZE, + size_t BODY_CNT, + std::vector>> parts) +{ + std::ofstream outfile(filename); + + if (!outfile) + { + std::cerr << "Error opening file: " << filename << std::endl; + return; + } + + outfile << "# vtk DataFile Version 4.2\n"; + outfile << "Position Data\n"; + outfile << "ASCII\n"; + outfile << "DATASET UNSTRUCTURED_GRID\n"; + outfile << "POINTS " << BODY_CNT << " float\n"; + + std::vector dump(3 * BODY_CNT); + + for (size_t b = 0; b < parts.size(); b++) + { + ctx.task(exec_place::host, parts[b].read())->*[&](cudaStream_t s, slice p) { + cuda_safe_call(cudaStreamSynchronize(s)); + for (size_t i = 0; i < p.size(); i++) + { + for (size_t k = 0; k < 3; k++) + { + dump[3 * (i + b * BLOCK_SIZE) + k] = p(i).pos[k]; + } + } + }; + } + + for (size_t p = 0; p < BODY_CNT; p++) + { + outfile << dump[3 * p] << " " << dump[3 * p + 1] << " " << dump[3 * p + 2] << "\n"; + } + + outfile.close(); +} + +void load_input_file(std::string filename, std::vector& particles) +{ + std::ifstream infile(filename); + if (!infile) + { + std::cerr << "Error opening file." << std::endl; + abort(); + return; + } + + double mass, posX, posY, posZ, velX, velY, velZ; + size_t ind = 0; + + // Loop until we reach the end of the file + while (infile >> mass >> posX >> posY >> posZ >> velX >> velY >> velZ) + { + body p; + p.mass = mass; + + p.pos[0] = posX; + p.pos[1] = posY; + p.pos[2] = posZ; + + p.vel[0] = velX; + p.vel[1] = velY; + p.vel[2] = velZ; + + // // Display first bodies + // if (ind < 10) { + // fprintf(stderr, "body xyz %e %e %e dxyz %e %e %e m %e\n", p.pos[0], p.pos[1], p.pos[2], p.vel[0], + // p.vel[1], p.vel[2], p.mass); + //} + + ind++; + particles.push_back(p); + } + + fprintf(stderr, "Loaded %ld bodies from %s...\n", ind, filename.c_str()); +} + +int main(int argc, char** argv) +{ + constexpr double kSofteningSquared = 1e-9; + // constexpr double kG = 6.67259e-11; + constexpr double kG = 1.0; + + size_t BODY_CNT = 128ULL * 1024ULL; + size_t BLOCK_SIZE = 16 * 1024ULL; + + std::vector particles; + + size_t NITER = 7; // 7000; + if (argc > 1) + { + NITER = atoi(argv[1]); + } + + // Initialize particles + if (argc > 2) + { + // Get dataset from file + std::string filename = argv[2]; + + load_input_file(filename, particles); + + BODY_CNT = particles.size(); + BLOCK_SIZE = (BODY_CNT + 7) / 8; + } + else + { + // Random distribution + BODY_CNT = 32ULL * 1024ULL; + particles.resize(BODY_CNT); + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(-1.0, 1.0); + for (auto& p : particles) + { + p.mass = 1.0; + + p.pos[0] = dis(gen); + p.pos[1] = dis(gen); + p.pos[2] = dis(gen); + + p.vel[0] = dis(gen); + p.vel[1] = dis(gen); + p.vel[2] = dis(gen); + } + } + + cuda_safe_call(cudaHostRegister(&particles[0], BODY_CNT * sizeof(body), cudaHostRegisterPortable)); + + double dt = 0.005; + + stream_ctx ctx; + + std::vector>> parts_outer; + + size_t block_cnt = (BODY_CNT + BLOCK_SIZE - 1) / BLOCK_SIZE; + for (size_t i = 0; i < block_cnt; i++) + { + size_t first = i * BLOCK_SIZE; + size_t last = std::min((i + 1) * BLOCK_SIZE, BODY_CNT); + auto p_i = ctx.logical_data(make_slice(&particles[first], last - first)); + parts_outer.push_back(p_i); + } + + int ngpus; + cuda_safe_call(cudaGetDeviceCount(&ngpus)); + + cudaEvent_t start; + cuda_safe_call(cudaEventCreate(&start)); + cuda_safe_call(cudaEventRecord(start, ctx.task_fence())); + + cudaGraphExec_t exec_graph = nullptr; + + for (size_t iter = 0; iter < NITER; iter++) + { + auto iteration_task = ctx.task(); + + for (size_t b = 0; b < block_cnt; b++) + { + iteration_task.add_deps(parts_outer[b].rw(data_place::device(b % ngpus))); + } + + iteration_task.set_symbol("iteration"); + + iteration_task->*[&](cudaStream_t stream) { + graph_ctx gctx; + + // Accelerations + std::vector>> acc_parts; + std::vector>> parts; + + size_t block_cnt = (BODY_CNT + BLOCK_SIZE - 1) / BLOCK_SIZE; + for (size_t i = 0; i < block_cnt; i++) + { + size_t first = i * BLOCK_SIZE; + size_t last = std::min((i + 1) * BLOCK_SIZE, BODY_CNT); + + auto acc_p_i = gctx.logical_data(shape_of>(last - first, 3)); + acc_parts.push_back(acc_p_i); + + auto p_i = gctx.logical_data(iteration_task.get>(i), data_place::device(i % ngpus)); + parts.push_back(p_i); + } + + // Initialize acceleration to 0 + for (size_t b = 0; b < block_cnt; b++) + { + gctx.launch(exec_place::device(b % ngpus), acc_parts[b].write()).set_symbol("init_acc") + ->*[=] _CCCL_DEVICE(auto t, slice acc) { + for (size_t i = t.rank(); i < acc.extent(0); i += t.size()) + { + for (size_t k = 0; k < 3; k++) + { + acc(i, k) = 0.0; + } + } + }; + } + + // Compute accelerations + for (size_t b = 0; b < block_cnt; b++) + { + for (size_t b_other = 0; b_other < block_cnt; b_other++) + { + gctx.launch(exec_place::device(b % ngpus), parts[b].read(), parts[b_other].read(), acc_parts[b].rw()) + .set_symbol("compute_acc") + ->*[=] _CCCL_DEVICE(auto t, slice p, slice p_other, slice acc) { + for (size_t i = t.rank(); i < p.extent(0); i += t.size()) + { + for (size_t j = 0; j < p_other.extent(0); j++) + { + if ((b * BLOCK_SIZE + i) != (b_other * BLOCK_SIZE + j)) + { + double d[3]; + for (size_t k = 0; k < 3; k++) + { + d[k] = p_other(j).pos[k] - p(i).pos[k]; + } + + double dist = d[0] * d[0] + d[1] * d[1] + d[2] * d[2] + kSofteningSquared; + double dist_inv = 1.0 / sqrt(dist); + + for (size_t k = 0; k < 3; k++) + { + acc(i, k) += d[k] * kG * p_other(j).mass * dist_inv * dist_inv * dist_inv; + } + } + } + } + }; + } + } + + for (size_t b = 0; b < block_cnt; b++) + { + // Update velocity and positions + gctx.launch(exec_place::device(b % ngpus), parts[b].rw(), acc_parts[b].read()).set_symbol("update")->* + [=] _CCCL_DEVICE(auto t, slice p, slice acc) { + for (size_t i = t.rank(); i < p.extent(0); i += t.size()) + { + for (size_t k = 0; k < 3; k++) + { + p(i).vel[k] += acc(i, k) * dt; + } + + for (size_t k = 0; k < 3; k++) + { + p(i).pos[k] += p(i).vel[k] * dt; + } + } + }; + } + + gctx.submit(stream, exec_graph); // no sync + exec_graph = gctx.get_exec_graph(); + }; + + // Write the VTK file for this time step + const char* dump_freq_str = getenv("DUMP_FREQ"); + if (dump_freq_str && iter % atoi(dump_freq_str) == 0) + { + std::string filename = "time_step_" + std::to_string(iter) + ".vtk"; + writeVTKFile(ctx, filename, BLOCK_SIZE, BODY_CNT, parts_outer); + } + } + + cudaEvent_t stop; + cuda_safe_call(cudaEventCreate(&stop)); + cuda_safe_call(cudaEventRecord(stop, ctx.task_fence())); + + ctx.finalize(); + + float elapsed; + cuda_safe_call(cudaEventElapsedTime(&elapsed, start, stop)); + + // rough approximation ! + double FLOP_COUNT = 21.0 * (1.0 * BODY_CNT) * (1.0 * BODY_CNT) * NITER; + + printf("NBODY: elapsed %f ms, %f GFLOPS\n", elapsed, FLOP_COUNT / elapsed / 1000000.0); +} diff --git a/cudax/test/stf/examples/09-nbody-blocked.cu b/cudax/test/stf/examples/09-nbody-blocked.cu new file mode 100644 index 00000000000..eca18b2df60 --- /dev/null +++ b/cudax/test/stf/examples/09-nbody-blocked.cu @@ -0,0 +1,283 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include + +using namespace cuda::experimental::stf; + +struct body +{ + // mass + double mass; + // position + double pos[3]; + // speed + double vel[3]; +}; + +// Function to write VTK file for a single time step +void writeVTKFile(context& ctx, + const std::string& filename, + size_t BLOCK_SIZE, + size_t BODY_CNT, + std::vector>> parts) +{ + std::ofstream outfile(filename); + + if (!outfile) + { + std::cerr << "Error opening file: " << filename << std::endl; + return; + } + + outfile << "# vtk DataFile Version 4.2\n"; + outfile << "Position Data\n"; + outfile << "ASCII\n"; + outfile << "DATASET UNSTRUCTURED_GRID\n"; + outfile << "POINTS " << BODY_CNT << " float\n"; + + std::vector dump(3 * BODY_CNT); + + for (size_t b = 0; b < parts.size(); b++) + { + ctx.task(exec_place::host, parts[b].read())->*[&](cudaStream_t s, slice p) { + cuda_safe_call(cudaStreamSynchronize(s)); + for (size_t i = 0; i < p.size(); i++) + { + for (size_t k = 0; k < 3; k++) + { + dump[3 * (i + b * BLOCK_SIZE) + k] = p(i).pos[k]; + } + } + }; + } + + for (size_t p = 0; p < BODY_CNT; p++) + { + outfile << dump[3 * p] << " " << dump[3 * p + 1] << " " << dump[3 * p + 2] << "\n"; + } + + outfile.close(); +} + +void load_input_file(std::string filename, std::vector& particles) +{ + std::ifstream infile(filename); + if (!infile) + { + std::cerr << "Error opening file." << std::endl; + abort(); + return; + } + + double mass, posX, posY, posZ, velX, velY, velZ; + size_t ind = 0; + + // Loop until we reach the end of the file + while (infile >> mass >> posX >> posY >> posZ >> velX >> velY >> velZ) + { + body p; + p.mass = mass; + + p.pos[0] = posX; + p.pos[1] = posY; + p.pos[2] = posZ; + + p.vel[0] = velX; + p.vel[1] = velY; + p.vel[2] = velZ; + + // // Display first bodies + // if (ind < 10) { + // fprintf(stderr, "body xyz %e %e %e dxyz %e %e %e m %e\n", p.pos[0], p.pos[1], p.pos[2], p.vel[0], + // p.vel[1], p.vel[2], p.mass); + //} + + ind++; + particles.push_back(p); + } + + fprintf(stderr, "Loaded %zu bodies from %s...\n", ind, filename.c_str()); +} + +int main(int argc, char** argv) +{ + constexpr double kSofteningSquared = 1e-9; + // constexpr double kG = 6.67259e-11; + constexpr double kG = 1.0; + + size_t BODY_CNT = 128ULL * 1024ULL; + size_t BLOCK_SIZE = 16 * 1024ULL; + + std::vector particles; + + // Initialize particles + if (argc > 1) + { + // Get dataset from file + std::string filename = argv[1]; + + load_input_file(filename, particles); + + BODY_CNT = particles.size(); + BLOCK_SIZE = (BODY_CNT + 7) / 8; + } + else + { + // Random distribution + BODY_CNT = 32ULL * 1024ULL; + particles.resize(BODY_CNT); + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(-1.0, 1.0); + for (auto& p : particles) + { + p.mass = 1.0; + + p.pos[0] = dis(gen); + p.pos[1] = dis(gen); + p.pos[2] = dis(gen); + + p.vel[0] = dis(gen); + p.vel[1] = dis(gen); + p.vel[2] = dis(gen); + } + } + + cuda_safe_call(cudaHostRegister(&particles[0], BODY_CNT * sizeof(body), cudaHostRegisterPortable)); + + double dt = 0.005; + size_t NITER = 7; // 7000; + + context ctx; + + std::vector>> parts; + + // Accelerations + std::vector>> acc_parts; + + size_t block_cnt = (BODY_CNT + BLOCK_SIZE - 1) / BLOCK_SIZE; + for (size_t i = 0; i < block_cnt; i++) + { + size_t first = i * BLOCK_SIZE; + size_t last = std::min((i + 1) * BLOCK_SIZE, BODY_CNT); + auto p_i = ctx.logical_data(make_slice(&particles[first], last - first)); + parts.push_back(p_i); + + auto acc_p_i = ctx.logical_data(shape_of>(last - first, 3)); + acc_parts.push_back(acc_p_i); + } + + int ngpus; + cuda_safe_call(cudaGetDeviceCount(&ngpus)); + + cudaEvent_t start; + cuda_safe_call(cudaEventCreate(&start)); + cuda_safe_call(cudaEventRecord(start, ctx.task_fence())); + + for (size_t iter = 0; iter < NITER; iter++) + { + // Initialize acceleration to 0 + for (size_t b = 0; b < block_cnt; b++) + { + ctx.launch(exec_place::device(b % ngpus), acc_parts[b].write()) + //.set_symbol("init_acc") + ->*[=] _CCCL_DEVICE(auto t, slice acc) { + for (size_t i = t.rank(); i < acc.extent(0); i += t.size()) + { + for (size_t k = 0; k < 3; k++) + { + acc(i, k) = 0.0; + } + } + }; + } + + // Compute accelerations + for (size_t b = 0; b < block_cnt; b++) + { + for (size_t b_other = 0; b_other < block_cnt; b_other++) + { + ctx.launch(exec_place::device(b % ngpus), parts[b].read(), parts[b_other].read(), acc_parts[b].rw()) + //.set_symbol("compute_acc") + ->*[=] _CCCL_DEVICE(auto t, slice p, slice p_other, slice acc) { + for (size_t i = t.rank(); i < p.extent(0); i += t.size()) + { + for (size_t j = 0; j < p_other.extent(0); j++) + { + if ((b * BLOCK_SIZE + i) != (b_other * BLOCK_SIZE + j)) + { + double d[3]; + for (size_t k = 0; k < 3; k++) + { + d[k] = p_other(j).pos[k] - p(i).pos[k]; + } + + double dist = d[0] * d[0] + d[1] * d[1] + d[2] * d[2] + kSofteningSquared; + double dist_inv = 1.0 / sqrt(dist); + + for (size_t k = 0; k < 3; k++) + { + acc(i, k) += d[k] * kG * p_other(j).mass * dist_inv * dist_inv * dist_inv; + } + } + } + } + }; + } + } + + for (size_t b = 0; b < block_cnt; b++) + { + // Update velocity and positions + ctx.launch(exec_place::device(b % ngpus), parts[b].rw(), acc_parts[b].read()) + //.set_symbol("update") + ->*[=] _CCCL_DEVICE(auto t, slice p, slice acc) { + for (size_t i = t.rank(); i < p.extent(0); i += t.size()) + { + for (size_t k = 0; k < 3; k++) + { + p(i).vel[k] += acc(i, k) * dt; + } + + for (size_t k = 0; k < 3; k++) + { + p(i).pos[k] += p(i).vel[k] * dt; + } + } + }; + } + + // Write the VTK file for this time step + const char* dump_freq_str = getenv("DUMP_FREQ"); + if (dump_freq_str && iter % atoi(dump_freq_str) == 0) + { + std::string filename = "time_step_" + std::to_string(iter) + ".vtk"; + writeVTKFile(ctx, filename, BLOCK_SIZE, BODY_CNT, parts); + } + } + + cudaEvent_t stop; + cuda_safe_call(cudaEventCreate(&stop)); + cuda_safe_call(cudaEventRecord(stop, ctx.task_fence())); + + ctx.finalize(); + + float elapsed; + cuda_safe_call(cudaEventElapsedTime(&elapsed, start, stop)); + + // rough approximation ! + double FLOP_COUNT = 21.0 * (1.0 * BODY_CNT) * (1.0 * BODY_CNT) * NITER; + + printf("NBODY: elapsed %f ms, %f GFLOPS\n", elapsed, FLOP_COUNT / elapsed / 1000000.0); +} diff --git a/cudax/test/stf/examples/09-nbody.cu b/cudax/test/stf/examples/09-nbody.cu new file mode 100644 index 00000000000..63b1de2e3fe --- /dev/null +++ b/cudax/test/stf/examples/09-nbody.cu @@ -0,0 +1,122 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include + +using namespace cuda::experimental::stf; + +struct body +{ + // mass + double mass; + // position + double pos[3]; + // speed + double vel[3]; + // acceleration + double acc[3]; +}; + +int main() +{ + constexpr double kSofteningSquared = 1e-3; + constexpr double kG = 6.67259e-11; + + size_t BODY_CNT = 4096; + + double dt = 0.1; + size_t NITER = 25; + + context ctx = graph_ctx(); + + std::vector particles; + particles.resize(BODY_CNT); + + // Initialize particles + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(-1.0, 1.0); + for (auto& p : particles) + { + p.mass = 1.0; + + p.pos[0] = dis(gen); + p.pos[1] = dis(gen); + p.pos[2] = dis(gen); + + p.vel[0] = dis(gen); + p.vel[1] = dis(gen); + p.vel[2] = dis(gen); + + p.acc[0] = 0.0; + p.acc[1] = 0.0; + p.acc[2] = 0.0; + } + + auto h_particles = ctx.logical_data(make_slice(&particles[0], BODY_CNT)); + + ctx.repeat(NITER)->*[&](context ctx, size_t) { + // Compute accelerations + ctx.parallel_for(h_particles.shape(), h_particles.rw())->*[=] _CCCL_DEVICE __host__(size_t i, slice p) { + double acc[3]; + for (size_t k = 0; k < 3; k++) + { + acc[k] = p(i).acc[k]; + } + + for (size_t j = 0; j < p.extent(0); j++) + { + if (i != j) + { + double d[3]; + for (size_t k = 0; k < 3; k++) + { + d[k] = p(j).pos[k] - p(i).pos[k]; + } + + double dist = d[0] * d[0] + d[1] * d[1] + d[2] * d[2] + kSofteningSquared; + double dist_inv = 1.0 / sqrt(dist); + + for (size_t k = 0; k < 3; k++) + { + acc[k] += d[k] * kG * p(j).mass * dist_inv * dist_inv * dist_inv; + } + } + } + + for (size_t k = 0; k < 3; k++) + { + p(i).acc[k] = acc[k]; + } + }; + + // Update velocity and positions + ctx.parallel_for(h_particles.shape(), h_particles.rw())->*[=] __host__ __device__(size_t i, slice p) { + for (size_t k = 0; k < 3; k++) + { + p(i).vel[k] += p(i).acc[k] * dt; + } + + for (size_t k = 0; k < 3; k++) + { + p(i).pos[k] += p(i).vel[k] * dt; + } + + for (size_t k = 0; k < 3; k++) + { + p(i).acc[k] = 0.0; + } + }; + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/fhe/batcher_sort_interpreted_cufhe.bool.ir b/cudax/test/stf/fhe/batcher_sort_interpreted_cufhe.bool.ir new file mode 100755 index 00000000000..e362d09ad09 --- /dev/null +++ b/cudax/test/stf/fhe/batcher_sort_interpreted_cufhe.bool.ir @@ -0,0 +1,355 @@ +package my_package + +fn conditional_swap(x: bits[16], y: bits[16], tmp: bits[16]) -> bits[16] { + bit_slice.936: bits[1] = bit_slice(y, start=15, width=1, id=936) + bit_slice.935: bits[1] = bit_slice(y, start=14, width=1, id=935) + bit_slice.952: bits[1] = bit_slice(x, start=15, width=1, id=952) + not.953: bits[1] = not(bit_slice.936, id=953) + bit_slice.951: bits[1] = bit_slice(x, start=14, width=1, id=951) + not.954: bits[1] = not(bit_slice.935, id=954) + bit_slice.934: bits[1] = bit_slice(y, start=13, width=1, id=934) + and.968: bits[1] = and(bit_slice.952, not.953, id=968) + and.971: bits[1] = and(bit_slice.951, not.954, id=971) + bit_slice.950: bits[1] = bit_slice(x, start=13, width=1, id=950) + not.955: bits[1] = not(bit_slice.934, id=955) + bit_slice.933: bits[1] = bit_slice(y, start=12, width=1, id=933) + not.969: bits[1] = not(and.968, id=969) + not.970: bits[1] = not(bit_slice.951, id=970) + not.972: bits[1] = not(and.971, id=972) + and.974: bits[1] = and(bit_slice.950, not.955, id=974) + bit_slice.949: bits[1] = bit_slice(x, start=12, width=1, id=949) + not.956: bits[1] = not(bit_slice.933, id=956) + bit_slice.932: bits[1] = bit_slice(y, start=11, width=1, id=932) + or.1014: bits[1] = or(bit_slice.952, not.953, id=1014) + and.1016: bits[1] = and(not.969, not.970, id=1016) + and.1018: bits[1] = and(not.969, not.972, id=1018) + not.973: bits[1] = not(bit_slice.950, id=973) + and.1021: bits[1] = and(not.969, not.972, id=1021) + not.975: bits[1] = not(and.974, id=975) + and.1025: bits[1] = and(not.969, not.972, id=1025) + and.977: bits[1] = and(bit_slice.949, not.956, id=977) + and.1030: bits[1] = and(not.969, not.972, id=1030) + bit_slice.948: bits[1] = bit_slice(x, start=11, width=1, id=948) + not.957: bits[1] = not(bit_slice.932, id=957) + and.1036: bits[1] = and(not.969, not.972, id=1036) + bit_slice.931: bits[1] = bit_slice(y, start=10, width=1, id=931) + and.1043: bits[1] = and(not.969, not.972, id=1043) + and.1051: bits[1] = and(not.969, not.972, id=1051) + and.1060: bits[1] = and(not.969, not.972, id=1060) + and.1070: bits[1] = and(not.969, not.972, id=1070) + and.1081: bits[1] = and(not.969, not.972, id=1081) + and.1093: bits[1] = and(not.969, not.972, id=1093) + and.1106: bits[1] = and(not.969, not.972, id=1106) + and.1120: bits[1] = and(not.969, not.972, id=1120) + and.1135: bits[1] = and(not.969, not.972, id=1135) + not.1015: bits[1] = not(or.1014, id=1015) + and.1017: bits[1] = and(and.1016, bit_slice.935, id=1017) + and.1019: bits[1] = and(and.1018, not.973, id=1019) + and.1022: bits[1] = and(and.1021, not.975, id=1022) + not.976: bits[1] = not(bit_slice.949, id=976) + and.1026: bits[1] = and(and.1025, not.975, id=1026) + not.978: bits[1] = not(and.977, id=978) + and.1031: bits[1] = and(and.1030, not.975, id=1031) + and.980: bits[1] = and(bit_slice.948, not.957, id=980) + and.1037: bits[1] = and(and.1036, not.975, id=1037) + bit_slice.947: bits[1] = bit_slice(x, start=10, width=1, id=947) + not.958: bits[1] = not(bit_slice.931, id=958) + and.1044: bits[1] = and(and.1043, not.975, id=1044) + bit_slice.930: bits[1] = bit_slice(y, start=9, width=1, id=930) + and.1052: bits[1] = and(and.1051, not.975, id=1052) + and.1061: bits[1] = and(and.1060, not.975, id=1061) + and.1071: bits[1] = and(and.1070, not.975, id=1071) + and.1082: bits[1] = and(and.1081, not.975, id=1082) + and.1094: bits[1] = and(and.1093, not.975, id=1094) + and.1107: bits[1] = and(and.1106, not.975, id=1107) + and.1121: bits[1] = and(and.1120, not.975, id=1121) + and.1136: bits[1] = and(and.1135, not.975, id=1136) + or.1153: bits[1] = or(not.1015, and.1017, id=1153) + and.1020: bits[1] = and(and.1019, bit_slice.934, id=1020) + and.1023: bits[1] = and(and.1022, not.976, id=1023) + and.1027: bits[1] = and(and.1026, not.978, id=1027) + not.979: bits[1] = not(bit_slice.948, id=979) + and.1032: bits[1] = and(and.1031, not.978, id=1032) + not.981: bits[1] = not(and.980, id=981) + and.1038: bits[1] = and(and.1037, not.978, id=1038) + and.983: bits[1] = and(bit_slice.947, not.958, id=983) + and.1045: bits[1] = and(and.1044, not.978, id=1045) + bit_slice.946: bits[1] = bit_slice(x, start=9, width=1, id=946) + not.959: bits[1] = not(bit_slice.930, id=959) + and.1053: bits[1] = and(and.1052, not.978, id=1053) + bit_slice.929: bits[1] = bit_slice(y, start=8, width=1, id=929) + and.1062: bits[1] = and(and.1061, not.978, id=1062) + and.1072: bits[1] = and(and.1071, not.978, id=1072) + and.1083: bits[1] = and(and.1082, not.978, id=1083) + and.1095: bits[1] = and(and.1094, not.978, id=1095) + and.1108: bits[1] = and(and.1107, not.978, id=1108) + and.1122: bits[1] = and(and.1121, not.978, id=1122) + and.1137: bits[1] = and(and.1136, not.978, id=1137) + or.1154: bits[1] = or(or.1153, and.1020, id=1154) + and.1024: bits[1] = and(and.1023, bit_slice.933, id=1024) + and.1028: bits[1] = and(and.1027, not.979, id=1028) + and.1033: bits[1] = and(and.1032, not.981, id=1033) + not.982: bits[1] = not(bit_slice.947, id=982) + and.1039: bits[1] = and(and.1038, not.981, id=1039) + not.984: bits[1] = not(and.983, id=984) + and.1046: bits[1] = and(and.1045, not.981, id=1046) + and.986: bits[1] = and(bit_slice.946, not.959, id=986) + and.1054: bits[1] = and(and.1053, not.981, id=1054) + bit_slice.945: bits[1] = bit_slice(x, start=8, width=1, id=945) + not.960: bits[1] = not(bit_slice.929, id=960) + and.1063: bits[1] = and(and.1062, not.981, id=1063) + bit_slice.928: bits[1] = bit_slice(y, start=7, width=1, id=928) + and.1073: bits[1] = and(and.1072, not.981, id=1073) + and.1084: bits[1] = and(and.1083, not.981, id=1084) + and.1096: bits[1] = and(and.1095, not.981, id=1096) + and.1109: bits[1] = and(and.1108, not.981, id=1109) + and.1123: bits[1] = and(and.1122, not.981, id=1123) + and.1138: bits[1] = and(and.1137, not.981, id=1138) + or.1155: bits[1] = or(or.1154, and.1024, id=1155) + and.1029: bits[1] = and(and.1028, bit_slice.932, id=1029) + and.1034: bits[1] = and(and.1033, not.982, id=1034) + and.1040: bits[1] = and(and.1039, not.984, id=1040) + not.985: bits[1] = not(bit_slice.946, id=985) + and.1047: bits[1] = and(and.1046, not.984, id=1047) + not.987: bits[1] = not(and.986, id=987) + and.1055: bits[1] = and(and.1054, not.984, id=1055) + and.989: bits[1] = and(bit_slice.945, not.960, id=989) + and.1064: bits[1] = and(and.1063, not.984, id=1064) + bit_slice.944: bits[1] = bit_slice(x, start=7, width=1, id=944) + not.961: bits[1] = not(bit_slice.928, id=961) + and.1074: bits[1] = and(and.1073, not.984, id=1074) + bit_slice.927: bits[1] = bit_slice(y, start=6, width=1, id=927) + and.1085: bits[1] = and(and.1084, not.984, id=1085) + and.1097: bits[1] = and(and.1096, not.984, id=1097) + and.1110: bits[1] = and(and.1109, not.984, id=1110) + and.1124: bits[1] = and(and.1123, not.984, id=1124) + and.1139: bits[1] = and(and.1138, not.984, id=1139) + or.1156: bits[1] = or(or.1155, and.1029, id=1156) + and.1035: bits[1] = and(and.1034, bit_slice.931, id=1035) + and.1041: bits[1] = and(and.1040, not.985, id=1041) + and.1048: bits[1] = and(and.1047, not.987, id=1048) + not.988: bits[1] = not(bit_slice.945, id=988) + and.1056: bits[1] = and(and.1055, not.987, id=1056) + not.990: bits[1] = not(and.989, id=990) + and.1065: bits[1] = and(and.1064, not.987, id=1065) + and.992: bits[1] = and(bit_slice.944, not.961, id=992) + and.1075: bits[1] = and(and.1074, not.987, id=1075) + bit_slice.943: bits[1] = bit_slice(x, start=6, width=1, id=943) + not.962: bits[1] = not(bit_slice.927, id=962) + and.1086: bits[1] = and(and.1085, not.987, id=1086) + bit_slice.926: bits[1] = bit_slice(y, start=5, width=1, id=926) + and.1098: bits[1] = and(and.1097, not.987, id=1098) + and.1111: bits[1] = and(and.1110, not.987, id=1111) + and.1125: bits[1] = and(and.1124, not.987, id=1125) + and.1140: bits[1] = and(and.1139, not.987, id=1140) + or.1157: bits[1] = or(or.1156, and.1035, id=1157) + and.1042: bits[1] = and(and.1041, bit_slice.930, id=1042) + and.1049: bits[1] = and(and.1048, not.988, id=1049) + and.1057: bits[1] = and(and.1056, not.990, id=1057) + not.991: bits[1] = not(bit_slice.944, id=991) + and.1066: bits[1] = and(and.1065, not.990, id=1066) + not.993: bits[1] = not(and.992, id=993) + and.1076: bits[1] = and(and.1075, not.990, id=1076) + and.995: bits[1] = and(bit_slice.943, not.962, id=995) + and.1087: bits[1] = and(and.1086, not.990, id=1087) + bit_slice.942: bits[1] = bit_slice(x, start=5, width=1, id=942) + not.963: bits[1] = not(bit_slice.926, id=963) + and.1099: bits[1] = and(and.1098, not.990, id=1099) + bit_slice.925: bits[1] = bit_slice(y, start=4, width=1, id=925) + and.1112: bits[1] = and(and.1111, not.990, id=1112) + and.1126: bits[1] = and(and.1125, not.990, id=1126) + and.1141: bits[1] = and(and.1140, not.990, id=1141) + or.1158: bits[1] = or(or.1157, and.1042, id=1158) + and.1050: bits[1] = and(and.1049, bit_slice.929, id=1050) + and.1058: bits[1] = and(and.1057, not.991, id=1058) + and.1067: bits[1] = and(and.1066, not.993, id=1067) + not.994: bits[1] = not(bit_slice.943, id=994) + and.1077: bits[1] = and(and.1076, not.993, id=1077) + not.996: bits[1] = not(and.995, id=996) + and.1088: bits[1] = and(and.1087, not.993, id=1088) + and.998: bits[1] = and(bit_slice.942, not.963, id=998) + and.1100: bits[1] = and(and.1099, not.993, id=1100) + bit_slice.941: bits[1] = bit_slice(x, start=4, width=1, id=941) + not.964: bits[1] = not(bit_slice.925, id=964) + and.1113: bits[1] = and(and.1112, not.993, id=1113) + bit_slice.924: bits[1] = bit_slice(y, start=3, width=1, id=924) + and.1127: bits[1] = and(and.1126, not.993, id=1127) + and.1142: bits[1] = and(and.1141, not.993, id=1142) + or.1159: bits[1] = or(or.1158, and.1050, id=1159) + and.1059: bits[1] = and(and.1058, bit_slice.928, id=1059) + and.1068: bits[1] = and(and.1067, not.994, id=1068) + and.1078: bits[1] = and(and.1077, not.996, id=1078) + not.997: bits[1] = not(bit_slice.942, id=997) + and.1089: bits[1] = and(and.1088, not.996, id=1089) + not.999: bits[1] = not(and.998, id=999) + and.1101: bits[1] = and(and.1100, not.996, id=1101) + and.1001: bits[1] = and(bit_slice.941, not.964, id=1001) + and.1114: bits[1] = and(and.1113, not.996, id=1114) + bit_slice.940: bits[1] = bit_slice(x, start=3, width=1, id=940) + not.965: bits[1] = not(bit_slice.924, id=965) + and.1128: bits[1] = and(and.1127, not.996, id=1128) + bit_slice.923: bits[1] = bit_slice(y, start=2, width=1, id=923) + and.1143: bits[1] = and(and.1142, not.996, id=1143) + or.1160: bits[1] = or(or.1159, and.1059, id=1160) + and.1069: bits[1] = and(and.1068, bit_slice.927, id=1069) + and.1079: bits[1] = and(and.1078, not.997, id=1079) + and.1090: bits[1] = and(and.1089, not.999, id=1090) + not.1000: bits[1] = not(bit_slice.941, id=1000) + and.1102: bits[1] = and(and.1101, not.999, id=1102) + not.1002: bits[1] = not(and.1001, id=1002) + and.1115: bits[1] = and(and.1114, not.999, id=1115) + and.1004: bits[1] = and(bit_slice.940, not.965, id=1004) + and.1129: bits[1] = and(and.1128, not.999, id=1129) + bit_slice.939: bits[1] = bit_slice(x, start=2, width=1, id=939) + not.966: bits[1] = not(bit_slice.923, id=966) + and.1144: bits[1] = and(and.1143, not.999, id=1144) + bit_slice.922: bits[1] = bit_slice(y, start=1, width=1, id=922) + or.1161: bits[1] = or(or.1160, and.1069, id=1161) + and.1080: bits[1] = and(and.1079, bit_slice.926, id=1080) + and.1091: bits[1] = and(and.1090, not.1000, id=1091) + and.1103: bits[1] = and(and.1102, not.1002, id=1103) + not.1003: bits[1] = not(bit_slice.940, id=1003) + and.1116: bits[1] = and(and.1115, not.1002, id=1116) + not.1005: bits[1] = not(and.1004, id=1005) + and.1130: bits[1] = and(and.1129, not.1002, id=1130) + and.1007: bits[1] = and(bit_slice.939, not.966, id=1007) + and.1145: bits[1] = and(and.1144, not.1002, id=1145) + bit_slice.938: bits[1] = bit_slice(x, start=1, width=1, id=938) + not.967: bits[1] = not(bit_slice.922, id=967) + or.1162: bits[1] = or(or.1161, and.1080, id=1162) + and.1092: bits[1] = and(and.1091, bit_slice.925, id=1092) + and.1104: bits[1] = and(and.1103, not.1003, id=1104) + and.1117: bits[1] = and(and.1116, not.1005, id=1117) + not.1006: bits[1] = not(bit_slice.939, id=1006) + and.1131: bits[1] = and(and.1130, not.1005, id=1131) + not.1008: bits[1] = not(and.1007, id=1008) + and.1146: bits[1] = and(and.1145, not.1005, id=1146) + and.1010: bits[1] = and(bit_slice.938, not.967, id=1010) + or.1163: bits[1] = or(or.1162, and.1092, id=1163) + and.1105: bits[1] = and(and.1104, bit_slice.924, id=1105) + and.1118: bits[1] = and(and.1117, not.1006, id=1118) + and.1132: bits[1] = and(and.1131, not.1008, id=1132) + not.1009: bits[1] = not(bit_slice.938, id=1009) + and.1147: bits[1] = and(and.1146, not.1008, id=1147) + not.1011: bits[1] = not(and.1010, id=1011) + bit_slice.937: bits[1] = bit_slice(x, start=0, width=1, id=937) + or.1164: bits[1] = or(or.1163, and.1105, id=1164) + and.1119: bits[1] = and(and.1118, bit_slice.923, id=1119) + and.1133: bits[1] = and(and.1132, not.1009, id=1133) + and.1148: bits[1] = and(and.1147, not.1011, id=1148) + not.1012: bits[1] = not(bit_slice.937, id=1012) + not.1013: bits[1] = not(bit_slice.952, id=1013) + or.1165: bits[1] = or(or.1164, and.1119, id=1165) + and.1134: bits[1] = and(and.1133, bit_slice.922, id=1134) + and.1149: bits[1] = and(and.1148, not.1012, id=1149) + bit_slice.921: bits[1] = bit_slice(y, start=0, width=1, id=921) + and.1151: bits[1] = and(not.1013, bit_slice.936, id=1151) + or.1166: bits[1] = or(or.1165, and.1134, id=1166) + and.1150: bits[1] = and(and.1149, bit_slice.921, id=1150) + or.1168: bits[1] = or(not.1013, bit_slice.936, id=1168) + not.1152: bits[1] = not(and.1151, id=1152) + or.1167: bits[1] = or(or.1166, and.1150, id=1167) + not.1169: bits[1] = not(or.1168, id=1169) + and.1170: bits[1] = and(not.1152, or.1167, id=1170) + bit_slice.1186: bits[1] = bit_slice(tmp, start=15, width=1, id=1186) + or.1187: bits[1] = or(not.1169, and.1170, id=1187) + bit_slice.1185: bits[1] = bit_slice(tmp, start=14, width=1, id=1185) + bit_slice.1184: bits[1] = bit_slice(tmp, start=13, width=1, id=1184) + bit_slice.1183: bits[1] = bit_slice(tmp, start=12, width=1, id=1183) + bit_slice.1182: bits[1] = bit_slice(tmp, start=11, width=1, id=1182) + bit_slice.1181: bits[1] = bit_slice(tmp, start=10, width=1, id=1181) + bit_slice.1180: bits[1] = bit_slice(tmp, start=9, width=1, id=1180) + bit_slice.1179: bits[1] = bit_slice(tmp, start=8, width=1, id=1179) + bit_slice.1178: bits[1] = bit_slice(tmp, start=7, width=1, id=1178) + bit_slice.1177: bits[1] = bit_slice(tmp, start=6, width=1, id=1177) + bit_slice.1176: bits[1] = bit_slice(tmp, start=5, width=1, id=1176) + bit_slice.1175: bits[1] = bit_slice(tmp, start=4, width=1, id=1175) + bit_slice.1174: bits[1] = bit_slice(tmp, start=3, width=1, id=1174) + bit_slice.1173: bits[1] = bit_slice(tmp, start=2, width=1, id=1173) + bit_slice.1172: bits[1] = bit_slice(tmp, start=1, width=1, id=1172) + bit_slice.1171: bits[1] = bit_slice(tmp, start=0, width=1, id=1171) + and.1189: bits[1] = and(bit_slice.952, bit_slice.1186, id=1189) + and.1190: bits[1] = and(bit_slice.952, or.1187, id=1190) + not.1188: bits[1] = not(or.1187, id=1188) + and.1192: bits[1] = and(bit_slice.951, bit_slice.1185, id=1192) + and.1193: bits[1] = and(bit_slice.951, or.1187, id=1193) + and.1195: bits[1] = and(bit_slice.950, bit_slice.1184, id=1195) + and.1196: bits[1] = and(bit_slice.950, or.1187, id=1196) + and.1198: bits[1] = and(bit_slice.949, bit_slice.1183, id=1198) + and.1199: bits[1] = and(bit_slice.949, or.1187, id=1199) + and.1201: bits[1] = and(bit_slice.948, bit_slice.1182, id=1201) + and.1202: bits[1] = and(bit_slice.948, or.1187, id=1202) + and.1204: bits[1] = and(bit_slice.947, bit_slice.1181, id=1204) + and.1205: bits[1] = and(bit_slice.947, or.1187, id=1205) + and.1207: bits[1] = and(bit_slice.946, bit_slice.1180, id=1207) + and.1208: bits[1] = and(bit_slice.946, or.1187, id=1208) + and.1210: bits[1] = and(bit_slice.945, bit_slice.1179, id=1210) + and.1211: bits[1] = and(bit_slice.945, or.1187, id=1211) + and.1213: bits[1] = and(bit_slice.944, bit_slice.1178, id=1213) + and.1214: bits[1] = and(bit_slice.944, or.1187, id=1214) + and.1216: bits[1] = and(bit_slice.943, bit_slice.1177, id=1216) + and.1217: bits[1] = and(bit_slice.943, or.1187, id=1217) + and.1219: bits[1] = and(bit_slice.942, bit_slice.1176, id=1219) + and.1220: bits[1] = and(bit_slice.942, or.1187, id=1220) + and.1222: bits[1] = and(bit_slice.941, bit_slice.1175, id=1222) + and.1223: bits[1] = and(bit_slice.941, or.1187, id=1223) + and.1225: bits[1] = and(bit_slice.940, bit_slice.1174, id=1225) + and.1226: bits[1] = and(bit_slice.940, or.1187, id=1226) + and.1228: bits[1] = and(bit_slice.939, bit_slice.1173, id=1228) + and.1229: bits[1] = and(bit_slice.939, or.1187, id=1229) + and.1231: bits[1] = and(bit_slice.938, bit_slice.1172, id=1231) + and.1232: bits[1] = and(bit_slice.938, or.1187, id=1232) + and.1234: bits[1] = and(bit_slice.937, bit_slice.1171, id=1234) + and.1235: bits[1] = and(bit_slice.937, or.1187, id=1235) + or.1237: bits[1] = or(and.1189, and.1190, id=1237) + and.1191: bits[1] = and(bit_slice.1186, not.1188, id=1191) + or.1239: bits[1] = or(and.1192, and.1193, id=1239) + and.1194: bits[1] = and(bit_slice.1185, not.1188, id=1194) + or.1241: bits[1] = or(and.1195, and.1196, id=1241) + and.1197: bits[1] = and(bit_slice.1184, not.1188, id=1197) + or.1243: bits[1] = or(and.1198, and.1199, id=1243) + and.1200: bits[1] = and(bit_slice.1183, not.1188, id=1200) + or.1245: bits[1] = or(and.1201, and.1202, id=1245) + and.1203: bits[1] = and(bit_slice.1182, not.1188, id=1203) + or.1247: bits[1] = or(and.1204, and.1205, id=1247) + and.1206: bits[1] = and(bit_slice.1181, not.1188, id=1206) + or.1249: bits[1] = or(and.1207, and.1208, id=1249) + and.1209: bits[1] = and(bit_slice.1180, not.1188, id=1209) + or.1251: bits[1] = or(and.1210, and.1211, id=1251) + and.1212: bits[1] = and(bit_slice.1179, not.1188, id=1212) + or.1253: bits[1] = or(and.1213, and.1214, id=1253) + and.1215: bits[1] = and(bit_slice.1178, not.1188, id=1215) + or.1255: bits[1] = or(and.1216, and.1217, id=1255) + and.1218: bits[1] = and(bit_slice.1177, not.1188, id=1218) + or.1257: bits[1] = or(and.1219, and.1220, id=1257) + and.1221: bits[1] = and(bit_slice.1176, not.1188, id=1221) + or.1259: bits[1] = or(and.1222, and.1223, id=1259) + and.1224: bits[1] = and(bit_slice.1175, not.1188, id=1224) + or.1261: bits[1] = or(and.1225, and.1226, id=1261) + and.1227: bits[1] = and(bit_slice.1174, not.1188, id=1227) + or.1263: bits[1] = or(and.1228, and.1229, id=1263) + and.1230: bits[1] = and(bit_slice.1173, not.1188, id=1230) + or.1265: bits[1] = or(and.1231, and.1232, id=1265) + and.1233: bits[1] = and(bit_slice.1172, not.1188, id=1233) + or.1267: bits[1] = or(and.1234, and.1235, id=1267) + and.1236: bits[1] = and(bit_slice.1171, not.1188, id=1236) + or.1238: bits[1] = or(or.1237, and.1191, id=1238) + or.1240: bits[1] = or(or.1239, and.1194, id=1240) + or.1242: bits[1] = or(or.1241, and.1197, id=1242) + or.1244: bits[1] = or(or.1243, and.1200, id=1244) + or.1246: bits[1] = or(or.1245, and.1203, id=1246) + or.1248: bits[1] = or(or.1247, and.1206, id=1248) + or.1250: bits[1] = or(or.1249, and.1209, id=1250) + or.1252: bits[1] = or(or.1251, and.1212, id=1252) + or.1254: bits[1] = or(or.1253, and.1215, id=1254) + or.1256: bits[1] = or(or.1255, and.1218, id=1256) + or.1258: bits[1] = or(or.1257, and.1221, id=1258) + or.1260: bits[1] = or(or.1259, and.1224, id=1260) + or.1262: bits[1] = or(or.1261, and.1227, id=1262) + or.1264: bits[1] = or(or.1263, and.1230, id=1264) + or.1266: bits[1] = or(or.1265, and.1233, id=1266) + or.1268: bits[1] = or(or.1267, and.1236, id=1268) + literal.916: bits[1] = literal(value=1, id=916) + literal.917: bits[1] = literal(value=0, id=917) + ret concat.1269: bits[16] = concat(or.1238, or.1240, or.1242, or.1244, or.1246, or.1248, or.1250, or.1252, or.1254, or.1256, or.1258, or.1260, or.1262, or.1264, or.1266, or.1268, id=1269) +} diff --git a/cudax/test/stf/fhe/kernel_sharpen_interpreted_cufhe.bool.ir b/cudax/test/stf/fhe/kernel_sharpen_interpreted_cufhe.bool.ir new file mode 100644 index 00000000000..593ea8062ff --- /dev/null +++ b/cudax/test/stf/fhe/kernel_sharpen_interpreted_cufhe.bool.ir @@ -0,0 +1,556 @@ +package my_package + +fn kernel_sharpen(window: bits[8][9]) -> bits[8] { + literal.2853: bits[4] = literal(value=5, id=2853) + literal.2873: bits[4] = literal(value=7, id=2873) + array_index.2854: bits[8] = array_index(window, indices=[literal.2853], id=2854) + array_index.2874: bits[8] = array_index(window, indices=[literal.2873], id=2874) + literal.2813: bits[4] = literal(value=1, id=2813) + literal.2833: bits[4] = literal(value=3, id=2833) + bit_slice.2857: bits[1] = bit_slice(array_index.2854, start=2, width=1, id=2857) + bit_slice.2877: bits[1] = bit_slice(array_index.2874, start=2, width=1, id=2877) + array_index.2814: bits[8] = array_index(window, indices=[literal.2813], id=2814) + array_index.2834: bits[8] = array_index(window, indices=[literal.2833], id=2834) + and.2894: bits[1] = and(bit_slice.2857, bit_slice.2877, id=2894) + bit_slice.2817: bits[1] = bit_slice(array_index.2814, start=2, width=1, id=2817) + bit_slice.2837: bits[1] = bit_slice(array_index.2834, start=2, width=1, id=2837) + or.2893: bits[1] = or(bit_slice.2857, bit_slice.2877, id=2893) + not.2895: bits[1] = not(and.2894, id=2895) + and.2897: bits[1] = and(bit_slice.2817, bit_slice.2837, id=2897) + and.2907: bits[1] = and(bit_slice.2857, bit_slice.2877, id=2907) + and.2900: bits[1] = and(or.2893, not.2895, id=2900) + bit_slice.2856: bits[1] = bit_slice(array_index.2854, start=1, width=1, id=2856) + or.2896: bits[1] = or(bit_slice.2817, bit_slice.2837, id=2896) + not.2898: bits[1] = not(and.2897, id=2898) + or.2909: bits[1] = or(bit_slice.2857, bit_slice.2877, id=2909) + not.2908: bits[1] = not(and.2907, id=2908) + bit_slice.2876: bits[1] = bit_slice(array_index.2874, start=1, width=1, id=2876) + and.2901: bits[1] = and(and.2900, bit_slice.2856, id=2901) + and.2912: bits[1] = and(bit_slice.2817, bit_slice.2837, id=2912) + and.2904: bits[1] = and(or.2896, not.2898, id=2904) + bit_slice.2816: bits[1] = bit_slice(array_index.2814, start=1, width=1, id=2816) + and.2910: bits[1] = and(or.2909, not.2908, id=2910) + and.2911: bits[1] = and(bit_slice.2856, bit_slice.2876, id=2911) + and.2902: bits[1] = and(and.2901, bit_slice.2876, id=2902) + or.2914: bits[1] = or(bit_slice.2817, bit_slice.2837, id=2914) + not.2913: bits[1] = not(and.2912, id=2913) + bit_slice.2836: bits[1] = bit_slice(array_index.2834, start=1, width=1, id=2836) + bit_slice.2858: bits[1] = bit_slice(array_index.2854, start=3, width=1, id=2858) + bit_slice.2878: bits[1] = bit_slice(array_index.2874, start=3, width=1, id=2878) + and.2905: bits[1] = and(and.2904, bit_slice.2816, id=2905) + or.2925: bits[1] = or(and.2910, and.2911, id=2925) + not.2926: bits[1] = not(and.2902, id=2926) + and.2915: bits[1] = and(or.2914, not.2913, id=2915) + and.2916: bits[1] = and(bit_slice.2816, bit_slice.2836, id=2916) + and.2918: bits[1] = and(bit_slice.2858, bit_slice.2878, id=2918) + and.2935: bits[1] = and(bit_slice.2858, bit_slice.2878, id=2935) + bit_slice.2818: bits[1] = bit_slice(array_index.2814, start=3, width=1, id=2818) + bit_slice.2838: bits[1] = bit_slice(array_index.2834, start=3, width=1, id=2838) + and.2906: bits[1] = and(and.2905, bit_slice.2836, id=2906) + and.2945: bits[1] = and(or.2925, not.2926, id=2945) + or.2927: bits[1] = or(and.2915, and.2916, id=2927) + bit_slice.2859: bits[1] = bit_slice(array_index.2854, start=4, width=1, id=2859) + bit_slice.2879: bits[1] = bit_slice(array_index.2874, start=4, width=1, id=2879) + or.2917: bits[1] = or(bit_slice.2858, bit_slice.2878, id=2917) + not.2919: bits[1] = not(and.2918, id=2919) + and.2899: bits[1] = and(bit_slice.2857, bit_slice.2877, id=2899) + or.2937: bits[1] = or(bit_slice.2858, bit_slice.2878, id=2937) + not.2936: bits[1] = not(and.2935, id=2936) + and.2939: bits[1] = and(bit_slice.2818, bit_slice.2838, id=2939) + not.2928: bits[1] = not(and.2906, id=2928) + and.2946: bits[1] = and(and.2945, or.2927, id=2946) + and.2922: bits[1] = and(bit_slice.2818, bit_slice.2838, id=2922) + and.2949: bits[1] = and(bit_slice.2859, bit_slice.2879, id=2949) + and.2930: bits[1] = and(or.2917, not.2919, id=2930) + or.2920: bits[1] = or(and.2899, and.2902, id=2920) + and.2976: bits[1] = and(bit_slice.2859, bit_slice.2879, id=2976) + bit_slice.2819: bits[1] = bit_slice(array_index.2814, start=4, width=1, id=2819) + bit_slice.2839: bits[1] = bit_slice(array_index.2834, start=4, width=1, id=2839) + and.2938: bits[1] = and(or.2937, not.2936, id=2938) + or.2941: bits[1] = or(bit_slice.2818, bit_slice.2838, id=2941) + not.2940: bits[1] = not(and.2939, id=2940) + and.2943: bits[1] = and(or.2925, not.2926, id=2943) + and.2944: bits[1] = and(or.2927, not.2928, id=2944) + and.2947: bits[1] = and(and.2946, not.2928, id=2947) + or.2921: bits[1] = or(bit_slice.2818, bit_slice.2838, id=2921) + not.2923: bits[1] = not(and.2922, id=2923) + and.2903: bits[1] = and(bit_slice.2817, bit_slice.2837, id=2903) + bit_slice.2860: bits[1] = bit_slice(array_index.2854, start=5, width=1, id=2860) + bit_slice.2880: bits[1] = bit_slice(array_index.2874, start=5, width=1, id=2880) + or.2948: bits[1] = or(bit_slice.2859, bit_slice.2879, id=2948) + not.2950: bits[1] = not(and.2949, id=2950) + and.2929: bits[1] = and(bit_slice.2858, bit_slice.2878, id=2929) + and.2931: bits[1] = and(and.2930, or.2920, id=2931) + or.2978: bits[1] = or(bit_slice.2859, bit_slice.2879, id=2978) + not.2977: bits[1] = not(and.2976, id=2977) + and.2980: bits[1] = and(bit_slice.2819, bit_slice.2839, id=2980) + or.2956: bits[1] = or(and.2938, and.2899, id=2956) + and.2942: bits[1] = and(or.2941, not.2940, id=2942) + or.2962: bits[1] = or(and.2943, and.2944, id=2962) + not.2963: bits[1] = not(and.2947, id=2963) + and.2953: bits[1] = and(bit_slice.2819, bit_slice.2839, id=2953) + and.2933: bits[1] = and(or.2921, not.2923, id=2933) + or.2924: bits[1] = or(and.2903, and.2906, id=2924) + and.2995: bits[1] = and(bit_slice.2860, bit_slice.2880, id=2995) + and.2971: bits[1] = and(or.2948, not.2950, id=2971) + or.2951: bits[1] = or(and.2929, and.2931, id=2951) + and.3017: bits[1] = and(bit_slice.2860, bit_slice.2880, id=3017) + bit_slice.2820: bits[1] = bit_slice(array_index.2814, start=5, width=1, id=2820) + bit_slice.2840: bits[1] = bit_slice(array_index.2834, start=5, width=1, id=2840) + and.2979: bits[1] = and(or.2978, not.2977, id=2979) + or.2982: bits[1] = or(bit_slice.2819, bit_slice.2839, id=2982) + not.2981: bits[1] = not(and.2980, id=2981) + or.2957: bits[1] = or(or.2956, and.2902, id=2957) + not.2958: bits[1] = not(and.2931, id=2958) + or.2959: bits[1] = or(and.2942, and.2903, id=2959) + and.2989: bits[1] = and(or.2962, not.2963, id=2989) + or.2964: bits[1] = or(bit_slice.2856, bit_slice.2876, id=2964) + and.2965: bits[1] = and(bit_slice.2856, bit_slice.2876, id=2965) + or.2952: bits[1] = or(bit_slice.2819, bit_slice.2839, id=2952) + not.2954: bits[1] = not(and.2953, id=2954) + and.2932: bits[1] = and(bit_slice.2818, bit_slice.2838, id=2932) + and.2934: bits[1] = and(and.2933, or.2924, id=2934) + bit_slice.2861: bits[1] = bit_slice(array_index.2854, start=6, width=1, id=2861) + bit_slice.2881: bits[1] = bit_slice(array_index.2874, start=6, width=1, id=2881) + or.2994: bits[1] = or(bit_slice.2860, bit_slice.2880, id=2994) + not.2996: bits[1] = not(and.2995, id=2996) + and.2970: bits[1] = and(bit_slice.2859, bit_slice.2879, id=2970) + and.2972: bits[1] = and(and.2971, or.2951, id=2972) + or.3019: bits[1] = or(bit_slice.2860, bit_slice.2880, id=3019) + not.3018: bits[1] = not(and.3017, id=3018) + and.3021: bits[1] = and(bit_slice.2820, bit_slice.2840, id=3021) + or.3002: bits[1] = or(and.2979, and.2929, id=3002) + and.2983: bits[1] = and(or.2982, not.2981, id=2983) + and.2986: bits[1] = and(or.2957, not.2958, id=2986) + or.2960: bits[1] = or(or.2959, and.2906, id=2960) + and.2990: bits[1] = and(and.2989, or.2964, id=2990) + not.2966: bits[1] = not(and.2965, id=2966) + and.2999: bits[1] = and(bit_slice.2820, bit_slice.2840, id=2999) + and.2974: bits[1] = and(or.2952, not.2954, id=2974) + or.2955: bits[1] = or(and.2932, and.2934, id=2955) + and.3033: bits[1] = and(bit_slice.2861, bit_slice.2881, id=3033) + and.3012: bits[1] = and(or.2994, not.2996, id=3012) + or.2997: bits[1] = or(and.2970, and.2972, id=2997) + and.3055: bits[1] = and(bit_slice.2861, bit_slice.2881, id=3055) + bit_slice.2821: bits[1] = bit_slice(array_index.2814, start=6, width=1, id=2821) + bit_slice.2841: bits[1] = bit_slice(array_index.2834, start=6, width=1, id=2841) + and.3020: bits[1] = and(or.3019, not.3018, id=3020) + or.3023: bits[1] = or(bit_slice.2820, bit_slice.2840, id=3023) + not.3022: bits[1] = not(and.3021, id=3022) + or.3003: bits[1] = or(or.3002, and.2931, id=3003) + not.3004: bits[1] = not(and.2972, id=3004) + or.3005: bits[1] = or(and.2983, and.2932, id=3005) + not.2961: bits[1] = not(and.2934, id=2961) + and.2987: bits[1] = and(and.2986, or.2960, id=2987) + and.2991: bits[1] = and(and.2990, not.2966, id=2991) + or.2967: bits[1] = or(bit_slice.2816, bit_slice.2836, id=2967) + and.2968: bits[1] = and(bit_slice.2816, bit_slice.2836, id=2968) + or.2998: bits[1] = or(bit_slice.2820, bit_slice.2840, id=2998) + not.3000: bits[1] = not(and.2999, id=3000) + and.2973: bits[1] = and(bit_slice.2819, bit_slice.2839, id=2973) + and.2975: bits[1] = and(and.2974, or.2955, id=2975) + bit_slice.2862: bits[1] = bit_slice(array_index.2854, start=7, width=1, id=2862) + bit_slice.2882: bits[1] = bit_slice(array_index.2874, start=7, width=1, id=2882) + or.3032: bits[1] = or(bit_slice.2861, bit_slice.2881, id=3032) + not.3034: bits[1] = not(and.3033, id=3034) + and.3011: bits[1] = and(bit_slice.2860, bit_slice.2880, id=3011) + and.3013: bits[1] = and(and.3012, or.2997, id=3013) + or.3057: bits[1] = or(bit_slice.2861, bit_slice.2881, id=3057) + not.3056: bits[1] = not(and.3055, id=3056) + and.3059: bits[1] = and(bit_slice.2821, bit_slice.2841, id=3059) + or.3040: bits[1] = or(and.3020, and.2970, id=3040) + and.3024: bits[1] = and(or.3023, not.3022, id=3024) + and.3027: bits[1] = and(or.3003, not.3004, id=3027) + or.3006: bits[1] = or(or.3005, and.2934, id=3006) + and.2984: bits[1] = and(or.2957, not.2958, id=2984) + and.2985: bits[1] = and(or.2960, not.2961, id=2985) + and.2988: bits[1] = and(and.2987, not.2961, id=2988) + and.2992: bits[1] = and(and.2991, or.2967, id=2992) + not.2969: bits[1] = not(and.2968, id=2969) + and.3037: bits[1] = and(bit_slice.2821, bit_slice.2841, id=3037) + and.3015: bits[1] = and(or.2998, not.3000, id=3015) + or.3001: bits[1] = or(and.2973, and.2975, id=3001) + and.3087: bits[1] = and(bit_slice.2862, bit_slice.2882, id=3087) + and.3071: bits[1] = and(bit_slice.2862, bit_slice.2882, id=3071) + and.3050: bits[1] = and(or.3032, not.3034, id=3050) + or.3035: bits[1] = or(and.3011, and.3013, id=3035) + bit_slice.2822: bits[1] = bit_slice(array_index.2814, start=7, width=1, id=2822) + bit_slice.2842: bits[1] = bit_slice(array_index.2834, start=7, width=1, id=2842) + and.3058: bits[1] = and(or.3057, not.3056, id=3058) + or.3061: bits[1] = or(bit_slice.2821, bit_slice.2841, id=3061) + not.3060: bits[1] = not(and.3059, id=3060) + or.3041: bits[1] = or(or.3040, and.2972, id=3041) + not.3042: bits[1] = not(and.3013, id=3042) + or.3043: bits[1] = or(and.3024, and.2973, id=3043) + not.3007: bits[1] = not(and.2975, id=3007) + and.3028: bits[1] = and(and.3027, or.3006, id=3028) + or.3008: bits[1] = or(and.2984, and.2985, id=3008) + not.3009: bits[1] = not(and.2988, id=3009) + and.2993: bits[1] = and(and.2992, not.2969, id=2993) + or.3036: bits[1] = or(bit_slice.2821, bit_slice.2841, id=3036) + not.3038: bits[1] = not(and.3037, id=3038) + and.3014: bits[1] = and(bit_slice.2820, bit_slice.2840, id=3014) + and.3016: bits[1] = and(and.3015, or.3001, id=3016) + or.3089: bits[1] = or(bit_slice.2862, bit_slice.2882, id=3089) + not.3088: bits[1] = not(and.3087, id=3088) + or.3070: bits[1] = or(bit_slice.2862, bit_slice.2882, id=3070) + not.3072: bits[1] = not(and.3071, id=3072) + and.3049: bits[1] = and(bit_slice.2861, bit_slice.2881, id=3049) + and.3051: bits[1] = and(and.3050, or.3035, id=3051) + and.3093: bits[1] = and(bit_slice.2822, bit_slice.2842, id=3093) + or.3078: bits[1] = or(and.3058, and.3011, id=3078) + and.3062: bits[1] = and(or.3061, not.3060, id=3062) + and.3065: bits[1] = and(or.3041, not.3042, id=3065) + or.3044: bits[1] = or(or.3043, and.2975, id=3044) + and.3025: bits[1] = and(or.3003, not.3004, id=3025) + and.3026: bits[1] = and(or.3006, not.3007, id=3026) + and.3029: bits[1] = and(and.3028, not.3007, id=3029) + and.3030: bits[1] = and(or.3008, not.3009, id=3030) + or.3010: bits[1] = or(and.2947, and.2993, id=3010) + and.3075: bits[1] = and(bit_slice.2822, bit_slice.2842, id=3075) + and.3053: bits[1] = and(or.3036, not.3038, id=3053) + or.3039: bits[1] = or(and.3014, and.3016, id=3039) + and.3090: bits[1] = and(or.3089, not.3088, id=3090) + and.3091: bits[1] = and(or.3070, not.3072, id=3091) + or.3073: bits[1] = or(and.3049, and.3051, id=3073) + or.3095: bits[1] = or(bit_slice.2822, bit_slice.2842, id=3095) + not.3094: bits[1] = not(and.3093, id=3094) + or.3079: bits[1] = or(or.3078, and.3013, id=3079) + not.3080: bits[1] = not(and.3051, id=3080) + or.3081: bits[1] = or(and.3062, and.3014, id=3081) + not.3045: bits[1] = not(and.3016, id=3045) + and.3066: bits[1] = and(and.3065, or.3044, id=3066) + or.3046: bits[1] = or(and.3025, and.3026, id=3046) + not.3047: bits[1] = not(and.3029, id=3047) + and.3031: bits[1] = and(and.3030, or.3010, id=3031) + or.3074: bits[1] = or(bit_slice.2822, bit_slice.2842, id=3074) + not.3076: bits[1] = not(and.3075, id=3076) + and.3052: bits[1] = and(bit_slice.2821, bit_slice.2841, id=3052) + and.3054: bits[1] = and(and.3053, or.3039, id=3054) + or.3121: bits[1] = or(and.3090, and.3049, id=3121) + and.3092: bits[1] = and(and.3091, or.3073, id=3092) + and.3096: bits[1] = and(or.3095, not.3094, id=3096) + and.3101: bits[1] = and(or.3079, not.3080, id=3101) + or.3082: bits[1] = or(or.3081, and.3016, id=3082) + and.3063: bits[1] = and(or.3041, not.3042, id=3063) + and.3064: bits[1] = and(or.3044, not.3045, id=3064) + and.3067: bits[1] = and(and.3066, not.3045, id=3067) + and.3068: bits[1] = and(or.3046, not.3047, id=3068) + or.3048: bits[1] = or(and.2988, and.3031, id=3048) + and.3097: bits[1] = and(or.3074, not.3076, id=3097) + or.3077: bits[1] = or(and.3052, and.3054, id=3077) + or.3122: bits[1] = or(or.3121, and.3051, id=3122) + not.3123: bits[1] = not(and.3092, id=3123) + or.3124: bits[1] = or(and.3096, and.3052, id=3124) + not.3083: bits[1] = not(and.3054, id=3083) + and.3102: bits[1] = and(and.3101, or.3082, id=3102) + or.3084: bits[1] = or(and.3063, and.3064, id=3084) + not.3085: bits[1] = not(and.3067, id=3085) + and.3069: bits[1] = and(and.3068, or.3048, id=3069) + and.3098: bits[1] = and(and.3097, or.3077, id=3098) + and.3147: bits[1] = and(or.3122, not.3123, id=3147) + or.3125: bits[1] = or(or.3124, and.3054, id=3125) + and.3099: bits[1] = and(or.3079, not.3080, id=3099) + and.3100: bits[1] = and(or.3082, not.3083, id=3100) + and.3103: bits[1] = and(and.3102, not.3083, id=3103) + and.3104: bits[1] = and(or.3084, not.3085, id=3104) + or.3086: bits[1] = or(and.3029, and.3069, id=3086) + not.3126: bits[1] = not(and.3098, id=3126) + and.3148: bits[1] = and(and.3147, or.3125, id=3148) + or.3127: bits[1] = or(and.3099, and.3100, id=3127) + not.3128: bits[1] = not(and.3103, id=3128) + and.3105: bits[1] = and(and.3104, or.3086, id=3105) + and.3120: bits[1] = and(or.3084, not.3085, id=3120) + and.3106: bits[1] = and(or.3046, not.3047, id=3106) + and.3107: bits[1] = and(or.3008, not.3009, id=3107) + and.3109: bits[1] = and(or.2964, not.2966, id=3109) + and.3143: bits[1] = and(bit_slice.2862, bit_slice.2882, id=3143) + and.3144: bits[1] = and(bit_slice.2822, bit_slice.2842, id=3144) + and.3145: bits[1] = and(or.3122, not.3123, id=3145) + and.3146: bits[1] = and(or.3125, not.3126, id=3146) + and.3149: bits[1] = and(and.3148, not.3126, id=3149) + and.3150: bits[1] = and(or.3127, not.3128, id=3150) + or.3129: bits[1] = or(and.3067, and.3105, id=3129) + or.3140: bits[1] = or(and.3120, and.3029, id=3140) + or.3130: bits[1] = or(and.3106, and.2988, id=3130) + or.3133: bits[1] = or(and.3107, and.2947, id=3133) + and.3110: bits[1] = and(and.3109, or.2967, id=3110) + and.3112: bits[1] = and(bit_slice.2856, bit_slice.2876, id=3112) + and.3116: bits[1] = and(bit_slice.2816, bit_slice.2836, id=3116) + or.3183: bits[1] = or(and.3143, and.3092, id=3183) + or.3158: bits[1] = or(and.3143, and.3092, id=3158) + or.3159: bits[1] = or(and.3144, and.3098, id=3159) + or.3160: bits[1] = or(and.3145, and.3146, id=3160) + not.3161: bits[1] = not(and.3149, id=3161) + and.3151: bits[1] = and(and.3150, or.3129, id=3151) + or.3141: bits[1] = or(or.3140, and.3069, id=3141) + not.3142: bits[1] = not(and.3105, id=3142) + or.3131: bits[1] = or(or.3130, and.3031, id=3131) + not.3132: bits[1] = not(and.3069, id=3132) + or.3134: bits[1] = or(or.3133, and.2993, id=3134) + not.3135: bits[1] = not(and.3031, id=3135) + and.3108: bits[1] = and(or.2962, not.2963, id=3108) + and.3111: bits[1] = and(and.3110, not.2969, id=3111) + or.3114: bits[1] = or(bit_slice.2856, bit_slice.2876, id=3114) + not.3113: bits[1] = not(and.3112, id=3113) + or.3118: bits[1] = or(bit_slice.2816, bit_slice.2836, id=3118) + not.3117: bits[1] = not(and.3116, id=3117) + or.3184: bits[1] = or(or.3183, and.3144, id=3184) + and.3171: bits[1] = and(or.3158, or.3159, id=3171) + and.3172: bits[1] = and(or.3160, not.3161, id=3172) + or.3162: bits[1] = or(and.3103, and.3151, id=3162) + and.3152: bits[1] = and(or.3127, not.3128, id=3152) + and.3157: bits[1] = and(or.3141, not.3142, id=3157) + and.3153: bits[1] = and(or.3131, not.3132, id=3153) + and.3154: bits[1] = and(or.3134, not.3135, id=3154) + or.3136: bits[1] = or(and.3108, and.3111, id=3136) + not.3137: bits[1] = not(and.2993, id=3137) + and.3115: bits[1] = and(or.3114, not.3113, id=3115) + and.3119: bits[1] = and(or.3118, not.3117, id=3119) + or.3185: bits[1] = or(or.3184, and.3098, id=3185) + not.3186: bits[1] = not(and.3171, id=3186) + and.3173: bits[1] = and(and.3172, or.3162, id=3173) + and.3174: bits[1] = and(or.3160, not.3161, id=3174) + or.3163: bits[1] = or(and.3152, and.3067, id=3163) + not.3170: bits[1] = not(and.3157, id=3170) + not.3166: bits[1] = not(and.3153, id=3166) + not.3167: bits[1] = not(and.3154, id=3167) + and.3155: bits[1] = and(or.3136, not.3137, id=3155) + or.3138: bits[1] = or(and.3115, and.3119, id=3138) + not.3139: bits[1] = not(and.3111, id=3139) + and.3194: bits[1] = and(or.3185, not.3186, id=3194) + and.3195: bits[1] = and(or.3185, not.3186, id=3195) + or.3187: bits[1] = or(and.3149, and.3173, id=3187) + or.3188: bits[1] = or(and.3174, and.3103, id=3188) + or.3164: bits[1] = or(or.3163, and.3105, id=3164) + not.3165: bits[1] = not(and.3151, id=3165) + and.3179: bits[1] = and(not.3170, not.3166, id=3179) + and.3176: bits[1] = and(not.3166, not.3167, id=3176) + not.3168: bits[1] = not(and.3155, id=3168) + and.3156: bits[1] = and(or.3138, not.3139, id=3156) + or.3208: bits[1] = or(and.3194, and.3149, id=3208) + and.3196: bits[1] = and(and.3195, or.3187, id=3196) + or.3189: bits[1] = or(or.3188, and.3151, id=3189) + not.3190: bits[1] = not(and.3173, id=3190) + and.3175: bits[1] = and(or.3164, not.3165, id=3175) + and.3180: bits[1] = and(and.3179, not.3167, id=3180) + literal.2843: bits[4] = literal(value=4, id=2843) + and.3177: bits[1] = and(and.3176, not.3168, id=3177) + not.3169: bits[1] = not(and.3156, id=3169) + or.3209: bits[1] = or(or.3208, and.3173, id=3209) + not.3210: bits[1] = not(and.3196, id=3210) + and.3197: bits[1] = and(or.3189, not.3190, id=3197) + not.3191: bits[1] = not(and.3175, id=3191) + and.3181: bits[1] = and(and.3180, not.3168, id=3181) + array_index.2844: bits[8] = array_index(window, indices=[literal.2843], id=2844) + and.3178: bits[1] = and(and.3177, not.3169, id=3178) + and.3218: bits[1] = and(or.3209, not.3210, id=3218) + not.3211: bits[1] = not(and.3197, id=3211) + and.3198: bits[1] = and(not.3191, not.3170, id=3198) + and.3182: bits[1] = and(and.3181, not.3169, id=3182) + bit_slice.2846: bits[1] = bit_slice(array_index.2844, start=1, width=1, id=2846) + or.3192: bits[1] = or(not.3170, and.3178, id=3192) + not.3232: bits[1] = not(and.3218, id=3232) + and.3219: bits[1] = and(not.3211, not.3191, id=3219) + and.3199: bits[1] = and(and.3198, not.3166, id=3199) + not.3193: bits[1] = not(and.3182, id=3193) + and.3204: bits[1] = and(bit_slice.2846, or.3192, id=3204) + and.3238: bits[1] = and(not.3232, not.3211, id=3238) + and.3220: bits[1] = and(and.3219, not.3170, id=3220) + and.3200: bits[1] = and(and.3199, not.3167, id=3200) + and.3203: bits[1] = and(or.3192, not.3193, id=3203) + and.3205: bits[1] = and(and.3204, not.3193, id=3205) + and.3239: bits[1] = and(and.3238, not.3191, id=3239) + and.3221: bits[1] = and(and.3220, not.3166, id=3221) + and.3201: bits[1] = and(and.3200, not.3168, id=3201) + or.3214: bits[1] = or(bit_slice.2846, and.3203, id=3214) + not.3215: bits[1] = not(and.3205, id=3215) + and.3206: bits[1] = and(not.3167, not.3168, id=3206) + and.3240: bits[1] = and(and.3239, not.3170, id=3240) + and.3222: bits[1] = and(and.3221, not.3167, id=3222) + and.3202: bits[1] = and(and.3201, not.3169, id=3202) + bit_slice.2847: bits[1] = bit_slice(array_index.2844, start=2, width=1, id=2847) + or.3212: bits[1] = or(not.3191, and.3182, id=3212) + and.3228: bits[1] = and(or.3214, not.3215, id=3228) + bit_slice.2845: bits[1] = bit_slice(array_index.2844, start=0, width=1, id=2845) + and.3207: bits[1] = and(and.3206, not.3169, id=3207) + and.3241: bits[1] = and(and.3240, not.3166, id=3241) + and.3223: bits[1] = and(and.3222, not.3168, id=3223) + not.3213: bits[1] = not(and.3202, id=3213) + and.3226: bits[1] = and(bit_slice.2847, or.3212, id=3226) + and.3229: bits[1] = and(and.3228, bit_slice.2845, id=3229) + or.3216: bits[1] = or(not.3166, and.3207, id=3216) + and.3242: bits[1] = and(and.3241, not.3167, id=3242) + and.3224: bits[1] = and(and.3223, not.3169, id=3224) + bit_slice.2848: bits[1] = bit_slice(array_index.2844, start=3, width=1, id=2848) + or.3233: bits[1] = or(not.3211, and.3202, id=3233) + and.3225: bits[1] = and(or.3212, not.3213, id=3225) + and.3227: bits[1] = and(and.3226, not.3213, id=3227) + and.3230: bits[1] = and(and.3229, or.3216, id=3230) + not.3217: bits[1] = not(and.3178, id=3217) + and.3243: bits[1] = and(and.3242, not.3168, id=3243) + not.3234: bits[1] = not(and.3224, id=3234) + and.3246: bits[1] = and(bit_slice.2848, or.3233, id=3246) + or.3235: bits[1] = or(bit_slice.2847, and.3225, id=3235) + not.3236: bits[1] = not(and.3227, id=3236) + and.3231: bits[1] = and(and.3230, not.3217, id=3231) + and.3244: bits[1] = and(and.3243, not.3169, id=3244) + bit_slice.2849: bits[1] = bit_slice(array_index.2844, start=4, width=1, id=2849) + or.3250: bits[1] = or(not.3232, and.3224, id=3250) + and.3245: bits[1] = and(or.3233, not.3234, id=3245) + and.3247: bits[1] = and(and.3246, not.3234, id=3247) + and.3248: bits[1] = and(or.3235, not.3236, id=3248) + or.3237: bits[1] = or(and.3205, and.3231, id=3237) + not.3251: bits[1] = not(and.3244, id=3251) + and.3256: bits[1] = and(bit_slice.2849, or.3250, id=3256) + or.3252: bits[1] = or(bit_slice.2848, and.3245, id=3252) + not.3253: bits[1] = not(and.3247, id=3253) + and.3249: bits[1] = and(and.3248, or.3237, id=3249) + and.3255: bits[1] = and(or.3250, not.3251, id=3255) + and.3257: bits[1] = and(and.3256, not.3251, id=3257) + and.3258: bits[1] = and(or.3252, not.3253, id=3258) + or.3254: bits[1] = or(and.3227, and.3249, id=3254) + and.3266: bits[1] = and(or.3252, not.3253, id=3266) + or.3260: bits[1] = or(bit_slice.2849, and.3255, id=3260) + not.3261: bits[1] = not(and.3257, id=3261) + and.3259: bits[1] = and(and.3258, or.3254, id=3259) + or.3276: bits[1] = or(and.3266, and.3227, id=3276) + and.3267: bits[1] = and(or.3235, not.3236, id=3267) + and.3263: bits[1] = and(or.3260, not.3261, id=3263) + and.3264: bits[1] = and(or.3260, not.3261, id=3264) + or.3262: bits[1] = or(and.3247, and.3259, id=3262) + or.3277: bits[1] = or(or.3276, and.3249, id=3277) + not.3278: bits[1] = not(and.3259, id=3278) + or.3279: bits[1] = or(and.3267, and.3205, id=3279) + and.3269: bits[1] = and(bit_slice.2845, or.3216, id=3269) + and.3271: bits[1] = and(or.3216, not.3217, id=3271) + or.3273: bits[1] = or(and.3263, and.3247, id=3273) + and.3265: bits[1] = and(and.3264, or.3262, id=3265) + and.3292: bits[1] = and(or.3277, not.3278, id=3292) + or.3280: bits[1] = or(or.3279, and.3231, id=3280) + and.3268: bits[1] = and(or.3214, not.3215, id=3268) + and.3270: bits[1] = and(and.3269, not.3217, id=3270) + and.3298: bits[1] = and(or.3277, not.3278, id=3298) + or.3284: bits[1] = or(bit_slice.2845, and.3271, id=3284) + or.3274: bits[1] = or(or.3273, and.3259, id=3274) + not.3275: bits[1] = not(and.3265, id=3275) + and.3293: bits[1] = and(and.3292, or.3280, id=3293) + not.3281: bits[1] = not(and.3249, id=3281) + and.3295: bits[1] = and(or.3277, not.3278, id=3295) + or.3282: bits[1] = or(and.3268, and.3270, id=3282) + and.3299: bits[1] = and(and.3298, or.3284, id=3299) + not.3285: bits[1] = not(and.3270, id=3285) + and.3272: bits[1] = and(not.3168, not.3169, id=3272) + and.3288: bits[1] = and(or.3274, not.3275, id=3288) + and.3289: bits[1] = and(or.3277, not.3278, id=3289) + and.3294: bits[1] = and(and.3293, not.3281, id=3294) + and.3296: bits[1] = and(and.3295, or.3282, id=3296) + not.3283: bits[1] = not(and.3231, id=3283) + and.3300: bits[1] = and(and.3299, not.3285, id=3300) + or.3286: bits[1] = or(not.3167, and.3272, id=3286) + or.3303: bits[1] = or(and.3288, and.3289, id=3303) + and.3290: bits[1] = and(or.3280, not.3281, id=3290) + or.3311: bits[1] = or(and.3288, and.3294, id=3311) + and.3297: bits[1] = and(and.3296, not.3283, id=3297) + and.3301: bits[1] = and(and.3300, or.3286, id=3301) + not.3287: bits[1] = not(and.3207, id=3287) + or.3304: bits[1] = or(or.3303, and.3290, id=3304) + and.3291: bits[1] = and(or.3282, not.3283, id=3291) + or.3306: bits[1] = or(and.3288, and.3294, id=3306) + or.3312: bits[1] = or(or.3311, and.3297, id=3312) + and.3302: bits[1] = and(and.3301, not.3287, id=3302) + or.3305: bits[1] = or(or.3304, and.3291, id=3305) + or.3307: bits[1] = or(or.3306, and.3297, id=3307) + or.3313: bits[1] = or(or.3312, and.3302, id=3313) + not.3309: bits[1] = not(or.3305, id=3309) + or.3308: bits[1] = or(or.3307, and.3302, id=3308) + or.3318: bits[1] = or(not.3168, not.3169, id=3318) + not.3319: bits[1] = not(and.3272, id=3319) + or.3314: bits[1] = or(or.3313, not.3309, id=3314) + and.3322: bits[1] = and(or.3284, not.3285, id=3322) + not.3310: bits[1] = not(or.3308, id=3310) + and.3328: bits[1] = and(or.3286, not.3287, id=3328) + and.3333: bits[1] = and(or.3318, not.3319, id=3333) + and.3338: bits[1] = and(or.3138, not.3139, id=3338) + and.3320: bits[1] = and(or.3284, not.3285, id=3320) + not.3316: bits[1] = not(or.3314, id=3316) + and.3323: bits[1] = and(and.3322, not.3310, id=3323) + and.3315: bits[1] = and(not.3310, not.3309, id=3315) + and.3326: bits[1] = and(or.3286, not.3287, id=3326) + and.3329: bits[1] = and(and.3328, not.3310, id=3329) + and.3331: bits[1] = and(or.3318, not.3319, id=3331) + and.3334: bits[1] = and(and.3333, not.3310, id=3334) + and.3336: bits[1] = and(or.3138, not.3139, id=3336) + and.3339: bits[1] = and(and.3338, not.3310, id=3339) + and.3341: bits[1] = and(or.3282, not.3283, id=3341) + and.3321: bits[1] = and(and.3320, not.3316, id=3321) + and.3324: bits[1] = and(and.3323, not.3309, id=3324) + not.3317: bits[1] = not(and.3315, id=3317) + and.3327: bits[1] = and(and.3326, not.3316, id=3327) + and.3330: bits[1] = and(and.3329, not.3309, id=3330) + and.3332: bits[1] = and(and.3331, not.3316, id=3332) + and.3335: bits[1] = and(and.3334, not.3309, id=3335) + and.3337: bits[1] = and(and.3336, not.3316, id=3337) + and.3340: bits[1] = and(and.3339, not.3309, id=3340) + literal.2803: bits[4] = literal(value=0, id=2803) + literal.2823: bits[4] = literal(value=2, id=2823) + literal.2863: bits[4] = literal(value=6, id=2863) + literal.2883: bits[4] = literal(value=8, id=2883) + and.3342: bits[1] = and(and.3341, not.3310, id=3342) + or.3344: bits[1] = or(and.3321, and.3324, id=3344) + and.3325: bits[1] = and(not.3316, not.3317, id=3325) + or.3346: bits[1] = or(and.3327, and.3330, id=3346) + or.3348: bits[1] = or(and.3332, and.3335, id=3348) + or.3350: bits[1] = or(and.3337, and.3340, id=3350) + array_index.2804: bits[8] = array_index(window, indices=[literal.2803], id=2804) + array_index.2824: bits[8] = array_index(window, indices=[literal.2823], id=2824) + array_index.2864: bits[8] = array_index(window, indices=[literal.2863], id=2864) + array_index.2884: bits[8] = array_index(window, indices=[literal.2883], id=2884) + literal.2801: bits[1] = literal(value=0, id=2801) + and.3343: bits[1] = and(and.3342, not.3309, id=3343) + or.3345: bits[1] = or(or.3344, and.3325, id=3345) + or.3347: bits[1] = or(or.3346, and.3325, id=3347) + or.3349: bits[1] = or(or.3348, and.3325, id=3349) + or.3351: bits[1] = or(or.3350, and.3325, id=3351) + literal.2800: bits[1] = literal(value=1, id=2800) + bit_slice.2805: bits[1] = bit_slice(array_index.2804, start=0, width=1, id=2805) + bit_slice.2806: bits[1] = bit_slice(array_index.2804, start=1, width=1, id=2806) + bit_slice.2807: bits[1] = bit_slice(array_index.2804, start=2, width=1, id=2807) + bit_slice.2808: bits[1] = bit_slice(array_index.2804, start=3, width=1, id=2808) + bit_slice.2809: bits[1] = bit_slice(array_index.2804, start=4, width=1, id=2809) + bit_slice.2810: bits[1] = bit_slice(array_index.2804, start=5, width=1, id=2810) + bit_slice.2811: bits[1] = bit_slice(array_index.2804, start=6, width=1, id=2811) + bit_slice.2812: bits[1] = bit_slice(array_index.2804, start=7, width=1, id=2812) + bit_slice.2815: bits[1] = bit_slice(array_index.2814, start=0, width=1, id=2815) + bit_slice.2825: bits[1] = bit_slice(array_index.2824, start=0, width=1, id=2825) + bit_slice.2826: bits[1] = bit_slice(array_index.2824, start=1, width=1, id=2826) + bit_slice.2827: bits[1] = bit_slice(array_index.2824, start=2, width=1, id=2827) + bit_slice.2828: bits[1] = bit_slice(array_index.2824, start=3, width=1, id=2828) + bit_slice.2829: bits[1] = bit_slice(array_index.2824, start=4, width=1, id=2829) + bit_slice.2830: bits[1] = bit_slice(array_index.2824, start=5, width=1, id=2830) + bit_slice.2831: bits[1] = bit_slice(array_index.2824, start=6, width=1, id=2831) + bit_slice.2832: bits[1] = bit_slice(array_index.2824, start=7, width=1, id=2832) + bit_slice.2835: bits[1] = bit_slice(array_index.2834, start=0, width=1, id=2835) + bit_slice.2850: bits[1] = bit_slice(array_index.2844, start=5, width=1, id=2850) + bit_slice.2851: bits[1] = bit_slice(array_index.2844, start=6, width=1, id=2851) + bit_slice.2852: bits[1] = bit_slice(array_index.2844, start=7, width=1, id=2852) + bit_slice.2855: bits[1] = bit_slice(array_index.2854, start=0, width=1, id=2855) + bit_slice.2865: bits[1] = bit_slice(array_index.2864, start=0, width=1, id=2865) + bit_slice.2866: bits[1] = bit_slice(array_index.2864, start=1, width=1, id=2866) + bit_slice.2867: bits[1] = bit_slice(array_index.2864, start=2, width=1, id=2867) + bit_slice.2868: bits[1] = bit_slice(array_index.2864, start=3, width=1, id=2868) + bit_slice.2869: bits[1] = bit_slice(array_index.2864, start=4, width=1, id=2869) + bit_slice.2870: bits[1] = bit_slice(array_index.2864, start=5, width=1, id=2870) + bit_slice.2871: bits[1] = bit_slice(array_index.2864, start=6, width=1, id=2871) + bit_slice.2872: bits[1] = bit_slice(array_index.2864, start=7, width=1, id=2872) + bit_slice.2875: bits[1] = bit_slice(array_index.2874, start=0, width=1, id=2875) + bit_slice.2885: bits[1] = bit_slice(array_index.2884, start=0, width=1, id=2885) + bit_slice.2886: bits[1] = bit_slice(array_index.2884, start=1, width=1, id=2886) + bit_slice.2887: bits[1] = bit_slice(array_index.2884, start=2, width=1, id=2887) + bit_slice.2888: bits[1] = bit_slice(array_index.2884, start=3, width=1, id=2888) + bit_slice.2889: bits[1] = bit_slice(array_index.2884, start=4, width=1, id=2889) + bit_slice.2890: bits[1] = bit_slice(array_index.2884, start=5, width=1, id=2890) + bit_slice.2891: bits[1] = bit_slice(array_index.2884, start=6, width=1, id=2891) + bit_slice.2892: bits[1] = bit_slice(array_index.2884, start=7, width=1, id=2892) + ret concat.3352: bits[8] = concat(literal.2801, literal.2801, literal.2801, and.3343, or.3345, or.3347, or.3349, or.3351, id=3352) +} diff --git a/cudax/test/stf/fhe/parse_arctyrex.cu b/cudax/test/stf/fhe/parse_arctyrex.cu new file mode 100644 index 00000000000..4d344054e67 --- /dev/null +++ b/cudax/test/stf/fhe/parse_arctyrex.cu @@ -0,0 +1,433 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief This shows how we can asynchronously compose a sequence of operations + * described with an IR + */ + +#include + +#include +#include + +using namespace cuda::experimental::stf; + +using logical_slice = logical_data>; + +static __global__ void cuda_sleep_kernel(long long int clock_cnt) +{ + long long int start_clock = clock64(); + long long int clock_offset = 0; + while (clock_offset < clock_cnt) + { + clock_offset = clock64() - start_clock; + } +} + +void cuda_sleep(double ms, cudaStream_t stream) +{ + int device; + cudaGetDevice(&device); + + // cudaDevAttrClockRate: Peak clock frequency in kilohertz; + int clock_rate; + cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, device); + + long long int clock_cnt = (long long int) (ms * clock_rate); + cuda_sleep_kernel<<<1, 1, 0, stream>>>(clock_cnt); +} + +const double sleep_time = 1.0; + +// z = LITERAL(length, value) +template +logical_slice LITERAL(Ctx& ctx, size_t n, int, std::string out_symbol = "undefined") +{ + auto z = ctx.logical_data(shape_of>(n)); + z.set_symbol(out_symbol); + + ctx.task(z.write()).set_symbol("LITERAL")->*[](cudaStream_t stream, auto /*unused*/) { + cuda_sleep(sleep_time, stream); + }; + + return z; +} + +// z = OR(x,y) +template +logical_slice OR(Ctx& ctx, logical_slice x, logical_slice y, std::string out_symbol = "undefined") +{ + assert(x.shape().size() == y.shape().size()); + + auto z = ctx.logical_data(x.shape()); + z.set_symbol(out_symbol); + + ctx.task(x.read(), y.read(), z.write()).set_symbol("OR")->* + [](cudaStream_t stream, auto /*unused*/, auto /*unused*/, auto /*unused*/) { + cuda_sleep(sleep_time, stream); + }; + + return z; +} + +// z = AND(x,y) +template +logical_slice AND(Ctx& ctx, logical_slice x, logical_slice y, std::string out_symbol = "undefined") +{ + assert(x.shape().size() == y.shape().size()); + + auto z = ctx.logical_data(x.shape()); + z.set_symbol(out_symbol); + + ctx.task(x.read(), y.read(), z.write()).set_symbol("AND")->* + [](cudaStream_t stream, auto /*unused*/, auto /*unused*/, auto /*unused*/) { + cuda_sleep(sleep_time, stream); + }; + + return z; +} + +template +logical_slice +ARRAY_INDEX(Ctx& ctx, logical_slice x, logical_slice /*unused*/, size_t sz, std::string out_symbol = "undefined") +{ + auto z = ctx.logical_data(shape_of>(sz)); + z.set_symbol(out_symbol); + + ctx.task(x.read(), z.write()).set_symbol("ARRAY INDEX")->*[](cudaStream_t stream, auto /*unused*/, auto /*unused*/) { + cuda_sleep(sleep_time, stream); + }; + + return z; +} + +// z = BIT_SLICE(x, position, size) +template +logical_slice BIT_SLICE(Ctx& ctx, logical_slice x, size_t /*unused*/, size_t sz, std::string out_symbol = "undefined") +{ + auto z = ctx.logical_data(shape_of>(sz)); + z.set_symbol(out_symbol); + + ctx.task(x.read(), z.write()).set_symbol("BIT SLICE")->*[](cudaStream_t stream, auto /*unused*/, auto /*unused*/) { + cuda_sleep(sleep_time, stream); + }; + + return z; +} + +// y = NOT(x) +template +logical_slice NOT(Ctx& ctx, logical_slice x, std::string out_symbol = "undefined") +{ + auto y = ctx.logical_data(x.shape()); + y.set_symbol(out_symbol); + + ctx.task(x.read(), y.write()).set_symbol("NOT")->*[](cudaStream_t stream, auto /*unused*/, auto /*unused*/) { + cuda_sleep(sleep_time, stream); + }; + + return y; +} + +// y = CONCAT(sz, vector<> inputs) +template +logical_slice CONCAT(Ctx& ctx, size_t sz, std::vector inputs, std::string out_symbol = "undefined") +{ + auto y = ctx.logical_data(shape_of>(sz)); + y.set_symbol(out_symbol); + + auto t = ctx.task(); + t.add_deps(y.write()); + t.set_symbol("CONCAT"); + for (auto& input : inputs) + { + t.add_deps(input.read()); + } + + t->*[](cudaStream_t stream) { + cuda_sleep(sleep_time, stream); + }; + + return y; +} + +template +void run(const char* inputfile) +{ + // Find the handle from its symbol + std::map logical_slices; + std::string output_data_symbol; + std::ifstream read(inputfile); + Ctx ctx; + + // Indicates if we are parsing the body of the circuit + bool in_body = false; + + for (std::string line; std::getline(read, line);) + { + std::stringstream ss(line); + + // std::cout << "LINE : " << line << std::endl; + if (!in_body) + { + std::string token; + ss >> token; + // std::cout << "TOKEN : " << token << std::endl; + + if (token == "fn") + { + // We are parsing the declaration of the function, this starts the body + // std::cout << "GOT DECLARATION " << line << std::endl; + in_body = true; + + // Look for parameters + size_t begin_params, end_params; + begin_params = line.find('('); + end_params = line.find(')'); + std::string params = line.substr(begin_params + 1, end_params - begin_params - 1); + + // std::cout << "PARAMS = " << params << std::endl; + + // Parse parameters which are separated by a comma, format = "symbol: type" + while (true) + { + // Find symbol + size_t pos; + pos = params.find(":"); + std::string symbol = params.substr(0, pos); + // std::cout << symbol << std::endl; + + // We create a dummy allocation so that the data handles refers to actually allocated host memory + double* dummy = new double[1]; + auto param_handle = ctx.logical_data(make_slice(dummy, 1)); + + param_handle.set_symbol(symbol); + logical_slices[symbol] = param_handle; + + pos = params.find(", "); + if (pos == std::string::npos) + { + break; + } + + params.erase(0, pos + 2); + } + } + } + else + { + std::string token; + ss >> token; + // std::cout << "TOKEN : " << token << std::endl; + + if (token == "}") + { + // This closes the body + in_body = false; + continue; + } + + // We expect lines of the format : " symbol: type = gate_name(..., id=VALUE)" + size_t end_symbol = line.find(":"); + + // We look for the first "= " to find the gate name + size_t gate_symbol_pos = line.find("= "); + std::string gate = line.substr(gate_symbol_pos + 2); + + size_t gate_name_end = gate.find("("); + std::string gate_symbol = gate.substr(0, gate_name_end); + std::string gate_args = gate.substr(gate_name_end + 1, gate.size() - gate_name_end - 2); + std::string gate_outvar_symbol = line.substr(2, end_symbol - 2); + + // Possibly remove the "ret" out of the gate_outvar_symbol + size_t ret_pos = gate_outvar_symbol.find("ret "); + if (ret_pos != std::string::npos) + { + // This is our result ! + gate_outvar_symbol.erase(4); + + output_data_symbol = gate_outvar_symbol; + } + + // std::cout << "GATE OUT SYMBOL " << gate_outvar_symbol << std::endl; + // std::cout << "GATE DESCRIPTION : " << gate << std::endl; + // std::cout << "GATE SYMBOL " << gate_symbol << std::endl; + // std::cout << "GATE ARGS " << gate_args << std::endl; + + // We now dispatch between the different gates + if (gate_symbol == "literal") + { + // literal.916: bits[1] = literal(value=1, id=916) + int value = 42; // TODO parse + size_t sz = 1; + + logical_slices[gate_outvar_symbol] = LITERAL(ctx, sz, value, gate_outvar_symbol); + + continue; + } + + if (gate_symbol == "or") + { + // or.1268: bits[1] = or(or.1267, and.1236, id=1268) + size_t pos; + pos = gate_args.find(", "); + std::string symbol_left = gate_args.substr(0, pos); + gate_args.erase(0, pos + 2); + + pos = gate_args.find(", "); + std::string symbol_right = gate_args.substr(0, pos); + gate_args.erase(0, pos + 2); + + // std::cout << "OR GATE on symbols" << symbol_left << " AND " << symbol_right << std::endl; + + auto data_left = logical_slices[symbol_left]; + auto data_right = logical_slices[symbol_right]; + logical_slices[gate_outvar_symbol] = OR(ctx, data_left, data_right, gate_outvar_symbol); + + continue; + } + + if (gate_symbol == "and") + { + size_t pos; + pos = gate_args.find(", "); + std::string symbol_left = gate_args.substr(0, pos); + gate_args.erase(0, pos + 2); + + pos = gate_args.find(", "); + std::string symbol_right = gate_args.substr(0, pos); + gate_args.erase(0, pos + 2); + + auto data_left = logical_slices[symbol_left]; + auto data_right = logical_slices[symbol_right]; + logical_slices[gate_outvar_symbol] = AND(ctx, data_left, data_right, gate_outvar_symbol); + + continue; + } + + if (gate_symbol == "bit_slice") + { + // bit_slice.936: bits[1] = bit_slice(y, start=15, width=1, id=936) + size_t pos; + pos = gate_args.find(", "); + std::string symbol_in = gate_args.substr(0, pos); + gate_args.erase(0, pos + 2); + + // harcoded ... + size_t sz = 1; + + auto data_in = logical_slices[symbol_in]; + logical_slices[gate_outvar_symbol] = BIT_SLICE(ctx, data_in, 42, sz, gate_outvar_symbol); + + // std::cout << "PRODUCED DATA FOR " << gate_outvar_symbol << std::endl; + + continue; + } + + if (gate_symbol == "array_index") + { + // array_index.2804: bits[8] = array_index(window, indices=[literal.2803], id=2804) + size_t pos; + pos = gate_args.find(", "); + std::string symbol_in = gate_args.substr(0, pos); + gate_args.erase(0, pos + 2); + + pos = gate_args.find(", "); + std::string symbol_indices = gate_args.substr(0, pos); + gate_args.erase(0, pos + 2); + + size_t pos_beg = symbol_indices.find("["); + size_t pos_end = symbol_indices.find("]"); + std::string symbol_in_2 = symbol_indices.substr(pos_beg + 1, pos_end - pos_beg - 1); + // std::cout << "ARRAY INDEX ... INDEX = " << symbol_in_2 << std::endl; + + // harcoded ... + size_t sz = 1; + + auto data_in = logical_slices[symbol_in]; + auto data_in_2 = logical_slices[symbol_in_2]; + logical_slices[gate_outvar_symbol] = ARRAY_INDEX(ctx, data_in, data_in_2, sz, gate_outvar_symbol); + + // std::cout << "PRODUCED DATA FOR " << gate_outvar_symbol << std::endl; + continue; + } + + if (gate_symbol == "not") + { + // not.953: bits[1] = not(bit_slice.936, id=953) + size_t pos; + pos = gate_args.find(", "); + std::string symbol_in = gate_args.substr(0, pos); + gate_args.erase(0, pos + 2); + + auto data_in = logical_slices[symbol_in]; + logical_slices[gate_outvar_symbol] = NOT(ctx, data_in, gate_outvar_symbol); + + continue; + } + + if (gate_symbol == "concat") + { + // ret concat.1269: bits[16] = concat(or.1238, or.1240, or.1242, or.1244, or.1246, or.1248, or.1250, + // or.1252, or.1254, or.1256, or.1258, or.1260, or.1262, or.1264, or.1266, or.1268, id=1269) + // Remove the end ", id =.." + size_t id_pos = gate_args.find(", id="); + gate_args = gate_args.substr(0, id_pos); + + std::vector inputs; + size_t pos; + + while (true) + { + pos = gate_args.find(", "); + if (pos == std::string::npos) + { + break; + } + + std::string symbol = gate_args.substr(0, pos); + inputs.push_back(logical_slices[symbol]); + gate_args.erase(0, pos + 2); + + // std::cout << "CONCAT ARG = " << symbol << std::endl; + } + + size_t sz = 1; + logical_slices[gate_outvar_symbol] = CONCAT(ctx, sz, inputs, gate_outvar_symbol); + continue; + } + + std::cout << "UNRECOGNIZED GATE !" << std::endl; + abort(); + } + } + + auto output_data = logical_slices[output_data_symbol]; + + ctx.finalize(); +} + +int main(int argc, char** argv) +{ + /* Until we find a simple and "safe" way to pass a file to the test, we + * consider this is not an error ... + * One possible approach would be to convert a default .ir file to a large + * static data processed in the test suite ? + */ + if (argc < 2) + { + fprintf(stderr, "This test needs an input file, skipping.\n"); + return 0; + } + run(argv[1]); + run(argv[1]); +} diff --git a/cudax/test/stf/freeze/constant_logical_data.cu b/cudax/test/stf/freeze/constant_logical_data.cu new file mode 100644 index 00000000000..ae5fe04aed4 --- /dev/null +++ b/cudax/test/stf/freeze/constant_logical_data.cu @@ -0,0 +1,72 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Helper to build constant data based on frozen logical data + * + */ + +#include +#include + +using namespace cuda::experimental::stf; + +int main() +{ + stream_ctx ctx; + + const int N = 16; + + /* Create a constant value */ + auto ld_cst = ctx.logical_data(shape_of>(N)); + ctx.parallel_for(ld_cst.shape(), ld_cst.write())->*[] __device__(size_t i, slice res) { + res(i) = 18 * i - 9; + }; + auto cst = constant_logical_data(ctx, mv(ld_cst)); + + int X[N]; + + for (int i = 0; i < N; i++) + { + X[i] = 5 * i - 3; + } + + auto lX = ctx.logical_data(X).set_symbol("X"); + + for (size_t iter = 0; iter < 4; iter++) + { + auto cst_slice = cst.get(); + ctx.parallel_for(lX.shape(), lX.rw()).set_symbol("X+=cst")->*[cst_slice] __device__(size_t i, auto x) { + x(i) += cst_slice(i); + }; + + auto cst2 = run_once()->*[&]() { + auto ld = ctx.logical_data(shape_of>(N)); + ctx.parallel_for(ld.shape(), ld.write())->*[] __device__(size_t i, slice res) { + res(i) = 4 * i - 2; + }; + return constant_logical_data(ctx, mv(ld)); + }; + + auto cst2_slice = cst2.get(); + ctx.parallel_for(lX.shape(), lX.rw()).set_symbol("X+=cst2")->*[cst2_slice] __device__(size_t i, auto x) { + x(i) += cst2_slice(i); + }; + } + + ctx.finalize(); + + for (int i = 0; i < N; i++) + { + EXPECT(X[i] == (5 * i - 3) + 4 * (18 * i - 9) + 4 * (4 * i - 2)); + } +} diff --git a/cudax/test/stf/freeze/freeze.cu b/cudax/test/stf/freeze/freeze.cu new file mode 100644 index 00000000000..f1da3e9808a --- /dev/null +++ b/cudax/test/stf/freeze/freeze.cu @@ -0,0 +1,82 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Freeze data in read-only fashion + * + */ + +#include +#include + +using namespace cuda::experimental::stf; + +int X0(int i) +{ + return 17 * i + 45; +} + +__global__ void print(slice s) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < s.size(); i += nthreads) + { + printf("%d %d\n", i, s(i)); + } +} + +int main() +{ + stream_ctx ctx; + + cudaStream_t stream = ctx.pick_stream(); + + const int N = 16; + int X[N]; + + for (int i = 0; i < N; i++) + { + X[i] = X0(i); + } + + auto lX = ctx.logical_data(X).set_symbol("X"); + auto lY = ctx.logical_data(lX.shape()).set_symbol("Y"); + + ctx.parallel_for(lX.shape(), lX.rw()).set_symbol("X=2X")->*[] __device__(size_t i, auto x) { + x(i) *= 2; + }; + + auto fx = ctx.freeze(lX); + + auto dX = fx.get(data_place::current_device(), stream); + + print<<<8, 4, 0, stream>>>(dX); + + ctx.parallel_for(lX.shape(), lX.read(), lY.write()).set_symbol("Y=X")->*[] __device__(size_t i, auto x, auto y) { + y(i) = x(i); + }; + + fx.unfreeze(stream); + + ctx.parallel_for(lX.shape(), lX.rw()).set_symbol("X+=1")->*[] __device__(size_t i, auto x) { + x(i) += 1; + }; + + ctx.finalize(); + + for (int i = 0; i < N; i++) + { + EXPECT(X[i] == 2 * X0(i) + 1); + } +} diff --git a/cudax/test/stf/freeze/freeze_rw.cu b/cudax/test/stf/freeze/freeze_rw.cu new file mode 100644 index 00000000000..d9dd67c3c7e --- /dev/null +++ b/cudax/test/stf/freeze/freeze_rw.cu @@ -0,0 +1,83 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Freeze data in read-only fashion + * + */ + +#include +#include + +using namespace cuda::experimental::stf; + +int X0(int i) +{ + return 17 * i + 45; +} + +__global__ void mult(slice s, int val) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < s.size(); i += nthreads) + { + s(i) *= val; + } +} + +int main() +{ + stream_ctx ctx; + + cudaStream_t stream = ctx.pick_stream(); + + const int N = 16; + int X[N]; + + for (int i = 0; i < N; i++) + { + X[i] = X0(i); + } + + auto lX = ctx.logical_data(X).set_symbol("X"); + auto lY = ctx.logical_data(lX.shape()).set_symbol("Y"); + + for (int k = 0; k < 4; k++) + { + auto fx = ctx.freeze(lX, access_mode::rw, data_place::current_device()); + auto dX = fx.get(data_place::current_device(), stream); + mult<<<8, 4, 0, stream>>>(dX, 4); + fx.unfreeze(stream); + + ctx.parallel_for(lX.shape(), lX.read(), lY.write()).set_symbol("Y=X")->*[] __device__(size_t i, auto x, auto y) { + y(i) = x(i); + }; + + ctx.parallel_for(lX.shape(), lY.rw()).set_symbol("Y+=1")->*[] __device__(size_t i, auto y) { + y(i) += 1; + }; + + // ctx.host_launch(lX.read(), lY.read())->*[](auto x, auto y) { + // for (int i = 0; i < x.size(); i++) { + // EXPECT(x(i) == 2*X0(i) + 4); + // } + // + // for (int i = 0; i < y.size(); i++) { + // EXPECT(y(i) == 2*X0(i) + 4); + // } + // }; + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/freeze/task_fence.cu b/cudax/test/stf/freeze/task_fence.cu new file mode 100644 index 00000000000..573dab14638 --- /dev/null +++ b/cudax/test/stf/freeze/task_fence.cu @@ -0,0 +1,98 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Freeze data in read-only fashion + * + */ + +#include +#include + +using namespace cuda::experimental::stf; + +int X0(int i) +{ + return 17 * i + 45; +} + +__global__ void print(slice s) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < s.size(); i += nthreads) + { + printf("%d %d\n", i, s(i)); + } +} + +int main() +{ + stream_ctx ctx; + + const int N = 16; + int X[N]; + + for (int i = 0; i < N; i++) + { + X[i] = X0(i); + } + + auto lX = ctx.logical_data(X).set_symbol("X"); + auto lY = ctx.logical_data(lX.shape()).set_symbol("Y"); + + ctx.parallel_for(lX.shape(), lX.rw()).set_symbol("X=2X")->*[] __device__(size_t i, auto x) { + x(i) *= 2; + }; + + // test 1 : implicit sync of gets + { + auto fx = ctx.freeze(lX); + auto [dX, _] = fx.get(data_place::current_device()); + + // the stream returned by task_fence should depend on the get operation + auto stream2 = ctx.task_fence(); + print<<<8, 4, 0, stream2>>>(dX); + + ctx.parallel_for(lX.shape(), lX.read(), lY.write()).set_symbol("Y=X")->*[] __device__(size_t i, auto x, auto y) { + y(i) = x(i); + }; + + fx.unfreeze(stream2); + } + + // test 2 : unfreeze with no events due to user sync + { + auto fx = ctx.freeze(lX); + auto [dX, _] = fx.get(data_place::current_device()); + + // the stream returned by task_fence should depend on the get operation + auto stream2 = ctx.task_fence(); + print<<<8, 4, 0, stream2>>>(dX); + + // We synchronize so there is nothing to depend on anymore + cudaStreamSynchronize(stream2); + fx.unfreeze(event_list()); + } + + ctx.parallel_for(lX.shape(), lX.rw()).set_symbol("X+=1")->*[] __device__(size_t i, auto x) { + x(i) += 1; + }; + + ctx.finalize(); + + for (int i = 0; i < N; i++) + { + EXPECT(X[i] == 2 * X0(i) + 1); + } +} diff --git a/cudax/test/stf/gnu/06-pdgemm.cpp b/cudax/test/stf/gnu/06-pdgemm.cpp new file mode 100644 index 00000000000..1ae2f363c14 --- /dev/null +++ b/cudax/test/stf/gnu/06-pdgemm.cpp @@ -0,0 +1,371 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief An example that implements a tiled matrix product over multiple devices using CUBLAS + * + * This also illustrates how the same code base can be used both with a + * stream_ctx and a graph_ctx backend. + */ + +#include +#include + +#include + +#define TILED + +using namespace cuda::experimental::stf; + +static std::unordered_map cublas_handles; + +/* Get a CUBLAS handle valid on the current device, or initialize it lazily */ +cublasHandle_t get_cublas_handle() +{ + int dev; + cuda_safe_call(cudaGetDevice(&dev)); + + auto& result = cublas_handles[dev]; + if (result == cublasHandle_t()) + { // not found, default value inserted + // Lazy initialization, and save the handle for future use + cuda_safe_call(cublasCreate(&result)); + } + return result; +} + +template +class matrix +{ +public: + template + matrix( + Ctx& ctx, size_t NROWS, size_t NCOLS, size_t BLOCKSIZE_ROWS, size_t BLOCKSIZE_COLS, const char* _symbol = "matrix") + { + symbol = _symbol; + + m = NROWS; + mb = BLOCKSIZE_ROWS; + + n = NCOLS; + nb = BLOCKSIZE_COLS; + + assert(m % mb == 0); + assert(n % nb == 0); + + size_t s = ((size_t) m) * ((size_t) n) * sizeof(T); + // cuda_safe_call(cudaMallocHost(&h_array, m*n*sizeof(T))); + // fprintf(stderr, "Allocating %ld x %ld x %ld = %ld bytes (%f GB) on host for %s\n", m, n, sizeof(T), s, + // s / (1024.0 * 1024.0 * 1024.0), _symbol); + h_array = (T*) malloc(s); + assert(h_array); + cuda_safe_call(cudaHostRegister(h_array, s, cudaHostRegisterPortable)); + + // Compute the number of blocks + mt = m / mb; + nt = n / nb; + + handles.resize(mt * nt); + + for (size_t colb = 0; colb < nt; colb++) + { + for (size_t rowb = 0; rowb < mt; rowb++) + { + T* addr_h = get_block_h(rowb, colb); + +#ifdef TILED + // tiles are stored contiguously + const size_t ld = mb; +#else + const size_t ld = m; +#endif + + std::ignore = ld; // avoid warning #177-D: variable "ld" was declared but never referenced + auto s = make_slice(addr_h, std::tuple{mb, nb}, ld); + auto tile = ctx.logical_data(s); + + tile.set_symbol(std::string(symbol) + "_" + std::to_string(rowb) + "_" + std::to_string(colb)); + + handles[rowb + colb * mt] = std::move(tile); + } + } + + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + for (int a = 1; a * a <= ndevs; a++) + { + if (ndevs % a == 0) + { + grid_p = a; + grid_q = ndevs / a; + } + } + + assert(grid_p * grid_q == ndevs); + + // std::cout << "FOUND " << ndevs << " DEVICES " + // << "p=" << grid_p << " q=" << grid_q << std::endl; + } + + int get_preferred_devid(int row, int col) + { + return (row % grid_p) + (col % grid_q) * grid_p; + } + + logical_data>& get_handle(int row, int col) + { + return handles[row + col * mt]; + } + + size_t get_index(size_t row, size_t col) + { +#ifdef TILED + // Find which tile contains this element + int tile_row = row / mb; + int tile_col = col / nb; + + size_t tile_size = mb * nb; + + // Look for the index of the begining of the tile + size_t tile_start = (tile_row + mt * tile_col) * tile_size; + + // Offset within the tile + size_t offset = (row % mb) + (col % nb) * mb; + + return tile_start + offset; +#else + return row + col * m; +#endif + } + + T* get_block_h(int brow, int bcol) + { + size_t index = get_index(brow * mb, bcol * nb); + return &h_array[index]; + } + + // Fill with func(Matrix*,row, col) + void fill(T (*func)(matrix*, int, int)) + { + // Fill blocks by blocks + for (int colb = 0; colb < nt; colb++) + { + for (int rowb = 0; rowb < mt; rowb++) + { + T* addr_h = get_block_h(rowb, colb); +#ifdef TILED + // tiles are stored contiguously + int ld = mb; +#else + int ld = m; +#endif + + for (int lrow = 0; lrow < mb; lrow++) + { + for (int lcol = 0; lcol < nb; lcol++) + { + size_t row = lrow + rowb * mb; + size_t col = lcol + colb * nb; + + T val = func(this, row, col); + + addr_h[lrow + lcol * ld] = val; + } + } + } + } + } + + T* h_array; + size_t m; // nrows + size_t n; // ncols + + size_t mb; // block size (rows) + size_t nb; // block size (cols) + + size_t mt; // numter of column blocks + size_t nt; // numter of row blocks + + // abstract data handles + std::vector>> handles; + + const char* symbol; + + // for the mapping + int ndevs; + int grid_p, grid_q; +}; + +template +void DGEMM( + Ctx& ctx, + cublasOperation_t transa, + cublasOperation_t transb, + double alpha, + matrix& A, + int A_row, + int A_col, + matrix& B, + int B_row, + int B_col, + double beta, + matrix& C, + int C_row, + int C_col) +{ + auto dev = exec_place::device(C.get_preferred_devid(C_row, C_col)); + + auto t = ctx.task( + dev, A.get_handle(A_row, A_col).read(), B.get_handle(B_row, B_col).read(), C.get_handle(C_row, C_col).rw()); + t.set_symbol("DGEMM"); + + t->*[&](cudaStream_t stream, auto tA, auto tB, auto tC) { + cuda_safe_call(cublasSetStream(get_cublas_handle(), stream)); + int k = tA.extent(transa == CUBLAS_OP_N ? 1 : 0); + cuda_safe_call(cublasDgemm( + get_cublas_handle(), + transa, + transb, + tC.extent(0), + tC.extent(1), + k, + &alpha, + tA.data_handle(), + tA.stride(1), + tB.data_handle(), + tB.stride(1), + &beta, + tC.data_handle(), + tC.stride(1))); + }; +} + +template +void PDGEMM(Ctx& ctx, + cublasOperation_t transa, + cublasOperation_t transb, + double alpha, + matrix& A, + matrix& B, + double beta, + matrix& C) +{ + for (int m = 0; m < C.mt; m++) + { + for (int n = 0; n < C.nt; n++) + { + //========================================= + // alpha*A*B does not contribute; scale C + //========================================= + int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + if (alpha == 0.0 || inner_k == 0) + { + DGEMM(ctx, transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); + } + else if (transa == CUBLAS_OP_N) + { + //================================ + // CUBLAS_OP_N / CUBLAS_OP_N + //================================ + if (transb == CUBLAS_OP_N) + { + assert(A.nt == B.mt); + for (int k = 0; k < A.nt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(ctx, transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); + } + } + //===================================== + // CUBLAS_OP_N / CUBLAS_OP_T + //===================================== + else + { + for (int k = 0; k < A.nt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(ctx, transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); + } + } + } + else + { + //===================================== + // CUBLAS_OP_T / CUBLAS_OP_N + //===================================== + if (transb == CUBLAS_OP_N) + { + for (int k = 0; k < A.mt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(ctx, transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); + } + } + //========================================== + // CUBLAS_OP_T / CUBLAS_OP_T + //========================================== + else + { + for (int k = 0; k < A.mt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(ctx, transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); + } + } + } + } + } +} + +double hilbert(matrix* mat, int row, int col) +{ + return 1.0 / (col + row + 1.0) + 2.0 * mat->n * (col == row); +} + +template +void run(size_t N, size_t NB) +{ + /* This is the CUDASTF context */ + Ctx ctx; + + matrix A(ctx, N, N, NB, NB, "A"); + matrix B(ctx, N, N, NB, NB, "B"); + matrix C(ctx, N, N, NB, NB, "C"); + + // (Hilbert matrix + 2*N*Id) to have a diagonal dominant matrix + A.fill(hilbert); + B.fill(hilbert); + C.fill(hilbert); + + PDGEMM(ctx, CUBLAS_OP_N, CUBLAS_OP_N, 1.0, A, B, -2.0, C); + + ctx.finalize(); +} + +int main(int argc, char** argv) +{ + size_t N = 1024; + size_t NB = 128; + + if (argc > 1) + { + N = atoi(argv[1]); + } + + if (argc > 2) + { + NB = atoi(argv[2]); + } + + assert(N % NB == 0); + + run(N, NB); + run(N, NB); +} diff --git a/cudax/test/stf/gnu/07-cholesky.cpp b/cudax/test/stf/gnu/07-cholesky.cpp new file mode 100644 index 00000000000..011de211e5c --- /dev/null +++ b/cudax/test/stf/gnu/07-cholesky.cpp @@ -0,0 +1,718 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief This example implements a Cholesky decomposition over multiple devices using CUBLAS and CUSOLVER + * + * It also illustrates how we can use CUDASTF to allocate temporary data for CUSOLVER in CUDASTF tasks + */ + +#include + +#include + +#define TILED + +using namespace cuda::experimental::stf; + +// Global for the sake of simplicity ! +stream_ctx ctx; + +/* Get a CUBLAS handle valid on the current device, or initialize it lazily */ +cublasHandle_t& get_cublas_handle() +{ + int dev; + cuda_safe_call(cudaGetDevice(&dev)); + + static std::unordered_map cublas_handles; + auto& result = cublas_handles[dev]; + if (result == cublasHandle_t()) + { // not found, default value inserted + // Lazy initialization, and save the handle for future use + cuda_safe_call(cublasCreate(&result)); + } + return result; +} + +/* Get a CUSOLVER handle valid on the current device, or initialize it lazily */ +cusolverDnHandle_t& get_cusolver_handle() +{ + int dev; + cuda_safe_call(cudaGetDevice(&dev)); + + static std::unordered_map cusolver_handles; + auto& result = cusolver_handles[dev]; + if (result == cusolverDnHandle_t()) + { // not found, default value inserted + // Lazy initialization, and save the handle for future use + cuda_safe_call(cusolverDnCreate(&result)); + } + return result; +} + +template +class matrix +{ +public: + matrix(int NROWS, int NCOLS, int BLOCKSIZE_ROWS, int BLOCKSIZE_COLS, bool is_sym, const char* _symbol = "matrix") + { + symbol = _symbol; + + sym_matrix = is_sym; + + m = NROWS; + mb = BLOCKSIZE_ROWS; + + n = NCOLS; + nb = BLOCKSIZE_COLS; + + assert(m % mb == 0); + assert(n % nb == 0); + + // cuda_safe_call(cudaMallocHost(&h_array, m*n*sizeof(T))); + // fprintf(stderr, "Allocating %ld x %ld x %ld = %ld bytes (%f GB) on host for %s\n", m, n, sizeof(T), s, + // s / (1024.0 * 1024.0 * 1024.0), _symbol); + h_array.resize(m * n); + cuda_safe_call(cudaHostRegister(&h_array[0], h_array.size() * sizeof(T), cudaHostRegisterPortable)); + + // Compute the number of blocks + mt = m / mb; + nt = n / nb; + + handles.resize(mt * nt); + + for (int colb = 0; colb < nt; colb++) + { + int low_rowb = sym_matrix ? colb : 0; + for (int rowb = low_rowb; rowb < mt; rowb++) + { + T* addr_h = get_block_h(rowb, colb); + auto& h = handle(rowb, colb); + +#ifdef TILED + // tiles are stored contiguously + size_t ld = mb; +#else + size_t ld = m; +#endif + std::ignore = ld; // work around bug in compiler + h = ctx.logical_data(make_slice(addr_h, std::tuple{mb, nb}, ld)); + h.set_symbol(std::string(symbol) + "_" + std::to_string(rowb) + "_" + std::to_string(colb)); + } + } + + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + for (int a = 1; a * a <= ndevs; a++) + { + if (ndevs % a == 0) + { + grid_p = a; + grid_q = ndevs / a; + } + } + + assert(grid_p * grid_q == ndevs); + + // std::cout << "FOUND " << ndevs << " DEVICES " + // << "p=" << grid_p << " q=" << grid_q << std::endl; + } + + int get_preferred_devid(int row, int col) + { + return (row % grid_p) + (col % grid_q) * grid_p; + } + + auto& handle(int row, int col) + { + return handles[row + col * mt]; + } + + size_t get_index(size_t row, size_t col) + { +#ifdef TILED + // Find which tile contains this element + int tile_row = row / mb; + int tile_col = col / nb; + + size_t tile_size = mb * nb; + + // Look for the index of the begining of the tile + size_t tile_start = (tile_row + mt * tile_col) * tile_size; + + // Offset within the tile + size_t offset = (row % mb) + (col % nb) * mb; + + return tile_start + offset; +#else + return row + col * m; +#endif + } + + T* get_block_h(int brow, int bcol) + { + size_t index = get_index(brow * mb, bcol * nb); + return &h_array[index]; + } + + // Fill with func(Matrix*,row, col) + template + void fill(Fun&& fun) + { + // Fill blocks by blocks + for (int colb = 0; colb < nt; colb++) + { + int low_rowb = sym_matrix ? colb : 0; + for (int rowb = low_rowb; rowb < mt; rowb++) + { + // Each task fills a block + ctx.host_launch(handle(rowb, colb).write())->*[this, fun, rowb, colb](auto sA) { + for (int lcol = 0; lcol < sA.extent(1); lcol++) + { + size_t col = lcol + colb * sA.extent(1); + for (int lrow = 0; lrow < sA.extent(0); lrow++) + { + size_t row = lrow + rowb * sA.extent(0); + sA(lrow, lcol) = fun(*this, row, col); + } + } + }; + } + } + } + + std::vector h_array; + size_t m; // nrows + size_t n; // ncols + + // Is this a sym matrix ? (lower assumed) + bool sym_matrix; + + size_t mb; // block size (rows) + size_t nb; // block size (cols) + + size_t mt; // number of column blocks + size_t nt; // number of row blocks + + // abstract data handles + std::vector>> handles; + + const char* symbol; + + // for the mapping + int ndevs; + int grid_p, grid_q; +}; + +void DPOTRF(cublasFillMode_t uplo, class matrix& A, int A_row, int A_col) +{ + auto& Akk = A.handle(A_row, A_col); + size_t m_akk = Akk.shape().extent(0); + // Note that the handle may be different from the actual handle... + int Lwork_expected; + cuda_safe_call(cusolverDnDpotrf_bufferSize(get_cusolver_handle(), uplo, m_akk, nullptr, 0, &Lwork_expected)); + + auto potrf_buffer = ctx.logical_data(Lwork_expected); + auto devInfo = ctx.logical_data(shape_of>(1)); + + auto t = ctx.task(Akk.rw(), potrf_buffer.write(), devInfo.write()); + t.set_symbol("DPOTRF"); + t->*[&](cudaStream_t s, auto sAkk, auto buffer, auto info) { + auto& h = get_cusolver_handle(); + cuda_safe_call(cusolverDnSetStream(h, s)); + + cuda_safe_call(cusolverDnDpotrf( + h, + uplo, + sAkk.extent(0), + sAkk.data_handle(), + sAkk.stride(1), + buffer.data_handle(), + buffer.extent(0), + info.data_handle())); + }; +} + +void DGEMM( + cublasOperation_t transa, + cublasOperation_t transb, + double alpha, + class matrix& A, + int A_row, + int A_col, + class matrix& B, + int B_row, + int B_col, + double beta, + class matrix& C, + int C_row, + int C_col) +{ + auto t = ctx.task(A.handle(A_row, A_col).read(), B.handle(B_row, B_col).read(), C.handle(C_row, C_col).rw()); + t.set_symbol("DGEMM"); + t->*[&](cudaStream_t s, auto sA, auto sB, auto sC) { + auto& h = get_cublas_handle(); + cuda_safe_call(cublasSetStream(h, s)); + + auto k = (transa == CUBLAS_OP_N) ? sA.extent(1) : sA.extent(0); + cuda_safe_call(cublasDgemm( + h, + transa, + transb, + sC.extent(0), + sC.extent(1), + k, + &alpha, + sA.data_handle(), + sA.stride(1), + sB.data_handle(), + sB.stride(1), + &beta, + sC.data_handle(), + sC.stride(1))); + }; +} + +void DSYRK( + cublasFillMode_t uplo, + cublasOperation_t trans, + double alpha, + class matrix& A, + int A_row, + int A_col, + double beta, + class matrix& C, + int C_row, + int C_col) +{ + auto t = ctx.task(A.handle(A_row, A_col).read(), C.handle(C_row, C_col).rw()); + t.set_symbol("DSYRK"); + t->*[&](cudaStream_t s, auto sA, auto sC) { + auto& h = get_cublas_handle(); + cuda_safe_call(cublasSetStream(h, s)); + + // number of rows of matrix op(A) and C + auto n = sC.extent(0); + + // number of columns of matrix op(A) + auto k = (trans == CUBLAS_OP_N) ? sA.extent(1) : sA.extent(0); + + cuda_safe_call( + cublasDsyrk(h, uplo, trans, n, k, &alpha, sA.data_handle(), sA.stride(1), &beta, sC.data_handle(), sC.stride(1))); + }; +} + +void DTRSM( + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t transa, + cublasDiagType_t diag, + double alpha, + class matrix& A, + int A_row, + int A_col, + class matrix& B, + int B_row, + int B_col) +{ + auto t = ctx.task(A.handle(A_row, A_col).read(), B.handle(B_row, B_col).rw()); + t.set_symbol("DTRSM"); + t->*[&](cudaStream_t s, auto sA, auto sB) { + auto& h = get_cublas_handle(); + cuda_safe_call(cublasSetStream(h, s)); + + cuda_safe_call(cublasDtrsm( + h, + side, + uplo, + transa, + diag, + sB.extent(0), + sB.extent(1), + &alpha, + sA.data_handle(), + sA.stride(1), + sB.data_handle(), + sB.stride(1))); + }; +} + +void PDNRM2_HOST(matrix* A, double* result) +{ +#ifdef HAVE_DOT + reserved::dot::set_current_color("red"); +#endif + + for (int rowb = 0; rowb < A->mt; rowb++) + { + for (int colb = 0; colb < A->nt; colb++) + { + ctx.host_launch(A->handle(rowb, colb).read())->*[=](auto sA) { + double res2 = 0.0; + for (size_t col = 0; col < sA.extent(1); col++) + { + for (size_t row = 0; row < sA.extent(0); row++) + { + double v = sA(row, col); + res2 += v * v; + } + } + *result += res2; + }; + } + } +} + +void PDPOTRF(matrix& A) +{ +#ifdef HAVE_DOT + reserved::dot::set_current_color("yellow"); +#endif + + assert(A.m == A.n); + assert(A.mt == A.nt); + + int NBLOCKS = A.mt; + assert(A.mb == A.nb); + + cuda_safe_call(cudaSetDevice(0)); + + nvtxRangePushA("SUBMIT_PDPOTRF"); + for (int K = 0; K < NBLOCKS; K++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(K, K))); + DPOTRF(CUBLAS_FILL_MODE_LOWER, A, K, K); + + for (int row = K + 1; row < NBLOCKS; row++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, K))); + DTRSM(CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, K, K, A, row, K); + + for (int col = K + 1; col < row; col++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); + DGEMM(CUBLAS_OP_N, CUBLAS_OP_T, -1.0, A, row, K, A, col, K, 1.0, A, row, col); + } + + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, row))); + DSYRK(CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, -1.0, A, row, K, 1.0, A, row, row); + } + } + cuda_safe_call(cudaSetDevice(0)); + + nvtxRangePop(); +} + +// Algorithm from PLASMA +void PDTRSM(cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + double alpha, + class matrix& A, + class matrix& B) +{ + // std::cout << "[PDTRSM] START B MT " << B.mt << " NT " << B.nt << std::endl; + + if (side == CUBLAS_SIDE_LEFT) + { + if (uplo == CUBLAS_FILL_MODE_UPPER) + { + // TODO + assert(0); + abort(); + } + else + { + //=========================================== + // CUBLAS_SIDE_LEFT / CUBLAS_FILL_MODE_LOWER / CUBLAS_OP_N + //=========================================== + if (trans == CUBLAS_OP_N) + { + for (int k = 0; k < B.mt; k++) + { + double lalpha = k == 0 ? alpha : 1.0; + for (int n = 0; n < B.nt; n++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(k, k))); + DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); + } + for (int m = k + 1; m < B.mt; m++) + { + for (int n = 0; n < B.nt; n++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(m, k))); + DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); + } + } + } + } + //================================================ + // CUBLAS_SIDE_LEFT / CUBLAS_FILL_MODE_LOWER / CUBLAS_OP_[C|T] + //================================================ + else + { + for (int k = 0; k < B.mt; k++) + { + double lalpha = k == 0 ? alpha : 1.0; + for (int n = 0; n < B.nt; n++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); + DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); + } + for (int m = k + 1; m < B.mt; m++) + { + for (int n = 0; n < B.nt; n++) + { + cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); + DGEMM( + trans, CUBLAS_OP_N, -1.0, A, B.mt - k - 1, B.mt - 1 - m, B, B.mt - k - 1, n, lalpha, B, B.mt - 1 - m, n); + } + } + } + } + } + } + else + { + // TODO + abort(); + } + cuda_safe_call(cudaSetDevice(0)); + // std::cout << "[PDTRSM] END" << std::endl; +} + +void PDPOTRS(matrix& A, class matrix& B, cublasFillMode_t uplo) +{ +#ifdef HAVE_DOT + reserved::dot::set_current_color("green"); +#endif + + // std::cout << "[PDPOTRS] START" << std::endl; + // Call the parallel functions. + PDTRSM( + CUBLAS_SIDE_LEFT, uplo, uplo == CUBLAS_FILL_MODE_UPPER ? CUBLAS_OP_T : CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, 1.0, A, B); + +#ifdef HAVE_DOT + reserved::dot::set_current_color("darkgreen"); +#endif + + PDTRSM( + CUBLAS_SIDE_LEFT, uplo, uplo == CUBLAS_FILL_MODE_UPPER ? CUBLAS_OP_N : CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, B); + // std::cout << "[PDPOTRS] END" << std::endl; +} + +/***************************************************************************** + * Parallel tile matrix-matrix + *multiplication. + * @see plasma_omp_dgemm + ******************************************************************************/ +void PDGEMM(cublasOperation_t transa, + cublasOperation_t transb, + double alpha, + class matrix& A, + class matrix& B, + double beta, + class matrix& C) +{ +#ifdef HAVE_DOT + reserved::dot::set_current_color("blue"); +#endif + + for (int m = 0; m < C.mt; m++) + { + for (int n = 0; n < C.nt; n++) + { + //========================================= + // alpha*A*B does not contribute; scale C + //========================================= + int inner_k = transa == CUBLAS_OP_N ? A.n : A.m; + if (alpha == 0.0 || inner_k == 0) + { + DGEMM(transa, transb, alpha, A, 0, 0, B, 0, 0, beta, C, m, n); + } + else if (transa == CUBLAS_OP_N) + { + //================================ + // CUBLAS_OP_N / CUBLAS_OP_N + //================================ + if (transb == CUBLAS_OP_N) + { + for (int k = 0; k < A.nt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); + } + } + //===================================== + // CUBLAS_OP_N / CUBLAS_OP_T + //===================================== + else + { + for (int k = 0; k < A.nt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); + } + } + } + else + { + //===================================== + // CUBLAS_OP_T / CUBLAS_OP_N + //===================================== + if (transb == CUBLAS_OP_N) + { + for (int k = 0; k < A.mt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); + } + } + //========================================== + // CUBLAS_OP_T / CUBLAS_OP_T + //========================================== + else + { + for (int k = 0; k < A.mt; k++) + { + double zbeta = k == 0 ? beta : 1.0; + DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); + } + } + } + } + } +} + +int main(int argc, char** argv) +{ + int N = 1024; + int NB = 128; + + if (argc > 1) + { + N = atoi(argv[1]); + } + + if (argc > 2) + { + NB = atoi(argv[2]); + } + + int check_result = 1; + if (getenv("CHECK_RESULT")) + { + check_result = atoi(getenv("CHECK_RESULT")); + } + + assert(N % NB == 0); + + // Set up CUBLAS and CUSOLVER + int ndevs; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + cuda_safe_call(cudaSetDevice(0)); + + matrix A(N, N, NB, NB, true, "A"); + matrix Aref(N, N, NB, NB, false, "Aref"); + + // (Hilbert matrix + 2*N*Id) to have a diagonal dominant matrix + auto hilbert = [](matrix& mat, int row, int col) { + return 1.0 / (col + row + 1.0) + 2.0 * mat.n * (col == row); + }; + + if (check_result) + { + Aref.fill(hilbert); + } + + A.fill(hilbert); + + /* Right-hand side */ + matrix B_potrs(N, 1, NB, 1, false, "B"); + matrix Bref_potrs(N, 1, NB, 1, false, "Bref"); + + if (check_result) + { + auto rhs_vals = [](matrix&, int row, int /*col*/) { + return 1.0 * (row + 1); + }; + B_potrs.fill(rhs_vals); + Bref_potrs.fill(rhs_vals); + } + + // // Compute ||Bref|| + double Bref_nrm2 = 0.0; + double res_nrm2 = 0.0; + + if (check_result) + { + PDNRM2_HOST(&Bref_potrs, &Bref_nrm2); + } + + cudaEvent_t startEvent_pdpotrf, stopEvent_pdpotrf; + float milliseconds_pdpotrf = 0; + + // for (int row = 0; row < A.mt; row++) + // { + // for (int col = 0; col <= row; col++) + // { + // cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); + // NOOP(A, row, col); + // } + // } + + cuda_safe_call(cudaEventCreate(&startEvent_pdpotrf)); + cuda_safe_call(cudaEventCreate(&stopEvent_pdpotrf)); + + cuda_safe_call(cudaEventRecord(startEvent_pdpotrf, ctx.task_fence())); + + PDPOTRF(A); + + cuda_safe_call(cudaEventRecord(stopEvent_pdpotrf, ctx.task_fence())); + + /* + * POTRS + */ + + if (check_result) + { + // Solve AX = B and put the result in B + PDPOTRS(A, B_potrs, CUBLAS_FILL_MODE_LOWER); + + // Compute (AX - B) + // Bref = (Aref*B - Bref) + PDGEMM(CUBLAS_OP_N, CUBLAS_OP_N, 1.0, Aref, B_potrs, -1.0, Bref_potrs); + + // Compute ||AX - B|| = ||Bref|| + PDNRM2_HOST(&Bref_potrs, &res_nrm2); + } + + ctx.finalize(); + + cuda_safe_call(cudaEventElapsedTime(&milliseconds_pdpotrf, startEvent_pdpotrf, stopEvent_pdpotrf)); + + double gflops_pdpotrf = 1.0 / 3.0 * ((double) N * (double) N * (double) N) / (1000000000.0); + std::cout << "[PDPOTRF] ELAPSED: " << milliseconds_pdpotrf + << " ms, GFLOPS: " << gflops_pdpotrf / (milliseconds_pdpotrf / 1000.0) << std::endl; + + if (check_result) + { + if (const auto residual = sqrt(res_nrm2) / sqrt(Bref_nrm2); residual >= 0.01) + { + std::cerr << "[POTRS] ||AX - B|| : " << sqrt(res_nrm2) << '\n'; + std::cerr << "[POTRS] ||B|| : " << sqrt(Bref_nrm2) << '\n'; + std::cerr << "[POTRS] RESIDUAL (||AX - B||/||B||) : " << residual << '\n'; + assert(!"Algorithm did not converge."); + } + } +} diff --git a/cudax/test/stf/gnu/include_only.cpp b/cudax/test/stf/gnu/include_only.cpp new file mode 100644 index 00000000000..080f9322a9f --- /dev/null +++ b/cudax/test/stf/gnu/include_only.cpp @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +int main() +{ + graph_ctx ctx; + ctx.finalize(); + + stream_ctx ctx2; + ctx2.finalize(); +} diff --git a/cudax/test/stf/graph/concurrency_test.cu b/cudax/test/stf/graph/concurrency_test.cu new file mode 100644 index 00000000000..d562c973972 --- /dev/null +++ b/cudax/test/stf/graph/concurrency_test.cu @@ -0,0 +1,84 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +/* + * The goal of this test is to ensure that using read access modes actually + * results in concurrent tasks + */ + +using namespace cuda::experimental::stf; + +static __global__ void cuda_sleep_kernel(long long int clock_cnt) +{ + long long int start_clock = clock64(); + long long int clock_offset = 0; + while (clock_offset < clock_cnt) + { + clock_offset = clock64() - start_clock; + } +} + +int main(int argc, char** argv) +{ + int NTASKS = 256; + int ms = 40; + + if (argc > 1) + { + NTASKS = atoi(argv[1]); + } + + if (argc > 2) + { + ms = atoi(argv[2]); + } + + // cudaDevAttrClockRate: Peak clock frequency in kilohertz; + int clock_rate; + cuda_safe_call(cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, 0)); + long long int clock_cnt = (long long int) (ms * clock_rate); + + graph_ctx ctx; + + int dummy[1]; + auto handle = ctx.logical_data(dummy); + + ctx.task(handle.rw())->*[](cudaGraph_t graph, auto /*unused*/) { + cudaGraphNode_t n; + cuda_safe_call(cudaGraphAddEmptyNode(&n, graph, nullptr, 0)); + }; + + for (int iter = 0; iter < 10; iter++) + { + for (int k = 0; k < NTASKS; k++) + { + ctx.task(handle.read())->*[&](cudaStream_t stream, auto /*unused*/) { + cuda_sleep_kernel<<<1, 1, 0, stream>>>(clock_cnt); + }; + } + + ctx.task(handle.rw())->*[&](cudaGraph_t graph, auto /*unused*/) { + cudaGraphNode_t n; + cuda_safe_call(cudaGraphAddEmptyNode(&n, graph, nullptr, 0)); + }; + } + + ctx.submit(); + + if (argc > 3) + { + std::cout << "Generating DOT output in " << argv[3] << std::endl; + ctx.print_to_dot(argv[3]); + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/graph/epoch.cu b/cudax/test/stf/graph/epoch.cu new file mode 100644 index 00000000000..c877149faf3 --- /dev/null +++ b/cudax/test/stf/graph/epoch.cu @@ -0,0 +1,56 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Test explicit uses of the API to change epoch and create a sequence + * of CUDA graphs + */ + +#include + +using namespace cuda::experimental::stf; + +int main() +{ + graph_ctx ctx; + + const size_t N = 8; + const size_t NITER = 2; + + double A[N]; + for (size_t i = 0; i < N; i++) + { + A[i] = 1.0 * i; + } + + auto lA = ctx.logical_data(A); + + for (size_t k = 0; k < NITER; k++) + { + ctx.parallel_for(blocked_partition(), exec_place::current_device(), lA.shape(), lA.rw()) + ->*[] __host__ __device__(size_t i, slice A) { A(i) = cos(A(i)); }; + + ctx.change_epoch(); + } + + ctx.finalize(); + + for (size_t i = 0; i < N; i++) + { + double Ai_ref = 1.0 * i; + for (size_t k = 0; k < NITER; k++) + { + Ai_ref = cos(Ai_ref); + } + + EXPECT(fabs(A[i] - Ai_ref) < 0.01); + } +} diff --git a/cudax/test/stf/graph/for_each_batched.cu b/cudax/test/stf/graph/for_each_batched.cu new file mode 100644 index 00000000000..a640fcc9448 --- /dev/null +++ b/cudax/test/stf/graph/for_each_batched.cu @@ -0,0 +1,128 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Generate a library call from nested CUDA graphs generated using for_each_batched and algorithms + */ + +#include + +using namespace cuda::experimental::stf; + +// Some fake library doing MATH +// template +void libMATH(graph_ctx ctx, logical_data> x, logical_data> y) +{ + // We only want to have kernels with 4 CTAs to stress the system + auto spec = par<4>(par<128>()); + ctx.launch(spec, exec_place::current_device(), x.read(), y.write()).set_symbol("MATH1")->* + [] __device__(auto t, auto x, auto y) { + for (auto i : t.apply_partition(shape(x))) + { + y(i) = cos(cos(x(i))); + } + }; + + ctx.launch(spec, exec_place::current_device(), x.write(), y.read()).set_symbol("MATH2")->* + [] __device__(auto t, auto x, auto y) { + for (auto i : t.apply_partition(shape(x))) + { + x(i) = sin(sin(y(i))); + }; + }; +} + +template +void libMATH_AS_GRAPH(context_t& ctx, logical_data> x, logical_data> y) +{ + static algorithm alg; + alg.run_as_task(libMATH, ctx, x.rw(), y.write()); +} + +// Some fake lib doing a SWAP +template +void libSWAP(context_t& ctx, logical_data> x, logical_data> y) +{ + // We only want to have kernels with 4 CTAs to stress the system + auto spec = par<4>(par<128>()); + ctx.launch(spec, exec_place::current_device(), x.rw(), y.rw()).set_symbol("SWAP")->* + [] __device__(auto t, auto x, auto y) { + for (auto i : t.apply_partition(shape(x))) + { + auto tmp = x(i); + x(i) = y(i); + y(i) = tmp; + } + }; +} + +template +logical_data> libCOPY(context_t& ctx, logical_data> x) +{ + logical_data> res = ctx.logical_data(x.shape()); + + // We only want to have kernels with 4 CTAs to stress the system + auto spec = par<4>(par<128>()); + ctx.launch(spec, exec_place::current_device(), x.read(), res.write()).set_symbol("SWAP")->* + [] __device__(auto t, auto x, auto res) { + for (auto i : t.apply_partition(shape(x))) + { + res(i) = x(i); + } + }; + + return res; +} + +int main() +{ + nvtx_range r("run"); + + stream_ctx ctx; + + const size_t N = 256 * 1024; + const size_t K = 8; + + size_t BATCH_SIZE = 4; + + logical_data> lX[K]; + logical_data> lY[K]; + + for (size_t i = 0; i < K; i++) + { + lX[i] = ctx.logical_data(N); + lY[i] = ctx.logical_data(N); + + ctx.parallel_for(lX[i].shape(), lX[i].write(), lY[i].write()).set_symbol("INIT")->* + [] __device__(size_t i, auto x, auto y) { + x(i) = 2.0 * i + 12.0; + y(i) = -3.0 * i + 17.0; + }; + } + + for_each_batched, slice>( + context(ctx), + K, + BATCH_SIZE, + [&](size_t i) { + return std::make_tuple(lX[i].rw(), lY[i].write()); + }) + ->*[](context ctx, size_t, auto lxi, auto lyi) { + auto tmp = libCOPY(ctx, lxi); + libSWAP(ctx, tmp, lyi); + libMATH_AS_GRAPH(ctx, lxi, lyi); + libSWAP(ctx, lxi, lyi); + }; + + ctx.task_fence(); + + ctx.finalize(); +} diff --git a/cudax/test/stf/graph/for_each_batched_write.cu b/cudax/test/stf/graph/for_each_batched_write.cu new file mode 100644 index 00000000000..2d044d5f678 --- /dev/null +++ b/cudax/test/stf/graph/for_each_batched_write.cu @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Ensure write-only accesses produces an output with for_each_batched + */ + +#include + +using namespace cuda::experimental::stf; + +template +void COPY(context_t& ctx, logical_data> x, logical_data> y) +{ + // We only want to have kernels with 4 CTAs to stress the system + auto spec = par<4>(par<128>()); + ctx.launch(spec, exec_place::current_device(), x.read(), y.write()).set_symbol("COPY")->* + [] __device__(auto t, auto x, auto y) { + for (auto i : t.apply_partition(shape(x))) + { + y(i) = x(i); + } + }; +} + +int main() +{ + stream_ctx ctx; + + const size_t N = 256 * 1024; + const size_t K = 8; + + const size_t BATCH_SIZE = 4; + + logical_data> lX[K]; + logical_data> lY[K]; + + for (size_t i = 0; i < K; i++) + { + lX[i] = ctx.logical_data(N); + lX[i].set_symbol("x" + std::to_string(i)); + ctx.parallel_for(lX[i].shape(), lX[i].write()).set_symbol("INIT")->*[] __device__(size_t i, auto x) { + x(i) = 2.0 * i + 12.0; + }; + } + + for (size_t i = 0; i < K; i++) + { + // NOT INITIALIZED + lY[i] = ctx.logical_data(N); + lY[i].set_symbol("y" + std::to_string(i)); + } + + for_each_batched, slice>( + context(ctx), + K, + BATCH_SIZE, + [&](size_t i) { + return std::make_tuple(lX[i].read(), lY[i].write()); + }) + ->*[](context inner_ctx, size_t, auto lxi, auto lyi) { + COPY(inner_ctx, lxi, lyi); + }; + + for (size_t i = 0; i < K; i++) + { + // TODO check actual content + ctx.task(lY[i].read()).set_symbol("CHECK")->*[](cudaStream_t, auto) {}; + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/graph/freeze_for_graph.cu b/cudax/test/stf/graph/freeze_for_graph.cu new file mode 100644 index 00000000000..425d0690a4b --- /dev/null +++ b/cudax/test/stf/graph/freeze_for_graph.cu @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Ensure temporary data are destroyed + * + */ + +#include +#include + +using namespace cuda::experimental::stf; + +int X0(int i) +{ + return 17 * i + 45; +} + +__global__ void dummy() {} + +int main() +{ + stream_ctx ctx; + const int N = 16; + int X[N]; + + for (int i = 0; i < N; i++) + { + X[i] = X0(i); + } + + auto lX = ctx.logical_data(X); + + auto fX = ctx.freeze(lX, access_mode::rw, data_place::current_device()); + + auto stream = ctx.pick_stream(); + + graph_ctx gctx(stream); + + auto frozen_X = fX.get(data_place::current_device(), stream); + auto lX_alias = gctx.logical_data(frozen_X, data_place::current_device()); + + auto lY = gctx.logical_data(lX.shape()); + + gctx.parallel_for(lX.shape(), lX_alias.read(), lY.write())->*[] __device__(size_t i, auto x, auto y) { + y(i) = x(i); + }; + + gctx.parallel_for(lX.shape(), lX_alias.write(), lY.read())->*[] __device__(size_t i, auto x, auto y) { + x(i) = y(i) + 2; + }; + + gctx.finalize(); + + fX.unfreeze(stream); + + ctx.host_launch(lX.read())->*[](auto x) { + for (int i = 0; i < static_cast(x.size()); i++) + { + EXPECT(x(i) == X0(i) + 2); + } + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/graph/graph_composition.cu b/cudax/test/stf/graph/graph_composition.cu new file mode 100644 index 00000000000..91985720217 --- /dev/null +++ b/cudax/test/stf/graph/graph_composition.cu @@ -0,0 +1,117 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Generate a library call from nested CUDA graphs generated using algorithms + */ + +#include + +using namespace cuda::experimental::stf; + +// Some fake library doing MATH +void libMATH(graph_ctx ctx, logical_data> x, logical_data> y) +{ + // We only want to have kernels with 4 CTAs to stress the system + auto spec = par<4>(par<128>()); + ctx.launch(spec, exec_place::current_device(), x.read(), y.write()).set_symbol("MATH1")->* + [] __device__(auto t, auto x, auto y) { + for (auto i : t.apply_partition(shape(x))) + { + y(i) = cos(cos(x(i))); + } + }; + + ctx.launch(spec, exec_place::current_device(), x.write(), y.read()).set_symbol("MATH2")->* + [] __device__(auto t, auto x, auto y) { + for (auto i : t.apply_partition(shape(x))) + { + x(i) = sin(sin(y(i))); + }; + }; +} + +template +void libMATH_AS_GRAPH(context_t& ctx, logical_data> x, logical_data> y) +{ + static algorithm alg; + alg.run_as_task(libMATH, ctx, x.rw(), y.write()); +} + +// Some fake lib doing a SWAP +template +void libSWAP(context_t& ctx, logical_data> x, logical_data> y) +{ + // We only want to have kernels with 4 CTAs to stress the system + auto spec = par<4>(par<128>()); + ctx.launch(spec, exec_place::current_device(), x.rw(), y.rw()).set_symbol("SWAP")->* + [] __device__(auto t, auto x, auto y) { + for (auto i : t.apply_partition(shape(x))) + { + auto tmp = x(i); + x(i) = y(i); + y(i) = tmp; + } + }; +} + +template +logical_data> libCOPY(context_t& ctx, logical_data> x) +{ + logical_data> res = ctx.logical_data(x.shape()); + + // We only want to have kernels with 4 CTAs to stress the system + auto spec = par<4>(par<128>()); + ctx.launch(spec, exec_place::current_device(), x.read(), res.write()).set_symbol("SWAP")->* + [] __device__(auto t, auto x, auto res) { + for (auto i : t.apply_partition(shape(x))) + { + res(i) = x(i); + } + }; + + return res; +} + +int main() +{ + nvtx_range r("run"); + + stream_ctx ctx; + + const size_t N = 256 * 1024; + const size_t K = 8; + + logical_data> lX[K]; + logical_data> lY[K]; + + for (size_t i = 0; i < K; i++) + { + lX[i] = ctx.logical_data(N); + lY[i] = ctx.logical_data(N); + + ctx.parallel_for(lX[i].shape(), lX[i].write(), lY[i].write()).set_symbol("INIT")->* + [] __device__(size_t i, auto x, auto y) { + x(i) = 2.0 * i + 12.0; + y(i) = -3.0 * i + 17.0; + }; + } + + for (size_t i = 0; i < K; i++) + { + auto tmp = libCOPY(ctx, lX[i]); + libSWAP(ctx, tmp, lY[i]); + libMATH_AS_GRAPH(ctx, lX[i], lY[i]); + libSWAP(ctx, lX[i], lY[i]); + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/graph/graph_ctx_low_level.cu b/cudax/test/stf/graph/graph_ctx_low_level.cu new file mode 100644 index 00000000000..d0052d5245a --- /dev/null +++ b/cudax/test/stf/graph/graph_ctx_low_level.cu @@ -0,0 +1,54 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Test low level API of the graph context + */ + +#include + +using namespace cuda::experimental::stf; + +int main(int argc, char** argv) +{ + graph_ctx ctx; + + double X[1024], Y[1024]; + auto handle_X = ctx.logical_data(X); + auto handle_Y = ctx.logical_data(Y); + + for (int k = 0; k < 10; k++) + { + graph_task<> t = ctx.task(); + t.add_deps(handle_X.rw()); + t.start(); + cudaGraphNode_t n; + cuda_safe_call(cudaGraphAddEmptyNode(&n, t.get_graph(), nullptr, 0)); + t.end(); + } + + graph_task<> t2 = ctx.task(); + t2.add_deps(handle_X.read(), handle_Y.rw()); + t2.start(); + cudaGraphNode_t n2; + cuda_safe_call(cudaGraphAddEmptyNode(&n2, t2.get_graph(), nullptr, 0)); + t2.end(); + + ctx.submit(); + + if (argc > 1) + { + std::cout << "Generating DOT output in " << argv[1] << std::endl; + ctx.print_to_dot(argv[1]); + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/graph/graph_tmp_data.cu b/cudax/test/stf/graph/graph_tmp_data.cu new file mode 100644 index 00000000000..fc0369ba311 --- /dev/null +++ b/cudax/test/stf/graph/graph_tmp_data.cu @@ -0,0 +1,60 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Ensure temporary data are destroyed + * + */ + +#include +#include + +using namespace cuda::experimental::stf; + +double X0(int i) +{ + return sin((double) i); +} + +__global__ void dummy() {} + +int main() +{ + // stream_ctx ctx; + graph_ctx ctx; + const int N = 16; + double X[N]; + + for (int i = 0; i < N; i++) + { + X[i] = X0(i); + } + + auto lX = ctx.logical_data(X); + + for (int i = 0; i < 10; i++) + { + // fprintf(stderr, "START loop %ld\n", i); + auto lY = ctx.logical_data(lX.shape()); + lY.set_symbol("tmp" + std::to_string(i)); + + // fprintf(stderr, "START pfor %ld\n", i); + ctx.parallel_for(lX.shape(), lX.rw(), lY.write())->*[] _CCCL_DEVICE(size_t ind, auto dX, auto dY) { + dY(ind) = dX(ind); + dX(ind) = dY(ind) + 1.0; + }; + // fprintf(stderr, "End loop %ld\n", i); + } + // fprintf(stderr, "OVER...\n"); + + ctx.finalize(); +} diff --git a/cudax/test/stf/graph/many.cu b/cudax/test/stf/graph/many.cu new file mode 100644 index 00000000000..678de8102b5 --- /dev/null +++ b/cudax/test/stf/graph/many.cu @@ -0,0 +1,34 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Ensure we can create graph contexts many times + */ + +#include + +using namespace cuda::experimental::stf; + +int main() +{ + for (size_t i = 0; i < 10240; i++) + { + graph_ctx ctx; + auto lA = ctx.logical_data(shape_of>(64)); + ctx.launch(lA.write())->*[] _CCCL_DEVICE(auto t, slice A) { + for (auto i : t.apply_partition(shape(A))) + { + A(i) = 2 * i; + } + }; + ctx.finalize(); + } +} diff --git a/cudax/test/stf/graph/multiple_graph_ctx.cu b/cudax/test/stf/graph/multiple_graph_ctx.cu new file mode 100644 index 00000000000..e4eeedcafec --- /dev/null +++ b/cudax/test/stf/graph/multiple_graph_ctx.cu @@ -0,0 +1,69 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Ensure we can use multiple graph contexts simultaneously and launch them + */ + +#include + +using namespace cuda::experimental::stf; + +__global__ void dummy() {} + +int main(int argc, char** argv) +{ + graph_ctx ctx; + graph_ctx ctx_2; + + double X[1024], Y[1024]; + auto handle_X = ctx.logical_data(make_slice(X, 1024)); + auto handle_Y = ctx.logical_data(make_slice(Y, 1024)); + + double Z[1024]; + auto handle_Z = ctx_2.logical_data(make_slice(Z, 1024)); + + for (int k = 0; k < 10; k++) + { + ctx.task(handle_X.rw())->*[&](cudaStream_t s, auto /*unused*/) { + dummy<<<1, 1, 0, s>>>(); + }; + } + + ctx.task(handle_X.read(), handle_Y.rw())->*[&](cudaStream_t s, auto /*unused*/, auto /*unused*/) { + dummy<<<1, 1, 0, s>>>(); + }; + + ctx_2.task(handle_Z.rw())->*[&](cudaStream_t s, auto /*unused*/) { + dummy<<<1, 1, 0, s>>>(); + }; + + cudaStream_t stream; + cuda_safe_call(cudaStreamCreate(&stream)); + + ctx.submit(stream); + ctx_2.submit(stream); + + if (argc > 1) + { + std::cout << "Generating DOT output in " << argv[1] << std::endl; + ctx.print_to_dot(argv[1]); + } + + if (argc > 2) + { + std::cout << "Generating DOT output in " << argv[2] << std::endl; + ctx_2.print_to_dot(argv[2]); + } + + ctx.finalize(); + ctx_2.finalize(); +} diff --git a/cudax/test/stf/graph/static_graph_ctx.cu b/cudax/test/stf/graph/static_graph_ctx.cu new file mode 100644 index 00000000000..1a27cd55970 --- /dev/null +++ b/cudax/test/stf/graph/static_graph_ctx.cu @@ -0,0 +1,51 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Ensure a graph context can be defined as a static variable + */ + +#include + +using namespace cuda::experimental::stf; + +// Static graph ctx +graph_ctx ctx; + +__global__ void dummy() {} + +int main(int argc, char** argv) +{ + double X[1024], Y[1024]; + auto handle_X = ctx.logical_data(X); + auto handle_Y = ctx.logical_data(Y); + + for (int k = 0; k < 10; k++) + { + ctx.task(handle_X.rw())->*[&](cudaStream_t s, auto /*unused*/) { + dummy<<<1, 1, 0, s>>>(); + }; + } + + ctx.task(handle_X.read(), handle_Y.rw())->*[&](cudaStream_t s, auto /*unused*/, auto /*unused*/) { + dummy<<<1, 1, 0, s>>>(); + }; + + ctx.submit(); + + if (argc > 1) + { + std::cout << "Generating DOT output in " << argv[1] << std::endl; + ctx.print_to_dot(argv[1]); + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/green_context/axpy_gc.cu b/cudax/test/stf/green_context/axpy_gc.cu new file mode 100644 index 00000000000..beec7bd34a4 --- /dev/null +++ b/cudax/test/stf/green_context/axpy_gc.cu @@ -0,0 +1,112 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +// Green contexts are only supported since CUDA 12.4 +#if CUDA_VERSION >= 12040 +__global__ void axpy(double a, slice x, slice y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + size_t n = x.extent(0); + for (int ind = tid; ind < n; ind += nthreads) + { + y(ind) += a * x(ind); + } +} + +void debug_info(cudaStream_t stream, CUgreenCtx g_ctx) +{ + // Get the green context associated to that CUDA stream + CUgreenCtx stream_cugc; + cuda_safe_call(cuStreamGetGreenCtx(CUstream(stream), &stream_cugc)); + assert(stream_cugc != nullptr); + + CUcontext stream_green_primary; + CUcontext place_green_primary; + + unsigned long long stream_ctxId; + unsigned long long place_ctxId; + + // Convert green contexts to primary contexts and get their ID + cuda_safe_call(cuCtxFromGreenCtx(&stream_green_primary, stream_cugc)); + cuda_safe_call(cuCtxGetId(stream_green_primary, &stream_ctxId)); + + cuda_safe_call(cuCtxFromGreenCtx(&place_green_primary, g_ctx)); + cuda_safe_call(cuCtxGetId(place_green_primary, &place_ctxId)); + + // Make sure the stream belongs to the same green context as the execution place + EXPECT(stream_ctxId == place_ctxId); +} +#endif // CUDA_VERSION >= 12040 + +int main() +{ +#if CUDA_VERSION < 12040 + fprintf(stderr, "Green contexts are not supported by this version of CUDA: skipping test.\n"); + return 0; +#else + int ndevs; + const int num_sms = 8; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + stream_ctx ctx; + const double alpha = 2.0; + + int NITER = 30; + const int n = 12; + + double X[n], Y[n]; + + for (int ind = 0; ind < n; ind++) + { + X[ind] = 1.0 * ind; + Y[ind] = 2.0 * ind - 3.0; + } + + auto handle_X = ctx.logical_data(make_slice(&X[0], n)); + auto handle_Y = ctx.logical_data(make_slice(&Y[0], n)); + + // The green_context_helper class automates the creation of green context views + std::vector gc(ndevs); + for (int devid = 0; devid < ndevs; devid++) + { + gc[devid] = green_context_helper(num_sms, devid); + } + + for (int iter = 0; iter < NITER; iter++) + { + for (int devid = 0; devid < ndevs; devid++) + { + auto& g_ctx = gc[devid]; + auto cnt = g_ctx.get_count(); + ctx.task(exec_place::green_ctx(g_ctx.get_view(iter % cnt)), handle_X.read(), handle_Y.rw()) + ->*[&](cudaStream_t stream, auto dX, auto dY) { + debug_info(stream, g_ctx.get_view(iter % cnt).g_ctx); + axpy<<<16, 16, 0, stream>>>(alpha, dX, dY); + }; + } + } + ctx.host_launch(handle_X.read(), handle_Y.read())->*[&](auto hX, auto hY) { + for (int ind = 0; ind < n; ind++) + { + EXPECT(fabs(hX(ind) - 1.0 * ind) < 0.00001); + EXPECT(fabs(hY(ind) - (2.0 * ind - 3.0) - NITER * ndevs * alpha * hX(ind)) < 0.00001); + } + }; + + ctx.finalize(); +#endif // CUDA_VERSION < 12040 +} diff --git a/cudax/test/stf/green_context/cuda_graph.cu b/cudax/test/stf/green_context/cuda_graph.cu new file mode 100644 index 00000000000..8f9ea28003b --- /dev/null +++ b/cudax/test/stf/green_context/cuda_graph.cu @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +// Green contexts are only supported since CUDA 12.4 +#if CUDA_VERSION >= 12040 +__global__ void axpy(double a, slice x, slice y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + size_t n = x.extent(0); + for (int ind = tid; ind < n; ind += nthreads) + { + y(ind) += a * x(ind); + } +} +#endif // CUDA_VERSION >= 12040 + +int main() +{ +#if CUDA_VERSION < 12040 + fprintf(stderr, "Green contexts are not supported by this version of CUDA: skipping test.\n"); + return 0; +#else + int ndevs; + const int num_sms = 16; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + graph_ctx ctx; + const double alpha = 2.0; + + int NITER = 30; + const int n = 12; + + double X[n], Y[n]; + + for (int ind = 0; ind < n; ind++) + { + X[ind] = 1.0 * ind; + Y[ind] = 2.0 * ind - 3.0; + } + + auto handle_X = ctx.logical_data(make_slice(&X[0], n)); + auto handle_Y = ctx.logical_data(make_slice(&Y[0], n)); + + // The green_context_helper class automates the creation of green context views + green_context_helper gc(num_sms); + + for (int iter = 0; iter < NITER; iter++) + { + auto cnt = gc.get_count(); + ctx.task(exec_place::green_ctx(gc.get_view(iter % cnt)), handle_X.read(), handle_Y.rw()) + ->*[&](cudaStream_t stream, auto dX, auto dY) { + axpy<<<16, 16, 0, stream>>>(alpha, dX, dY); + }; + } + + ctx.host_launch(handle_X.read(), handle_Y.read())->*[&](auto hX, auto hY) { + for (int ind = 0; ind < n; ind++) + { + EXPECT(fabs(hX(ind) - 1.0 * ind) < 0.00001); + EXPECT(fabs(hY(ind) - (2.0 * ind - 3.0) - NITER * ndevs * alpha * hX(ind)) < 0.00001); + } + }; + + ctx.finalize(); +#endif // CUDA_VERSION < 12040 +} diff --git a/cudax/test/stf/green_context/gc_grid.cu b/cudax/test/stf/green_context/gc_grid.cu new file mode 100644 index 00000000000..06322b5343f --- /dev/null +++ b/cudax/test/stf/green_context/gc_grid.cu @@ -0,0 +1,110 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +// Green contexts are only supported since CUDA 12.4 +#if CUDA_VERSION >= 12040 +__global__ void axpy(double a, slice x, slice y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + size_t n = x.extent(0); + for (int ind = tid; ind < n; ind += nthreads) + { + y(ind) += a * x(ind); + } +} + +void debug_info(cudaStream_t stream, CUgreenCtx g_ctx) +{ + // Get the green context associated to that CUDA stream + CUgreenCtx stream_cugc; + cuda_safe_call(cuStreamGetGreenCtx(CUstream(stream), &stream_cugc)); + assert(stream_cugc != nullptr); + + CUcontext stream_green_primary; + CUcontext place_green_primary; + + unsigned long long stream_ctxId; + unsigned long long place_ctxId; + + // Convert green contexts to primary contexts and get their ID + cuda_safe_call(cuCtxFromGreenCtx(&stream_green_primary, stream_cugc)); + cuda_safe_call(cuCtxGetId(stream_green_primary, &stream_ctxId)); + + cuda_safe_call(cuCtxFromGreenCtx(&place_green_primary, g_ctx)); + cuda_safe_call(cuCtxGetId(place_green_primary, &place_ctxId)); + + // Make sure the stream belongs to the same green context as the execution place + EXPECT(stream_ctxId == place_ctxId); +} +#endif // CUDA_VERSION >= 12040 + +int main() +{ +#if CUDA_VERSION < 12040 + fprintf(stderr, "Green contexts are not supported by this version of CUDA: skipping test.\n"); + return 0; +#else + int ndevs; + const int num_sms = 8; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + stream_ctx ctx; + + int NITER = 8; + const int n = 16 * 1024 * 1024; + + std::vector X(n); + std::vector Y(n); + + for (int ind = 0; ind < n; ind++) + { + X[ind] = 1.0 * ind; + Y[ind] = 2.0 * ind - 3.0; + } + + auto handle_X = ctx.logical_data(make_slice(&X[0], n)); + auto handle_Y = ctx.logical_data(make_slice(&Y[0], n)); + + std::vector places; + + // The green_context_helper class automates the creation of green context views + std::vector gc(ndevs); + for (int devid = 0; devid < ndevs; devid++) + { + gc[devid] = green_context_helper(num_sms, devid); + + auto& g_ctx = gc[devid]; + auto cnt = g_ctx.get_count(); + for (size_t i = 0; i < cnt; i++) + { + places.push_back(exec_place::green_ctx(g_ctx.get_view(i))); + } + } + + auto where = make_grid(places); + + for (int iter = 0; iter < NITER; iter++) + { + ctx.parallel_for(blocked_partition(), where, handle_X.shape(), handle_X.rw(), handle_Y.read()) + ->*[] __device__(size_t i, auto x, auto y) { + x(i) += y(i); + }; + } + + ctx.finalize(); +#endif // CUDA_VERSION < 12040 +} diff --git a/cudax/test/stf/hash/ctx_hash.cu b/cudax/test/stf/hash/ctx_hash.cu new file mode 100644 index 00000000000..e545fcd1693 --- /dev/null +++ b/cudax/test/stf/hash/ctx_hash.cu @@ -0,0 +1,69 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +template +__global__ void inc_kernel(S sA) +{ + sA(threadIdx.x)++; +} + +template +__global__ void copy_kernel(S1 sA, S2 sB) +{ + sB(threadIdx.x) = sA(threadIdx.x); +} + +template +void run() +{ + int A[10] = {0}; + int B[10] = {0}; + Ctx ctx; + + auto lA = ctx.logical_data(A); + auto lB = ctx.logical_data(B); + + for (size_t k = 0; k < 10; k++) + { + // fprintf(stderr, "iter %zu - 0 : ctx.hash %zu\n", k, ctx.hash()); + ctx.task(lA.rw())->*[](cudaStream_t stream, auto sA) { + inc_kernel<<<1, 10, 0, stream>>>(sA); + }; + + // fprintf(stderr, "iter %zu - 1 : ctx.hash %zu\n", k, ctx.hash()); + ctx.task(lA.read(), lB.rw())->*[](cudaStream_t stream, auto sA, auto sB) { + copy_kernel<<<1, 10, 0, stream>>>(sA, sB); + }; + + // fprintf(stderr, "iter %zu - 2 : ctx.hash %zu\n", k, ctx.hash()); + ctx.task(lB.rw())->*[](cudaStream_t stream, auto sB) { + inc_kernel<<<1, 10, 0, stream>>>(sB); + }; + + // fprintf(stderr, "iter %zu - 3 : ctx.hash %zu\n", k, ctx.hash()); + } + + ctx.host_launch(lA.read(), lB.read())->*[&](auto /*unused*/, auto /*unused*/) { + // fprintf(stderr, "HOST END : ctx.hash %zu\n", ctx.hash()); + }; + + ctx.finalize(); +} + +int main() +{ + run(); + run(); +} diff --git a/cudax/test/stf/hash/logical_data.cu b/cudax/test/stf/hash/logical_data.cu new file mode 100644 index 00000000000..cc62608971c --- /dev/null +++ b/cudax/test/stf/hash/logical_data.cu @@ -0,0 +1,51 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +template +__global__ void inc_kernel(S sA) +{ + sA(threadIdx.x)++; +} + +template +void run() +{ + int A[10] = {0}; + stream_ctx ctx; + + auto l = ctx.logical_data(A); + + for (size_t k = 0; k < 10; k++) + { + // size_t h = l.hash(); + // fprintf(stderr, "iter %zu : logical data hash %zu ctx.hash %zu\n", k, h, ctx.hash()); + + ctx.task(l.rw())->*[](cudaStream_t stream, auto sA) { + inc_kernel<<<1, 10, 0, stream>>>(sA); + }; + } + + ctx.host_launch(l.read())->*[&](auto /*unused*/) { + // fprintf(stderr, "HOST end : logical data hash %zu ctx.hash %zu\n", l.hash(), ctx.hash()); + }; + + ctx.finalize(); +} + +int main() +{ + run(); + run(); +} diff --git a/cudax/test/stf/hashtable/fusion.cu b/cudax/test/stf/hashtable/fusion.cu new file mode 100644 index 00000000000..dfc60f174a4 --- /dev/null +++ b/cudax/test/stf/hashtable/fusion.cu @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +// Iterate over every item in the hashtableA, and add them to B + +__global__ void gpu_merge_hashtable(hashtable A, const hashtable B) +{ + unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x; + while (threadid < reserved::kHashTableCapacity) + { + if (B.addr[threadid].key != reserved::kEmpty) + { + uint32_t value = B.addr[threadid].value; + if (value != reserved::kEmpty) + { + // printf("INSERTING key %d value %d\n", pHashTableB[threadid].key, value); + A.insert(B.addr[threadid]); + } + } + threadid += blockDim.x * gridDim.x; + } +} + +int main() +{ + stream_ctx ctx; + + hashtable A; + A.insert(reserved::KeyValue(107, 4)); + A.insert(reserved::KeyValue(108, 6)); + + hashtable B; + B.insert(reserved::KeyValue(7, 14)); + B.insert(reserved::KeyValue(8, 16)); + + auto lA = ctx.logical_data(A); + auto lB = ctx.logical_data(B); + + ctx.task(lA.rw(), lB.read())->*[](auto stream, auto hA, auto hB) { + gpu_merge_hashtable<<<32, 128, 0, stream>>>(hA, hB); + cuda_safe_call(cudaGetLastError()); + }; + + ctx.host_launch(lA.read())->*[](auto hA) { + EXPECT(hA.get(107) == 4); + EXPECT(hA.get(108) == 6); + EXPECT(hA.get(7) == 14); + EXPECT(hA.get(8) == 16); + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/hashtable/fusion_reduction.cu b/cudax/test/stf/hashtable/fusion_reduction.cu new file mode 100644 index 00000000000..f84619499fc --- /dev/null +++ b/cudax/test/stf/hashtable/fusion_reduction.cu @@ -0,0 +1,122 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +using namespace cuda::experimental::stf; + +__global__ void gpu_merge_hashtable(hashtable A, const hashtable B) +{ + unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x; + while (threadid < B.get_capacity()) + { + if (B.addr[threadid].key != reserved::kEmpty) + { + uint32_t value = B.addr[threadid].value; + if (value != reserved::kEmpty) + { + // printf("INSERTING key %d value %d\n", pHashTableB[threadid].key, value); + A.insert(B.addr[threadid]); + } + } + threadid += blockDim.x * gridDim.x; + } +} + +void cpu_merge_hashtable(hashtable A, const hashtable B) +{ + for (unsigned int i = 0; i < B.get_capacity(); i++) + { + if (B.addr[i].key != reserved::kEmpty) + { + uint32_t value = B.addr[i].value; + if (value != reserved::kEmpty) + { + // printf("INSERTING key %d value %d\n", pHashTableB[threadid].key, value); + A.insert(B.addr[i]); + } + } + } +} + +class hashtable_fusion_t : public stream_reduction_operator +{ + void op(const hashtable& in, hashtable& inout, const exec_place& e, cudaStream_t s) override + { + if (e.affine_data_place() == data_place::host) + { + cuda_safe_call(cudaStreamSynchronize(s)); // TODO use a callback + cpu_merge_hashtable(inout, in); + } + else + { + gpu_merge_hashtable<<<32, 32, 0, s>>>(inout, in); + } + } + + void init_op(hashtable& /*unused*/, const exec_place& /*unused*/, cudaStream_t /*unused*/) override + { + // This init operator is a no-op because hashtables are already + // initialized as empty tables + } +}; + +// A kernel to fill the hashtable with some fictious values +__global__ void fill_table(size_t dev_id, size_t cnt, hashtable h) +{ + unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int nthreads = blockDim.x * gridDim.x; + + for (unsigned int i = threadid; i < cnt; i += nthreads) + { + uint32_t key = dev_id * 1000 + i; + uint32_t value = 2 * i; + + reserved::KeyValue kvs(key, value); + h.insert(kvs); + } +} + +int main() +{ + stream_ctx ctx; + + // Explicit capacity of 2048 entries + hashtable refh(2048); + auto h_handle = ctx.logical_data(refh); + + auto fusion_op = std::make_shared(); + + for (size_t dev_id = 0; dev_id < 4; dev_id++) + { + ctx.task(h_handle.redux(fusion_op))->*[&](auto stream, auto h) { + EXPECT(h.get_capacity() == 2048); + fill_table<<<32, 32, 0, stream>>>(dev_id, 10, h); + }; + } + + ctx.host_launch(h_handle.read())->*[&](auto h) { + // Check that the table contains all values + for (size_t dev_id = 0; dev_id < 4; dev_id++) + { + for (unsigned i = 0; i < 10; i++) + { + uint32_t key = static_cast(dev_id * 1000 + i); + uint32_t value = 2 * i; + + EXPECT(h.get(key) == value); + } + } + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/hashtable/parallel_for.cu b/cudax/test/stf/hashtable/parallel_for.cu new file mode 100644 index 00000000000..6d5494841aa --- /dev/null +++ b/cudax/test/stf/hashtable/parallel_for.cu @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +using namespace cuda::experimental::stf; + +int main() +{ + stream_ctx ctx; + + // This constructor automatically initializes an empty hashtable on the host + hashtable h; + auto lh = ctx.logical_data(h); + + ctx.parallel_for(box(16), lh.rw())->*[] _CCCL_DEVICE(size_t i, auto h) { + uint32_t key = 10 * i; + uint32_t value = 17 + i * 14; + h.insert(key, value); + }; + + ctx.finalize(); + + // Thanks to the write-back mechanism on lh, h has been updated + for (size_t i = 0; i < 16; i++) + { + assert(h.get(i * 10) == 17 + i * 14); + } +} diff --git a/cudax/test/stf/hashtable/parallel_for_shape.cu b/cudax/test/stf/hashtable/parallel_for_shape.cu new file mode 100644 index 00000000000..c0368181fed --- /dev/null +++ b/cudax/test/stf/hashtable/parallel_for_shape.cu @@ -0,0 +1,39 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +using namespace cuda::experimental::stf; + +int main() +{ + stream_ctx ctx; + + // Create a logical data from a shape of hashtable + auto lh = ctx.logical_data(shape_of()); + + // A write() access is needed because we initialized lh from a shape, so there is no reference copy + ctx.parallel_for(box(16), lh.write())->*[] _CCCL_DEVICE(size_t i, auto h) { + uint32_t key = 10 * i; + uint32_t value = 17 + i * 14; + h.insert(key, value); + }; + + ctx.host_launch(lh.rw())->*[](auto h) { + for (int i = 0; i < 16; i++) + { + EXPECT(h.get(i * 10) == 17 + i * 14); + } + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/hashtable/test.cu b/cudax/test/stf/hashtable/test.cu new file mode 100644 index 00000000000..190accf32f4 --- /dev/null +++ b/cudax/test/stf/hashtable/test.cu @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +// Insert the key/values in kvs into the hashtable +__global__ void gpu_hashtable_insert_kernel(hashtable h, const reserved::KeyValue* kvs, unsigned int numkvs) +{ + unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x; + if (threadid < numkvs) + { + h.insert(kvs[threadid]); + } +} + +void gpu_hashtable_insert(hashtable d_h, const reserved::KeyValue* device_kvs, unsigned int num_kvs, cudaStream_t stream) +{ + // Have CUDA calculate the thread block size + auto [mingridsize, threadblocksize] = reserved::compute_occupancy(gpu_hashtable_insert_kernel); + + // Insert all the keys into the hash table + int gridsize = ((uint32_t) num_kvs + threadblocksize - 1) / threadblocksize; + gpu_hashtable_insert_kernel<<>>(d_h, device_kvs, (uint32_t) num_kvs); +} + +int main() +{ + stream_ctx ctx; + + // This constructor automatically initializes an empty hashtable on the host + hashtable h; + auto lh = ctx.logical_data(h); + + // Create an array of values on the host + reserved::KeyValue kvs_array[16]; + for (uint32_t i = 0; i < 16; i++) + { + kvs_array[i].key = i * 10; + kvs_array[i].value = 17 + i * 14; + } + auto h_kvs_array = ctx.logical_data(make_slice(&kvs_array[0], 16)); + + ctx.task(lh.rw(), h_kvs_array.read())->*[](auto stream, auto h, auto a) { + gpu_hashtable_insert(h, a.data_handle(), static_cast(a.extent(0)), stream); + }; + + ctx.finalize(); + + // Thanks to the write-back mechanism on lh, h has been updated + for (size_t i = 0; i < 16; i++) + { + assert(h.get(i * 10) == 17 + i * 14); + } +} diff --git a/cudax/test/stf/interface/data_from_device.cu b/cudax/test/stf/interface/data_from_device.cu new file mode 100644 index 00000000000..a0e8487b9d5 --- /dev/null +++ b/cudax/test/stf/interface/data_from_device.cu @@ -0,0 +1,82 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +template +__global__ void axpy(size_t n, T a, const T* x, T* y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int ind = tid; ind < n; ind += nthreads) + { + y[ind] += a * x[ind]; + } +} + +template +__global__ void setup_vectors(size_t n, T* x, T* y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (size_t ind = tid; ind < n; ind += nthreads) + { + x[ind] = 1.0 * ind; + y[ind] = 2.0 * ind - 3.0; + } +} + +template +void run() +{ + Ctx ctx; + const size_t n = 12; + const double alpha = 2.0; + + double *dX, *dY; + cuda_safe_call(cudaMalloc((void**) &dX, n * sizeof(double))); + cuda_safe_call(cudaMalloc((void**) &dY, n * sizeof(double))); + + // Use a kernel to setup values + setup_vectors<<<16, 16>>>(n, dX, dY); + cuda_safe_call(cudaDeviceSynchronize()); + // We here provide device addresses and memory node 1 (which is assumed to + // be device 0) + auto handle_X = ctx.logical_data(make_slice(dX, n), data_place::device(0)); + auto handle_Y = ctx.logical_data(make_slice(dY, n), data_place::device(0)); + + ctx.task(handle_X.read(), handle_Y.rw())->*[&](cudaStream_t stream, auto X, auto Y) { + axpy<<<16, 128, 0, stream>>>(n, alpha, X.data_handle(), Y.data_handle()); + }; + + // Access Ask to use X, Y and Z on the host + ctx.host_launch(handle_X.read(), handle_Y.read())->*[&](auto X, auto Y) { + for (size_t ind = 0; ind < n; ind++) + { + // X unchanged + EXPECT(fabs(X(ind) - 1.0 * ind) < 0.00001); + // Y = Y + alpha X + EXPECT(fabs(Y(ind) - (-3.0 + ind * (2.0 + alpha))) < 0.00001); + } + }; + + ctx.finalize(); +} + +int main() +{ + run(); + run(); +} diff --git a/cudax/test/stf/interface/data_from_device_2.cu b/cudax/test/stf/interface/data_from_device_2.cu new file mode 100644 index 00000000000..3f71facd717 --- /dev/null +++ b/cudax/test/stf/interface/data_from_device_2.cu @@ -0,0 +1,97 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +template +__global__ void axpy(int N, T a, const T* x, T* y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int ind = tid; ind < N; ind += nthreads) + { + y[ind] += a * x[ind]; + } +} + +template +__global__ void setup_vectors(int N, T* x, T* y, T* z) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int ind = tid; ind < N; ind += nthreads) + { + x[ind] = 1.0 * ind; + y[ind] = 2.0 * ind - 3.0; + z[ind] = 7.0 * ind + 6.0; + } +} + +template +void run() +{ + Ctx ctx; + + const double alpha = 2.0; + const int N = 12; + + double *dX, *dY, *dZ; + cuda_safe_call(cudaMalloc((void**) &dX, N * sizeof(double))); + cuda_safe_call(cudaMalloc((void**) &dY, N * sizeof(double))); + cuda_safe_call(cudaMalloc((void**) &dZ, N * sizeof(double))); + + // Use a kernel to setup values + setup_vectors<<<16, 16>>>(N, dX, dY, dZ); + cuda_safe_call(cudaDeviceSynchronize()); + + // We here provide device addresses and memory node 1 (which is assumed to + // be device 0) + auto handle_X = ctx.logical_data(make_slice(dX, N), data_place::device(0)); + auto handle_Y = ctx.logical_data(make_slice(dY, N), data_place::device(0)); + auto handle_Z = ctx.logical_data(make_slice(dZ, N), data_place::device(0)); + + ctx.task(handle_X.read(), handle_Y.rw())->*[&](cudaStream_t stream, auto X, auto Y) { + axpy<<<16, 128, 0, stream>>>(N, alpha, X.data_handle(), Y.data_handle()); + }; + + ctx.task(handle_X.read(), handle_Z.rw())->*[&](cudaStream_t stream, auto X, auto Z) { + axpy<<<16, 128, 0, stream>>>(N, alpha, X.data_handle(), Z.data_handle()); + }; + + ctx.task(handle_Y.read(), handle_Z.rw())->*[&](cudaStream_t stream, auto Y, auto Z) { + axpy<<<16, 128, 0, stream>>>(N, alpha, Y.data_handle(), Z.data_handle()); + }; + + // Access Ask to use X, Y and Z on the host + ctx.host_launch(handle_X.read(), handle_Y.read(), handle_Z.read())->*[&](auto X, auto Y, auto Z) { + for (size_t ind = 0; ind < N; ind++) + { + // X unchanged + EXPECT(fabs(X(ind) - 1.0 * ind) < 0.00001); + // Y = Y + alpha X + EXPECT(fabs(Y(ind) - (-3.0 + ind * (2.0 + alpha))) < 0.00001); + // Z = Z + alpha (X + alpha Y) + EXPECT(fabs(Z(ind) - ((6.0 - 3 * alpha) + ind * (7.0 + 3 * alpha + alpha * alpha))) < 0.00001); + } + }; + + ctx.finalize(); +} + +int main() +{ + run(); + run(); +} diff --git a/cudax/test/stf/interface/data_from_device_async.cu b/cudax/test/stf/interface/data_from_device_async.cu new file mode 100644 index 00000000000..d0f026c5c7a --- /dev/null +++ b/cudax/test/stf/interface/data_from_device_async.cu @@ -0,0 +1,102 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +template +__global__ void axpy(int n, T a, T* x, T* y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int ind = tid; ind < n; ind += nthreads) + { + y[ind] += a * x[ind]; + } +} + +template +__global__ void setup_vectors(int n, T* x, T* y, T* z) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int ind = tid; ind < n; ind += nthreads) + { + x[ind] = 1.0 * ind; + y[ind] = 2.0 * ind - 3.0; + z[ind] = 7.0 * ind + 6.0; + } +} + +int main() +{ +// FIXME +#if 0 + stream_ctx ctx; + + cudaStream_t stream; + cuda_safe_call(cudaStreamCreate(&stream)); + + const double alpha = 2.0; + + size_t n = 12; + + double *dX, *dY, *dZ; + cuda_safe_call(cudaMallocAsync((void**) &dX, n * sizeof(double), stream)); + cuda_safe_call(cudaMallocAsync((void**) &dY, n * sizeof(double), stream)); + cuda_safe_call(cudaMallocAsync((void**) &dZ, n * sizeof(double), stream)); + + // Use a kernel to setup values + setup_vectors<<<16, 16, 0, stream>>>(n, dX, dY, dZ); + + auto stream_ready = make_event(stream); + + // We here provide device addresses and memory node 1 (which is assumed to + // be device 0) + auto handle_X = ctx.logical_data(slice(dX, { n }), data_place::device(0), std::string("X"), stream_ready); + auto handle_Y = ctx.logical_data(slice(dY, { n }), data_place::device(0), std::string("Y"), stream_ready); + auto handle_Z = ctx.logical_data(slice(dZ, { n }), data_place::device(0), std::string("Z"), stream_ready); + + ctx.task(handle_X.read(), handle_Y.rw())->*[](cudaStream_t stream, auto X, auto Y) { + axpy<<<16, 128, 0, stream>>>(n, alpha, X.base(), Y.base()); + }; + + ctx.task(handle_X.read(), handle_Z.rw())->*[](cudaStream_t stream, auto X, auto Z) { + axpy<<<16, 128, 0, stream>>>(n, alpha, X.base(), Z.base()); + }; + + ctx.task(handle_Y.read(), handle_Z.rw())->*[](cudaStream_t stream, auto Y, auto Z) { + axpy<<<16, 128, 0, stream>>>(n, alpha, Y.base(), Z.base()); + }; + + // Access Ask to use X, Y and Z on the host + ctx.task(exec_place::host, handle_X.read(), handle_Y.read(), handle_Z.read()) + ->* + [](cudaStream_t stream, auto X, auto Y, auto Z) { + cuda_safe_call(cudaStreamSynchronize(stream)); + + for (int ind = 0; ind < n; ind++) { + // X unchanged + assert(fabs(X(ind) - 1.0 * ind) < 0.00001); + // Y = Y + alpha X + assert(fabs(Y(ind) - (-3.0 + ind * (2.0 + alpha))) < 0.00001); + // Z = Z + alpha (X + alpha Y) + assert(fabs(Z(ind) - ((6.0 - 3 * alpha) + ind * (7.0 + 3 * alpha + alpha * alpha))) < 0.00001); + } + }; + + ctx.finalize(); +#endif + + return 0; +} diff --git a/cudax/test/stf/interface/data_from_device_wb.cu b/cudax/test/stf/interface/data_from_device_wb.cu new file mode 100644 index 00000000000..be7ff580582 --- /dev/null +++ b/cudax/test/stf/interface/data_from_device_wb.cu @@ -0,0 +1,78 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +/* + * This test makes sure write-back works even if the original data place was a device + */ + +template +__global__ void setup(slice s) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int ind = tid; ind < s.size(); ind += nthreads) + { + s(ind) = 1.0 * ind; + } +} + +template +__global__ void check(T* x, size_t n) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int ind = tid; ind < n; ind += nthreads) + { + assert(x[ind] == 2.0 * ind + 1.0); + } +} + +template +void run() +{ + Ctx ctx; + const size_t n = 12; + + double* dX; + cuda_safe_call(cudaMalloc((void**) &dX, n * sizeof(double))); + + // We here provide device addresses and memory node 1 (which is assumed to + // be device 0) + auto handle_X = ctx.logical_data(make_slice(dX, n), data_place::device(0)); + + ctx.task(handle_X.write())->*[&](cudaStream_t stream, auto X) { + setup<<<16, 128, 0, stream>>>(X); + }; + + ctx.host_launch(handle_X.rw())->*[&](auto X) { + for (size_t ind = 0; ind < n; ind++) + { + X(ind) = 2.0 * X(ind) + 1.0; + } + }; + + ctx.finalize(); + + // Check if data was properly written-back with a blocking kernel + check<<<16, 128>>>(dX, n); +} + +int main() +{ + run(); + run(); +} diff --git a/cudax/test/stf/interface/graph_use_device_data.cu b/cudax/test/stf/interface/graph_use_device_data.cu new file mode 100644 index 00000000000..0f05003209c --- /dev/null +++ b/cudax/test/stf/interface/graph_use_device_data.cu @@ -0,0 +1,100 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +// #include + +using namespace cuda::experimental::stf; + +template +__global__ void axpy(int N, T a, T* x, T* y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int ind = tid; ind < N; ind += nthreads) + { + y[ind] += a * x[ind]; + } +} + +template +__global__ void setup_vectors(int N, T* x, T* y, T* z) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int ind = tid; ind < N; ind += nthreads) + { + x[ind] = 1.0 * ind; + y[ind] = 2.0 * ind - 3.0; + z[ind] = 7.0 * ind + 6.0; + } +} + +int main(int argc, char** argv) +{ + graph_ctx ctx; + const int N = 12; + const double alpha = 2.0; + + double *dX, *dY, *dZ; + + cuda_safe_call(cudaMalloc((void**) &dX, N * sizeof(double))); + cuda_safe_call(cudaMalloc((void**) &dY, N * sizeof(double))); + cuda_safe_call(cudaMalloc((void**) &dZ, N * sizeof(double))); + + // Use a kernel to setup values + setup_vectors<<<16, 16>>>(N, dX, dY, dZ); + cuda_safe_call(cudaDeviceSynchronize()); + + // We here provide device addresses and memory node 1 (which is assumed to + // be device 0) + auto handle_X = ctx.logical_data(make_slice(dX, N), data_place::device(0)); + auto handle_Y = ctx.logical_data(make_slice(dY, N), data_place::device(0)); + auto handle_Z = ctx.logical_data(make_slice(dZ, N), data_place::device(0)); + + // Y = Y + alpha X + ctx.task(handle_X.read(), handle_Y.rw())->*[&](cudaStream_t stream, auto dX, auto dY) { + axpy<<<16, 16, 0, stream>>>(N, alpha, dX.data_handle(), dY.data_handle()); + }; + + // Z = Z + alpha X + ctx.task(handle_X.read(), handle_Z.rw())->*[&](cudaStream_t stream, auto dX, auto dZ) { + axpy<<<16, 16, 0, stream>>>(N, alpha, dX.data_handle(), dZ.data_handle()); + }; + + // Z = Z + alpha Y + ctx.task(handle_Y.read(), handle_Z.rw())->*[&](cudaStream_t stream, auto dY, auto dZ) { + axpy<<<16, 16, 0, stream>>>(N, alpha, dY.data_handle(), dZ.data_handle()); + }; + + ctx.host_launch(handle_X.read(), handle_Y.read(), handle_Z.read())->*[&](auto hX, auto hY, auto hZ) { + for (size_t ind = 0; ind < N; ind++) + { + // X unchanged + EXPECT(fabs(hX(ind) - 1.0 * ind) < 0.00001); + // Y = Y + alpha X + EXPECT(fabs(hY(ind) - (-3.0 + ind * (2.0 + alpha))) < 0.00001); + // Z = Z + alpha (X + alpha Y) + EXPECT(fabs(hZ(ind) - ((6.0 - 3 * alpha) + ind * (7.0 + 3 * alpha + alpha * alpha))) < 0.00001); + } + }; + + ctx.submit(); + + if (argc > 1) + { + std::cout << "Generating DOT output in " << argv[1] << std::endl; + ctx.print_to_dot(argv[1]); + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/interface/mix_stream_and_graph.cu b/cudax/test/stf/interface/mix_stream_and_graph.cu new file mode 100644 index 00000000000..6da50cdf490 --- /dev/null +++ b/cudax/test/stf/interface/mix_stream_and_graph.cu @@ -0,0 +1,87 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Test that we can use a local graph context within a task of the + * stream backend + */ + +#include +#include + +using namespace cuda::experimental::stf; + +template +__global__ void setup(slice s) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (size_t ind = tid; ind < s.size(); ind += nthreads) + { + s(ind) = T(ind); + } +} + +template +__global__ void add(slice s, T val) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (size_t ind = tid; ind < s.size(); ind += nthreads) + { + s(ind) += val; + } +} + +int main() +{ + stream_ctx ctx; + constexpr int N = 12; + + int X[N]; + + for (int i = 0; i < N; i++) + { + X[i] = i; + } + + auto lX = ctx.logical_data(X); + + ctx.task(lX.rw())->*[](cudaStream_t stream, auto sX) { + graph_ctx gctx; + auto lX_alias = gctx.logical_data(sX, data_place::current_device()); + + // X(i) = (i + 17) + gctx.task(lX_alias.rw())->*[](cudaStream_t stream2, auto sX) { + add<<<16, 128, 0, stream2>>>(sX, 17); + }; + + // X(i) = 2*(i + 17) + 1 + gctx.host_launch(lX_alias.rw())->*[&](auto sX) { + for (int ind = 0; ind < N; ind++) + { + sX(ind) = 2 * sX(ind) + 1; + } + }; + + gctx.submit(stream); + // no sync ! + }; + + ctx.finalize(); + + for (int ind = 0; ind < N; ind++) + { + EXPECT(X[ind] == 2 * (ind + 17) + 1); + } +} diff --git a/cudax/test/stf/interface/mix_stream_and_graph_2.cu b/cudax/test/stf/interface/mix_stream_and_graph_2.cu new file mode 100644 index 00000000000..b3a9c8d47f3 --- /dev/null +++ b/cudax/test/stf/interface/mix_stream_and_graph_2.cu @@ -0,0 +1,119 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Test that we can use a local graph context within a task of the + * stream backend to create a CUDA graph that we can launch multiple + * times. + */ + +#include +#include + +using namespace cuda::experimental::stf; + +template +__global__ void setup(slice s) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (size_t ind = tid; ind < s.size(); ind += nthreads) + { + s(ind) = T(ind); + } +} + +template +__global__ void add(slice s, T val) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (size_t ind = tid; ind < s.size(); ind += nthreads) + { + s(ind) += val; + } +} + +__global__ void slice_add(slice s_from, slice s_to) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (size_t ind = tid; ind < s_from.size(); ind += nthreads) + { + s_to(ind) += s_from(ind); + } +} + +int main() +{ + stream_ctx ctx; + const int N = 12; + const size_t K = 10; + + int X[N]; + int Y[N]; + + for (int i = 0; i < N; i++) + { + X[i] = i; + Y[i] = -i; + } + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + /* + * Create a CUDA graph from a single task, and launch it many times + */ + + ctx.task(lX.rw(), lY.rw())->*[&](cudaStream_t stream, auto sX, auto sY) { + graph_ctx gctx; + auto lX_alias = gctx.logical_data(sX, data_place::current_device()); + auto lY_alias = gctx.logical_data(sY, data_place::current_device()); + + for (size_t ii = 0; ii < 10; ii++) + { + gctx.task(lX_alias.rw())->*[](cudaStream_t stream2, auto sX) { + add<<<16, 128, 0, stream2>>>(sX, 17); + }; + gctx.task(lY_alias.rw())->*[](cudaStream_t stream2, auto sY) { + add<<<16, 128, 0, stream2>>>(sY, 17); + }; + gctx.task(lX_alias.read(), lY_alias.rw())->*[](cudaStream_t stream2, auto sX, auto sY) { + slice_add<<<16, 128, 0, stream2>>>(sX, sY); + }; + gctx.task(lX_alias.rw())->*[](cudaStream_t stream2, auto sX) { + add<<<16, 128, 0, stream2>>>(sX, 17); + }; + gctx.task(lY_alias.rw())->*[](cudaStream_t stream2, auto sY) { + add<<<16, 128, 0, stream2>>>(sY, 17); + }; + } + + // gctx.host_launch(lX_alias.rw())->*[&](auto sX) { + // for (size_t ind = 0; ind < N; ind++) { + // sX(ind) = 2 * sX(ind) + 1; + // } + // }; + + // gctx.print_to_dot("gctx" + std::to_string(iter)); + auto exec_graph = gctx.instantiate(); + for (size_t iter = 0; iter < K; iter++) + { + cudaGraphLaunch(*exec_graph, stream); + } + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/interface/move_operator.cu b/cudax/test/stf/interface/move_operator.cu new file mode 100644 index 00000000000..25901b28329 --- /dev/null +++ b/cudax/test/stf/interface/move_operator.cu @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +template +class foo +{ +public: + template + foo(Ctx& ctx, T* array, size_t n) + : h(ctx.logical_data(array, n)) + {} + +private: + logical_data_untyped h; +}; + +template +void run() +{ + Ctx ctx; + + const int N = 16; + double X[N], Y[N], Z[N]; + + for (size_t ind = 0; ind < N; ind++) + { + X[ind] = 0.0; + Y[ind] = 0.0; + Z[ind] = 0.0; + } + + // Move logical_data_untyped directly + logical_data_untyped h1 = ctx.logical_data(X); + logical_data_untyped h2(std::move(h1)); + + // Ensures the methodology used in the move ctor of logical_data_untyped is working + // with multiple handles... + logical_data_untyped h3 = ctx.logical_data(Y); + logical_data_untyped h4(std::move(h3)); + + // Make sure a class containing a logical_data_untyped is movable + foo A = foo(ctx, &Z[0], N); + foo B = std::move(A); + + ctx.finalize(); +} + +int main() +{ + run(); + run(); +} diff --git a/cudax/test/stf/interface/scal.cu b/cudax/test/stf/interface/scal.cu new file mode 100644 index 00000000000..ae1bf0c6d2d --- /dev/null +++ b/cudax/test/stf/interface/scal.cu @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +template +__global__ void scal(size_t n, T a, T* x) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (size_t ind = tid; ind < n; ind += nthreads) + { + x[ind] = a * x[ind]; + } +} + +double x_init(int i) +{ + return cos((double) i); +} + +template +void run() +{ + Ctx ctx; + const int n = 4096; + double X[n]; + + for (int ind = 0; ind < n; ind++) + { + X[ind] = x_init(ind); + } + + auto handle_X = ctx.logical_data(X); + + double alpha = 2.0; + int niter = 4; + for (int iter = 0; iter < niter; iter++) + { + ctx.task(handle_X.rw())->*[&](cudaStream_t s, auto sX) { + scal<<<16, 128, 0, s>>>(sX.size(), alpha, sX.data_handle()); + }; + } + + // Ask to use Y on the host + ctx.host_launch(handle_X.read())->*[&](auto sX) { + for (int ind = 0; ind < n; ind++) + { + EXPECT(fabs(sX(ind) - pow(alpha, niter) * (x_init(ind))) < 0.00001); + } + }; + + ctx.finalize(); +} + +int main() +{ + run(); + run(); +} diff --git a/cudax/test/stf/interface/scalar_div.cu b/cudax/test/stf/interface/scalar_div.cu new file mode 100644 index 00000000000..d773e87cec0 --- /dev/null +++ b/cudax/test/stf/interface/scalar_div.cu @@ -0,0 +1,124 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +static __global__ void scalar_div(const double* a, const double* b, double* c) +{ + *c = (*a) / (*b); +} + +static __global__ void scalar_minus(const double* a, double* res) +{ + *res = -(*a); +} + +/** + * This class is an example of class to issue tasks when accessing a scalar + * value. + * + * This is not meant to be the most efficient approach, but this is + * supposedly convenient. + * + */ +template +class scalar +{ +public: + scalar(Ctx* ctx, bool is_tmp = false) + : ctx(ctx) + { + size_t s = sizeof(double); + + if (is_tmp) + { + // There is no physical backing for this temporary vector + h_addr = NULL; + } + else + { + h_addr = (double*) malloc(s); + cuda_safe_call(cudaHostRegister(h_addr, s, cudaHostRegisterPortable)); + } + + data_place d = is_tmp ? data_place::invalid : data_place::host; + handle = ctx->logical_data(make_slice(h_addr), d); + } + + // Copy constructor + scalar(const scalar& a) + : ctx(a.ctx) + { + h_addr = NULL; + handle = ctx->logical_data(make_slice((double*) nullptr)); + + ctx->task(handle.write(), a.handle.read())->*[](cudaStream_t stream, auto dst, auto src) { + // There are likely much more efficient ways. + cuda_safe_call( + cudaMemcpyAsync(dst.data_handle(), src.data_handle(), sizeof(double), cudaMemcpyDeviceToDevice, stream)); + }; + } + + scalar operator/(scalar const& rhs) const + { + // Submit a task that computes this/rhs + scalar res(ctx); + + ctx->task(handle.read(), rhs.handle.read(), res.handle.write()) + ->*[](cudaStream_t stream, auto x, auto y1, auto result) { + scalar_div<<<1, 1, 0, stream>>>(x.data_handle(), y1.data_handle(), result.data_handle()); + }; + + return res; + } + + scalar operator-() const + { + // Submit a task that computes -s + scalar res(ctx); + ctx->task(handle.read(), res.handle.write())->*[](cudaStream_t stream, auto x, auto result) { + scalar_minus<<<1, 1, 0, stream>>>(x.data_handle(), result.data_handle()); + }; + + return res; + } + + Ctx* ctx; + mutable logical_data> handle; + double* h_addr; +}; + +template +void run() +{ + Ctx ctx; + scalar a(&ctx); + scalar b(&ctx); + + *a.h_addr = 42.0; + *b.h_addr = 12.3; + + scalar c = (-a) / b; + + ctx.host_launch(c.handle.read())->*[](auto x) { + EXPECT(fabs(*x.data_handle() - (-42.0) / 12.3) < 0.001); + }; + + ctx.finalize(); +} + +int main() +{ + run(); + run(); +} diff --git a/cudax/test/stf/interface/stream_add_callback.cu b/cudax/test/stf/interface/stream_add_callback.cu new file mode 100644 index 00000000000..1fb6f0d08b1 --- /dev/null +++ b/cudax/test/stf/interface/stream_add_callback.cu @@ -0,0 +1,50 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +void host_inc(cudaStream_t /*unused*/, cudaError_t /*unused*/, void* userData) +{ + /* Retrieve a pointer to the arguments and destroy it */ + int* var = static_cast(userData); + *var = *var + 1; +} + +int main() +{ + int cnt = 0; + + stream_ctx ctx; + auto h_cnt = ctx.logical_data(make_slice(&cnt, 1)); + + int NITER = 2; + for (int iter = 0; iter < NITER; iter++) + { + // Enqueue a dummy GPU task + ctx.task(h_cnt.rw())->*[&](cudaStream_t /*unused*/, auto /*unused*/) { + // no-op + }; + + // Enqueue a host callback + ctx.task(exec_place::host, h_cnt.rw())->*[&](cudaStream_t stream, auto s_cnt) { + cuda_safe_call(cudaStreamAddCallback(stream, host_inc, s_cnt.data_handle(), 0)); + cuda_safe_call(cudaGetLastError()); + }; + } + + // Ask to use Y on the host + ctx.host_launch(h_cnt.read())->*[&](auto s_cnt) { + EXPECT(s_cnt(0) == NITER); + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/local_stf/interop_cuda.cu b/cudax/test/stf/local_stf/interop_cuda.cu new file mode 100644 index 00000000000..ff1f8aa687d --- /dev/null +++ b/cudax/test/stf/local_stf/interop_cuda.cu @@ -0,0 +1,70 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief This example illustrates how to introduce CUDASTF contexts within existing stream-synchronized library calls + */ + +#include + +using namespace cuda::experimental::stf; + +// B += alpha*A; +__global__ void axpy(double alpha, double* d_ptrA, double* d_ptrB, size_t N) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < N; i += nthreads) + { + d_ptrB[i] += alpha * d_ptrA[i]; + } +} + +int main() +{ + double *d_ptrA, *d_ptrB; + const size_t N = 128 * 1024; + const size_t NITER = 128; + + // User allocated memory + cuda_safe_call(cudaMalloc(&d_ptrA, N * sizeof(double))); + cuda_safe_call(cudaMalloc(&d_ptrB, N * sizeof(double))); + + cudaStream_t stream; + cuda_safe_call(cudaStreamCreate(&stream)); + + async_resources_handle handle; + for (size_t i = 0; i < NITER; i++) + { + stream_ctx ctx(stream, handle); + auto lA = ctx.logical_data(make_slice(d_ptrA, N), data_place::current_device()); + auto lB = ctx.logical_data(make_slice(d_ptrB, N), data_place::current_device()); + + ctx.parallel_for(lA.shape(), lA.write())->*[] __device__(size_t i, auto a) { + a(i) = sin(double(i)); + }; + ctx.parallel_for(lB.shape(), lB.write())->*[] __device__(size_t i, auto b) { + b(i) = cos(double(i)); + }; + + ctx.task(lA.read(), lB.rw())->*[=](cudaStream_t s, auto a, auto b) { + axpy<<<128, 32, 0, s>>>(3.0, a.data_handle(), b.data_handle(), N); + }; + + // Note that this is non-blocking because we have creating the stream_ctx + // relative to a user-provided CUDA stream + ctx.finalize(); + + axpy<<<128, 32, 0, stream>>>(2.0, d_ptrA, d_ptrB, N); + } + cuda_safe_call(cudaStreamSynchronize(stream)); +} diff --git a/cudax/test/stf/local_stf/legacy_to_stf.cu b/cudax/test/stf/local_stf/legacy_to_stf.cu new file mode 100644 index 00000000000..3373f39ec0b --- /dev/null +++ b/cudax/test/stf/local_stf/legacy_to_stf.cu @@ -0,0 +1,215 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +__global__ void initA(double* d_ptrA, size_t N) +{ + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t nthreads = blockDim.x * gridDim.x; + for (size_t i = tid; i < N; i += nthreads) + { + d_ptrA[i] = sin((double) i); + } +} + +__global__ void initB(double* d_ptrB, size_t N) +{ + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t nthreads = blockDim.x * gridDim.x; + for (size_t i = tid; i < N; i += nthreads) + { + d_ptrB[i] = cos((double) i); + } +} + +// B += alpha*A; +__global__ void axpy(double alpha, double* d_ptrA, double* d_ptrB, size_t N) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < N; i += nthreads) + { + d_ptrB[i] += alpha * d_ptrA[i]; + } +} + +__global__ void empty_kernel() +{ + // no-op +} + +double X0(int i) +{ + return sin((double) i); +} + +double Y0(int i) +{ + return cos((double) i); +} + +void ref_lib_call(cudaStream_t stream, double* d_ptrA, double* d_ptrB, size_t N) +{ + initA<<<128, 32, 0, stream>>>(d_ptrA, N); + initB<<<128, 32, 0, stream>>>(d_ptrB, N); + axpy<<<128, 32, 0, stream>>>(3.0, d_ptrA, d_ptrB, N); + empty_kernel<<<16, 8, 0, stream>>>(); +} + +void lib_call(cudaStream_t stream, double* d_ptrA, double* d_ptrB, size_t N) +{ + stream_ctx ctx(stream); + auto lA = ctx.logical_data(make_slice(d_ptrA, N), data_place::current_device()); + auto lB = ctx.logical_data(make_slice(d_ptrB, N), data_place::current_device()); + ctx.task(lA.write())->*[=](cudaStream_t s, auto a) { + initA<<<128, 32, 0, s>>>(a.data_handle(), N); + }; + + ctx.task(lB.write())->*[=](cudaStream_t s, auto b) { + initB<<<128, 32, 0, s>>>(b.data_handle(), N); + }; + + ctx.task(lA.read(), lB.rw())->*[=](cudaStream_t s, auto a, auto b) { + axpy<<<128, 32, 0, s>>>(3.0, a.data_handle(), b.data_handle(), N); + }; + + ctx.task()->*[](cudaStream_t s) { + empty_kernel<<<16, 8, 0, s>>>(); + }; + + // Note that this is non-blocking because we have creating the stream_ctx + // relative to a user-provided CUDA stream + ctx.finalize(); +} + +void lib_call_with_handle(async_resources_handle& handle, cudaStream_t stream, double* d_ptrA, double* d_ptrB, size_t N) +{ + stream_ctx ctx(stream, handle); + auto lA = ctx.logical_data(make_slice(d_ptrA, N), data_place::current_device()); + auto lB = ctx.logical_data(make_slice(d_ptrB, N), data_place::current_device()); + ctx.task(lA.write())->*[=](cudaStream_t s, auto a) { + initA<<<128, 32, 0, s>>>(a.data_handle(), N); + }; + + ctx.task(lB.write())->*[=](cudaStream_t s, auto b) { + initB<<<128, 32, 0, s>>>(b.data_handle(), N); + }; + + ctx.task(lA.read(), lB.rw())->*[=](cudaStream_t s, auto a, auto b) { + axpy<<<128, 32, 0, s>>>(3.0, a.data_handle(), b.data_handle(), N); + }; + + ctx.task()->*[](cudaStream_t s) { + empty_kernel<<<16, 8, 0, s>>>(); + }; + + // Note that this is non-blocking because we have creating the stream_ctx + // relative to a user-provided CUDA stream + ctx.finalize(); +} + +template +void lib_call_generic(async_resources_handle& handle, cudaStream_t stream, double* d_ptrA, double* d_ptrB, size_t N) +{ + Ctx_t ctx(stream, handle); + auto lA = ctx.logical_data(make_slice(d_ptrA, N), data_place::current_device()); + auto lB = ctx.logical_data(make_slice(d_ptrB, N), data_place::current_device()); + ctx.task(lA.write())->*[=](cudaStream_t s, auto a) { + initA<<<128, 32, 0, s>>>(a.data_handle(), N); + }; + + ctx.task(lB.write())->*[=](cudaStream_t s, auto b) { + initB<<<128, 32, 0, s>>>(b.data_handle(), N); + }; + + ctx.task(lA.read(), lB.rw())->*[=](cudaStream_t s, auto a, auto b) { + axpy<<<128, 32, 0, s>>>(3.0, a.data_handle(), b.data_handle(), N); + }; + + ctx.task()->*[](cudaStream_t s) { + empty_kernel<<<16, 8, 0, s>>>(); + }; + + ctx.submit(); +} + +int main() +{ + double *d_ptrA, *d_ptrB; + const size_t N = 128 * 1024; + const size_t NITER = 128; + + // User allocated memory + cuda_safe_call(cudaMalloc(&d_ptrA, N * sizeof(double))); + cuda_safe_call(cudaMalloc(&d_ptrB, N * sizeof(double))); + + cudaStream_t stream; + cuda_safe_call(cudaStreamCreate(&stream)); + + nvtxRangePushA("warmup"); + for (size_t i = 0; i < NITER; i++) + { + ref_lib_call(stream, d_ptrA, d_ptrB, N); + } + cuda_safe_call(cudaStreamSynchronize(stream)); + nvtxRangePop(); + + nvtxRangePushA("ref"); + for (size_t i = 0; i < NITER; i++) + { + ref_lib_call(stream, d_ptrA, d_ptrB, N); + } + cuda_safe_call(cudaStreamSynchronize(stream)); + nvtxRangePop(); + + nvtxRangePushA("local stf"); + for (size_t i = 0; i < NITER; i++) + { + lib_call(stream, d_ptrA, d_ptrB, N); + } + cuda_safe_call(cudaStreamSynchronize(stream)); + nvtxRangePop(); + + nvtxRangePushA("local stf handle"); + async_resources_handle handle; + for (size_t i = 0; i < NITER; i++) + { + lib_call_with_handle(handle, stream, d_ptrA, d_ptrB, N); + } + cuda_safe_call(cudaStreamSynchronize(stream)); + nvtxRangePop(); + + nvtxRangePushA("generic graph handle"); + for (size_t i = 0; i < NITER; i++) + { + lib_call_generic(handle, stream, d_ptrA, d_ptrB, N); + } + cuda_safe_call(cudaStreamSynchronize(stream)); + + nvtxRangePushA("generic stream handle"); + for (size_t i = 0; i < NITER; i++) + { + lib_call_generic(handle, stream, d_ptrA, d_ptrB, N); + } + cuda_safe_call(cudaStreamSynchronize(stream)); + + nvtxRangePushA("generic context handle"); + for (size_t i = 0; i < NITER; i++) + { + lib_call_generic(handle, stream, d_ptrA, d_ptrB, N); + } + cuda_safe_call(cudaStreamSynchronize(stream)); + + nvtxRangePop(); +} diff --git a/cudax/test/stf/loop_dispatch/dispatch_on_streams.cu b/cudax/test/stf/loop_dispatch/dispatch_on_streams.cu new file mode 100644 index 00000000000..d49bb96d1b0 --- /dev/null +++ b/cudax/test/stf/loop_dispatch/dispatch_on_streams.cu @@ -0,0 +1,44 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +int main() +{ +#if CUDA_VERSION < 12040 + fprintf(stderr, "Green contexts are not supported by this version of CUDA: skipping test.\n"); + return 0; +#else + + context ctx; + + auto lX = ctx.logical_data(size_t(32 * 1024 * 1024)); + + ctx.parallel_for(lX.shape(), lX.write())->*[] __device__(size_t i, auto x) { + x(i) = 3 * i - 7; + }; + + for (auto& sub_place : + place_partition(ctx.async_resources(), exec_place::current_device(), place_partition_scope::green_context)) + { + for (size_t i = 0; i < 4; i++) + { + ctx.parallel_for(sub_place, lX.shape(), lX.rw())->*[] __device__(size_t i, auto x) { + x(i) += 1; + }; + } + } + + ctx.finalize(); +#endif +} diff --git a/cudax/test/stf/loop_dispatch/loop_dispatch.cu b/cudax/test/stf/loop_dispatch/loop_dispatch.cu new file mode 100644 index 00000000000..86369203896 --- /dev/null +++ b/cudax/test/stf/loop_dispatch/loop_dispatch.cu @@ -0,0 +1,41 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +int main() +{ + context ctx; + + // Loop count + int n = 1024; + + auto lB = ctx.logical_data(size_t(1024 * 1024)); + ctx.parallel_for(lB.shape(), lB.write())->*[] __device__(size_t i, auto b) { + b(i) = 42; + }; + + loop_dispatch(ctx, exec_place::all_devices(), 0, n, [&](size_t) { + auto lA = ctx.logical_data(size_t(1024 * 1024)); + + ctx.parallel_for(ctx.current_exec_place(), lA.shape(), lA.write())->*[] __device__(size_t i, auto a) { + a(i) = (int) (10.0 * cos((double) i)); + }; + + ctx.parallel_for(ctx.current_exec_place(), lA.shape(), lA.rw(), lB.read())->*[] __device__(size_t i, auto a, auto b) { + a(i) += b(i); + }; + }); + + ctx.finalize(); +} diff --git a/cudax/test/stf/loop_dispatch/nested_loop_dispatch.cu b/cudax/test/stf/loop_dispatch/nested_loop_dispatch.cu new file mode 100644 index 00000000000..29c8a455cb7 --- /dev/null +++ b/cudax/test/stf/loop_dispatch/nested_loop_dispatch.cu @@ -0,0 +1,52 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include +#include + +using namespace cuda::experimental::stf; + +int main() +{ + context ctx; + + auto lB = ctx.logical_data(size_t(1024 * 1024)); + + ctx.parallel_for(lB.shape(), lB.write())->*[] __device__(size_t i, auto b) { + b(i) = 42; + }; + + // A fake grid which should work regardless of the underlying machine + auto grid = exec_place::repeat(exec_place::current_device(), 8); + + // Split the affinity into 4 parts + loop_dispatch(ctx, grid, place_partition_scope::cuda_device, 0, 4, [&](size_t) { + // We should have 2 places per subplace + EXPECT(ctx.current_affinity().size() == 2); + + // This should use ctx.current_affinity() implicitly + loop_dispatch(ctx, 0, 4, [&](size_t) { + auto lA = ctx.logical_data(size_t(1024 * 1024)); + + ctx.parallel_for(ctx.current_exec_place(), lA.shape(), lA.write())->*[] __device__(size_t i, auto a) { + a(i) = (int) (10.0 * cuda::std::cos((double) i)); + }; + + ctx.parallel_for(ctx.current_exec_place(), lA.shape(), lA.rw(), lB.read()) + ->*[] __device__(size_t i, auto a, auto b) { + a(i) += b(i); + }; + }); + }); + + ctx.finalize(); +} diff --git a/cudax/test/stf/parallel_for/fdtd.cu b/cudax/test/stf/parallel_for/fdtd.cu new file mode 100644 index 00000000000..0e97d2e7afd --- /dev/null +++ b/cudax/test/stf/parallel_for/fdtd.cu @@ -0,0 +1,254 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +// FIXME : MSVC has trouble with box constructors +#if !defined(_CCCL_COMPILER_MSVC) +void write_vtk_2D(const std::string& filename, slice Ez, double dx, double dy, double /*unused*/) +{ + FILE* f = fopen(filename.c_str(), "w"); + + const size_t pos_z = Ez.extent(2) / 2; + const size_t nx = Ez.extent(0); + + const size_t size = Ez.extent(0) * Ez.extent(1); + + fprintf(f, "# vtk DataFile Version 3.0\n"); + fprintf(f, "vtk output\n"); + fprintf(f, "ASCII\n"); + fprintf(f, "DATASET UNSTRUCTURED_GRID\n"); + fprintf(f, "POINTS %ld float\n", 4 * size); + + for (size_t y = 0; y < Ez.extent(1); y++) + { + for (size_t x = 0; x < Ez.extent(0); x++) + { + fprintf(f, "%lf %lf 0.0\n", dx * static_cast(x + 0), dy * static_cast(y + 0)); + fprintf(f, "%lf %lf 0.0\n", dx * static_cast(x + 1), dy * static_cast(y + 0)); + fprintf(f, "%lf %lf 0.0\n", dx * static_cast(x + 1), dy * static_cast(y + 1)); + fprintf(f, "%lf %lf 0.0\n", dx * static_cast(x + 0), dy * static_cast(y + 1)); + } + } + + fprintf(f, "CELLS %ld %ld\n", size, 5 * size); + + size_t cell_id = 0; + for (size_t y = 0; y < Ez.extent(1); y++) + { + for (size_t x = 0; x < Ez.extent(0); x++) + { + const size_t point_offset = cell_id * 4; + fprintf(f, + "4 %d %d %d %d\n", + (int) (point_offset + 0), + (int) (point_offset + 1), + (int) (point_offset + 2), + (int) (point_offset + 3)); + + cell_id++; + } + } + + fprintf(f, "CELL_TYPES %ld\n", size); + + for (size_t ii = 0; ii < size; ii++) + { + fprintf(f, "5\n"); + } + + fprintf(f, "CELL_DATA %ld\n", size); + fprintf(f, "SCALARS Ez double 1\n"); + fprintf(f, "LOOKUP_TABLE default\n"); + + for (size_t y = 0; y < Ez.extent(1); y++) + { + for (size_t x = 0; x < Ez.extent(0); x++) + { + fprintf(f, "%lf\n", Ez(x, y, pos_z)); + } + } + + fclose(f); +} + +// Define the source function +__device__ double Source(double t, double x, double y, double z) +{ + constexpr double pi = 3.14159265358979323846; + constexpr double freq = 1e9; + constexpr double omega = (2 * pi * freq); + constexpr double wavelength = 3e8 / freq; + constexpr double k = 2 * pi / wavelength; + return sin(k * x - omega * t); +} +#endif // !defined(_CCCL_COMPILER_MSVC) + +int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) +{ +#if !defined(_CCCL_COMPILER_MSVC) + stream_ctx ctx; + + // Domain dimensions + const size_t SIZE_X = 100; + const size_t SIZE_Y = 100; + const size_t SIZE_Z = 100; + + // Grid spacing + const double DX = 0.01; + const double DY = 0.01; + const double DZ = 0.01; + + // Define the electric and magnetic fields + auto data_shape = shape_of>(SIZE_X, SIZE_Y, SIZE_Z); + auto lEx = ctx.logical_data(data_shape); + auto lEy = ctx.logical_data(data_shape); + auto lEz = ctx.logical_data(data_shape); + auto lHx = ctx.logical_data(data_shape); + auto lHy = ctx.logical_data(data_shape); + auto lHz = ctx.logical_data(data_shape); + + // Define the permittivity and permeability of the medium + auto lepsilon = ctx.logical_data(data_shape); + auto lmu = ctx.logical_data(data_shape); + + const double EPSILON = 8.85e-12; // Permittivity of free space + const double MU = 1.256e-6; // Permeability of free space + + // CFL condition DT <= min(DX, DY, DZ) * sqrt(epsilon_max * mu_max) + double DT = 0.25 * min(min(DX, DY), DZ) * sqrt(EPSILON * MU); + + // Initialize E + ctx.parallel_for(data_shape, lEx.write(), lEy.write(), lEz.write()) + ->*[] _CCCL_DEVICE(size_t i, size_t j, size_t k, auto Ex, auto Ey, auto Ez) { + Ex(i, j, k) = 0.0; + Ey(i, j, k) = 0.0; + Ez(i, j, k) = 0.0; + }; + + // Initialize H + ctx.parallel_for(data_shape, lHx.write(), lHy.write(), lHz.write()) + ->*[] _CCCL_DEVICE(size_t i, size_t j, size_t k, auto Hx, auto Hy, auto Hz) { + Hx(i, j, k) = 0.0; + Hy(i, j, k) = 0.0; + Hz(i, j, k) = 0.0; + }; + + // Initialize permittivity and permeability fields + ctx.parallel_for(data_shape, lepsilon.write(), lmu.write()) + ->*[=] _CCCL_DEVICE(size_t i, size_t j, size_t k, auto epsilon, auto mu) { + epsilon(i, j, k) = EPSILON; + mu(i, j, k) = MU; + }; + + // Set the source function at the center of the grid + + const size_t center_x = SIZE_X / 2; + const size_t center_y = SIZE_Y / 2; + const size_t center_z = SIZE_Z / 2; + + // Initialize the time loop + size_t timesteps = 10; + if (argc > 1) + { + timesteps = (size_t) atol(argv[1]); + } + + // No output by default + int64_t output_freq = -1; + if (argc > 2) + { + output_freq = (int64_t) atol(argv[2]); + } + + /* Index shapes for Electric fields, Magnetic fields, and the indices where there is a source */ + box<3> Es({1ul, SIZE_X - 1}, {1ul, SIZE_Y - 1}, {1ul, SIZE_Z - 1}); + box<3> Hs({0ul, SIZE_X - 1}, {0ul, SIZE_Y - 1}, {0ul, SIZE_Z - 1}); + box<3> source_s({center_x, center_x + 1}, {center_y, center_y + 1}, {center_z, center_z + 1}); + + for (size_t n = 0; n < timesteps; n++) + { + // Update the electric fields + + // Update Ex + ctx.parallel_for(Es, lEx.rw(), lHy.read(), lHz.read(), lepsilon.read()) + ->*[=] + _CCCL_DEVICE(size_t i, size_t j, size_t k, auto Ex, auto Hy, auto Hz, auto epsilon) { + Ex(i, j, k) = Ex(i, j, k) + + (DT / (epsilon(i, j, k) * DX)) * (Hz(i, j, k) - Hz(i, j - 1, k) - Hy(i, j, k) + Hy(i, j, k - 1)); + }; + + // Update Ey + ctx.parallel_for(Es, lEy.rw(), lHx.read(), lHz.read(), lepsilon.read()) + ->*[=] + _CCCL_DEVICE(size_t i, size_t j, size_t k, auto Ey, auto Hx, auto Hz, auto epsilon) { + Ey(i, j, k) = Ey(i, j, k) + + (DT / (epsilon(i, j, k) * DY)) * (Hx(i, j, k) - Hx(i, j, k - 1) - Hz(i, j, k) + Hz(i - 1, j, k)); + }; + + // Update Ez + ctx.parallel_for(Es, lEz.rw(), lHx.read(), lHy.read(), lepsilon.read()) + ->*[=] + _CCCL_DEVICE(size_t i, size_t j, size_t k, auto Ez, auto Hx, auto Hy, auto epsilon) { + Ez(i, j, k) = Ez(i, j, k) + + (DT / (epsilon(i, j, k) * DZ)) * (Hy(i, j, k) - Hy(i - 1, j, k) - Hx(i, j, k) + Hx(i, j - 1, k)); + }; + + // Add the source function at the center of the grid + ctx.parallel_for(source_s, lEz.rw())->*[=] _CCCL_DEVICE(size_t i, size_t j, size_t k, auto Ez) { + Ez(i, j, k) = Ez(i, j, k) + Source(n * DT, i * DX, j * DY, k * DZ); + }; + + // Update the magnetic fields + + // Update Hx + ctx.parallel_for(Hs, lHx.rw(), lEy.read(), lEz.read(), lmu.read()) + ->*[=] _CCCL_DEVICE(size_t i, size_t j, size_t k, auto Hx, auto Ey, auto Ez, auto mu) { + Hx(i, j, k) = Hx(i, j, k) + - (DT / (mu(i, j, k) * DY)) * (Ez(i, j + 1, k) - Ez(i, j, k) - Ey(i, j, k + 1) + Ey(i, j, k)); + }; + + // Update Hy + ctx.parallel_for(Hs, lHy.rw(), lEx.read(), lEz.read(), lmu.read()) + ->*[=] _CCCL_DEVICE(size_t i, size_t j, size_t k, auto Hy, auto Ex, auto Ez, auto mu) { + Hy(i, j, k) = Hy(i, j, k) + - (DT / (mu(i, j, k) * DZ)) * (Ex(i, j, k + 1) - Ex(i, j, k) - Ez(i + 1, j, k) + Ez(i, j, k)); + }; + + // Update Hz + ctx.parallel_for(Hs, lHz.rw(), lEx.read(), lEy.read(), lmu.read()) + ->*[=] _CCCL_DEVICE(size_t i, size_t j, size_t k, auto Hz, auto Ex, auto Ey, auto mu) { + Hz(i, j, k) = Hz(i, j, k) + - (DT / (mu(i, j, k) * DX)) * (Ey(i + 1, j, k) - Ey(i, j, k) - Ex(i, j + 1, k) + Ex(i, j, k)); + }; + + if (output_freq > 0) + { + ctx.host_launch(lEz.read())->*[=](auto Ez) { + // Output the electric field at the center of the grid + fprintf(stderr, "%ld\t%le\n", n, Ez(center_x, center_y, center_z)); + + if (output_freq > 0 && n % output_freq == 0) + { + std::string filename = "Ez" + std::to_string(n) + ".vtk"; + + // Dump a 2D slice of Ez in VTK + write_vtk_2D(filename, Ez, DX, DY, DZ); + } + }; + } + } + + ctx.finalize(); +#endif // !defined(_CCCL_COMPILER_MSVC) +} diff --git a/cudax/test/stf/parallel_for/fdtd_acc.cpp b/cudax/test/stf/parallel_for/fdtd_acc.cpp new file mode 100644 index 00000000000..272b3088bae --- /dev/null +++ b/cudax/test/stf/parallel_for/fdtd_acc.cpp @@ -0,0 +1,300 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Reference OpenACC implementation of the FDTD example + */ +#include + +#include +#include +#include + +#define Ex(i, j, k) (Ex_array[(i) + (j) * SIZE_X + (k) * SIZE_X * SIZE_Y]) +#define Ey(i, j, k) (Ey_array[(i) + (j) * SIZE_X + (k) * SIZE_X * SIZE_Y]) +#define Ez(i, j, k) (Ez_array[(i) + (j) * SIZE_X + (k) * SIZE_X * SIZE_Y]) +#define Hx(i, j, k) (Hx_array[(i) + (j) * SIZE_X + (k) * SIZE_X * SIZE_Y]) +#define Hy(i, j, k) (Hy_array[(i) + (j) * SIZE_X + (k) * SIZE_X * SIZE_Y]) +#define Hz(i, j, k) (Hz_array[(i) + (j) * SIZE_X + (k) * SIZE_X * SIZE_Y]) +#define mu(i, j, k) (mu_array[(i) + (j) * SIZE_X + (k) * SIZE_X * SIZE_Y]) +#define epsilon(i, j, k) (epsilon_array[(i) + (j) * SIZE_X + (k) * SIZE_X * SIZE_Y]) + +void write_vtk_2D( + const std::string& filename, + double* Ez_array, + double dx, + double dy, + double dz, + size_t SIZE_X, + size_t SIZE_Y, + size_t SIZE_Z) +{ + FILE* f = fopen(filename.c_str(), "w"); + + const size_t pos_z = SIZE_Z / 2; + const size_t size = SIZE_X * SIZE_Y; + + fprintf(f, "# vtk DataFile Version 3.0\n"); + fprintf(f, "vtk output\n"); + fprintf(f, "ASCII\n"); + fprintf(f, "DATASET UNSTRUCTURED_GRID\n"); + fprintf(f, "POINTS %ld float\n", 4 * size); + + for (size_t y = 0; y < SIZE_Y; y++) + { + for (size_t x = 0; x < SIZE_X; x++) + { + fprintf(f, "%lf %lf 0.0\n", dx * static_cast(x + 0), dy * static_cast(y + 0)); + fprintf(f, "%lf %lf 0.0\n", dx * static_cast(x + 1), dy * static_cast(y + 0)); + fprintf(f, "%lf %lf 0.0\n", dx * static_cast(x + 1), dy * static_cast(y + 1)); + fprintf(f, "%lf %lf 0.0\n", dx * static_cast(x + 0), dy * static_cast(y + 1)); + } + } + + fprintf(f, "CELLS %ld %ld\n", size, 5 * size); + + size_t cell_id = 0; + for (size_t y = 0; y < SIZE_Y; y++) + { + for (size_t x = 0; x < SIZE_X; x++) + { + const size_t point_offset = cell_id * 4; + fprintf(f, + "4 %d %d %d %d\n", + (int) (point_offset + 0), + (int) (point_offset + 1), + (int) (point_offset + 2), + (int) (point_offset + 3)); + + cell_id++; + } + } + + fprintf(f, "CELL_TYPES %ld\n", size); + + for (size_t ii = 0; ii < size; ii++) + { + fprintf(f, "5\n"); + } + + fprintf(f, "CELL_DATA %ld\n", size); + fprintf(f, "SCALARS Ez double 1\n"); + fprintf(f, "LOOKUP_TABLE default\n"); + + for (size_t y = 0; y < SIZE_Y; y++) + { + for (size_t x = 0; x < SIZE_X; x++) + { + fprintf(f, "%lf\n", Ez(x, y, pos_z)); + } + } + + fclose(f); +} + +// Define the source function +double Source(double t, double x, double y, double z) +{ + const double pi = 3.14159265358979323846; + const double freq = 1e9; + const double omega = (2 * pi * freq); + const double wavelength = 3e8 / freq; + const double k = 2 * pi / wavelength; + return sin(k * x - omega * t); +} + +int main(int argc, char** argv) +{ + // Domain dimensions + const size_t SIZE_X = 100; + const size_t SIZE_Y = 100; + const size_t SIZE_Z = 100; + + // Grid spacing + const double DX = 0.01; + const double DY = 0.01; + const double DZ = 0.01; + + // Define the electric and magnetic fields + double* Ex_array = new double[SIZE_X * SIZE_Y * SIZE_Z]; + double* Ey_array = new double[SIZE_X * SIZE_Y * SIZE_Z]; + double* Ez_array = new double[SIZE_X * SIZE_Y * SIZE_Z]; + double* Hx_array = new double[SIZE_X * SIZE_Y * SIZE_Z]; + double* Hy_array = new double[SIZE_X * SIZE_Y * SIZE_Z]; + double* Hz_array = new double[SIZE_X * SIZE_Y * SIZE_Z]; + + // Define the permittivity and permeability of the medium + double* epsilon_array = new double[SIZE_X * SIZE_Y * SIZE_Z]; + double* mu_array = new double[SIZE_X * SIZE_Y * SIZE_Z]; + + const double EPSILON = 8.85e-12; // Permittivity of free space + const double MU = 1.256e-6; // Permeability of free space + + // CFL condition DT <= min(DX, DY, DZ) * sqrt(epsilon_max * mu_max) + double DT = 0.25 * std::min(std::min(DX, DY), DZ) * sqrt(EPSILON * MU); + +// Initialize the fields, permittivity, permeability, and conductivity +#pragma acc parallel loop collapse(3) + for (size_t k = 0; k < SIZE_Z; k++) + { + for (size_t j = 0; j < SIZE_Y; j++) + { + for (size_t i = 0; i < SIZE_X; i++) + { + Ex(i, j, k) = 0.0; + Ey(i, j, k) = 0.0; + Ez(i, j, k) = 0.0; + Hx(i, j, k) = 0.0; + Hy(i, j, k) = 0.0; + Hz(i, j, k) = 0.0; + mu(i, j, k) = MU; + epsilon(i, j, k) = EPSILON; + } + } + } + + // Set the source function at the center of the grid + const size_t center_x = SIZE_X / 2; + const size_t center_y = SIZE_Y / 2; + const size_t center_z = SIZE_Z / 2; + + // Initialize the time loop + size_t timesteps = 1000; + if (argc > 1) + { + timesteps = (size_t) atol(argv[1]); + } + + // No output by default + ssize_t output_freq = -1; + if (argc > 2) + { + output_freq = (ssize_t) atol(argv[2]); + } + + for (size_t n = 0; n < timesteps; n++) + { +// Update the electric fields + +// Update Ex +#pragma acc parallel loop collapse(3) async(1) + for (size_t k = 1; k < SIZE_Z - 1; k++) + { + for (size_t j = 1; j < SIZE_Y - 1; j++) + { + for (size_t i = 1; i < SIZE_X - 1; i++) + { + Ex(i, j, k) = + Ex(i, j, k) + + (DT / (epsilon(i, j, k) * DX)) * (Hz(i, j, k) - Hz(i, j - 1, k) - Hy(i, j, k) + Hy(i, j, k - 1)); + } + } + } + +#pragma acc parallel loop collapse(3) async(2) + for (size_t k = 1; k < SIZE_Z - 1; k++) + { + for (size_t j = 1; j < SIZE_Y - 1; j++) + { + for (size_t i = 1; i < SIZE_X - 1; i++) + { + Ey(i, j, k) = + Ey(i, j, k) + + (DT / (epsilon(i, j, k) * DY)) * (Hx(i, j, k) - Hx(i, j, k - 1) - Hz(i, j, k) + Hz(i - 1, j, k)); + } + } + } + +#pragma acc parallel loop collapse(3) async(3) + for (size_t k = 1; k < SIZE_Z - 1; k++) + { + for (size_t j = 1; j < SIZE_Y - 1; j++) + { + for (size_t i = 1; i < SIZE_X - 1; i++) + { + Ez(i, j, k) = + Ez(i, j, k) + + (DT / (epsilon(i, j, k) * DZ)) * (Hy(i, j, k) - Hy(i - 1, j, k) - Hx(i, j, k) + Hx(i, j - 1, k)); + + // Add the source function at the center of the grid + if (i == center_x && j == center_y && k == center_z) + { + Ez(i, j, k) = Ez(i, j, k) + Source(n * DT, i * DX, j * DY, k * DZ); + } + } + } + } + +#pragma acc wait(1) +#pragma acc wait(2) +#pragma acc wait(3) + +#pragma acc parallel loop collapse(3) async(1) + for (size_t k = 0; k < SIZE_Z - 1; k++) + { + for (size_t j = 0; j < SIZE_Y - 1; j++) + { + for (size_t i = 0; i < SIZE_X - 1; i++) + { + Hx(i, j, k) = + Hx(i, j, k) - (DT / (mu(i, j, k) * DY)) * (Ez(i, j + 1, k) - Ez(i, j, k) - Ey(i, j, k + 1) + Ey(i, j, k)); + } + } + } + +#pragma acc parallel loop collapse(3) async(2) + for (size_t k = 0; k < SIZE_Z - 1; k++) + { + for (size_t j = 0; j < SIZE_Y - 1; j++) + { + for (size_t i = 0; i < SIZE_X - 1; i++) + { + Hy(i, j, k) = + Hy(i, j, k) - (DT / (mu(i, j, k) * DZ)) * (Ex(i, j, k + 1) - Ex(i, j, k) - Ez(i + 1, j, k) + Ez(i, j, k)); + } + } + } + +#pragma acc parallel loop collapse(3) async(3) + for (size_t k = 0; k < SIZE_Z - 1; k++) + { + for (size_t j = 0; j < SIZE_Y - 1; j++) + { + for (size_t i = 0; i < SIZE_X - 1; i++) + { + Hz(i, j, k) = + Hz(i, j, k) - (DT / (mu(i, j, k) * DX)) * (Ey(i + 1, j, k) - Ey(i, j, k) - Ex(i, j + 1, k) + Ex(i, j, k)); + } + } + } + +#pragma acc wait(1) +#pragma acc wait(2) +#pragma acc wait(3) + + if (output_freq > 0) + { + // Output the electric field at the center of the grid + fprintf(stderr, "%ld\t%le\n", n, Ez(center_x, center_y, center_z)); + + if (output_freq > 0 && n % output_freq == 0) + { + std::string filename = "Ez" + std::to_string(n) + ".vtk"; + + // Dump a 2D slice of Ez in VTK + write_vtk_2D(filename, Ez_array, DX, DY, DZ, SIZE_X, SIZE_Y, SIZE_Z); + } + } + } + + return 0; +} diff --git a/cudax/test/stf/parallel_for/parallel_for_all_devs.cu b/cudax/test/stf/parallel_for/parallel_for_all_devs.cu new file mode 100644 index 00000000000..5d856d57de5 --- /dev/null +++ b/cudax/test/stf/parallel_for/parallel_for_all_devs.cu @@ -0,0 +1,82 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +template +__global__ void axpy(size_t start, size_t cnt, T a, const T* x, T* y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int ind = tid; ind < cnt; ind += nthreads) + { + y[ind + start] += a * x[ind + start]; + } +} + +double X0(size_t i) +{ + return sin((double) i); +} + +double Y0(size_t i) +{ + return cos((double) i); +} + +int main() +{ + stream_ctx ctx; + + const int N = 1024 * 1024 * 32; + double *X, *Y; + + X = new double[N]; + Y = new double[N]; + + for (size_t ind = 0; ind < N; ind++) + { + X[ind] = X0(ind); + Y[ind] = Y0(ind); + } + + auto handle_X = ctx.logical_data(X, {N}); + auto handle_Y = ctx.logical_data(Y, {N}); + + auto all_devs = exec_place::all_devices(); + + double alpha = 3.14; + + /* Compute Y = Y + alpha X */ + ctx.parallel_for(tiled_partition<1024 * 1024>(), all_devs, handle_X.shape(), handle_X.read(), handle_Y.rw()) + ->*[=] _CCCL_DEVICE(size_t i, auto sX, auto sY) { + sY(i) += alpha * sX(i); + }; + + /* Check the result on the host */ + ctx.task(exec_place::host, handle_X.read(), handle_Y.read())->*[&](cudaStream_t s, auto sX, auto sY) { + cuda_safe_call(cudaStreamSynchronize(s)); + + for (size_t ind = 0; ind < N; ind++) + { + // Y should be Y0 + alpha X0 + EXPECT(fabs(sY(ind) - (Y0(ind) + alpha * X0(ind))) < 0.0001); + + // X should be X0 + EXPECT(fabs(sX(ind) - X0(ind)) < 0.0001); + } + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/parallel_for/parallel_for_box.cu b/cudax/test/stf/parallel_for/parallel_for_box.cu new file mode 100644 index 00000000000..78b9b70d8b6 --- /dev/null +++ b/cudax/test/stf/parallel_for/parallel_for_box.cu @@ -0,0 +1,91 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +int main() +{ + stream_ctx ctx; + + const size_t N = 16; + const size_t M = 16; + + /* + * First test an explicit shape by only specifying its size + */ + + std::vector A(M * N); + auto lA = ctx.logical_data(make_slice(&A[0], std::tuple{M, N}, M)); + + ctx.parallel_for(lA.shape(), lA.write())->*[=] _CCCL_DEVICE(size_t i, size_t j, auto sA) { + sA(i, j) = 42.0; + }; + + // Create a subset of a limited size + box subset_shape(3, 4); + ctx.parallel_for(subset_shape, lA.rw())->*[=] _CCCL_DEVICE(size_t i, size_t j, auto sA) { + sA(i, j) = 13.0; + }; + + bool checkedA = false; + ctx.host_launch(lA.rw())->*[&checkedA](auto sA) { + for (size_t j = 0; j < sA.extent(1); j++) + { + for (size_t i = 0; i < sA.extent(0); i++) + { + double expected = (i < 3 && j < 4) ? 13.0 : 42.0; + if (sA(i, j) != expected) + { + fprintf(stderr, "sA(%zu,%zu) = %lf, expected %lf\n", i, j, sA(i, j), expected); + } + } + } + + checkedA = true; + }; + + std::vector B(M * N); + auto lB = ctx.logical_data(make_slice(&B[0], std::tuple{M, N}, M)); + + ctx.parallel_for(lB.shape(), lB.write())->*[=] _CCCL_DEVICE(size_t i, size_t j, auto sB) { + sB(i, j) = 42.0; + }; + + // Create a subset of a limited size + box subset_shape_2({2, 5}, {5, 8}); + ctx.parallel_for(subset_shape_2, lB.rw())->*[=] _CCCL_DEVICE(size_t i, size_t j, auto sB) { + sB(i, j) = 13.0; + }; + + bool checkedB = false; + ctx.host_launch(lB.rw())->*[&checkedB](auto sB) { + for (size_t j = 0; j < sB.extent(1); j++) + { + for (size_t i = 0; i < sB.extent(0); i++) + { + double expected = (2 <= i && i < 5 && 5 <= j && j < 8) ? 13.0 : 42.0; + if (sB(i, j) != expected) + { + fprintf(stderr, "sB(%zu,%zu) = %lf, expected %lf\n", i, j, sB(i, j), expected); + } + } + } + + checkedB = true; + }; + + ctx.finalize(); + + assert(checkedA); + assert(checkedB); +} diff --git a/cudax/test/stf/parallel_for/parallel_for_repeat.cu b/cudax/test/stf/parallel_for/parallel_for_repeat.cu new file mode 100644 index 00000000000..a6878caf3f6 --- /dev/null +++ b/cudax/test/stf/parallel_for/parallel_for_repeat.cu @@ -0,0 +1,75 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief This creates a dummy grid of devices (repeating device 0 multiple + * times) to check that parallel_for works on a grid of places. + */ + +#include +#include + +using namespace cuda::experimental::stf; + +double X0(size_t i) +{ + return sin((double) i); +} + +double Y0(size_t i) +{ + return cos((double) i); +} + +int main() +{ + context ctx; + + const int N = 1024 * 1024 * 32; + double *X, *Y; + + X = new double[N]; + Y = new double[N]; + + for (size_t ind = 0; ind < N; ind++) + { + X[ind] = X0(ind); + Y[ind] = Y0(ind); + } + + auto handle_X = ctx.logical_data(X, {N}); + auto handle_Y = ctx.logical_data(Y, {N}); + + auto where = exec_place::repeat(exec_place::current_device(), 8); + + double alpha = 3.14; + size_t NITER = 5; + + /* Compute Y = Y + alpha X */ + for (size_t k = 0; k < NITER; k++) + { + ctx.parallel_for(tiled_partition<1024 * 1024>(), where, handle_X.shape(), handle_X.read(), handle_Y.rw()) + ->*[=] _CCCL_DEVICE(size_t i, auto sX, auto sY) { + sY(i) += alpha * sX(i); + }; + } + + ctx.finalize(); + + for (size_t ind = 0; ind < N; ind++) + { + // Y should be Y0 + NITER alpha X0 + assert(fabs(Y[ind] - (Y0(ind) + NITER * alpha * X0(ind))) < 0.0001); + + // X should be X0 + assert(fabs(X[ind] - X0(ind)) < 0.0001); + } +} diff --git a/cudax/test/stf/parallel_for/test2_parallel_for_context.cu b/cudax/test/stf/parallel_for/test2_parallel_for_context.cu new file mode 100644 index 00000000000..5ff11ffafca --- /dev/null +++ b/cudax/test/stf/parallel_for/test2_parallel_for_context.cu @@ -0,0 +1,73 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Check that parallel_for constructs do update slices with shapes of different dimensions + */ + +#include + +using namespace cuda::experimental::stf; + +__host__ __device__ double x0(size_t i, size_t j) +{ + return sin((double) (i - j)); +} + +__host__ __device__ double y0(size_t i, size_t j) +{ + return cos((double) (i + j)); +} + +int main() +{ + context ctx; + + const size_t N = 16; + double X[2 * N * 2 * N]; + double Y[N * N]; + + auto lx = ctx.logical_data(make_slice(&X[0], std::tuple{2 * N, 2 * N}, 2 * N)); + auto ly = ctx.logical_data(make_slice(&Y[0], std::tuple{N, N}, N)); + + ctx.parallel_for(lx.shape(), lx.write())->*[=] _CCCL_DEVICE(size_t i, size_t j, auto sx) { + sx(i, j) = x0(i, j); + }; + + ctx.parallel_for(ly.shape(), lx.read(), ly.write())->*[=] _CCCL_DEVICE(size_t i, size_t j, auto sx, auto sy) { + sy(i, j) = y0(i, j); + for (size_t ii = 0; ii < 2; ii++) + { + for (size_t jj = 0; jj < 2; jj++) + { + sy(i, j) += sx(2 * i + ii, 2 * j + jj); + } + } + }; + + ctx.parallel_for(exec_place::host, ly.shape(), ly.read())->*[=] __host__(size_t i, size_t j, slice sy) { + double expected = y0(i, j); + for (size_t ii = 0; ii < 2; ii++) + { + for (size_t jj = 0; jj < 2; jj++) + { + expected += x0(2 * i + ii, 2 * j + jj); + } + } + if (fabs(sy(i, j) - expected) > 0.001) + { + printf("sy(%zu, %zu) %f expect %f\n", i, j, sy(i, j), expected); + } + // assert(fabs(sy(i, j) - expected) < 0.001); + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/parallel_for/test_parallel_for.cu b/cudax/test/stf/parallel_for/test_parallel_for.cu new file mode 100644 index 00000000000..1688704ff73 --- /dev/null +++ b/cudax/test/stf/parallel_for/test_parallel_for.cu @@ -0,0 +1,134 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief This test ensures that the parallel_for API works properly with its + * different overloads (with places, partitioners, etc...) + */ + +#include + +using namespace cuda::experimental::stf; + +__host__ __device__ double x0(size_t ind) +{ + return sin((double) ind); +} + +__host__ __device__ double y0(size_t ind) +{ + return cos((double) ind); +} + +int main() +{ + for (int style = 0; style < 7; ++style) + { + stream_ctx ctx; + + const int N = 16; + double X[2 * N], Y[N]; + + for (size_t ind = 0; ind < 2 * N; ind++) + { + X[ind] = x0(ind); + } + + for (size_t ind = 0; ind < N; ind++) + { + Y[ind] = y0(ind); + } + + auto lx = ctx.logical_data(X); + auto ly = ctx.logical_data(Y); + + switch (style) + { + case 0: + // Basic usage + ctx.parallel_for(ly.shape(), lx.read(), ly.rw())->*[=] _CCCL_DEVICE(size_t pos, auto sx, auto sy) { + sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1)); + }; + break; + case 1: + // This throws because you're attempting to run a device lambda on the host + try + { + ctx.parallel_for(exec_place::host, ly.shape(), lx.read(), ly.rw()) + ->*[=] _CCCL_DEVICE(size_t pos, auto sx, auto sy) { + sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1)); + }; + } + catch (std::exception&) + { + // All good, but don't check the result + ctx.finalize(); + continue; + } + assert(0); + break; + case 2: + // This throws because you're attempting to run a host lambda on a device + try + { + ctx.parallel_for(ly.shape(), lx.read(), ly.rw())->*[=] __host__(size_t pos, auto sx, auto sy) { + sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1)); + }; + } + catch (std::exception&) + { + // All good, but don't check the result + ctx.finalize(); + continue; + } + assert(0); + break; + case 3: + // This works because it dynamically selects the dual function to run on the host + ctx.parallel_for(exec_place::host, ly.shape(), lx.read(), ly.rw()) + ->*[=] __host__ __device__(size_t pos, slice sx, slice sy) { + sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1)); + }; + break; + case 4: + // This works because it dynamically selects the dual function to run on the device + ctx.parallel_for(exec_place::current_device(), ly.shape(), lx.read(), ly.rw()) + ->*[=] __host__ __device__(size_t pos, slice sx, slice sy) { + sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1)); + }; + break; + case 5: + // This works because it dynamically selects the dual function to run on the current device + ctx.parallel_for(ly.shape(), lx.read(), ly.rw()) + ->*[=] __host__ __device__(size_t pos, slice sx, slice sy) { + sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1)); + }; + break; + case 6: + // This works because it dispatches on all devices + ctx.parallel_for(blocked_partition(), exec_place::all_devices(), ly.shape(), lx.read(), ly.rw()) + ->*[=] __host__ __device__(size_t pos, slice sx, slice sy) { + sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1)); + }; + break; + default: + assert(0); + } + + /* Check the result on the host */ + ctx.parallel_for(exec_place::host, ly.shape(), lx.read(), ly.read())->*[=](size_t pos, auto sx, auto sy) { + EXPECT(fabs(sx(2 * pos) - x0(2 * pos)) < 0.0001); + EXPECT(fabs(sx(2 * pos + 1) - x0(2 * pos + 1)) < 0.0001); + EXPECT(fabs(sy(pos) - (y0(pos) + 0.5 * (x0(2 * pos) + x0(2 * pos + 1)))) < 0.0001); + }; + ctx.finalize(); + } +} diff --git a/cudax/test/stf/parallel_for/tiled_loops.cu b/cudax/test/stf/parallel_for/tiled_loops.cu new file mode 100644 index 00000000000..bdc2f7fa8b8 --- /dev/null +++ b/cudax/test/stf/parallel_for/tiled_loops.cu @@ -0,0 +1,84 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Apply partitioning operations on shapes to manipulate data subsets + */ + +#include +#include + +using namespace cuda::experimental::stf; + +// Compute which part ID has the item at position "index" +__host__ __device__ size_t ref_tiling(size_t index, size_t tile_size, size_t nparts) +{ + // in which tile is this ? + size_t tile_id = index / tile_size; + + // part which owns this tile + return (tile_id % nparts); +} + +int main() +{ + stream_ctx ctx; + + const size_t nparts = 4; + const size_t tile_size = 8; + + // Be sure to pick a number that is not a divisor to stress the tiling operator + const size_t N = nparts * 3 * tile_size + 7; + + double Y[N]; + + auto ly = ctx.logical_data(Y); + + // Init Y + ctx.parallel_for(ly.shape(), ly.write())->*[=] _CCCL_DEVICE(size_t pos, auto sy) { + sy(pos) = -1.0; + }; + + /* + * We apply a tiling operator on the shape of ly, to work on subsets of ly. + * For each subset, we put the id of the subset in the corresponding + * entries of Y + * + * Note that these tasks are serialized as they perform a rw() on the + * logical data as a whole. + */ + for (size_t part_id = 0; part_id < nparts; part_id++) + { + ctx.parallel_for(tiled(ly.shape(), part_id, nparts), ly.rw())->*[=] _CCCL_DEVICE(size_t pos, auto sy) { + sy(pos) = (double) part_id; + }; + } + + bool checked = false; + bool* pchecked = &checked; + + /* Check the result on the host */ + ctx.parallel_for(exec_place::host, ly.shape(), ly.read())->*[=](size_t pos, slice sy) { + int expected = static_cast(ref_tiling(pos, tile_size, nparts)); + int value = (int) sy(pos); + if (expected != value) + { + printf("POS %zu -> %d (expected %d)\n", pos, value, expected); + } + assert(expected == value); + *pchecked = true; + }; + + ctx.finalize(); + + // Ensure verification code did occur + assert(checked); +} diff --git a/cudax/test/stf/places/cuda_stream_place.cu b/cudax/test/stf/places/cuda_stream_place.cu new file mode 100644 index 00000000000..44c0881f75a --- /dev/null +++ b/cudax/test/stf/places/cuda_stream_place.cu @@ -0,0 +1,89 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief An AXPY kernel using an exec place attached to a specific CUDA stream + * + */ + +#include + +#include "nvtx3/nvToolsExtCudaRt.h" + +using namespace cuda::experimental::stf; + +double X0(size_t i) +{ + return sin((double) i); +} + +double Y0(size_t i) +{ + return cos((double) i); +} + +int main() +{ + cudaStream_t stream; + cuda_safe_call(cudaStreamCreate(&stream)); + nvtxNameCudaStreamA(stream, "user stream"); + + // context ctx; + stream_ctx ctx; + const size_t N = 16; + double X[N], Y[N]; + + for (size_t i = 0; i < N; i++) + { + X[i] = X0(i); + Y[i] = Y0(i); + } + + double alpha = 3.14; + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + /* Compute Y = Y + alpha X */ + auto where = exec_place::cuda_stream(stream); + + for (size_t iter = 0; iter < 10; iter++) + { + ctx.parallel_for(where, lX.shape(), lX.read(), lY.rw())->*[alpha] __device__(size_t i, auto x, auto y) { + y(i) += alpha * x(i); + }; + } + + /* Associate the CUDA stream with a unique internal ID to speed up synchronizations */ + auto rstream = register_stream(ctx.async_resources(), stream); + auto where2 = exec_place::cuda_stream(rstream); + + for (size_t iter = 0; iter < 10; iter++) + { + ctx.parallel_for(where2, lX.shape(), lX.read(), lY.rw())->*[alpha] __device__(size_t i, auto x, auto y) { + y(i) += alpha * x(i); + }; + } + + // Remove the association + unregister_stream(ctx.async_resources(), rstream); + + ctx.finalize(); + + for (size_t i = 0; i < N; i++) + { + assert(fabs(Y[i] - (Y0(i) + 2 * 10.0 * alpha * X0(i))) < 0.0001); + assert(fabs(X[i] - X0(i)) < 0.0001); + } + + cuda_safe_call(cudaStreamDestroy(stream)); +} diff --git a/cudax/test/stf/places/managed.cu b/cudax/test/stf/places/managed.cu new file mode 100644 index 00000000000..4c7c3da57fa --- /dev/null +++ b/cudax/test/stf/places/managed.cu @@ -0,0 +1,87 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief An AXPY kernel implemented with a task of the CUDA stream backend + * where the task accesses managed memory from the device + * + */ + +#include + +#include + +using namespace cuda::experimental::stf; + +__global__ void axpy(double a, slice x, slice y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < x.size(); i += nthreads) + { + y(i) += a * x(i); + } +} + +double X0(size_t i) +{ + return sin((double) i); +} + +double Y0(size_t i) +{ + return cos((double) i); +} + +int main() +{ + // Verify whether this device can access memory concurrently from CPU and GPU. + int dev; + cuda_safe_call(cudaGetDevice(&dev)); + assert(dev >= 0); + cudaDeviceProp prop; + cuda_safe_call(cudaGetDeviceProperties(&prop, dev)); + if (!prop.concurrentManagedAccess) + { + fprintf(stderr, "Concurrent CPU/GPU access not supported, skipping test.\n"); + return 0; + } + + stream_ctx ctx; + const size_t N = 16; + double X[N], Y[N]; + + for (size_t i = 0; i < N; i++) + { + X[i] = X0(i); + Y[i] = Y0(i); + } + + double alpha = 3.14; + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + /* Compute Y = Y + alpha X, but leave X on the host and access it with mapped memory */ + ctx.task(lX.read(data_place::managed), lY.rw())->*[&](cudaStream_t s, auto dX, auto dY) { + axpy<<<16, 128, 0, s>>>(alpha, dX, dY); + }; + + ctx.finalize(); + + for (size_t i = 0; i < N; i++) + { + assert(fabs(Y[i] - (Y0(i) + alpha * X0(i))) < 0.0001); + assert(fabs(X[i] - X0(i)) < 0.0001); + } +} diff --git a/cudax/test/stf/places/managed_from_shape.cu b/cudax/test/stf/places/managed_from_shape.cu new file mode 100644 index 00000000000..fbda56a9b3d --- /dev/null +++ b/cudax/test/stf/places/managed_from_shape.cu @@ -0,0 +1,92 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Make sure we can automatically allocate and use data in managed memory based on their shape + * + */ + +#include + +#include + +using namespace cuda::experimental::stf; + +__global__ void axpy(double a, slice x, slice y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < x.size(); i += nthreads) + { + y(i) += a * x(i); + } +} + +__host__ __device__ double X0(size_t i) +{ + return sin((double) i); +} + +double Y0(size_t i) +{ + return cos((double) i); +} + +int main() +{ + // Verify whether this device can access memory concurrently from CPU and GPU. + int dev; + cuda_safe_call(cudaGetDevice(&dev)); + assert(dev >= 0); + cudaDeviceProp prop; + cuda_safe_call(cudaGetDeviceProperties(&prop, dev)); + if (!prop.concurrentManagedAccess) + { + fprintf(stderr, "Concurrent CPU/GPU access not supported, skipping test.\n"); + return 0; + } + + stream_ctx ctx; + const size_t N = 16; + double Y[N]; + + for (size_t i = 0; i < N; i++) + { + Y[i] = Y0(i); + } + + double alpha = 3.14; + + auto lX = ctx.logical_data(shape_of>(N)); + auto lY = ctx.logical_data(Y); + + // Make sure X is created automatically in managed memory + ctx.parallel_for(lX.shape(), lX.write(data_place::managed))->*[] _CCCL_DEVICE(size_t i, auto X) { + X(i) = X0(i); + }; + + /* Compute Y = Y + alpha X, but leave X in managed memory */ + ctx.task(lX.read(data_place::managed), lY.rw())->*[&](cudaStream_t s, auto dX, auto dY) { + axpy<<<16, 128, 0, s>>>(alpha, dX, dY); + }; + + ctx.host_launch(lX.read(data_place::managed), lY.read())->*[=](auto X, auto Y) { + for (size_t i = 0; i < N; i++) + { + EXPECT(fabs(Y(i) - (Y0(i) + alpha * X0(i))) < 0.0001); + EXPECT(fabs(X(i) - X0(i)) < 0.0001); + } + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/places/managed_from_user.cu b/cudax/test/stf/places/managed_from_user.cu new file mode 100644 index 00000000000..8f79708c28c --- /dev/null +++ b/cudax/test/stf/places/managed_from_user.cu @@ -0,0 +1,90 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief An AXPY kernel implemented with a task of the CUDA stream backend + * where the task accesses managed memory from the device. This tests + * explicitely created managed memory, and passes it to a logical data. + */ + +#include + +#include + +using namespace cuda::experimental::stf; + +__global__ void axpy(double a, slice x, slice y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < x.size(); i += nthreads) + { + y(i) += a * x(i); + } +} + +double X0(size_t i) +{ + return sin((double) i); +} + +double Y0(size_t i) +{ + return cos((double) i); +} + +int main() +{ + // Verify whether this device can access memory concurrently from CPU and GPU. + int dev; + cuda_safe_call(cudaGetDevice(&dev)); + assert(dev >= 0); + cudaDeviceProp prop; + cuda_safe_call(cudaGetDeviceProperties(&prop, dev)); + if (!prop.concurrentManagedAccess) + { + fprintf(stderr, "Concurrent CPU/GPU access not supported, skipping test.\n"); + return 0; + } + + stream_ctx ctx; + const size_t N = 16; + double Y[N]; + + double* X; + cuda_safe_call(cudaMallocManaged(&X, N * sizeof(double))); + + for (size_t i = 0; i < N; i++) + { + X[i] = X0(i); + Y[i] = Y0(i); + } + + double alpha = 3.14; + + auto lX = ctx.logical_data(make_slice(X, N), data_place::managed); + auto lY = ctx.logical_data(Y); + + /* Compute Y = Y + alpha X, but leave X in managed memory */ + ctx.task(lX.read(data_place::managed), lY.rw())->*[&](cudaStream_t s, auto dX, auto dY) { + axpy<<<16, 128, 0, s>>>(alpha, dX, dY); + }; + + ctx.finalize(); + + for (size_t i = 0; i < N; i++) + { + assert(fabs(Y[i] - (Y0(i) + alpha * X0(i))) < 0.0001); + assert(fabs(X[i] - X0(i)) < 0.0001); + } +} diff --git a/cudax/test/stf/places/non_current_device.cu b/cudax/test/stf/places/non_current_device.cu new file mode 100644 index 00000000000..4f682773a65 --- /dev/null +++ b/cudax/test/stf/places/non_current_device.cu @@ -0,0 +1,77 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +template +__global__ void axpy(int n, T a, const T* x, T* y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int ind = tid; ind < n; ind += nthreads) + { + y[ind] += a * x[ind]; + } +} + +int main() +{ + int ndevs; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + cuda_safe_call(cudaSetDevice(0)); + + if (ndevs < 2) + { + fprintf(stderr, "Skipping test that needs at last 2 devices.\n"); + return 0; + } + + stream_ctx ctx; + + const double alpha = 2.0; + const int n = 12; + + double X[n], Y[n]; + + for (int ind = 0; ind < n; ind++) + { + X[ind] = 1.0 * ind; + Y[ind] = 2.0 * ind - 3.0; + } + + auto handle_X = ctx.logical_data(X); + auto handle_Y = ctx.logical_data(Y); + + /* Compute Y = Y + alpha X, but leave X on the host and access it with mapped memory */ + ctx.task(exec_place::device(1), handle_X.read(), handle_Y.rw())->*[&](cudaStream_t stream, auto X, auto Y) { + axpy<<<16, 128, 0, stream>>>(n, alpha, X.data_handle(), Y.data_handle()); + }; + + // Access Ask to use X, Y and Z on the host + ctx.task(exec_place::host, handle_X.read(), handle_Y.read())->*[&](cudaStream_t stream, auto X, auto Y) { + cuda_safe_call(cudaStreamSynchronize(stream)); + + for (int ind = 0; ind < n; ind++) + { + // X unchanged + EXPECT(fabs(X(ind) - 1.0 * ind) < 0.00001); + // Y = Y + alpha X + EXPECT(fabs(Y(ind) - (-3.0 + ind * (2.0 + alpha))) < 0.00001); + } + }; + + ctx.finalize(); + + return 0; +} diff --git a/cudax/test/stf/places/place_partition.cu b/cudax/test/stf/places/place_partition.cu new file mode 100644 index 00000000000..bf3a918efe9 --- /dev/null +++ b/cudax/test/stf/places/place_partition.cu @@ -0,0 +1,52 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +void print_partition(async_resources_handle& handle, exec_place place, place_partition_scope scope) +{ + fprintf(stderr, "-----------\n"); + fprintf( + stderr, "PARTITION %s (scope: %s):\n", place.to_string().c_str(), place_partition_scope_to_string(scope).c_str()); + for (auto sub_place : place_partition(handle, place, scope)) + { + fprintf(stderr, "[%s] subplace: %s\n", place.to_string().c_str(), sub_place.to_string().c_str()); + } + + fprintf(stderr, "-----------\n"); +} + +int main() +{ +#if CUDA_VERSION < 12040 + fprintf(stderr, "Green contexts are not supported by this version of CUDA: skipping test.\n"); + return 0; +#else + async_resources_handle handle; + + print_partition(handle, exec_place::all_devices(), place_partition_scope::cuda_device); + + print_partition(handle, exec_place::all_devices(), place_partition_scope::cuda_stream); + + print_partition(handle, exec_place::current_device(), place_partition_scope::cuda_stream); + + print_partition(handle, exec_place::current_device(), place_partition_scope::green_context); + print_partition(handle, exec_place::current_device(), place_partition_scope::green_context); + + print_partition(handle, exec_place::repeat(exec_place::current_device(), 4), place_partition_scope::green_context); + + print_partition(handle, exec_place::current_device(), place_partition_scope::cuda_device); + + print_partition(handle, exec_place::repeat(exec_place::current_device(), 4), place_partition_scope::cuda_stream); +#endif +} diff --git a/cudax/test/stf/places/recursion.cu b/cudax/test/stf/places/recursion.cu new file mode 100644 index 00000000000..3af51e4dc98 --- /dev/null +++ b/cudax/test/stf/places/recursion.cu @@ -0,0 +1,41 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +void rec_func(exec_place_grid places) +{ + if (places.size() == 1) + { + // places->print("SINGLE"); + } + else + { + // places->print("REC"); + for (int i = 0; i < 2; i++) + { + // Take every other places from the grid + auto half_places = partition_cyclic(places, dim4(2), pos4(i)); + rec_func(half_places); + } + } +} + +int main() +{ + auto places = exec_place::all_devices(); + // places->print("ALL"); + + rec_func(places); + + return 0; +} diff --git a/cudax/test/stf/reclaiming/graph.cu b/cudax/test/stf/reclaiming/graph.cu new file mode 100644 index 00000000000..3c77cb15980 --- /dev/null +++ b/cudax/test/stf/reclaiming/graph.cu @@ -0,0 +1,72 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#if !defined(_CCCL_COMPILER_MSVC) +using namespace cuda::experimental::stf; + +__global__ void kernel() +{ + // No-op +} +#endif // !defined(_CCCL_COMPILER_MSVC) + +int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) +{ +// TODO fix setenv +#if !defined(_CCCL_COMPILER_MSVC) + int nblocks = 4; + size_t block_size = 1024 * 1024; + + if (argc > 1) + { + nblocks = atoi(argv[1]); + } + + if (argc > 2) + { + block_size = atoi(argv[2]); + } + + // At most 1 buffer is allocated at the same time + setenv("MAX_ALLOC_CNT", "1", 0); + + graph_ctx ctx; + + ::std::vector>> handles(nblocks); + + char* h_buffer = new char[nblocks * block_size]; + + for (int i = 0; i < nblocks; i++) + { + handles[i] = ctx.logical_data(make_slice(&h_buffer[i * block_size], block_size)); + handles[i].set_symbol("D_" + std::to_string(i)); + } + + // We only 2 buffers, we are forced to reuse the buffer from D0 for D2 + for (int i = 0; i < 3; i++) + { + ctx.task(handles[i % nblocks].rw())->*[&](cudaStream_t s, auto /*unused*/) { + kernel<<<1, 1, 0, s>>>(); + }; + } + + ctx.submit(); + + if (argc > 3) + { + std::cout << "Generating DOT output in " << argv[3] << std::endl; + ctx.print_to_dot(argv[1]); + } + + ctx.finalize(); +#endif // !defined(_CCCL_COMPILER_MSVC) +} diff --git a/cudax/test/stf/reclaiming/graph_2.cu b/cudax/test/stf/reclaiming/graph_2.cu new file mode 100644 index 00000000000..9099a7be571 --- /dev/null +++ b/cudax/test/stf/reclaiming/graph_2.cu @@ -0,0 +1,66 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +__global__ void kernel() +{ + // No-op +} + +int main(int argc, char** argv) +{ + int nblocks = 4; + size_t block_size = 1024 * 1024; + + if (argc > 1) + { + nblocks = atoi(argv[1]); + } + + if (argc > 2) + { + block_size = atoi(argv[2]); + } + + // At most 1 buffer is allocated at the same time + setenv("MAX_ALLOC_CNT", "2", 1); + + graph_ctx ctx; + ::std::vector>> handles(nblocks); + std::vector h_buffer(nblocks * block_size); + + for (int i = 0; i < nblocks; i++) + { + handles[i] = ctx.logical_data(make_slice(&h_buffer[i * block_size], block_size)); + handles[i].set_symbol("D_" + std::to_string(i)); + } + + // We only 2 buffers, we are forced to reuse the buffer from D0 for D2 + for (int i = 0; i < 3; i++) + { + ctx.task(handles[i % nblocks].read(), handles[(i + 1) % nblocks].rw()) + ->*[&](cudaStream_t s, auto /*unused*/, auto /*unused*/) { + kernel<<<1, 1, 0, s>>>(); + }; + } + + ctx.submit(); + + if (argc > 3) + { + std::cout << "Generating DOT output in " << argv[3] << std::endl; + ctx.print_to_dot(argv[3]); + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/reclaiming/graph_real_oom.cu b/cudax/test/stf/reclaiming/graph_real_oom.cu new file mode 100644 index 00000000000..e71fe028f4c --- /dev/null +++ b/cudax/test/stf/reclaiming/graph_real_oom.cu @@ -0,0 +1,78 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +__global__ void dummy() {} + +int main(int argc, char** argv) +{ + const int dev_id = 0; + cuda_safe_call(cudaSetDevice(dev_id)); + + cudaDeviceProp prop; + cuda_safe_call(cudaGetDeviceProperties(&prop, dev_id)); + + const size_t total_mem_ref = prop.totalGlobalMem; + + size_t free_mem, total_mem; + cuda_safe_call(cudaMemGetInfo(&free_mem, &total_mem)); + + std::cout << "Device memory: " << total_mem_ref / (1024 * 1024.) << " MB" + << " FREE/TOTAL=" << free_mem / (1024 * 1024.) << "/" << total_mem / (1024 * 1024.) << std::endl; + + // Warning: this should represent 6% of device's available memory + size_t block_size = free_mem / 32; + int nblocks = 2; + + // We preallocate most available device memory to avoid being limited by host memory. + void* wasted_mem; + size_t wasted_size = block_size * 29; + cuda_safe_call(cudaMalloc(&wasted_mem, wasted_size)); + + std::cout << "Wasted: " << wasted_size / (1024 * 1024.) << " MB" << std::endl; + + graph_ctx ctx; + + std::vector>> handles(nblocks); + + char* h_buffer = new char[nblocks * block_size]; + + for (int i = 0; i < 2; i++) + { + handles[i] = ctx.logical_data(make_slice(&h_buffer[i * block_size], block_size)); + handles[i].set_symbol("D_" + std::to_string(i)); + } + + // There can only be a single handle on the device at the same time + for (int i = 0; i < nblocks; i++) + { + ctx.task(handles[i % nblocks].rw())->*[](cudaStream_t stream, auto /*unused*/) { + dummy<<<1, 1, 0, stream>>>(); + }; + } + + // Checking the amount of memory actually available now + cuda_safe_call(cudaMemGetInfo(&free_mem, &total_mem)); + std::cout + << "Device memory: FREE/TOTAL=" << free_mem / (1024 * 1024.) << "/" << total_mem / (1024 * 1024.) << std::endl; + + ctx.submit(); + + if (argc > 1) + { + std::cout << "Generating DOT output in " << argv[1] << std::endl; + ctx.print_to_dot(argv[1]); + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/reclaiming/stream.cu b/cudax/test/stf/reclaiming/stream.cu new file mode 100644 index 00000000000..7af2e4373ed --- /dev/null +++ b/cudax/test/stf/reclaiming/stream.cu @@ -0,0 +1,142 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +__global__ void kernel(int i, slice buf) +{ + buf[0] = (char) i; +} + +static __global__ void cuda_sleep_kernel(long long int clock_cnt) +{ + long long int start_clock = clock64(); + long long int clock_offset = 0; + while (clock_offset < clock_cnt) + { + clock_offset = clock64() - start_clock; + } +} + +void cuda_sleep(double ms, cudaStream_t stream) +{ + int device; + cudaGetDevice(&device); + + // cudaDevAttrClockRate: Peak clock frequency in kilohertz; + int clock_rate; + cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, device); + + long long int clock_cnt = (long long int) (ms * clock_rate); + cuda_sleep_kernel<<<1, 1, 0, stream>>>(clock_cnt); +} + +// Dummy allocator which only allocates a single block on a single device +// +// The first allocation succeeds, next attempt will fail until buffer is +// deallocated. +class one_block_allocator : public block_allocator_interface +{ +public: + one_block_allocator() = default; + +public: + // Note that this allocates memory immediately, so we just do not modify the event list and ignore it + void* allocate(backend_ctx_untyped&, const data_place& memory_node, ::std::ptrdiff_t& s, event_list&) override + { + if (busy) + { + s = -s; + return nullptr; + } + + EXPECT(memory_node.is_device()); + if (!base) + { + cuda_safe_call(cudaMalloc(&base, s)); + } + + busy = true; + + return base; + } + + void deallocate(backend_ctx_untyped&, const data_place&, event_list&, void*, size_t) override + { + EXPECT(busy); + busy = false; + } + + event_list deinit(backend_ctx_untyped&) override + { + return event_list(); + } + + std::string to_string() const override + { + return "dummy"; + } + +private: + // We have a single block, so we keep its address, and a flag to indicate + // if it's busy + void* base = nullptr; + bool busy = false; +}; + +int main(int argc, char** argv) +{ + int nblocks = 4; + size_t block_size = 1024 * 1024; + + if (argc > 1) + { + nblocks = atoi(argv[1]); + } + + if (argc > 2) + { + block_size = atoi(argv[2]); + } + + stream_ctx ctx; + + auto dummy_alloc = block_allocator(ctx); + ctx.set_allocator(dummy_alloc); + + ::std::vector>> handles(nblocks); + + char* h_buffer = new char[nblocks * block_size]; + + for (int i = 0; i < nblocks; i++) + { + handles[i] = ctx.logical_data(make_slice(&h_buffer[i * block_size], block_size)); + handles[i].set_symbol("D_" + std::to_string(i)); + } + + // We only 2 buffers, we are forced to reuse the buffer from D0 for D2 + for (int i = 0; i < nblocks; i++) + { + ctx.task(handles[i % nblocks].rw())->*[&](cudaStream_t s, auto buf) { + // Wait 100ms to have a more stressful asynchronous execution + cuda_sleep(100, s); + kernel<<<1, 1, 0, s>>>(i, buf); + }; + } + + ctx.finalize(); + + for (int i = 0; i < nblocks; i++) + { + EXPECT(h_buffer[block_size * i] == i); + } +} diff --git a/cudax/test/stf/reductions/many_inc.cu b/cudax/test/stf/reductions/many_inc.cu new file mode 100644 index 00000000000..af5b96d0571 --- /dev/null +++ b/cudax/test/stf/reductions/many_inc.cu @@ -0,0 +1,51 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +__global__ void add(int* ptr) +{ + *ptr = *ptr + 1; +} + +int main() +{ + stream_ctx ctx; + + auto redux_op = std::make_shared>(); + + int a = 17; + + auto handle = ctx.logical_data(make_slice(&a, 1)); + + int ndevs; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + int K = 1024; + for (int i = 0; i < K; i++) + { + // Increment the variable by 1 + ctx.task(exec_place::device(i % ndevs), handle.redux(redux_op))->*[](auto stream, auto s) { + add<<<1, 1, 0, stream>>>(s.data_handle()); + }; + } + + // Total value should be initial value + K + ctx.task(exec_place::host, handle.read())->*[&](auto stream, auto s) { + cuda_safe_call(cudaStreamSynchronize(stream)); + EXPECT(s(0) == 17 + K); + // printf("VALUE %d expected %d\n", s(0), 17 + K); + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/reductions/redux_test.cu b/cudax/test/stf/reductions/redux_test.cu new file mode 100644 index 00000000000..93200835bc5 --- /dev/null +++ b/cudax/test/stf/reductions/redux_test.cu @@ -0,0 +1,42 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +__global__ void add(int* ptr, int value) +{ + *ptr = *ptr + value; +} + +int main() +{ + stream_ctx ctx; + + int a = 17; + auto handle = ctx.logical_data(make_slice(&a, 1)); + auto redux_op = std::make_shared>(); + ctx.task(handle.redux(redux_op))->*[](auto stream, auto s) { + add<<<1, 1, 0, stream>>>(s.data_handle(), 42); + }; + + ctx.task(handle.rw())->*[](auto stream, auto s) { + add<<<1, 1, 0, stream>>>(s.data_handle(), 1); + }; + + ctx.task(exec_place::host, handle.read())->*[](auto stream, auto s) { + cuda_safe_call(cudaStreamSynchronize(stream)); + EXPECT(s(0) == 17 + 42 + 1); + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/reductions/redux_test2.cu b/cudax/test/stf/reductions/redux_test2.cu new file mode 100644 index 00000000000..02ca26ea993 --- /dev/null +++ b/cudax/test/stf/reductions/redux_test2.cu @@ -0,0 +1,78 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +__global__ void add(int* ptr, int value) +{ + *ptr = *ptr + value; +} + +__global__ void check_val(const int* ptr, int expected) +{ + assert(*ptr == expected); +} + +/* + * This test ensures that we can reconstruct a piece of data where there are + * multiple "shared" instances, and "redux" instances + */ +int main() +{ + stream_ctx ctx; + + int ndevs; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + if (ndevs < 2) + { + fprintf(stderr, "Skipping test: need at least 2 devices.\n"); + return 0; + } + + auto redux_op = std::make_shared>(); + + int a = 17; + + // init op (17) + auto handle = ctx.logical_data(make_slice(&a, 1)); + + // RW dev0 (18) + ctx.task(exec_place::device(0), handle.rw())->*[](auto stream, auto s) { + add<<<1, 1, 0, stream>>>(s.data_handle(), 1); + }; + + // READ dev1 (18) + ctx.task(exec_place::device(1), handle.read())->*[](auto stream, auto s) { + check_val<<<1, 1, 0, stream>>>(s.data_handle(), 18); + }; + + // REDUX dev1 (18 + 42) + ctx.task(exec_place::device(1), handle.redux(redux_op))->*[](auto stream, auto s) { + add<<<1, 1, 0, stream>>>(s.data_handle(), 42); + }; + + // READ dev0 (18 + 42) + ctx.task(exec_place::device(0), handle.read())->*[](auto stream, auto s) { + check_val<<<1, 1, 0, stream>>>(s.data_handle(), 18 + 42); + }; + + // READ + ctx.task(exec_place::host, handle.read())->*[](auto stream, auto s) { + cuda_safe_call(cudaStreamSynchronize(stream)); + EXPECT(s(0) == 18 + 42); + // printf("VALUE %d expected %d\n", s(0), 18 + 42); + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/reductions/slice2d_reduction.cu b/cudax/test/stf/reductions/slice2d_reduction.cu new file mode 100644 index 00000000000..d60005f65c4 --- /dev/null +++ b/cudax/test/stf/reductions/slice2d_reduction.cu @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +__global__ void add(slice s, int val) +{ + size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + size_t nthreads = blockDim.x * gridDim.x; + + for (size_t j = 0; j < s.extent(1); j++) + { + for (size_t i = tid; i < s.extent(0); i += nthreads) + { + s(i, j) += val; + } + } +} + +int main() +{ + stream_ctx ctx; + + int array[6] = {0, 1, 2, 3, 4, 5}; + + // auto handle = ctx.logical_data(slice(&array[0], std::tuple{ 2, 3 }, 2)); + auto handle = ctx.logical_data(make_slice(&array[0], std::tuple{2, 3}, 2)); + + auto redux_op = std::make_shared>(); + + ctx.task(handle.redux(redux_op))->*[](auto stream, auto s) { + add<<<32, 32, 0, stream>>>(s, 42); + }; + + ctx.task(exec_place::host, handle.read())->*[](auto stream, auto s) { + cuda_safe_call(cudaStreamSynchronize(stream)); + + for (size_t j = 0; j < s.extent(1); j++) + { + for (size_t i = 0; i < s.extent(0); i++) + { + // fprintf(stderr, "%d\t", s(i, j)); + } + // fprintf(stderr, "\n"); + } + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/reductions/slice_custom_op.cu b/cudax/test/stf/reductions/slice_custom_op.cu new file mode 100644 index 00000000000..05d25965b97 --- /dev/null +++ b/cudax/test/stf/reductions/slice_custom_op.cu @@ -0,0 +1,72 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +struct OR_op +{ + static void init_host(bool& out) + { + out = false; + }; + static __device__ void init_gpu(bool& out) + { + out = false; + }; + static void op_host(const bool& in, bool& inout) + { + inout |= in; + }; + static __device__ void op_gpu(const bool& in, bool& inout) + { + inout |= in; + }; +}; + +int main() +{ + stream_ctx ctx; + + int ndevs; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + bool A[4] = {false, false, true, false}; + bool B[4] = {false, false, false, false}; + bool C[4] = {true, false, false, true}; + + auto lA = ctx.logical_data(make_slice(&A[0], 4)); + auto lB = ctx.logical_data(make_slice(&B[0], 4)); + auto lC = ctx.logical_data(make_slice(&C[0], 4)); + + auto op = std::make_shared>(); + + // C |= A + ctx.task(lC.redux(op), lA.read())->*[](auto stream, auto sC, auto sA) { + cudaMemcpyAsync(sC.data_handle(), sA.data_handle(), sA.extent(0) * sizeof(bool), cudaMemcpyDeviceToDevice, stream); + }; + + // C |= B + ctx.task(lC.redux(op), lB.read())->*[](auto stream, auto sC, auto sB) { + cudaMemcpyAsync(sC.data_handle(), sB.data_handle(), sB.extent(0) * sizeof(bool), cudaMemcpyDeviceToDevice, stream); + }; + + ctx.task(exec_place::host, lC.read())->*[](auto stream, auto sC) { + cuda_safe_call(cudaStreamSynchronize(stream)); + for (size_t i = 0; i < sC.extent(0); i++) + { + // fprintf(stderr, "RESULT C[i] = %d\n", sC(i)); + } + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/reductions/successive_reductions.cu b/cudax/test/stf/reductions/successive_reductions.cu new file mode 100644 index 00000000000..01cdef27401 --- /dev/null +++ b/cudax/test/stf/reductions/successive_reductions.cu @@ -0,0 +1,67 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +using scalar_t = slice_stream_interface; + +template +__global__ void set_value(T* addr, T val) +{ + *addr = val; +} + +template +__global__ void check_value_and_reset(T* addr, T val, T reset_val) +{ + assert(*addr == val); + *addr = reset_val; +} + +template +__global__ void add_val(T* inout_addr, T val) +{ + *inout_addr += val; +} + +int main() +{ + stream_ctx ctx; + const int N = 4; + + auto var_handle = ctx.logical_data(shape_of>(1)); + var_handle.set_symbol("var"); + + auto redux_op = std::make_shared>(); + + const int niters = 4; + for (int iter = 0; iter < niters; iter++) + { + // We add i (total = N(N-1)/2 + initial_value) + for (int i = 0; i < N; i++) + { + ctx.task(var_handle.redux(redux_op))->*[=](cudaStream_t stream, auto d_var) { + add_val<<<1, 1, 0, stream>>>(d_var.data_handle(), i); + cuda_safe_call(cudaGetLastError()); + }; + } + + // Check that we have the expected value, and reset it so that we can perform another reduction + ctx.task(var_handle.rw())->*[=](cudaStream_t stream, auto d_var) { + int expected = (N * (N - 1)) / 2; + check_value_and_reset<<<1, 1, 0, stream>>>(d_var.data_handle(), expected, 0); + }; + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/reductions/successive_reductions_pfor.cu b/cudax/test/stf/reductions/successive_reductions_pfor.cu new file mode 100644 index 00000000000..af1fb868ca9 --- /dev/null +++ b/cudax/test/stf/reductions/successive_reductions_pfor.cu @@ -0,0 +1,45 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +int main() +{ + stream_ctx ctx; + const int N = 4; + + auto var_handle = ctx.logical_data(shape_of>(1)); + var_handle.set_symbol("var"); + + auto op = std::make_shared>(); + + const int niters = 4; + for (int iter = 0; iter < niters; iter++) + { + // We add i (total = N(N-1)/2 + initial_value) + for (int i = 0; i < N; i++) + { + ctx.parallel_for(var_handle.shape(), var_handle.redux(op))->*[=] _CCCL_DEVICE(size_t ind, auto d_var) { + atomicAdd(d_var.data_handle(), i); + }; + } + + // Check that we have the expected value, and reset it so that we can perform another reduction + ctx.parallel_for(var_handle.shape(), var_handle.rw())->*[=] _CCCL_DEVICE(size_t ind, auto d_var) { + assert(d_var(0) == (N * (N - 1)) / 2); + d_var(0) = 0; + }; + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/reductions/sum.cu b/cudax/test/stf/reductions/sum.cu new file mode 100644 index 00000000000..21c8f9c004b --- /dev/null +++ b/cudax/test/stf/reductions/sum.cu @@ -0,0 +1,108 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief Use the reduction access mode to add variables concurrently on different places + */ + +#include +#include + +using namespace cuda::experimental::stf; + +using scalar_t = slice_stream_interface; + +template +__global__ void set_value(T* addr, T val) +{ + *addr = val; +} + +template +__global__ void add(const T* in_addr, T* inout_addr) +{ + // printf("ADD: %d += %d : RES %d\n", *inout_addr, *in_addr, *inout_addr + *in_addr); + + *inout_addr += *in_addr; +} + +template +__global__ void add_val(T* inout_addr, T val) +{ + *inout_addr += val; +} + +/* + * Define a SUM reduction operator over a scalar + */ +class scalar_sum_t : public stream_reduction_operator_untyped +{ +public: + scalar_sum_t() + : stream_reduction_operator_untyped() {}; + + void stream_redux_op( + logical_data_untyped& d, + const data_place& /*unused*/, + instance_id_t inout_instance_id, + const data_place& /*unused*/, + instance_id_t in_instance_id, + const exec_place& /*unused*/, + cudaStream_t s) override + { + auto& in_instance = d.instance(in_instance_id); + auto& inout_instance = d.instance(inout_instance_id); + // fprintf(stderr, "REDUX OP d %p inout (node %d id %d addr %p) in (node %d id %d addr %p)\n", d, + // inout_memory_node, inout_instance_id, *inout_instance, in_memory_node, in_instance_id, *in_instance); + add<<<1, 1, 0, s>>>(in_instance.data_handle(), inout_instance.data_handle()); + } + + void stream_init_op(logical_data_untyped& d, + const data_place& /*unused*/, + instance_id_t out_instance_id, + const exec_place& /*unused*/, + cudaStream_t s) override + { + auto& out_instance = d.instance(out_instance_id); + // fprintf(stderr, "REDUX INIT d %p memory node %d instance id %d => addr %p\n", d, out_memory_node, + // out_instance_id, *out_instance); + set_value<<<1, 1, 0, s>>>(out_instance.data_handle(), 0); + } +}; + +int main() +{ + const int N = 4; + + stream_ctx ctx; + + auto var_handle = ctx.logical_data(shape_of>(1)); + var_handle.set_symbol("var"); + + auto redux_op = std::make_shared(); + + // We add i (total = N(N-1)/2 + initial_value) + for (int i = 0; i < N; i++) + { + ctx.task(var_handle.redux(redux_op))->*[&](cudaStream_t stream, auto d_var) { + add_val<<<1, 1, 0, stream>>>(d_var.data_handle(), i); + }; + } + + // Check result + ctx.task(exec_place::host, var_handle.read())->*[&](cudaStream_t stream, auto h_var) { + cuda_safe_call(cudaStreamSynchronize(stream)); + int expected = (N * (N - 1)) / 2; + EXPECT(h_var(0) == expected); + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/reductions/sum_array.cu b/cudax/test/stf/reductions/sum_array.cu new file mode 100644 index 00000000000..9ffda9ff180 --- /dev/null +++ b/cudax/test/stf/reductions/sum_array.cu @@ -0,0 +1,119 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include + +using namespace cuda::experimental::stf; + +using scalar_t = slice_stream_interface; + +template +__global__ void set_value(T* addr, T val) +{ + *addr = val; +} + +template +__global__ void add(const T* in_addr, T* inout_addr) +{ + *inout_addr += *in_addr; +} + +/* + * Define a SUM reduction operator over a scalar + */ +class scalar_sum_t : public stream_reduction_operator_untyped +{ +public: + scalar_sum_t() + : stream_reduction_operator_untyped() {}; + + void stream_redux_op( + logical_data_untyped& d, + const data_place& /*unused*/, + instance_id_t inout_instance_id, + const data_place& /*unused*/, + instance_id_t in_instance_id, + const exec_place& /*unused*/, + cudaStream_t s) override + { + auto& in_instance = d.instance(in_instance_id); + auto& inout_instance = d.instance(inout_instance_id); + add<<<1, 1, 0, s>>>(in_instance.data_handle(), inout_instance.data_handle()); + } + + void stream_init_op(logical_data_untyped& d, + const data_place& /*unused*/, + instance_id_t out_instance_id, + const exec_place& /*unused*/, + cudaStream_t s) override + { + auto& out_instance = d.instance(out_instance_id); + // fprintf(stderr, "REDUX INIT d %p memory node %d instance id %d => addr %p\n", d, out_memory_node, + // out_instance_id, *out_instance); + set_value<<<1, 1, 0, s>>>(out_instance.data_handle(), 0); + } +}; + +int main() +{ + stream_ctx ctx; + + const int N = 128; + + // We have an array, and a handle for each entry of the array + int array[N]; + logical_data> array_handles[N]; + + /* + * We are going to compute the sum of this array + */ + for (int i = 0; i < N; i++) + { + array[i] = i; + array_handles[i] = ctx.logical_data(&array[i], {1}); + array_handles[i].set_symbol(std::string("array[") + std::to_string(i) + std::string("]")); + } + + logical_data> var_handle = ctx.logical_data(shape_of>(1)); + var_handle.set_symbol("var"); + + int check_sum = 0; + for (int i = 0; i < N; i++) + { + check_sum += array[i]; + } + + auto redux_op = std::make_shared(); + + for (int i = 0; i < N; i++) + { + ctx.task(var_handle.redux(redux_op), array_handles[i].read())->*[](cudaStream_t stream, auto d_var, auto d_array_i) { + add<<<1, 1, 0, stream>>>(d_array_i.data_handle(), d_var.data_handle()); + }; + } + + // Force the reconstruction of data on the device, so that no transfers are + // necessary while reconstructing the result. + // This will of course not be necessary in the future ... + ctx.task(var_handle.read())->*[](cudaStream_t /*unused*/, auto /*unused*/) {}; + + // Check result + ctx.task(exec_place::host, var_handle.read())->*[=](cudaStream_t stream, auto h_var) { + cuda_safe_call(cudaStreamSynchronize(stream)); + int value = h_var(0); + EXPECT(value == check_sum); + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/reductions/sum_multiple_places.cu b/cudax/test/stf/reductions/sum_multiple_places.cu new file mode 100644 index 00000000000..ebbf87a0874 --- /dev/null +++ b/cudax/test/stf/reductions/sum_multiple_places.cu @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +template +__global__ void add_val(slice inout, T val) +{ + inout(0) += val; +} + +int main() +{ + stream_ctx ctx; + + int ndevs; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + const int N = 4; + + int var = 42; + + auto var_handle = ctx.logical_data(make_slice(&var, 1)); + var_handle.set_symbol("var"); + + auto redux_op = std::make_shared>(); + + // We add i twice (total = N(N-1) + initial_value) + for (int i = 0; i < N; i++) + { + // device + for (int d = 0; d < ndevs; d++) + { + ctx.task(exec_place::device(d), var_handle.redux(redux_op))->*[=](cudaStream_t s, auto var) { + add_val<<<1, 1, 0, s>>>(var, i); + }; + } + + // host + ctx.host_launch(var_handle.redux(redux_op))->*[=](auto var) { + var(0) += i; + }; + } + + // Check result + ctx.host_launch(var_handle.read())->*[&](auto var) { + int expected = 42 + (N * (N - 1)) / 2 * (ndevs + 1); + EXPECT(var(0) == expected); + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/reductions/sum_multiple_places_no_refvalue.cu b/cudax/test/stf/reductions/sum_multiple_places_no_refvalue.cu new file mode 100644 index 00000000000..89cc446839e --- /dev/null +++ b/cudax/test/stf/reductions/sum_multiple_places_no_refvalue.cu @@ -0,0 +1,118 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief This test ensures that we can apply a reduction on a logical data + * described using a shape + */ + +#include +#include + +using namespace cuda::experimental::stf; + +using scalar_t = slice; + +template +__global__ void set_value(T* addr, T val) +{ + *addr = val; +} + +// instance id are put for debugging purpose ... +template +__global__ void add(const T* in_addr, T* inout_addr) +{ + *inout_addr += *in_addr; +} + +template +__global__ void add_val(T* inout_addr, T val) +{ + *inout_addr += val; +} + +class scalar_sum_t : public stream_reduction_operator +{ +public: + void op(const scalar_t& in, scalar_t& inout, const exec_place& e, cudaStream_t s) override + { + if (e.affine_data_place() == data_place::host) + { + // TODO make a callback when the situation gets better + cuda_safe_call(cudaStreamSynchronize(s)); + *inout.data_handle() += *in.data_handle(); + } + else + { + // this is not the host, so this has to be a device ... (XXX) + add<<<1, 1, 0, s>>>(in.data_handle(), inout.data_handle()); + } + } + + void init_op(scalar_t& out, const exec_place& e, cudaStream_t s) override + { + if (e.affine_data_place() == data_place::host) + { + // TODO make a callback when the situation gets better + cuda_safe_call(cudaStreamSynchronize(s)); + *out.data_handle() = 0; + } + else + { + // this is not the host, so this has to be a device ... (XXX) + set_value<<<1, 1, 0, s>>>(out.data_handle(), 0); + } + } +}; + +int main() +{ + stream_ctx ctx; + + int ndevs; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + const int N = 4; + + auto var_handle = ctx.logical_data(shape_of>(1)); + var_handle.set_symbol("var"); + + auto redux_op = std::make_shared(); + + // We add i twice (total = N(N-1) + initial_value) + for (int i = 0; i < N; i++) + { + // device + for (int d = 0; d < ndevs; d++) + { + ctx.task(exec_place::device(d), var_handle.redux(redux_op))->*[&](cudaStream_t s, auto var) { + add_val<<<1, 1, 0, s>>>(var.data_handle(), i); + }; + } + + // host + ctx.task(exec_place::host, var_handle.redux(redux_op))->*[&](cudaStream_t s, auto var) { + cuda_safe_call(cudaStreamSynchronize(s)); + *var.data_handle() += i; + }; + } + + // Check result + ctx.task(exec_place::host, var_handle.read())->*[&](cudaStream_t s, auto var) { + cuda_safe_call(cudaStreamSynchronize(s)); + int value = *var.data_handle(); + int expected = (N * (N - 1)) / 2 * (ndevs + 1); + EXPECT(value == expected); + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/reductions/write_back_after_redux.cu b/cudax/test/stf/reductions/write_back_after_redux.cu new file mode 100644 index 00000000000..48be6689096 --- /dev/null +++ b/cudax/test/stf/reductions/write_back_after_redux.cu @@ -0,0 +1,67 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief This test ensures that data are properly reconstructed after a + * reduction phase during the write-back mechanism + */ + +#include +#include + +#include + +using namespace cuda::experimental::stf; + +template +__global__ void add_val(slice inout, T val) +{ + inout(0) += val; +} + +int main() +{ + stream_ctx ctx; + + int ndevs; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + const int N = 4; + + int var = 42; + + auto var_handle = ctx.logical_data(make_slice(&var, 1)); + var_handle.set_symbol("var"); + + auto redux_op = std::make_shared>(); + + // We add i twice (total = N(N-1) + initial_value) + for (int i = 0; i < N; i++) + { + // device + for (int d = 0; d < ndevs; d++) + { + ctx.task(exec_place::device(d), var_handle.redux(redux_op))->*[=](cudaStream_t s, auto var) { + add_val<<<1, 1, 0, s>>>(var, i); + }; + } + + // host + ctx.host_launch(var_handle.redux(redux_op))->*[=](auto var) { + var(0) += i; + }; + } + + ctx.finalize(); + + int expected = 42 + (N * (N - 1)) / 2 * (ndevs + 1); + EXPECT(var == expected); +} diff --git a/cudax/test/stf/slice/pinning.cu b/cudax/test/stf/slice/pinning.cu new file mode 100644 index 00000000000..9e5eaf0930c --- /dev/null +++ b/cudax/test/stf/slice/pinning.cu @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include + +using namespace cuda::experimental::stf; + +int main() +{ + stream_ctx ctx; + + // Contiguous 1D + double* X = new double[1024]; + auto handle_X = ctx.logical_data(make_slice(X, 1024)); + + // Contiguous 2D + double* X2 = new double[1024 * 1024]; + auto handle_X2 = ctx.logical_data(make_slice(X2, std::tuple{1024, 1024}, 1024)); + + // Contiguous 3D + double* X4 = new double[128 * 128 * 128]; + auto handle_X4 = ctx.logical_data(make_slice(X4, std::tuple{128, 128, 128}, 128, 128 * 128)); + + // Discontiguous 2D + double* X3 = new double[128 * 8]; + auto handle_X3 = ctx.logical_data(make_slice(X3, std::tuple{64, 8}, 128)); + + // Discontiguous 3D + double* X5 = new double[32 * 4 * 4]; + auto handle_X5 = ctx.logical_data(make_slice(X5, std::tuple{16, 4, 4}, 32, 32 * 4)); + + double* X6 = new double[32 * 4 * 4]; + auto handle_X6 = ctx.logical_data(make_slice(X6, std::tuple{32, 2, 4}, 32, 32 * 4)); + + double* X7 = new double[128 * 128 * 128]; + cuda_safe_call(cudaHostRegister(X7, 128 * 128 * 128, cudaHostRegisterPortable)); + auto handle_X7 = ctx.logical_data(make_slice(X7, std::tuple{128, 128, 128}, 128, 128 * 128)); + + // Detect that this was already pinned + double* X9 = new double[1024]; + cuda_safe_call(cudaHostRegister(X9, 1024, cudaHostRegisterPortable)); + auto handle_X9 = ctx.logical_data(make_slice(X9, 1024)); + + // Detect that this was already pinned + double* X8 = new double[4 * 4 * 4]; + cuda_safe_call(cudaHostRegister(X8, 4 * 4 * 4, cudaHostRegisterPortable)); + auto handle_X8 = ctx.logical_data(make_slice(X8, std::tuple{1, 4, 4}, 4, 4 * 4)); + + ctx.finalize(); +} diff --git a/cudax/test/stf/static_error_checks/CMakeLists.txt b/cudax/test/stf/static_error_checks/CMakeLists.txt new file mode 100644 index 00000000000..576cb992b68 --- /dev/null +++ b/cudax/test/stf/static_error_checks/CMakeLists.txt @@ -0,0 +1,53 @@ +set(static_assert_tests + gnu_launch.cpp + gnu_parallel_for.cpp + static.cu + task_prototype.cu + stream_task_prototype.cu + graph_task_prototype.cu +) + +## cudax_add_stf_static_assert_test +# +# Adds an EXCLUDE_FROM_ALL build target for `source` and a ctest that +# triggers the compilation and checks for "static assertion" in the output. +# +# target_name_var: Variable name to overwrite with the name of the test +# target. Useful for post-processing target information. +# source: The source file for the test. +# cn_target: The reference cudax target with configuration information. +# Additional args are passed to cudax_stf_configure_target. +function(cudax_stf_add_static_assert_test target_name_var source cn_target) + cudax_get_target_property(config_dialect ${cn_target} DIALECT) + cudax_get_target_property(config_prefix ${cn_target} PREFIX) + + get_filename_component(filename ${source} NAME_WE) + + set(test_target "${config_prefix}.tests.stf.error.${filename}") + + add_executable(${test_target} ${source}) + cccl_configure_target(${test_target} DIALECT ${config_dialect}) + cudax_clone_target_properties(${test_target} ${cn_target}) + cudax_stf_configure_target(${test_target} ${ARGN}) + set_target_properties(${test_target} PROPERTIES + EXCLUDE_FROM_ALL true + EXCLUDE_FROM_DEFAULT_BUILD true + ) + + add_test(NAME ${test_target} + COMMAND ${CMAKE_COMMAND} --build "${CMAKE_BINARY_DIR}" + --target ${test_target} + --config $ + ) + set_tests_properties(${test_target} PROPERTIES + PASS_REGULAR_EXPRESSION "static_assert|static assertion" + ) + + set(${target_name_var} ${test_target} PARENT_SCOPE) +endfunction() + +foreach(cn_target IN LISTS cudax_TARGETS) + foreach(source IN LISTS static_assert_tests) + cudax_stf_add_static_assert_test(test_target "${source}" ${cn_target}) + endforeach() +endforeach() diff --git a/cudax/test/stf/static_error_checks/gnu_launch.cpp b/cudax/test/stf/static_error_checks/gnu_launch.cpp new file mode 100644 index 00000000000..8c39ad6d4d9 --- /dev/null +++ b/cudax/test/stf/static_error_checks/gnu_launch.cpp @@ -0,0 +1,27 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +int main() +{ + context ctx; + + auto A = ctx.logical_data(size_t(128)); + + // We cannot do a launch on a device lambda without a CUDA compiler + // as we normally expect an extended lambda (which is also missing here) + ctx.launch(par(128), A.write())->*[](auto th, auto A) { A(th.rank(i) = 0; + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/static_error_checks/gnu_parallel_for.cpp b/cudax/test/stf/static_error_checks/gnu_parallel_for.cpp new file mode 100644 index 00000000000..fcc485d01de --- /dev/null +++ b/cudax/test/stf/static_error_checks/gnu_parallel_for.cpp @@ -0,0 +1,28 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +int main() +{ + context ctx; + + auto A = ctx.logical_data(size_t(1)); + + // We cannot do a parallel for on a device lambda without a CUDA compiler + // as we normally expect an extended lambda (which is also missing here) + ctx.parallel_for(A.shape(), A.write())->*[](size_t i, auto A) { + A(i) = 0; + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/static_error_checks/graph_task_prototype.cu b/cudax/test/stf/static_error_checks/graph_task_prototype.cu new file mode 100644 index 00000000000..e3504ce388f --- /dev/null +++ b/cudax/test/stf/static_error_checks/graph_task_prototype.cu @@ -0,0 +1,28 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +int main() +{ + graph_ctx ctx; + + auto A = ctx.logical_data(size_t(1)); + auto B = ctx.logical_data(size_t(1)); + + // Task with an invalid prototype + ctx.task(A.rw(), B.rw())->*[](cudaStream_t s, auto A) { + + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/static_error_checks/static.cu b/cudax/test/stf/static_error_checks/static.cu new file mode 100644 index 00000000000..b45602d8314 --- /dev/null +++ b/cudax/test/stf/static_error_checks/static.cu @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +int main() +{ + static_assert(0); + return EXIT_FAILURE; +} diff --git a/cudax/test/stf/static_error_checks/stream_task_prototype.cu b/cudax/test/stf/static_error_checks/stream_task_prototype.cu new file mode 100644 index 00000000000..20ef95f725f --- /dev/null +++ b/cudax/test/stf/static_error_checks/stream_task_prototype.cu @@ -0,0 +1,28 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +int main() +{ + stream_ctx ctx; + + auto A = ctx.logical_data(size_t(1)); + auto B = ctx.logical_data(size_t(1)); + + // Task with an invalid prototype + ctx.task(A.rw(), B.rw())->*[](cudaStream_t s, auto A) { + + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/static_error_checks/task_prototype.cu b/cudax/test/stf/static_error_checks/task_prototype.cu new file mode 100644 index 00000000000..d23a5c0443a --- /dev/null +++ b/cudax/test/stf/static_error_checks/task_prototype.cu @@ -0,0 +1,28 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +int main() +{ + context ctx; + + auto A = ctx.logical_data(size_t(1)); + auto B = ctx.logical_data(size_t(1)); + + // Task with an invalid prototype + ctx.task(A.rw(), B.rw())->*[](cudaStream_t s, auto A) { + + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/stencil/stencil-1D.cu b/cudax/test/stf/stencil/stencil-1D.cu new file mode 100644 index 00000000000..843a1fb13d9 --- /dev/null +++ b/cudax/test/stf/stencil/stencil-1D.cu @@ -0,0 +1,210 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include + +using namespace cuda::experimental::stf; + +static graph_ctx ctx; + +/* + * DATA BLOCKS + * | GHOSTS | DATA | GHOSTS | + */ +template +class data_block +{ +public: + data_block(size_t beg, size_t end, size_t ghost_size) + : ghost_size(ghost_size) + , block_size(end - beg + 1) + , array(new T[block_size + 2 * ghost_size]) + , left_interface(new T[ghost_size]) + , right_interface(new T[ghost_size]) + , handle(ctx.logical_data(array.get(), block_size + 2 * ghost_size)) + , left_handle(ctx.logical_data(left_interface.get(), ghost_size)) + , right_handle(ctx.logical_data(right_interface.get(), ghost_size)) + {} + + T* get_array_in_task() + { + return handle.instance().data_handle(); + } + + T* get_array() + { + return array.get(); + } + +public: + size_t ghost_size; + size_t block_size; + std::unique_ptr array; + std::unique_ptr left_interface; + std::unique_ptr right_interface; + + // HANDLE = whole data + boundaries + logical_data> handle; + + // A piece of data to store the left part of the block + logical_data> left_handle; + + // A piece of data to store the right part of the block + logical_data> right_handle; +}; + +template +__global__ void copy_kernel(size_t cnt, T* dst, T* src) +{ + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < cnt; idx += blockDim.x * gridDim.x) + { + dst[idx] = src[idx]; + } +} + +template +__global__ void stencil_kernel(size_t cnt, size_t ghost_size, T* array, T* array1) +{ + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < cnt; idx += blockDim.x * gridDim.x) + { + int idx2 = idx + ghost_size; + array[idx2] = 0.9 * array1[idx2] + 0.05 * array1[idx2 - 1] + 0.05 * array1[idx2 + 1]; + } +} + +// bn1.array = bn.array +template +void stencil(data_block& bn, data_block& bn1) +{ + ctx.task(bn.handle.rw(), bn1.handle.read()) + ->*[bs = bn.block_size, gs = bn.ghost_size](cudaStream_t stream, auto s1, auto s2) { + stencil_kernel<<<256, 64, 0, stream>>>(bs, gs, s1.data_handle(), s2.data_handle()); + }; +} + +template +void update_inner_interfaces(data_block& bn) +{ + // LEFT + ctx.task(bn.handle.read(), bn.left_handle.rw())->*[gs = bn.ghost_size](cudaStream_t stream, auto s1, auto s2) { + copy_kernel<<<(gs > 64 ? 256 : 1), 64, 0, stream>>>(gs, s2.data_handle(), s1.data_handle() + gs); + }; + + // RIGHT + ctx.task(bn.handle.read(), bn.right_handle.rw()) + ->*[bs = bn.block_size, gs = bn.ghost_size](cudaStream_t stream, auto s1, auto s2) { + copy_kernel<<<(gs > 64 ? 256 : 1), 64, 0, stream>>>(gs, s2.data_handle(), s1.data_handle() + bs); + }; +} + +template +void update_outer_interfaces(data_block& bn, data_block& left, data_block& right) +{ + ctx.task(bn.handle.rw(), left.right_handle.read())->*[gs = bn.ghost_size](cudaStream_t stream, auto s1, auto s2) { + copy_kernel<<<(gs > 64 ? 256 : 1), 64, 0, stream>>>(gs, s1.data_handle(), s2.data_handle()); + }; + ctx.task(bn.handle.rw(), right.left_handle.read()) + ->*[bs = bn.block_size, gs = bn.ghost_size](cudaStream_t stream, auto s1, auto s2) { + copy_kernel<<<(gs > 64 ? 256 : 1), 64, 0, stream>>>(gs, s1.data_handle() + gs + bs, s2.data_handle()); + }; +} + +// bn1.array = bn.array +template +void copy_array(data_block& bn, data_block& bn1) +{ + ctx.task(bn1.handle.rw(), bn.handle.read()) + ->*[sz = bn.block_size + 2 * bn.ghost_size](cudaStream_t stream, auto s1, auto s2) { + copy_kernel<<<256, 64, 0, stream>>>(sz, s1.data_handle(), s2.data_handle()); + }; +} + +int main(int argc, char** argv) +{ + size_t NBLOCKS = 2; + size_t BLOCK_SIZE = 1024 * 64; + + size_t TOTAL_SIZE = NBLOCKS * BLOCK_SIZE; + + std::vector U0(NBLOCKS * BLOCK_SIZE); + for (size_t idx = 0; idx < NBLOCKS * BLOCK_SIZE; idx++) + { + U0[idx] = (idx == 0) ? 1.0 : 0.0; + } + + std::vector> Un; + std::vector> Un1; + + // Create blocks and allocates host data + for (size_t b = 0; b < NBLOCKS; b++) + { + int beg = b * BLOCK_SIZE; + int end = (b + 1) * BLOCK_SIZE; + + Un.push_back(data_block(beg, end, 1)); + Un1.push_back(data_block(beg, end, 1)); + } + + // Fill blocks with initial values + for (size_t b = 0; b < NBLOCKS; b++) + { + size_t beg = b * BLOCK_SIZE; + // int end = (b+1)*BLOCK_SIZE; + + double* Un_vals = Un[b].get_array(); + double* Un1_vals = Un1[b].get_array(); + // Attention, unusual loop: index goes all through BLOCK_SIZE inclusive. + for (size_t local_idx = 0; local_idx <= BLOCK_SIZE; local_idx++) + { + Un_vals[local_idx] = U0[(beg + local_idx - 1) % TOTAL_SIZE]; + Un1_vals[local_idx] = U0[(beg + local_idx - 1) % TOTAL_SIZE]; + } + } + + // Create the graph - it starts out empty + + int NITER = 400; + for (int iter = 0; iter < NITER; iter++) + { + // UPDATE Un from Un1 + for (size_t b = 0; b < NBLOCKS; b++) + { + stencil(Un[b], Un1[b]); + } + + for (size_t b = 0; b < NBLOCKS; b++) + { + // Update the internal copies of the left and right boundaries + update_inner_interfaces(Un[b]); + } + + for (size_t b = 0; b < NBLOCKS; b++) + { + update_outer_interfaces(Un[b], Un[(b - 1 + NBLOCKS) % NBLOCKS], Un[(b + 1) % NBLOCKS]); + } + + for (size_t b = 0; b < NBLOCKS; b++) + { + copy_array(Un[b], Un1[b]); + } + } + + ctx.submit(); + + if (argc > 1) + { + std::cout << "Generating DOT output in " << argv[1] << std::endl; + ctx.print_to_dot(argv[1]); + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/stress/empty_tasks.cu b/cudax/test/stf/stress/empty_tasks.cu new file mode 100644 index 00000000000..9c872c75a8b --- /dev/null +++ b/cudax/test/stf/stress/empty_tasks.cu @@ -0,0 +1,54 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +int main(int argc, char** argv) +{ + stream_ctx ctx; + const size_t N = 16; + double X[N], Y[N]; + + for (size_t i = 0; i < N; i++) + { + X[i] = 1.0; + Y[i] = 2.0; + } + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + +#ifdef NDEBUG + size_t iter_cnt = 10000000; +#else + size_t iter_cnt = 10000; + fprintf(stderr, "Warning: Running with small problem size in debug mode, should use DEBUG=0.\n"); +#endif + + if (argc > 1) + { + iter_cnt = atol(argv[1]); + } + + std::chrono::steady_clock::time_point start, stop; + start = std::chrono::steady_clock::now(); + for (size_t iter = 0; iter < iter_cnt; iter++) + { + ctx.task(lX.read(), lY.rw())->*[&](cudaStream_t, auto, auto) {}; + ctx.task(lY.read(), lX.rw())->*[&](cudaStream_t, auto, auto) {}; + } + stop = std::chrono::steady_clock::now(); + ctx.finalize(); + + std::chrono::duration duration = stop - start; + fprintf(stderr, "Elapsed: %.2lf us per task\n", duration.count() * 1000000.0 / (2 * iter_cnt)); +} diff --git a/cudax/test/stf/stress/empty_tasks_alloc.cu b/cudax/test/stf/stress/empty_tasks_alloc.cu new file mode 100644 index 00000000000..7bfeae653e1 --- /dev/null +++ b/cudax/test/stf/stress/empty_tasks_alloc.cu @@ -0,0 +1,46 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +int main(int argc, char** argv) +{ + stream_ctx ctx; + const size_t N = 16; +#ifdef NDEBUG + int iter_cnt = 1000000; +#else + int iter_cnt = 10000; + fprintf(stderr, "Warning: Running with small problem size in debug mode, should use DEBUG=0.\n"); +#endif + + if (argc > 1) + { + iter_cnt = atoi(argv[1]); + } + + std::chrono::steady_clock::time_point start, stop; + start = std::chrono::steady_clock::now(); + for (int iter = 0; iter < iter_cnt; iter++) + { + auto lX = ctx.logical_data(shape_of>(N)); + auto lY = ctx.logical_data(shape_of>(N)); + ctx.task(lX.write())->*[](cudaStream_t, auto) {}; + ctx.task(lY.write())->*[](cudaStream_t, auto) {}; + ctx.task(lX.read(), lY.rw())->*[](cudaStream_t, auto, auto) {}; + } + stop = std::chrono::steady_clock::now(); + ctx.finalize(); + + std::chrono::duration duration = stop - start; + fprintf(stderr, "Elapsed: %.2lf us per task\n", duration.count() * 1000000.0 / (3 * iter_cnt)); +} diff --git a/cudax/test/stf/stress/kernel_chain.cu b/cudax/test/stf/stress/kernel_chain.cu new file mode 100644 index 00000000000..621ac74adaf --- /dev/null +++ b/cudax/test/stf/stress/kernel_chain.cu @@ -0,0 +1,72 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +__global__ void swap(slice dst, const slice src) +{ + size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + size_t nthreads = blockDim.x * gridDim.x; + size_t n = dst.size(); + + for (size_t i = tid; i < n; i += nthreads) + { + double tmp = dst(i); + dst(i) = src(i); + src(i) = tmp; + } +} + +int main(int argc, char** argv) +{ + stream_ctx ctx; + const size_t N = 16; + double X[N], Y[N]; + + for (size_t i = 0; i < N; i++) + { + X[i] = 1.0; + Y[i] = 2.0; + } + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + +#ifdef NDEBUG + size_t iter_cnt = 10000000; +#else + size_t iter_cnt = 10000; + fprintf(stderr, "Warning: Running with small problem size in debug mode, should use DEBUG=0.\n"); +#endif + + if (argc > 1) + { + iter_cnt = atol(argv[1]); + } + + std::chrono::steady_clock::time_point start, stop; + start = std::chrono::steady_clock::now(); + for (size_t iter = 0; iter < iter_cnt; iter++) + { + ctx.task(lX.read(), lY.rw())->*[&](cudaStream_t s, auto dX, auto dY) { + swap<<<4, 16, 0, s>>>(dY, dX); + }; + ctx.task(lY.read(), lX.rw())->*[&](cudaStream_t s, auto dY, auto dX) { + swap<<<4, 16, 0, s>>>(dX, dY); + }; + } + stop = std::chrono::steady_clock::now(); + ctx.finalize(); + + std::chrono::duration duration = stop - start; + fprintf(stderr, "Elapsed: %.2lf us per task pair\n", duration.count() * 1000000.0 / (iter_cnt)); +} diff --git a/cudax/test/stf/stress/kernel_chain_fused.cu b/cudax/test/stf/stress/kernel_chain_fused.cu new file mode 100644 index 00000000000..9e8007d1ebb --- /dev/null +++ b/cudax/test/stf/stress/kernel_chain_fused.cu @@ -0,0 +1,70 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +__global__ void swap(slice dst, const slice src) +{ + size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + size_t nthreads = blockDim.x * gridDim.x; + size_t n = dst.size(); + + for (size_t i = tid; i < n; i += nthreads) + { + double tmp = dst(i); + dst(i) = src(i); + src(i) = tmp; + } +} + +int main(int argc, char** argv) +{ + stream_ctx ctx; + const size_t N = 16; + double X[N], Y[N]; + + for (size_t i = 0; i < N; i++) + { + X[i] = 1.0; + Y[i] = 2.0; + } + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + +#ifdef NDEBUG + size_t iter_cnt = 10000000; +#else + size_t iter_cnt = 10000; + fprintf(stderr, "Warning: Running with small problem size in debug mode, should use DEBUG=0.\n"); +#endif + + if (argc > 1) + { + iter_cnt = atol(argv[1]); + } + + std::chrono::steady_clock::time_point start, stop; + start = std::chrono::steady_clock::now(); + for (size_t iter = 0; iter < iter_cnt; iter++) + { + ctx.task(lY.rw(), lY.rw())->*[&](cudaStream_t s, auto dX, auto dY) { + ::swap<<<4, 16, 0, s>>>(dY, dX); + ::swap<<<4, 16, 0, s>>>(dX, dY); + }; + } + stop = std::chrono::steady_clock::now(); + ctx.finalize(); + + std::chrono::duration duration = stop - start; + fprintf(stderr, "Elapsed: %.2lf us per task\n", duration.count() * 1000000.0 / (iter_cnt)); +} diff --git a/cudax/test/stf/stress/launch_overhead.cu b/cudax/test/stf/stress/launch_overhead.cu new file mode 100644 index 00000000000..49b78a529ee --- /dev/null +++ b/cudax/test/stf/stress/launch_overhead.cu @@ -0,0 +1,65 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +int main(int argc, char** argv) +{ + stream_ctx ctx; + const size_t N = 16; + double X[N], Y[N]; + + for (size_t i = 0; i < N; i++) + { + X[i] = 1.0; + Y[i] = 2.0; + } + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + +#ifdef NDEBUG + int iter_cnt = 1000000; +#else + int iter_cnt = 10000; + fprintf(stderr, "Warning: Running with small problem size in debug mode, should use DEBUG=0.\n"); +#endif + + if (argc > 1) + { + iter_cnt = atoi(argv[1]); + } + + std::chrono::steady_clock::time_point start, stop; + start = std::chrono::steady_clock::now(); + for (int iter = 0; iter < iter_cnt; iter++) + { + ctx.launch(lX.read(), lY.rw())->*[] _CCCL_DEVICE(auto th, auto X, auto Y) { + for (size_t i = th.rank(); i < X.size(); i += th.size()) + { + Y(i) = 2.0 * X(i); + } + }; + + ctx.launch(lX.rw(), lY.read())->*[] _CCCL_DEVICE(auto th, auto X, auto Y) { + for (size_t i = th.rank(); i < X.size(); i += th.size()) + { + X(i) = 0.5 * Y(i); + } + }; + } + stop = std::chrono::steady_clock::now(); + ctx.finalize(); + + std::chrono::duration duration = stop - start; + fprintf(stderr, "Elapsed: %.2lf us per task\n", duration.count() * 1000000.0 / (2 * iter_cnt)); +} diff --git a/cudax/test/stf/stress/launch_vs_parallelfor.cu b/cudax/test/stf/stress/launch_vs_parallelfor.cu new file mode 100644 index 00000000000..ed95f48211a --- /dev/null +++ b/cudax/test/stf/stress/launch_vs_parallelfor.cu @@ -0,0 +1,125 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include +#include + +#define MAX_ITER 200 + +using namespace cuda::experimental::stf; + +int main(int argc, char** argv) +{ + stream_ctx ctx; + + int N0 = 128; + if (argc > 2) + { + N0 = atoi(argv[2]); + } + + // fprintf(stderr, "Using %d...\n", N0); + + size_t N = size_t(N0) * 1024 * 1024; + +#if 0 + auto number_devices = 1; + auto all_devs = exec_place::repeat(exec_place::device(0), number_devices); +#else + auto all_devs = exec_place::all_devices(); +#endif + + data_place cdp; + // if (argc > 2) { + // switch (atoi(argv[2])) { + // case 0: cdp = data_place::composite(blocked_partition(), all_devs); break; + // case 1: cdp = data_place::managed; break; + // case 2: cdp = data_place::device(0); break; + // case 3: cdp = data_place::host; break; + // default: abort(); + // } + // } else { + cdp = data_place::composite(blocked_partition(), all_devs); + // } + + fprintf(stderr, "data place: %s\n", cdp.to_string().c_str()); + + // data_place cdp = data_place::device(0); + // data_place cdp = data_place::managed; + + auto data_logical = ctx.logical_data(N); + // initialize centroids + ctx.parallel_for(blocked_partition(), all_devs, data_logical.shape(), data_logical.write(cdp)) + ->*[=] _CCCL_DEVICE(size_t ind, auto data) { + data(ind) = 0; + }; + int cur_iter = 1; + + const char* const method = argc >= 2 ? argv[1] : "launch-partition"; + + stopwatch sw(stopwatch::autostart, ctx.task_fence()); + + if (strcmp(method, "launch-partition") == 0) + { + for (cur_iter = 1; cur_iter < MAX_ITER; ++cur_iter) + { + ctx.launch(all_devs, data_logical.write(cdp)).set_symbol("launch assignment") + ->*[=] _CCCL_DEVICE(auto&& t, auto&& data) { + for (auto ind : t.apply_partition(shape(data))) + { + data(ind) = 0; + } + }; + } + } + else if (strcmp(method, "launch") == 0) + { + for (cur_iter = 1; cur_iter < MAX_ITER; ++cur_iter) + { + ctx.launch(all_devs, data_logical.rw(cdp))->*[=] _CCCL_DEVICE(auto t, auto data) {}; + } + } + else if (strcmp(method, "parallel") == 0) + { + for (cur_iter = 1; cur_iter < MAX_ITER; ++cur_iter) + { + ctx.parallel_for(blocked_partition(), all_devs, data_logical.shape(), data_logical.rw(cdp)) + ->*[=] _CCCL_DEVICE(size_t ind, auto data) {}; + } + } + else if (strcmp(method, "parallel-indexed") == 0) + { + for (cur_iter = 1; cur_iter < MAX_ITER; ++cur_iter) + { + ctx.parallel_for(blocked_partition(), all_devs, data_logical.shape(), data_logical.rw(cdp)) + .set_symbol("parallel for assignment") + ->*[=] _CCCL_DEVICE(size_t ind, auto data) { + data(ind) = 0; + }; + } + } + else + { + fprintf(stderr, "Must choose one of launch-partition, parallel-indexed, launch, parallel\n"); + return 1; + } + + sw.stop(ctx.task_fence()); + + ctx.finalize(); + + printf("Method: %s, elapsed: %f ms\n", argv[1], sw.elapsed().count()); +} diff --git a/cudax/test/stf/stress/many_read.cu b/cudax/test/stf/stress/many_read.cu new file mode 100644 index 00000000000..079f3f5e01b --- /dev/null +++ b/cudax/test/stf/stress/many_read.cu @@ -0,0 +1,70 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include + +using namespace std::chrono; +using namespace cuda::experimental::stf; + +/* wall-clock time */ +double gettime() +{ + auto now = system_clock::now().time_since_epoch(); + return duration_cast>(now).count(); +} + +int main(int argc, char** argv) +{ + stream_ctx ctx; + const size_t N = 16; + double X[N], Y[N]; + + for (size_t i = 0; i < N; i++) + { + X[i] = 1.0; + Y[i] = 2.0; + } + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + size_t nloops = 10; + size_t inner_nloops = 1000; + + if (argc > 1) + { + nloops = atol(argv[1]); + } + + if (argc > 2) + { + inner_nloops = atol(argv[2]); + } + + for (size_t j = 0; j < nloops; j++) + { + std::chrono::steady_clock::time_point start, stop; + start = std::chrono::steady_clock::now(); + + for (size_t i = 0; i < inner_nloops; i++) + { + ctx.task(lX.read(), lY.rw())->*[](cudaStream_t, auto, auto) {}; + } + + stop = std::chrono::steady_clock::now(); + std::chrono::duration duration = stop - start; + + fprintf(stderr, "Elapsed: %.2lf us per task\n", duration.count() * 1000000.0 / (inner_nloops)); + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/stress/parallel_for_overhead.cu b/cudax/test/stf/stress/parallel_for_overhead.cu new file mode 100644 index 00000000000..48abb4dedc7 --- /dev/null +++ b/cudax/test/stf/stress/parallel_for_overhead.cu @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +int main(int argc, char** argv) +{ + stream_ctx ctx; + const size_t N = 16; + double X[N], Y[N]; + + for (size_t i = 0; i < N; i++) + { + X[i] = 1.0; + Y[i] = 2.0; + } + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + +#ifdef NDEBUG + int iter_cnt = 1000000; +#else + int iter_cnt = 10000; + fprintf(stderr, "Warning: Running with small problem size in debug mode, should use DEBUG=0.\n"); +#endif + + if (argc > 1) + { + iter_cnt = atoi(argv[1]); + } + + std::chrono::steady_clock::time_point start, stop; + start = std::chrono::steady_clock::now(); + for (int iter = 0; iter < iter_cnt; iter++) + { + ctx.parallel_for(lX.shape(), lX.read(), lY.rw())->*[] _CCCL_DEVICE(size_t i, auto X, auto Y) { + Y(i) = 2.0 * X(i); + }; + + ctx.parallel_for(lX.shape(), lY.rw(), lY.read())->*[] _CCCL_DEVICE(size_t i, auto X, auto Y) { + X(i) = 0.5 * Y(i); + }; + } + stop = std::chrono::steady_clock::now(); + ctx.finalize(); + + std::chrono::duration duration = stop - start; + fprintf(stderr, "Elapsed: %.2lf us per task\n", duration.count() * 1000000.0 / (2 * iter_cnt)); +} diff --git a/cudax/test/stf/stress/task_bench.cu b/cudax/test/stf/stress/task_bench.cu new file mode 100644 index 00000000000..769b057075b --- /dev/null +++ b/cudax/test/stf/stress/task_bench.cu @@ -0,0 +1,319 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include // For rand() and srand() +#include // accumulate + +using namespace cuda::experimental::stf; + +enum test_id +{ + TRIVIAL = 0, + STENCIL = 1, + FFT = 2, + SWEEP = 3, + TREE = 4, + RANDOM = 5, +}; + +std::string test_name(test_id id) +{ + switch (id) + { + case TRIVIAL: + return "TRIVIAL"; + case STENCIL: + return "STENCIL"; + case FFT: + return "FFT"; + case SWEEP: + return "SWEEP"; + case TREE: + return "TREE"; + case RANDOM: + return "RANDOM"; + default: + return "unknown"; + } +} + +int log2Int(int n) +{ + int result = 0; + while (n >>= 1) + { // Divide n by 2 until n becomes 0 + ++result; + } + return result; +} + +#if defined(_CCCL_COMPILER_MSVC) +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_MSVC(4702) // unreachable code +#endif // _CCCL_COMPILER_MSVC +bool skip_task(test_id id, int t, int i, int /*W*/) +{ + switch (id) + { + case TRIVIAL: + case STENCIL: + case FFT: + case RANDOM: + return false; + case SWEEP: + // return (i <= t) && (t - i < W); + return (i <= t); + case TREE: { + if (t == 0) + { + return false; + } + int stride = 1 << (t); + return (i % stride != 0); + } + default: + abort(); + } + + // should not be reached + abort(); + return true; +} +#if defined(_CCCL_COMPILER_MSVC) +_CCCL_DIAG_POP +#endif // _CCCL_COMPILER_MSVC + +std::vector input_deps(test_id id, int t, int i, int W) +{ + std::vector res; + + // no input deps for the first step + if (t == 0) + { + return res; + } + + switch (id) + { + case TRIVIAL: + // D(t,i) = NIL + break; + case STENCIL: + // D(t, i) = {i, i-1, i+1} + res.push_back(i); + if (i > 0) + { + res.push_back(i - 1); + } + if (i < W - 1) + { + res.push_back(i + 1); + } + break; + case FFT: + // D(t,i) = {i, i - 2^t, i+2^t} + res.push_back(i); + { + if (t < 32) + { + int two_t1 = 1 << (t - 1); + if (i - two_t1 >= 0) + { + res.push_back(i - two_t1); + } + if (i + two_t1 < W) + { + res.push_back(i + two_t1); + } + } + } + + break; + case SWEEP: + // D(t,i) = (i, i-1) + res.push_back(i); + if (i > 0) + { + res.push_back(i - 1); + } + break; + case TREE: + // D(t,i) = (t <= log2(W)) {i - 2^(-t)W(i mod 2^(-t+1)W)} else {i, i + 2^(t-1)*W^-1} + { + int stride = 1 << (t - 1); + res.push_back(i); + if (i + stride < W) + { + res.push_back(i + stride); + } + } + break; + case RANDOM: + // Differs from TaskBench ( D(t,i) = {i | 0 <= i < W && random() < 0.5)) + // TaskBench topology assumes there can be arbitrarily large numbers of deps + // for (int j = 0; j < W; j++) { + // double r = static_cast(rand()) / RAND_MAX; + // if (r < 0.5) + // res.push_back(j); + // } + for (int k = 0; k < 2; k++) + { + res.push_back(rand() % W); + } + break; + default: + abort(); + } + + return res; +} + +void bench(context& ctx, test_id id, size_t width, size_t nsteps, size_t repeat_cnt) +{ + std::chrono::steady_clock::time_point start, stop; + + const size_t b = nsteps; + + std::vector>> data(width * b); + const size_t data_size = 128; + + /* + * Note : CUDASTF_DOT_REMOVE_DATA_DEPS=1 CUDASTF_DOT_NO_FENCE=1 ACUDASTF_DOT_DISPLAY_EPOCHS=1 + * CUDASTF_DOT_FILE=pif.dot build/nvcc/tests/stress/task_bench 8 6 1 3; dot -Tpdf pif.dot -o pif.pdf + */ + ctx.get_dot()->set_tracing(false); + + for (size_t t = 0; t < b; t++) + { + for (size_t i = 0; i < width; i++) + { + auto d = ctx.logical_data(data_size); + ctx.task(d.write())->*[](cudaStream_t, auto) {}; + data[t * width + i] = d; + } + } + ctx.get_dot()->set_tracing(true); + + cuda_safe_call(cudaStreamSynchronize(ctx.task_fence())); + ctx.change_epoch(); // for better DOT rendering + + std::vector tv; + const int niter = 10; + size_t task_cnt; + size_t deps_cnt; + + for (size_t iter = 0; iter < niter; iter++) + { + task_cnt = 0; + deps_cnt = 0; + start = std::chrono::steady_clock::now(); + + for (size_t k = 0; k < repeat_cnt; k++) + { + for (size_t t = 0; t < nsteps; t++) + { + for (size_t i = 0; i < width; i++) + { + if (!skip_task(id, t, i, width)) + { + auto tsk = ctx.task(); + tsk.add_deps(data[(t % b) * width + i].rw()); + + auto deps = input_deps(id, t, i, width); + for (int d : deps) + { + tsk.add_deps(data[((t - 1 + b) % b) * width + d].read()); + deps_cnt++; + } + + tsk.set_symbol(std::to_string(t) + "," + std::to_string(i)); + tsk->*[](cudaStream_t) {}; + task_cnt++; + } + } + } + } + + cuda_safe_call(cudaStreamSynchronize(ctx.task_fence())); + ctx.change_epoch(); // for better DOT rendering + stop = std::chrono::steady_clock::now(); + + std::chrono::duration duration = stop - start; + tv.push_back(duration.count() * 1000000.0 / (task_cnt)); + } + + // Compute the mean (average) + double sum = ::std::accumulate(tv.begin(), tv.end(), 0.0); + double mean = sum / tv.size(); + + // Compute the standard deviation + double sq_sum = ::std::accumulate(tv.begin(), tv.end(), 0.0, [mean](double acc, double val) { + return acc + std::pow(val - mean, 2); + }); + double variance = sq_sum / tv.size(); + double standardDeviation = ::std::sqrt(variance); + + fprintf(stderr, + "[%s] Elapsed: %.3lf+-%.4lf us per task (%zu tasks, %zu deps, %lf deps/task (avg)\n", + test_name(id).c_str(), + mean, + standardDeviation, + task_cnt, + deps_cnt, + (1.0 * deps_cnt) / task_cnt); +} + +int main(int argc, char** argv) +{ + context ctx; + + size_t width = 8; + if (argc > 1) + { + width = atol(argv[1]); + } + + size_t nsteps = width; + if (argc > 2) + { + nsteps = atol(argv[2]); + } + + size_t repeat_cnt = 10; + if (argc > 3) + { + repeat_cnt = atol(argv[3]); + } + + int id = -1; // all + if (argc > 4) + { + id = atoi(argv[4]); + } + + if (id == -1) + { + bench(ctx, TRIVIAL, width, nsteps, repeat_cnt); + bench(ctx, STENCIL, width, nsteps, repeat_cnt); + bench(ctx, FFT, width, nsteps, repeat_cnt); + bench(ctx, SWEEP, width, nsteps, repeat_cnt); + bench(ctx, TREE, width, nsteps, repeat_cnt); + bench(ctx, RANDOM, width, nsteps, repeat_cnt); + } + else + { + bench(ctx, test_id(id), width, nsteps, repeat_cnt); + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/threads/axpy-threads-2.cu b/cudax/test/stf/threads/axpy-threads-2.cu new file mode 100644 index 00000000000..9bf7dd95013 --- /dev/null +++ b/cudax/test/stf/threads/axpy-threads-2.cu @@ -0,0 +1,115 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief An AXPY kernel implemented with a task of the CUDA stream backend + * + */ + +#include + +#include +#include + +using namespace cuda::experimental::stf; + +static __global__ void cuda_sleep_kernel(long long int clock_cnt) +{ + long long int start_clock = clock64(); + long long int clock_offset = 0; + while (clock_offset < clock_cnt) + { + clock_offset = clock64() - start_clock; + } +} + +void cuda_sleep(double ms, cudaStream_t stream) +{ + int device; + cudaGetDevice(&device); + + // cudaDevAttrClockRate: Peak clock frequency in kilohertz; + int clock_rate; + cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, device); + + long long int clock_cnt = (long long int) (ms * clock_rate); + cuda_sleep_kernel<<<1, 1, 0, stream>>>(clock_cnt); +} + +__global__ void axpy(double a, slice x, slice y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < x.size(); i += nthreads) + { + y(i) += a * x(i); + } +} + +double X0(int i) +{ + return sin((double) i); +} + +double Y0(int i) +{ + return cos((double) i); +} + +void mytask(stream_ctx ctx, int /*id*/, logical_data> lX) +{ + // std::cout << "Thread " << id << " is executing.\n"; + + const size_t N = 16; + + double alpha = 3.14; + + auto lY = ctx.logical_data(N); + ctx.task(lY.write())->*[](cudaStream_t, auto) {}; + + /* Compute Y = Y + alpha X */ + for (size_t i = 0; i < 10; i++) + { + ctx.task(lX.read(), lY.rw())->*[&](cudaStream_t s, auto dX, auto dY) { + axpy<<<16, 128, 0, s>>>(alpha, dX, dY); + cuda_sleep(100.0, s); + }; + } +} + +int main() +{ + stream_ctx ctx; + const size_t N = 16; + auto lX = ctx.logical_data(N); + ctx.task(lX.write())->*[](cudaStream_t, auto) {}; + + std::vector threads; + // Launch 8 threads. + for (int i = 0; i < 10; ++i) + { + threads.emplace_back(mytask, ctx, i, lX); + } + + // Wait for all threads to complete. + for (auto& th : threads) + { + th.join(); + } + + ctx.task(lX.rw())->*[&](cudaStream_t s, auto) { + cuda_sleep(100.0, s); + }; + + ctx.finalize(); +} diff --git a/cudax/test/stf/threads/axpy-threads-pfor.cu b/cudax/test/stf/threads/axpy-threads-pfor.cu new file mode 100644 index 00000000000..8c73edda3a9 --- /dev/null +++ b/cudax/test/stf/threads/axpy-threads-pfor.cu @@ -0,0 +1,114 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief An AXPY kernel implemented with a task of the CUDA stream backend + * + */ + +#include + +#include +#include + +using namespace cuda::experimental::stf; + +static __global__ void cuda_sleep_kernel(long long int clock_cnt) +{ + long long int start_clock = clock64(); + long long int clock_offset = 0; + while (clock_offset < clock_cnt) + { + clock_offset = clock64() - start_clock; + } +} + +void cuda_sleep(double ms, cudaStream_t stream) +{ + int device; + cudaGetDevice(&device); + + // cudaDevAttrClockRate: Peak clock frequency in kilohertz; + int clock_rate; + cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, device); + + long long int clock_cnt = (long long int) (ms * clock_rate); + cuda_sleep_kernel<<<1, 1, 0, stream>>>(clock_cnt); +} + +__global__ void axpy(double a, slice x, slice y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < x.size(); i += nthreads) + { + y(i) += a * x(i); + } +} + +double X0(int i) +{ + return sin((double) i); +} + +double Y0(int i) +{ + return cos((double) i); +} + +void mytask(stream_ctx ctx, int /*id*/) +{ + // std::cout << "Thread " << id << " is executing.\n"; + + const size_t N = 16; + + double alpha = 3.14; + + auto lX = ctx.logical_data(N); + auto lY = ctx.logical_data(N); + + ctx.parallel_for(lX.shape(), lX.write())->*[] __device__(size_t i, auto x) { + x(i) = 1.0; + }; + + ctx.task(lY.write())->*[](cudaStream_t, auto) {}; + + /* Compute Y = Y + alpha X */ + for (size_t i = 0; i < 10; i++) + { + ctx.task(lX.read(), lY.rw())->*[&](cudaStream_t s, auto dX, auto dY) { + axpy<<<16, 128, 0, s>>>(alpha, dX, dY); + cuda_sleep(100.0, s); + }; + } +} + +int main() +{ + stream_ctx ctx; + + std::vector threads; + // Launch 8 threads. + for (int i = 0; i < 10; ++i) + { + threads.emplace_back(mytask, ctx, i); + } + + // Wait for all threads to complete. + for (auto& th : threads) + { + th.join(); + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/threads/axpy-threads.cu b/cudax/test/stf/threads/axpy-threads.cu new file mode 100644 index 00000000000..d217de2949c --- /dev/null +++ b/cudax/test/stf/threads/axpy-threads.cu @@ -0,0 +1,112 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief An AXPY kernel implemented with a task of the CUDA stream backend + * + */ + +#include + +#include +#include + +using namespace cuda::experimental::stf; + +static __global__ void cuda_sleep_kernel(long long int clock_cnt) +{ + long long int start_clock = clock64(); + long long int clock_offset = 0; + while (clock_offset < clock_cnt) + { + clock_offset = clock64() - start_clock; + } +} + +void cuda_sleep(double ms, cudaStream_t stream) +{ + int device; + cudaGetDevice(&device); + + // cudaDevAttrClockRate: Peak clock frequency in kilohertz; + int clock_rate; + cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, device); + + long long int clock_cnt = (long long int) (ms * clock_rate); + cuda_sleep_kernel<<<1, 1, 0, stream>>>(clock_cnt); +} + +__global__ void axpy(double a, slice x, slice y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < x.size(); i += nthreads) + { + y(i) += a * x(i); + } +} + +double X0(int i) +{ + return sin((double) i); +} + +double Y0(int i) +{ + return cos((double) i); +} + +void mytask(stream_ctx ctx, int /*id*/) +{ + // std::cout << "Thread " << id << " is executing.\n"; + + const size_t N = 16; + + double alpha = 3.14; + + auto lX = ctx.logical_data(N); + auto lY = ctx.logical_data(N); + + ctx.task(lX.write())->*[](cudaStream_t, auto) {}; + + ctx.task(lY.write())->*[](cudaStream_t, auto) {}; + + /* Compute Y = Y + alpha X */ + for (size_t i = 0; i < 10; i++) + { + ctx.task(lX.read(), lY.rw())->*[&](cudaStream_t s, auto dX, auto dY) { + axpy<<<16, 128, 0, s>>>(alpha, dX, dY); + cuda_sleep(100.0, s); + }; + } +} + +int main() +{ + stream_ctx ctx; + + std::vector threads; + // Launch 8 threads. + for (int i = 0; i < 10; ++i) + { + threads.emplace_back(mytask, ctx, i); + } + + // Wait for all threads to complete. + for (auto& th : threads) + { + th.join(); + } + + ctx.finalize(); +} diff --git a/cudax/test/stf/tools/auto_dump/auto_dump.cu b/cudax/test/stf/tools/auto_dump/auto_dump.cu new file mode 100644 index 00000000000..835016fcf2a --- /dev/null +++ b/cudax/test/stf/tools/auto_dump/auto_dump.cu @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief This test makes sure we can generate a dot file + */ + +#include + +#include + +using namespace cuda::experimental::stf; + +int main() +{ +#if !defined(_CCCL_COMPILER_MSVC) + // Generate a random dirname + srand(static_cast(time(nullptr))); + int r = rand(); + + std::string dirname = "dump_" + std::to_string(r); + setenv("CUDASTF_AUTO_DUMP", "1", 1); + setenv("CUDASTF_AUTO_DUMP_DIR", dirname.c_str(), 1); + + context ctx; + auto lA = ctx.logical_data(shape_of>(64)); + + // Ignored logical data + auto lB = ctx.logical_data(shape_of>(64)); + lB.set_auto_dump(false); + + ctx.parallel_for(lA.shape(), lA.write(), lB.write())->*[] __device__(size_t i, auto a, auto b) { + a(i) = 42 + i; + b(i) = 42; + }; + ctx.parallel_for(lA.shape(), lA.rw())->*[] __device__(size_t i, auto a) { + a(i) += 3; + }; + ctx.finalize(); + + EXPECT(std::filesystem::exists(dirname)); + EXPECT(std::filesystem::is_directory(dirname)); + + // Files 0 and 1 should exist + for (int i = 0; i < 2; i++) + { + std::string f = dirname + "/" + std::to_string(i); + EXPECT(std::filesystem::exists(f)); + } + + // File 2 should not exist + EXPECT(!std::filesystem::exists(dirname + "/" + std::to_string(2))); + + std::filesystem::remove_all(dirname); +#endif // !_CCCL_COMPILER_MSVC +} diff --git a/cudax/test/stf/utility/timing_with_fences.cu b/cudax/test/stf/utility/timing_with_fences.cu new file mode 100644 index 00000000000..1551a37c018 --- /dev/null +++ b/cudax/test/stf/utility/timing_with_fences.cu @@ -0,0 +1,95 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cuda::experimental::stf; + +static __global__ void cuda_sleep_kernel(long long int clock_cnt) +{ + long long int start_clock = clock64(); + long long int clock_offset = 0; + while (clock_offset < clock_cnt) + { + clock_offset = clock64() - start_clock; + } +} + +void cuda_sleep(double ms, cudaStream_t stream) +{ + int device; + cudaGetDevice(&device); + + // cudaDevAttrClockRate: Peak clock frequency in kilohertz; + int clock_rate; + cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, device); + + long long int clock_cnt = (long long int) (ms * clock_rate); + cuda_sleep_kernel<<<1, 1, 0, stream>>>(clock_cnt); +} + +template +void run(int NTASKS, int ms) +{ + Ctx_t ctx; + + int dummy[1]; + auto handle = ctx.logical_data(dummy); + + cudaEvent_t start, stop; + cuda_safe_call(cudaEventCreate(&start)); + cuda_safe_call(cudaEventCreate(&stop)); + + // warm-up + ctx.task(handle.rw())->*[ms](cudaStream_t stream, auto) { + cuda_sleep(ms, stream); + }; + + cuda_safe_call(cudaEventRecord(start, ctx.task_fence())); + + for (int iter = 0; iter < NTASKS; iter++) + { + ctx.task(handle.rw())->*[ms](cudaStream_t stream, auto) { + cuda_sleep(ms, stream); + }; + } + + cuda_safe_call(cudaEventRecord(stop, ctx.task_fence())); + + ctx.finalize(); + + float elapsed; + cuda_safe_call(cudaEventElapsedTime(&elapsed, start, stop)); + + // Give 25% margin of error + float expected = 1.0f * NTASKS * ms; + float err = fabs(elapsed - expected) / expected; + EXPECT(err < 0.25f); +} + +int main(int argc, char** argv) +{ + int NTASKS = 25; + int ms = 200; + + if (argc > 1) + { + NTASKS = atoi(argv[1]); + } + + if (argc > 2) + { + ms = atoi(argv[2]); + } + + run(NTASKS, ms); + run(NTASKS, ms); + run(NTASKS, ms); +} diff --git a/docs/cudax/index.rst b/docs/cudax/index.rst index 4e4110db969..10b9e3e5aa1 100644 --- a/docs/cudax/index.rst +++ b/docs/cudax/index.rst @@ -5,10 +5,11 @@ CUDA Experimental .. toctree:: :hidden: - :maxdepth: 3 + :maxdepth: 1 container memory_resource + stf ${repo_docs_api_path}/cudax_api ``CUDA Experimental`` (``cudax``) provides experimental new features that are still in development and subject to change. @@ -19,6 +20,7 @@ Specifically, ``cudax`` provides: - :ref:`an owning type erased memory resource ` - :ref:`stream-ordered memory resources ` - dimensions description functionality + - :ref:`an implementation of the STF (Sequential Task Flow) programming model ` Stability Guarantees --------------------- diff --git a/docs/cudax/stf.rst b/docs/cudax/stf.rst new file mode 100644 index 00000000000..6e0efc6edaa --- /dev/null +++ b/docs/cudax/stf.rst @@ -0,0 +1,2056 @@ +.. _stf: + +CUDASTF +======= + +.. contents:: + :depth: 2 + +CUDASTF is an implementation of the Sequential Task Flow model for CUDA. + +The availability of parallelism within modern hardware has dramatically +increased, with large nodes now featuring multiple accelerators. As a +result, maximizing concurrency at the application level in a scalable +manner has become a crucial priority. To effectively hide latencies, it +is essential to achieve the highest level of asynchrony possible. + +CUDASTF introduces a tasking model that automates data transfers while +enforcing implicit data-driven dependencies. + +Implemented as a header-only C++ library, CUDASTF builds on top of CUDA +APIs to simplify the development of multi-GPU applications. + +CUDASTF is currently capable of generating parallel applications using +either the CUDA stream API or the CUDA graph API. + +The Sequential Task Flow (STF) programming model +------------------------------------------------ + +The CUDASTF programming model involves defining logical data and +submitting tasks that operate on this data. CUDASTF automatically +deduces the dependencies between different tasks and orchestrates both +computation and data movement to ensure efficient execution with as much +concurrency as possible. + +CUDASTF employs the `Sequential Task +Flow `__ (STF) programming +model, which enables the extraction of concurrency from a sequence of +tasks annotated with appropriate data accesses and their respective +modes (read-only, write-only, or read/write). + +For instance, two tasks modifying the same data will be serialized (in +order to maintain read-after-write and write-after-write coherency), +whereas two tasks reading the same data without modification can be +executed concurrently (read-after-read is coherent in any order). A task +must wait until all preceding modifications have been completed before +reading a piece of data (read-after-write). Similarly, a task that needs +to modify data can only do so once all preceding reads have finished +(write-after-read). + +Applying these simple rules to a complex algorithm (initially expressed +serially as a sequence of tasks) results in a directed acyclic graph +(DAG) of tasks, which enables CUDASTF to devise concurrent execution for +the given algorithm. + +By providing data use annotations to CUDASTF, programmers benefit from +both automated parallelization and transparent data management. Through +a specialized cache coherency protocol, CUDASTF automates data +allocation and transfers. As a result, programmers can focus on +developing efficient task-based algorithms instead of grappling with +asynchrony and asynchronous data management. + +To illustrate how a sequence of tasks can be transformed into a parallel +application using annotated data accesses, consider the following +example involving three logical data pieces denoted as X, Y and Z: + +:: + + T1[X(rw)], T2[X(read), Y(rw)], T3[X(read), Z(rw)], T4[Y(read), Z(rw)] + +``T2`` and ``T3`` read ``X``, which is modified by ``T1``, creating a +read-after-write dependency between ``T1`` and ``T2``, as well as +between ``T1`` and ``T3``. Since ``T2`` and ``T3`` only perform +concurrent read accesses, they can execute concurrently. ``T4`` reads +``Y`` and ``Z``, which were modified by ``T2`` and ``T3``, respectively, +resulting in write-after-read dependencies between ``T2`` and ``T4``, +and between ``T3`` and ``T4``. The resulting dependency graph is shown +below. + +.. image:: stf/images/graph_01.png + +Getting started with CUDASTF +---------------------------- + +Getting CUDASTF +^^^^^^^^^^^^^^^ + +CUDASTF is part of the CUDA Experimental library of the CCCL project. It is not distributed with the CUDA Toolkit like the rest of CCCL. It is only available on the `CCCL GitHub repository `_. + +Using CUDASTF +^^^^^^^^^^^^^ + +CUDASTF is a header-only C++ library which only require to include its +main header. CUDASTF API is part of the ``cuda::experimental::stf`` C++ +namespace, and we will assume for brevity that we are using this +workspace in the rest of this document. + +.. code:: cpp + + #include + + using cuda::experimental::stf; + +Compiling +^^^^^^^^^ + +CUDASTF requires a compiler conforming to the C++17 standard or later. +Although there is no need to link against CUDASTF itself, the library +internally utilizes the CUDA library. + +.. code:: bash + + # Compilation flags + nvcc -std=c++17 --expt-relaxed-constexpr --extended-lambda -I$(cudastf_path) + # Linking flags + nvcc -lcuda + +It is also possible to use CUDASTF without nvcc. This is for example +useful when calling existing CUDA libraries such as CUBLAS which do not +require authoring custom kernels. Note that CUDASTF APIs intended to +automatically generate CUDA kernels such as ``parallel_for`` or +``launch`` are disabled when compiling without nvcc. + +.. code:: bash + + # Compilation flags + g++ -I$(cudastf_path) + # Linking flags + g++ -lcuda -lcudart + +Using CUDASTF within a cmake project +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +As part of the CCCL project, CUDASTF uses CMake for its build and installation +infrastructure, and is the recommended way of building applications that use +CUDASTF. + +This is facilitated by the CMake Package Manager as illustrated in this simple example which is available `here `_, and which is described in the next paragraph. + +A simple example +^^^^^^^^^^^^^^^^ + +The following example illustrates the use of CUDASTF to implement the +well-known AXPY kernel, which computes ``Y = Y + alpha * X`` where ``X`` +and ``Y`` are two vectors, and ``alpha`` is a scalar value. + +.. code:: cpp + + #include + + using namespace cuda::experimental::stf; + + template + __global__ void axpy(T a, slice x, slice y) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int ind = tid; ind < x.size(); ind += nthreads) { + y(ind) += a * x(ind); + } + } + + int main(int argc, char** argv) { + context ctx; + + const size_t N = 16; + double X[N], Y[N]; + + for (size_t ind = 0; ind < N; ind++) { + X[ind] = sin((double)ind); + Y[ind] = col((double)ind); + } + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + double alpha = 3.14; + + /* Compute Y = Y + alpha X */ + ctx.task(lX.read(), lY.rw())->*[&](cudaStream_t s, auto sX, auto sY) { + axpy<<<16, 128, 0, s>>>(alpha, sX, sY); + }; + + ctx.finalize(); + } + +The code is organized into several steps, which will be described in +more detail in the following sections: + +1. include CUDASTF headers +2. declare a CUDASTF context +3. create logical data +4. submit and wait for the completion of pending work + +More examples can be found in the ``examples`` directory in the sources. + +Backends and contexts +------------------------------- + +The code snippet below includes the required CUDASTF header. It then +creates a context object, which is an entry point for every API calls, +and which stores the state of the CUDASTF library and to keep track of +all resources and all dependencies. This context must eventually be +destroyed using the ``finalize()`` method. + +.. code:: cpp + + context ctx; + +There are currently three context backends available in CUDASTF, with a +common API but possibly different implementations, and a few specific +extensions. The ``context`` class, which is a generic context +implementation should be preferred to write generic code. Using a +specific context type might reduce compilation time, but provide less +flexibility. + +The ``stream_ctx`` class defines a context type that relies on CUDA +streams and CUDA events to implement synchronizations. Tasks are +launched eagerly. This is the context type used by default in the +generic ``context`` type. + +The ``graph_ctx`` class is a context type that implements task +parallelism by the means of CUDA graphs. Tasks (and all related +operations) are put into CUDA graphs. Note that the lambda function +attached describing a task is captured immediately (during the +``ctx.task`` API call) even if the execution is deferred. The underlying +CUDA graph is launched when a synchronization with the host is needed, +or when the context is finalized. Other circumstances such as task +fences might flush all pending operations and result into a graph +launch. Subsequent operations would be put in a new CUDA graph. +Selecting this backend is an easy way to adopt CUDA graphs, and can be +beneficial in terms of performance with a repeated task patterns. Unlike +other context types, it is not allowed for a task to synchronize with +the CUDA stream (e.g. with ``cudaStreamSynchronize``) within a task. + +Using either ``context``, ``stream_ctx`` or ``graph_ctx`` should result +in the same behaviour, even if the underlying implementation differs. +One may switch from a type to another one by adapting how we initialize +the context object, or by selecting an appropriate type to decide +statically : + +.. code:: cpp + + // assigns a graph_ctx() to a generic context + context ctx = graph_ctx(); + + // statically select a context based on CUDA streams and CUDA events + stream_ctx ctx; + + // statically select a context based on CUDA streams and CUDA events + graph_ctx ctx; + +For the most part, all types can be used interchangeably. The difference +lies in the mechanisms used internally to implement synchronization and +to execute computation. There can be a minor runtime overhead and an +increased compilation time when using the generic context type, but this +generic type can be required when CUDASTF automatically select the +context type (see Algorithms). + +Tasks in the Stream backend +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``stream_ctx`` backend utilizes CUDA streams and events to provide +synchronization. Each ``stream_task`` in the ``stream_ctx`` backend +represents a task that is associated with an input CUDA stream. +Asynchronous work can be submitted in the body of the task using this +input stream. Once the ``stream_task`` completes, all work submitted +within the task’s body is assumed to be synchronized with the associated +stream. + +Users can query the stream associated to a ``stream_task`` using its +``get_stream()`` method. + +Tasks in the Graph backend +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In the ``graph_ctx`` environment, a CUDA graph is either created +internally or passed in by the user during construction. If the user +supplies the CUDA graph, CUDASTF can automatically insert CUDA graph +nodes to enable subsequent tasks to be submitted as child graphs of the +user-supplied graph. + +Creating a ``graph_task`` results in creating a child graph in the +aforementioned graph associated to the ``graph_ctx`` object. The child +graph implements the body of the task, and CUDASTF automatically inserts +the appropriate dependencies to ensure this child graph is executed only +after all of its dependencies are fulfilled. CUDASTF may also add other +nodes in the supporting CUDA graph, such as those needed for data +transfers or data allocations. + +Users can retrieve the graph associated to a ``graph_task`` by using its +``get_graph()`` method. + +Logical data +------------ + +In traditional computing, “data”, such as a matrix describing a neural +network layer, typically refers to a location in memory with a defined +address. However, in mixed CPU/GPU systems, the same conceptual data may +exist simultaneously in multiple locations and have multiple addresses +(typically the CPU-tied RAM plus one or more copies in the +high-bandwidth memory used by GPUs). CUDASTF refers to such conceptual +data as *logical data*, an abstract handle for data that may get +transparently transferred to or replicated over the different places +used by CUDASTF tasks. When user code creates a logical data object from +a user-provided object (e.g. an array of ``double``), they transfer the +ownership of the original data to CUDASTF. As a result, any access to +the original data should be performed through the logical data +interface, as CUDASTF may transfer the logical data to a CUDA device +where it can be modified, rendering the original data invalid. By doing +this, user code is relieved of all memory allocation chores and of +keeping track of which physical location holds the correct data at +different stages of computation. + +A logical data is created by calling the ``ctx.logical_data`` member +function. The resulting object will be used to specify data accesses +within tasks. + +In the following example, a stack array ``X`` is used to define a new +logical data object ``lX``, which should be subsequently used instead of +``X``: + +.. code:: cpp + + double X[N]; + auto lX = ctx.logical_data(X); + +Each logical data object internally maintains various *data instances*, +which are replicas of the logical data at different *data places*. For +instance, there could be an instance in host memory, as well as +instances in the embedded memory of CUDA device 0 and CUDA device 1. +CUDASTF ensures that tasks have access to *valid* data instances where +they execute and may dynamically create new instances or destroy +existing ones. + +In the example above, ``X`` is initially on the host (on the CPU stack). +If a task is subsequently launched on device ``0`` that modifies data +through ``lX``, a new data instance will be created in memory associated +with device ``0``. In addition making that allocation, CUDASTF ensures +that a data transfer is issued asynchronously from the host to the +device, so that the task is given a *valid* replica of ``X``. Given that +the task modifies data through ``lX``, the instance associated to the +host will also be invalidated, so CUDASTF will later copy data back to +the host if another task needs to access ``X`` from the CPU. + +Data interfaces +^^^^^^^^^^^^^^^ + +CUDASTF implements a generic interface to manipulate different types of +data formats across the machine. + +Every type of data format is described using three separate types : + +- its shape, which stores parameters which will be common to all instance. For + a fixed-sized vector, the shape would for example contain the length of the + vector. +- a per-instance type that describes a specific data instance. For a + fixed-sized vector, this type would for example contain the address of the + vector. +- a data interface class which implements operations such as allocating a data + instance based on its shape, or copying an instance into another instance. + +Defining custom data interfaces (advanced) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. toctree:: + :maxdepth: 1 + + stf/custom_data_interface + + +CUDASTF API is designed to be extensible, so that advanced users may +define their own data interfaces. This can be useful when manipulating +data formats which are not regular multidimensional arrays, or to +provide a direct access to a domain-specific or an application-specific +data format. + +A complete example is given :ref:`here ` to +illustrate how to implement a custom data interface. + +Write-back policy +^^^^^^^^^^^^^^^^^ + +When a logical data object is destroyed, the original data instance is +updated (unless the logical data was created without a reference value, +e.g. from a shape). The result is only guaranteed to be available on the +corresponding data place when after the ``finalize()`` method was called +on the context. Likewise, when calling ``finalize()`` a write-back +mechanism is automatically issued on all logical data associated to the +context if they were not already destroyed. + +Write back is enabled by default, but it is possible to disable it for a +specific logical data by calling this method on a logical data : +``set_write_back(bool flag)``. Enabling write-back on a logical data +which was defined from a shape and has no reference data instance will +result in an error. + +Slices +^^^^^^ + +To facilitate the use of potentially non-contiguous multi-dimensional +arrays, we have introduced a C++ data structure class called ``slice``. +A slice is a partial specialization of C++’s +``std::mdspan`` (or ``std::experimental::mdspan`` depending on the C++ revision). + +.. code:: cpp + + template + using slice = mdspan, layout_stride>; + +When creating a ``logical_data`` from a C++ array, CUDASTF automatically +describes it as a slice instantiated with the scalar element type and +the dimensionality of the array. Here is an example with an 1D array of +``double``. + +.. code:: cpp + + double A[128]; + context ctx; + auto lA = ctx.logical_data(A); + +Internally, all instances of ``A`` are described as ``slice`` +where ``double`` is the scalar element type, and ``1`` is the +dimensionality of the array. The default dimension corresponds to ``1``, +so ``slice`` is equivalent with ``slice``. + +The ``mdspan`` facility provides a `variety of +methods `__ also +available to its alias ``slice``: + +- ``T *data_handle()`` gives the address of the first element +- ``operator()`` so that ``A(i)`` is the ``i``-th element of a slice of + dimension ``1``, and ``A(i, j)`` is the element at coordinates + ``(i, j)`` in a 2D slice. +- ``size_t size()`` returns the total number of elements in the slice +- ``size_t extent(size_t dim)`` returns the size of a slice in a given + dimension (run-time version) +- ``size_t stride(size_t dim)`` returns the distance in memory between + two elements in a given dimension, expressed as a number of elements + (run-time version) + +Slices can be passed by value, copied, or moved. Copying a slice does +not copy the underlying data. Slices can be passed as arguments to CUDA +kernel. Example: + +.. code:: cpp + + template + __global__ void axpy(T a, slice x, slice y) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int ind = tid; ind < x.size(); ind += nthreads) { + y(ind) += a * x(ind); + } + } + +Defining slices with multiple dimensions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Slices can be used on data with multiple dimensions, and possibly +non-contiguous data. + +For example, to define a 2D slice, we can use the ``make_slice`` method +which takes a base pointer, a tuple with all dimensions, and then the +*strides*. The number of stride values is equal to the number of +dimensions minus one. The i-th stride defines the number of elements in +memory between two successive elements along dimension i. + +.. code:: c++ + + double A[5 * 2]; + + // contiguous 2D slice + slice s = make_slice(A, std::tuple { 5, 2 }, 5); + + // non-contiguous 2D slice + slice s2 = make_slice(A, std::tuple { 4, 2 }, 5); + +In the second example, ``s2`` defines a non-contiguous 2D slice because +the stride is greater than the extent in the first dimension. We will +here *skip* an element between between ``s2(3, 0)`` (which is ``A[3]``) +and ``s2(0, 1)`` (which is ``A[5]``) + +Similarly with 3D data, we need to define 2 strides and 3 extent values +: + +.. code:: c++ + + double A[5 * 3 * 40]; + + // contiguous 3D slice + slice s = make_slice(A, std::tuple { 5, 3, 40 }, 5, 5 * 3); + + // non-contiguous 3D slice + slice s2 = make_slice(A, std::tuple { 4, 3, 40 }, 5, 5 * 3); + + // non-contiguous 3D slice + slice s3 = make_slice(A, std::tuple { 5, 2, 40 }, 5, 5 * 3); + +Such slices can also be used to create logical data : + +.. code:: c++ + + double A[32 * 32]; + + // Contiguous 2D slice + auto lX = ctx.logical_data(make_slice(A, std::tuple { 32, 32 }, 32)); + + // Non-contiguous 2D slice + auto lX2 = ctx.logical_data(make_slice(A, std::tuple { 24, 32 }, 32)); + +Defining logical data from a shape +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Data interfaces supports data which are only described as a shape. For +example, a user may want to define a vector of 10 integers, and later +fill it with a task. In this case, there is no need to have a *reference +instance* associated to that logical data because CUDASTF will +automatically allocate an instance on its first usage. + +.. code:: cpp + + auto lX = ctx.logical_data(shape_of>(10)); + + ctx.task(lX.write())->*[](cudaStream_t stream, auto X) { + cudaMemsetAsync(X.data_handle(), 0, X.size()*sizeof(int), stream); + }; + +In this example, ``lX`` is defined using a shape only, and there is no +physical backing needed to create it. Note that since there exists no +valid *data instance* of ``lX``, the first task needs to make a +write-only access (using the ``write()`` member of ``lX``). A write-only +access will indeed allocate ``lX`` at the appropriate location, but it +will not try to load a valid copy of it prior to executing the task. + +Using other access modes such as ``read()``, ``redux()`` or ``rw()`` +that attempt to provide a valid instance will result in an error. + +Similarly, it is possible to define a logical data from a slice shapes +with multiple dimensions. + +.. code:: cpp + + auto lX_2D = ctx.logical_data(shape_of>({16, 24})); + auto lX_3D = ctx.logical_data(shape_of>({16, 24, 10})); + +Tasks +----- + +A task is created by calling the ``ctx.task`` member function. It takes +an optional argument that specifies the execution location of the task. +If none is provided, the current CUDA device will be used, which is +equivalent to passing ``exec_place::current_device()``. Data accesses +are specified using a list of data dependencies. Each dependency is +described by calling the ``read()``, ``rw()``, or ``write()`` method of +the logical data object. + +In the example below, ``X`` is accessed in read-only mode and ``Y`` +needs to be updated so it uses a read-write access mode. + +.. code:: cpp + + __global__ void axpy(size_t n, double a, const double *x, double *y) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (size_t ind = tid; ind < n; ind += nthreads) { + y[ind] += a * x[ind]; + } + } + ... + ctx.task(lX.read(), lY.rw())->*[&](cudaStream_t s, slice sX, slice sY) { + axpy<<<16, 128, 0, s>>>(sX.size(), alpha, sX.data_handle(), sY.data_handle()); + }; + +The object returned by the call ``ctx.task()`` overloads +``operator->*()`` to accept a lambda function on the right-hand side. +This makes it easy for user code to pass the task’s body to the context +with a syntax akin to a control flow statement. The first argument of +the lambda function is a ``cudaStream_t`` that can be used to submit +work asynchronously on the selected device within the body of the task. +For each logical data, CUDASTF passes a *data instance* to the lambda +function. These *data instances* provide access to a local copy of the +logical data, which is coherent with respect to the CUDA stream passed +to the task. + +For example, data instances associated to 1D arrays of ``double`` are +typed as ``slice`` if the data is in write or read-write mode, +and ``slice`` if the data is in read-only mode. The +``.data_handle()`` method of this type returns the base address of the +underlying array, and the ``.size()`` method returns the total number of +elements. For multi-dimensional arrays, ``.extent(d)`` returns the size +along dimension ``d``. (For a 1D array, ``.size()`` is therefore +equivalent to ``.extent(0)``.) + +Better yet, the CUDA kernel could manipulate slices directly instead of +resorting to unsafe pointers as parameters: + +.. code:: cpp + + __global__ void axpy(double a, slice x, slice y) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (size_t ind = tid; ind < x.size(); ind += nthreads) { + y(ind) += a * x(ind); + } + } + ... + + ctx.task(lX.read(), lY.rw())->*[&](cudaStream_t s, slice sX, slice sY) { + axpy<<<16, 128, 0, s>>>(alpha, sX, sY); + }; + +Task submission can be further simplified to rely on type deduction with +``auto``, which also makes code more generic: + +.. code:: cpp + + ctx.task(lX.read(), lY.rw())->*[&](cudaStream_t s, auto sX, auto sY) { + axpy<<<16, 128, 0, s>>>(alpha, sX, sY); + }; + +*It is important to note that the body of the task construct is executed +directly at the submission of the task, and not when the task is +actually ready for execution. As a result, the body of the task here +submits a CUDA kernel in the stream, but it is not the CUDA kernel +itself.* For example, attempting to use slices ``sX`` and ``sY`` in the +example above immediately in the lambda function would be incorrect; the +right way is to pass them to a kernel synchronized with the stream +``s``. CUDA execution semantics will ensure that by the time the kernel +runs, ``sX`` and ``sY`` will be valid. + +Example of creating and using multiple tasks +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Often, complex algorithms involve multiple processing stages, each with +its own inputs and outputs. In CUDASTF it suffices to express computing +stages in a sequential manner along with their data dependencies. +CUDASTF will ensure optimal parallel execution without requiring the +user code to explicitly define a dependency graph. Consider the +following example consisting of four tasks, of which three run on GPUs: + +.. code:: cpp + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + // Task 1 + ctx.task(lX.read(), lY.read())->*[](cudaStream_t stream, auto sX, auto sY) { + K1<<<..., stream>>>(sX, sY); + K2<<<..., stream>>>(sX, sY);: + }; + + // Task 2 + ctx.task(lX.rw())->*[](cudaStream_t stream, auto sX) { + K3<<<..., stream>>>(sX); + }; + + // Task 3 + ctx.task(lY.rw())->*[](cudaStream_t stream, auto sY) { + K4<<<..., stream>>>(sY); + }; + + // Task 4 + ctx.host_launch(lX.read(), lY.read())->*[](auto sX, auto sY) { + callback(sX, sY); + }; + +Tasks ``T2`` and ``T3`` depend on ``T1`` because they respectively +modify ``X`` and ``Y``, which were accessed in read-only mode by ``T1``. +Task ``T4``, executed on the host, reads both ``X`` and ``Y``, and +therefore needs to wait for the completion of ``T2`` and ``T3``. Note +that Task ``T1`` submits multiple CUDA kernels in the same CUDA stream. +This illustrates how a task in CUDASTF encapsulates a piece of work that +is asynchronous with respect to CUDA stream semantics. + +The resulting task graph under the STF programming model is shown below. + +.. image:: stf/images/task-sequence-user.png + +In full detail, the resulting graph of asynchronous operations includes +additional data allocations of ``X`` and ``Y`` on the current device, as +well as copies to and from the device. These automated steps highlight +how CUDASTF alleviates much of the complexity associated with using +multiple processing units, allowing programmers to focus on algorithmic +matters instead. + +.. image:: stf/images/task-sequence.png + +Lower-level task API +^^^^^^^^^^^^^^^^^^^^ + +.. toctree:: + :maxdepth: 1 + + stf/lower_level_api + +A lower-level API that does not rely on lambda functions is also +available, and is described `here `. + +Synchronization +--------------- + +It is important to note that each task body (passed to the context via +``operator->*()``) is executed immediately and is used to *submit work +asynchronously* with respect to the synchronization semantic of the CUDA +stream. CUDASTF ensures that any operation enqueued in the stream +attached to the task within task body may access the specified data in a +coherently, with respect to the requested access modes. + +.. code:: cpp + + ctx.submit(); + // Unrelated CPU-based code might go here... + ctx.finalize(); + +Due to the asynchronous nature of task parallelism, it is necessary to +ensure that all operations are properly scheduled and executed. As +CUDASTF transparently handles data management (allocations, transfers, +…), there can be outstanding asynchronous operations that were not +submitted explicitly by the user. Therefore it is not sufficient to use +native CUDA synchronization operations because they are not aware of +CUDASTF’s state. Client code must call ``ctx.finalize()`` instead of +``cudaStreamSynchronize()`` or ``cudaDeviceSynchronize()``. + +- ``ctx.submit()`` initiates the submission of all asynchronous tasks + within the sequence +- ``ctx.finalize()`` awaits the conclusion of all outstanding + asynchronous operations in the context, automatically invoking + ``ctx.submit()`` if not previously called by user code + +Usually, creating the task and invoking ``ctx.finalize()`` is +sufficient. However, manually calling ``ctx.submit()`` can be beneficial +in at least two situations. First, it allows for executing additional +unrelated work on the CPU (or another GPU) between submission and +synchronization. Second, when it’s necessary for two contexts to run +concurrently, using the sequence +``ctx1.submit(); ctx2.submit(); ctx1.finalize(); ctx2.finalize();`` +achieves this goal (whereas calling +``ctx1.finalize(); ctx2.finalize();`` without the ``submit()`` calls +would wait for the completion of the first task before starting the +second). + +To wait for the completion of all pending operations (tasks, transfers, ...), +an asynchronous fence mechanism is available : + +.. code:: cpp + + cudaStream_t stream = ctx.task_fence(); + cudaStreamSynchronize(stream); + +Places +------ + +To assist users with managing data and execution affinity, CUDASTF +provides the notion of *place*. Places can represent either *execution +places*, which determine where code is executed, or *data places*, +specifying the location of data across the machine’s non-uniform memory. +One of CUDASTF’s goals is to ensure efficient data placement in line +with the execution place by default, while also providing users the +option to easily customize placement if necessary. Execution places +allow users to express where computation occurs without directly +engaging with the underlying CUDA APIs or dealing with the complex +synchronization that emerges from combining various execution places +asynchronously. + +Execution places +^^^^^^^^^^^^^^^^ + +A task’s constructor allows choosing an execution place. The example +below creates a logical data variable that describes an integer as a +vector of one ``int``. The logical data variable is then updated on +device ``0`` and on device ``1`` before being accessed again from the +host. + +The first argument passed to ``ctx.task`` is called an *execution place* +and tells CUDASTF where the task is expected to execute. +``exec_place::device(id)`` means that the task will run on device +``id``, and ``exec_place::host`` specifies that the task will execute on +the host. + +Regardless of the *execution place*, it is important to note that the +task’s body (i.e., the contents of the lambda function) corresponds to +CPU code that is expected to launch computation asynchronously. When +using ``exec_place::device(id)``, CUDASTF will automatically set the +current CUDA device to ``id`` when the task is started, and restore the +previous current device when the task ends. ``exec_place::host`` does +not affect the current CUDA device. + +.. code:: cpp + + context ctx; + + int X = 42; + + auto lX = ctx.logical_data(slice(&X, { 1 })); + + ctx.task(exec_place::device(0), lX.rw())->*[](cudaStream_t stream, auto sX) { + inc_kernel<<<1, 1, 0, stream>>>(sX); + }; + + ctx.task(exec_place::device(1), lX.rw())->*[](cudaStream_t stream, auto sX) { + inc_kernel<<<1, 1, 0, stream>>>(sX); + }; + + ctx.task(exec_place::host, lX.read())->*[](cudaStream_t stream, auto sX) { + cudaStreamSynchronize(stream); + assert(sX(0) == 44); + }; + + ctx.finalize(); + +Tasks submitted on the host are also executed immediately upon task +creation and not when dependencies are ready. Asynchronous semantics are +observed in accordance to CUDA serialization on the ``cudaStream_t`` +lambda parameter. Therefore, the code shown synchronizes explicitly with +the CUDA stream by calling ``cudaStreamSynchronize(stream)``. This +ensures the value ``sX`` is read only after data is guaranteed to be +valid, i.e., after the completion of prior operations in the stream. +This is disallowed in the ``graph_ctx`` backend. + +An alternative solution which is compatible with all types of backend is +to use ``ctx.host_launch``: + +.. code:: cpp + + ctx.host_launch(lX.read())->*[](auto sX) { + assert(sX(0) == 44); + }; + +The ``ctx.host_launch`` member function circumvents synchronization of +the CPU thread with CUDA execution by invoking the lambda function as a +CUDA callback, thereby maintaining optimal asynchronous semantics for +the entire workload. Since no explicit synchronization with the +underlying CUDA stream is needed, ``ctx.host_launch`` is thus compatible +with the CUDA graph backend (i.e., a context of type ``graph_ctx``). + +Data places +^^^^^^^^^^^ + +By default, logical data is associated with the device where it is +currently processed. A task launched on a device should therefore have +its data loaded into the global memory of that device, whereas a task +executed on the host would access data in host memory (RAM). These are +defined as the *affine* data places of an execution place. + +In the example below, data places are not specified for the two tasks +created. Consequently, the affine data places will be chosen for the two +tasks: the memory of device ``0`` for the first task and the host RAM +for the second task. + +.. code:: cpp + + ctx.task(exec_place::device(0), lA.rw())->*[](cudaStream_t s, auto a) { + ... + }; + + ctx.task(exec_place::host, lA.rw())->*[](cudaStream_t s, auto a) { + ... + }; + +The code above is equivalent with: + +.. code:: cpp + + ctx.task(exec_place::device(0), lA.rw(data_place::affine))->*[](cudaStream_t s, auto a) { + ... + }; + + ctx.task(exec_place::device(0), lA.rw(data_place::affine))->*[](cudaStream_t s, auto a) { + ... + }; + +The affinity can also be made explicit: + +.. code:: cpp + + ctx.task(exec_place::device(0), lA.rw(data_place::device(0)))->*[](cudaStream_t s, auto a) { + ... + }; + + ctx.task(exec_place::device(0), lA.rw(data_place::host))->*[](cudaStream_t s, auto a) { + ... + }; + +CUDASTF also allows to localize data and execution on different places. +The example below ensures that an instance of logical data ``A`` located +in host memory is passed to the task so that it can be accessed from +device ``0``: + +.. code:: cpp + + ctx.task(exec_place::device(0), lA.rw(data_place::host))->*[](cudaStream_t s, auto a) { + ... + }; + +Overriding affinity can be advantageous when a task is known to make +only sparse accesses to a piece of logical data. By overriding affinity, +transferring large amounts of data is avoided; the paging system of CUDA +`Unified +Memory `__ +will automatically fault in the portions of the data actually used. +Conversely, we can launch a task on the host that accesses data located +on a device: + +.. code:: cpp + + ctx.task(exec_place::host, lA.rw(data_place::device(0)))->*[](cudaStream_t s, auto a) { + ... + }; + +Alternatively, assuming there are at least two devices available, in +unified memory it is possible to access the memory of one device from +another: + +.. code:: cpp + + ctx.task(exec_place::device(0), lA.rw(data_place::device(1)))->*[](cudaStream_t s, auto a) { + ... + }; + +Non-affine data placement therefore provides flexibility and can be used +to improve performance or to address memory capacity issues when +accessing large data sets. They however assume that the system can +perform such accesses, which may depend on the hardware (NVLINK, UVM, …) +and the OS (WSL has limited support and lower performance when accessing +host memory from CUDA kernels, for example). + +Grid of places +^^^^^^^^^^^^^^ + +CUDASTF also makes it possible to manipulate places which are a +collection of multiple places. In particular, it is possible an +execution place which corresponds to multiple device execution places. + +Creating grids of places +~~~~~~~~~~~~~~~~~~~~~~~~ + +Grid of execution places are described with the ``exec_place_grid`` +class. This class is templated by two parameters : a scalar execution +place type which represents the type of each individual element, and a +partitioning class which defines how data and indexes are spread across +the different places of the grid. + +The scalar execution place can be for example be ``exec_place_device`` +if all entries are devices, or it can be the base ``exec_place`` class +if the type of the places is not homogeneous in the grid, or if the type +is not known statically, for example. + +It is possible to generate a 1D grid from a vector of places : + +.. code:: c++ + + exec_place exec_place::grid(std::vector places); + +For example, this is used to implement the ``exec_place::all_devices()`` +helper which creates a grid of all devices. + +.. code:: c++ + + template + inline exec_place_grid exec_place::all_devices() { + int ndevs; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + std::vector devices; + devices.reserve(ndevs); + for (int d = 0; d < ndevs; d++) { + devices.push_back(exec_place::device(d)); + } + + return exec_place::grid(std::move(devices)); + } + +The default partitioner class associated to +``exec_place::all_devices()`` is ``null_partition``, which means there +is no partitioning operator defined if none is provided. + +It is possible to retrieve the total number of elements in a grid using +the ``size_t size()`` method. For ``exec_place::all_devices()``, this +will correspond to the total number of devices. + +Shaped grids +~~~~~~~~~~~~ + +To fit the needs of the applications, grid of places need not be 1D +arrays, and can be structured as a multi-dimensional grid described with +a ``dim4`` class. There is indeed another constructor which takes such a +``dim4`` parameter : + +.. code:: c++ + + exec_place::grid(std::vector places, dim4 dims); + +Note that the total size of ``dims`` must match the size of the vector +of places. + +It is possible to query the *shape* of the grid using the following +methods : - ``dim4 get_dims()`` returns the shape of the grid - +``int get_dim(int axis_id)`` returns the number of elements along +direction ``axis_id`` + +Given an ``exec_place_grid``, it is also possible to create a new grid +with a different shape using the reshape member of the +``exec_place_grid``. In this example, a grid of 8 devices is reshaped +into a cube of size 2. + +.. code:: c++ + + // This assumes places.size() == 8 + auto places = exec_place::all_devices(); + auto places_reshaped = places.reshape(dim4(2, 2, 2)); + +Partitioning policies +~~~~~~~~~~~~~~~~~~~~~ + +Partitioning policies makes it possible to express how data are +dispatched over the different places of a grid, or how the index space +of a ``parallel_loop`` will be scattered across places too. + +.. code:: c++ + + class MyPartition : public partitioner_base { + public: + template + static const S_out apply(const S_in& in, pos4 position, dim4 grid_dims); + + pos4 get_executor(pos4 data_coords, dim4 data_dims, dim4 grid_dims); + }; + +A partitioning class must implement a ``apply`` method which takes : + +- a reference to a shape of type ``S_in`` - a position within a grid of + execution places. This position is described using an object of type ``pos4`` +- the dimension of this grid express as a ``dim4`` object. + +``apply`` returns a shape which corresponds to the subset of the ``in`` +shape associated to this entry of the grid. Note that the output shape +type ``S_out`` may be different from the ``S_in`` type of the input +shape. + +To support different types of shapes, appropriate overloads of the +``apply`` method should be implemented. + +This ``apply`` method is typically used by the ``parallel_for`` +construct in order to dispatch indices over the different places. + +A partitioning class must also implement the ``get_executor`` virtual +method which allows CUDASTF to use localized data allocators. This +method indicates, for each entry of a shape, on which place this entry +should *preferably* be allocated. + +``get_executor`` returns a ``pos4`` coordinate in the execution place +grid, and its arguments are : + +- a coordinate within the shape described as a ``pos4`` object +- the dimension of the shape expressed as a ``dim4`` object +- the dimension of the execution place grid expressed as a ``dim4`` object + +Defining the ``get_executor`` makes it possible to map a piece of data +over a execution place grid. The ``get_executor`` method of partitioning +policy in an execution place grid therefore defines the *affine data +place* of a logical data accessed on that grid. + +Predefined partitioning policies +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There are currently two policies readily available in CUDASTF : - +``tiled_partition`` dispatches entries of a shape using a +*tiled* layout. For multi-dimensional shapes, the outermost dimension is +dispatched into contiguous tiles of size ``TILE_SIZE``. - +``blocked_partition`` dispatches entries of the shape using a *blocked* +layout, where each entry of the grid of places receive approximatively +the same contiguous portion of the shape, dispatched along the outermost +dimension. + +This illustrates how a 2D shape is dispatched over 3 places using the +blocked layout : + +.. code:: text + + __________________________________ + | | | | + | | | | + | | | | + | P 0 | P 1 | P 2 | + | | | | + | | | | + |___________|___________|_________| + +This illustrates how a 2D shape is dispatched over 3 places using a +tiled layout, where the dimension of the tiles is indicated by the +``TILE_SIZE`` parameter : + +.. code:: text + + ________________________________ + | | | | | | | + | | | | | | | + | | | | | | | + | P 0 | P 1 | P 2 | P 0 | P 1 |P2| + | | | | | | | + | | | | | | | + |_____|_____|_____|_____|_____|__| + +.. _parallel_for_construct: + +``parallel_for`` construct +-------------------------- + +CUDASTF provides a helper construct which creates CUDA kernels (or CPU +kernels) which execute an operation over an index space described as a +*shape*. + +Example with a 1-dimensional array +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The example below illustrates processing a 1D array using +``parallel_for``: + +.. code:: cpp + + int A[128]; + auto lA = ctx.logical_data(A); + + ctx.parallel_for(exec_place::device(1), lA.shape(), lA.write())->*[] __device__ (size_t i, auto sA) { + A(i) = 2*i + 1; + }; + +The ``parallel_for`` construct consists of 4 main elements: + +- an execution place that indicates where the code will be executed; +- a shape defining the index space of the generated kernel; +- a set of data dependencies; +- a body of code specified using the ``->*`` operator. + +In the example above, the kernel is launched on the CUDA device with +index 1, which corresponds to the second installed GPU. Each logical +data object has a corresponding *data shape*, which can be accessed +through the ``shape()`` member function of the ``logical_data`` +parametrized type. (The shape of logical data can be thought of as full +information about the layout, without the actual data.) In this example, +``lA`` is the logical data associated with a 1D slice of size 128, which +naturally entails iteration over indices in a 1D dimension ranging from +0 to 127 (inclusive). The library associates the iteration strategy with +the data shape. The statement modifies ``lA``, so the lambda function +will be executed only when the data is ready. The lambda function has +the ``__device__`` attribute because a device execution place was +specified. The first parameter corresponds to the index within the shape +(``size_t i`` for a 1D shape). Subsequent parameters are the data +instances associated with the logical data arguments (e.g., +``slice sA``). + +Example with multi-dimensional arrays +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For multidimensional data shapes, iteration (and consequently the lambda +function) requires additional parameters. Consider an example that uses +``parallel_for`` to iterate over 2D arrays: + +.. code:: cpp + + const size_t N = 16; + double X[2 * N * 2 * N]; + double Y[N * N]; + + auto lx = ctx.logical_data(make_slice(&X[0], std::tuple{ 2 * N, 2 * N }, 2 * N)); + auto ly = ctx.logical_data(make_slice(&Y[0], std::tuple{ N, N }, N)); + + ctx.parallel_for(lx.shape(), lx.write())->*[=] __device__(size_t i, size_t j, auto sx) { sx(i, j) = 0.1; }; + + ctx.parallel_for(ly.shape(), lx.read(), ly.write())->*[=] __device__(size_t i, size_t j, auto sx, auto sy) { + sy(i, j) = y0(i, j); + for (size_t ii = 0; ii < 2; ii++) + for (size_t jj = 0; jj < 2; jj++) { + sy(i, j) += sx(2 * i + ii, 2 * j + jj); + } + }; + +Variables ``lx`` and ``ly`` are logical data objects that describe 2D +arrays, so their shapes are 2D index spaces as well. Consequently, a +``parallel_for`` construct applied to ``lx.shape()`` is passed two +indices, ``size_t i`` and ``size_t j``. In the second call of +``parallel_for``, two logical data objects with different shapes are +accessed within the same construct. Generally, ``parallel_for`` can +iterate any number of objects in lockstep, regardless of their +individual shapes. + +Passing a lambda with a signature that starts with a number of +``size_t`` parameters that does not match the dimensionality of the +shape will result in a compilation error. + +Box shape +^^^^^^^^^ + +There are situations where the desired index space does not correspond +to the shape of a logical data object. For those cases, CUDASTF also +provides the template class ``box`` (located in +the header ``cudastf/utility/dimensions.h``) that allows user code to +define multidimensional shapes with explicit bounds. The template +parameter represents the dimension of the shape. + +Box shapes with extents +^^^^^^^^^^^^^^^^^^^^^^^ + +Passing a shape object defined as ``box<2>({2, 3})`` to ``parallel_for`` +will correspond to a 2-dimensional iteration where the first index +varies from 0 through 1 and the second from 0 through 2. Consider: + +.. code:: cpp + + ctx.parallel_for(box<2>({2, 3}))->*[] __device__(size_t i, size_t j) { + printf("%ld, %ld\n", i, j); + }; + +The code above will print (in an unspecified order): + +:: + + 0, 0 + 1, 0 + 0, 1 + 1, 1 + 0, 2 + 1, 2 + +Since the ``box`` default template parameter is 1, it is also possible +to write code to iterate over all values of ``i`` from 0 through 3: + +.. code:: cpp + + ctx.parallel_for(box({4}))->*[] __device__(size_t i) { + printf("%ld\n", i); + }; + +Box shapes with lower and upper bounds +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Box shapes can be defined based on their lower and upper bounds. The +lower bounds are inclusive, while the upper bounds are exclusive. +Consider an example similar to the previous one: + +.. code:: cpp + + ctx.parallel_for(box<2>({{5, 8}, {2, 4}}))->*[] __device__(size_t i, size_t j) { + printf("%ld, %ld\n", i, j); + }; + +It will output (in an unspecified order): + +:: + + 5, 2 + 6, 2 + 7, 2 + 5, 3 + 6, 3 + 7, 3 + +Defining custom shapes (advanced) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Users typically map the ``parallel_for`` construct over the shape of a +logical data, or over a box shape describing a regular multidimensional +domain, but it is possible to define new types of shapes to describe an +index space. + +To define a new type of shape ``S`` (where ``S`` typically has a form of +``shape_of``) that can be used as an indexable shape for +``parallel_for``, ``shape_of`` must define inner type ``coords_t`` +and member function ``index_to_coords`` as follows: + +.. code:: c++ + + template + class shape_of { + ... + public: + using coords_t = ...; + + // This transforms a 1D index into a coordinate + __device__ __host__ coords_t index_to_coords(size_t index) const { + ... + } + }; + +The dimensionality of this ``coord_t`` tuple type determines the number +of arguments passed to the lambda function in ``parallel_for``. + +.. _launch_construct: + +``launch`` construct +-------------------- + +The ``ctx.launch`` primitive in CUDASTF is a kernel-launch mechanism +that handles the mapping and launching of a single kernel onto execution +places implicitly. + +Syntax: + +.. code-block:: cpp + + ctx.launch([thread hierarchy spec], [execution place], logicalData1.accessMode(), logicalData2.accessMode()) + ->*[capture list] __device__ (auto th_spec, auto data1, auto data2 ...) { + // Kernel implementation + }; + +.. _example-with-a-1-dimensional-array-1: + +Example with a 1-dimensional array +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The example below illustrates processing a 1D array using ``launch``: + +.. code:: cpp + + ctx.launch(par(1024), all_devs, handle_X.read(cdp), handle_Y.rw(cdp))->*[=] __device__(thread_info t, slice x, slice y) { + size_t tid = t.thread_id(); + size_t nthreads = t.get_num_threads(); + for (size_t ind = tid; ind < N; ind += nthreads) { + y(ind) += alpha * x(ind); + } + }; + +The ``launch`` construct consists of five main elements: + +- an optional ``execution_policy`` that explicitly specifies the launch + shape. Here we specify that a group of 1024 independent threads should + execute the loop described in the body of the launch construct. +- an execution place that indicates where the code will be executed; +- a set of data dependencies; +- a body of code specified using the ``->*`` operator. +- a parameter to the kernel ``thread_info t`` for thread properties. + +In the example above, the kernel is launched on all of the available +CUDA devices. The lambda function has the ``__device__`` attribute +because a device execution place was specified. The first parameter +corresponds to the per thread information that the user can query. This +includes a global thread id and the total number of threads that will be +executing the kernel. Subsequent parameters are the data instances +associated with the logical data arguments (e.g., ``slice x``). + +Describing a thread hierarchy +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The thread hierarchy specification describes the structure of the parallelism of this kernel. Level sizes can be computed automatically, be a dynamic value or be specified at compile-time. +Threads in a parallel group (`par`) are executed independently. +Threads in a concurrent group (`con`) can be synchronized using the `sync()` API which issues a group-level barrier. + +.. code-block:: cpp + + con() // A single level of threads which are allowed to synchronize + par(128) // A single level of 128 threads which cannot synchronize + par<128>() // A single level with a statically defined size + +Thread are described in a hierarchical manner : we can nest multiple groups with different characteristics. + +.. code-block:: cpp + + par(128, con<256>()) // A two-level thread hierarchy with 128 independent groups of 256 synchronizable threads.. + +Within each group, we can provide additional information such memory automatically shared among group members. + +.. code-block:: cpp + + con(256, mem(64)) // A group of 256 threads sharing 64 bytes of memory + +We can also provide some affinity information if we want to ensure that group of threads is mapped on a specific level of the machine hierarchy. + +The different scopes available are : + +- `hw_scope::thread` : CUDA threads +- `hw_scope::block` : CUDA blocks +- `hw_scope::device` : CUDA device +- `hw_scope::all` : all machine + +This for example describes a thread hierarchy where the inner-most level is limited to CUDA threads (i.e. it cannot span multiple CUDA blocks). And the overall kernel can be mapped at most on a single device, but not on multiple devices). + +.. code-block:: cpp + + par(hw_scope::device | hw_scope::block, par<128>(hw_scope::thread)); + +Manipulating a thread hierarchy +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When the `ctx.launch` construct is executed, a thread hierarchy object is passed to the function which implements the kernel. This object is available on the device(s) (assuming a device execution place), and makes it possible to query the structure of the hierarchy (e.g. based on the type of the thread hierarchy object), and allows the threads in the hierarchy to interact by the means of the `sync()` primitive for groups marked as concurrent (`con`). + +As an example, let us consider that we have the `par(128, con<32>())` hierarchy. + +.. code-block:: cpp + + th.rank(); // rank of the thread within the entire hierarchy (a number between 0 and 4096) + th.size(); // size of the group within the entire hierarchy (4096) + th.rank(i); // rank of the thread within the i-th level of the hierarchy (e.g. a number between 0 and 128 for th.rank(0)) + th.size(i); // size of the group at the i-th level of the hierarchy (128) + th.static_with(i) // size of the group at the i-th level if known at compile time (constexpr value) + +We can query if the i-th level is synchronizable or not using the following API: + +.. code-block:: cpp + + th.is_synchronizable(i); // if i is a dynamic value + th.template is_synchronizable(); // if i is known at compile time + +If the level is synchonizable, we can call + +.. code-block:: cpp + + th.sync(i) // synchronize all threads of the i-th level + th.sync() // synchronize all threads of the top-most level (level 0) + +We can query the affinity of the i-th level with + +.. code-block:: cpp + + th.get_scope(i); // returns the scope of the i-th level + +It is possible to get the amount of memory available for each level : + +.. code-block:: cpp + + th.get_mem(i); // returns the amount of memory available at i-th level + +And we can retrieve the corresponding per-level buffer as a slice (which +CUDASTF will automatically allocate in the most appropriate level of the memory +hierarchy among shared memory, device memory or managed memory) : + +.. code-block:: cpp + + slice smem = th.template storage(1); + +The depth of the thread hierarchy corresponds to the number of nested levels in the hierarchy : + +.. code-block:: cpp + + th.depth(); // (constexpr) return the number of levels in the hierarchy + +To simplify how we navigate within hierarchies, applying the `inner()` method +returns a thread hierarchy where the top-most level was removed. The returned specification +will differ between the different threads which call `inner()`. + +.. code-block:: cpp + + auto spec = con<128>(con()); + ... + auto ti = th.inner(); + ti.size(); // size within the par() hierarchy (automatically computed value) + ti.rank(); // rank within the par() hierarchy + ... + th.inner().sync(); // synchronize threads in the same block of the second level of the hierarchy + +C++ Types of logical data and tasks +----------------------------------- + +To prevent a common class of errors, CUDASTF strives to align its +processing semantics with C++ types as closely as possible. As shown in +the various examples, the use of the ``auto`` keyword is usually +recommended to create readable code while type safety is still enforced. + +.. _logical-data-1: + +Logical data +^^^^^^^^^^^^ + +The result of calling ``ctx.logical_data()`` is an object whose type +contains information about the underlying data interface used to +manipulate the logical data object. For example, a contiguous array of +``double`` is internally represented as a ``slice`` (which is an alias +of ``std::experimental::mdspan``) so that we can use the following type: + +.. code:: cpp + + double X[16]; + logical_data> lX = ctx.logical_data(X); + +For simplicity and without losing any information, users can typically +rely on the ``auto`` keyword: + +.. code:: cpp + + double X[16]; + auto lX = ctx.logical_data(X); + +One may for example store the logical data of a ``slice`` in a C++ +class or structure in such as way: + +.. code:: cpp + + class foo { + ... + mutable logical_data> ldata; + }; + +Note the use of the ``mutable`` qualifier because a task accessing a +``const foo`` object might want to read the ``ldata`` field. Submitting a +task that use this logical data in read only mode would modify the +internal data structures of the logical data, but should probably appear +as a ``const`` operation from user’s perspective. Without this ``mutable`` +qualifier, we could not have a ``const`` qualifier on the ``f`` variable +in the following code : + +.. code:: cpp + + void func(context &ctx, const foo &f) { + ctx.task(f.ldata.read())->*[](cudaStream_t stream, auto) { + ... do work ... + }; + } + +Tasks +^^^^^ + +With a ``stream_ctx`` backend, ``ctx.task(lX.read(), lY.rw())`` returns +an object of type ``stream_task``, where the template arguments +``TX`` and ``TY`` are the types associated to the data interfaces in +logical data ``lX`` and ``lY``. Assuming two arrays of ``double``, which +CUDASTF internally manages as ``slice`` objects, the type of +this task will be: + +.. code:: cpp + + stream_task, slice> + +The type of the task contains information about the element type and its +modifiability — read-only access is mapped to a slice of +``const double`` as opposed to ``double``. The type information is +propagated further from the task object to the lambda invoked by means +of ``operator->*`` in such a way that type errors are detected during +compilation. + +.. code:: cpp + + double X[16], Y[16]; + logical_data> lX = ctx.logical_data(X); + logical_data> lY = ctx.logical_data(Y); + + // results in a compilation error due to the erroneous slice type + ctx.task(lX.read(), lY.rw())->*[](cudaStream_t s, slice x, slice y) { + ... + }; + +In most cases, it’s recommended to use the ``auto`` C++ keyword to +automatically obtain the correct data types: + +.. code:: cpp + + double X[16], Y[16]; + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + ctx.task(lX.read(), lY.rw())->*[](cudaStream_t s, auto x, auto y) { + ... + }; + +In the graph backend, the untyped task type equivalent to +``stream_task<>`` is ``graph_task``, and the equivalent to +``stream_task`` would be, for example, ``graph_task``. +When using the generic context type, CUDASTF would create a task of type +``unified_task``. + +Dynamically-typed tasks +^^^^^^^^^^^^^^^^^^^^^^^ + +In certain circumstances, the exact data accessed by a task (and +consequently the type of a task as discussed above) may not be available +statically. For example, updating a part of the computation domain might +require accessing the closest neighbors of that part. The neighbors are +known only dynamically, meaning that it is not possible to directly pass +task dependencies as arguments to the ``ctx.task()`` call. + +For such situations CUDASTF offers a dynamically-typed task, called +``stream_task<>`` in the ``stream_ctx`` backend, whose member function +``add_deps`` allows adding dependencies dynamically: + +.. code:: cpp + + double X[16], Y[16]; + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + stream_task<> t = ctx.task(); + t.add_deps(lX.read(), lY.rw()); + +This dynamic approach entails a loss of expressiveness. The API based on +the ``->*`` notation is only compatible with *statically-typed* tasks, +as the user-provided lambda function needs to be passed data instances +of the proper types (for example ``slice``) by CUDASTF. As a +consequence, the ``stream_task<>`` needs to be manipulated with the +`low-level API <#lower-level-api>`__. + +Combining typed and untyped tasks +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It is possible to dynamically add dependencies to a typed task, but the +type of the task will not reflect the dynamically added dependencies. +This allows for combining the low-level API with the ``->*`` notation in +the following way: + +.. code:: cpp + + double X[16], Y[16]; + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + auto t = ctx.task(lX.read()); + t.add_deps(lY.rw()); + t->*[&](cudaStream_t s, auto x) { + slice y = t.template get>(1); + }; + +The program remains safe because accesses are checked dynamically. +However, any errors will be caught at runtime instead of during +compilation. + +Untyped tasks cannot be converted to typed tasks. On the other hand, +typed tasks can be converted implicitly to untyped tasks (thus losing +all the benefits of statically available types): + +.. code:: cpp + + stream_task<> t = ctx.task(lX.read()); + + +Tools +----- + +Visualizing task graphs +^^^^^^^^^^^^^^^^^^^^^^^ + +In order to visualize the task graph generated by CUDASTF, it is +possible to generate a file in the Graphviz format. This visualization +helps to better understand the application, and can be helpful to +optimize the algorithms as it sometimes allow to identify inefficient +patterns. + +Let us consider the ``examples/01-axpy.cu`` example which we compile as +usual with ``make build/examples/01-axpy``. + +.. code:: bash + + # Run the application with CUDASTF_DOT_FILE set to the filename + CUDASTF_DOT_FILE=axpy.dot build/examples/01-axpy + + # Generate the visualization from this dot file + ## PDF format + dot -Tpdf axpy.dot -o axpy.pdf + ## PNG format + dot -Tpng axpy.dot -o axpy.png + +We obtain a visualization like this, where we only see a single task +with little : + +.. image:: stf/images/dot-output-axpy.png + +To have more information, we can enhance the application with some extra +debugging information. For example, we can specify what is the name of a +logical data using the ``set_symbol`` method of the ``logical_data`` +class. As illustrated here : + +.. code:: c++ + + auto lX = ctx.logical_data(X).set_symbol("X"); + auto lY = ctx.logical_data(Y); + lY.set_symbol("Y"); + +We can also annotate tasks with symbols. Instead of writing this : + +.. code:: c++ + + ctx.task(lX.read(), lY.rw())->*[&](cudaStream_t s, auto dX, auto dY) { axpy<<<16, 128, 0, s>>>(alpha, dX, dY); }; + +We can write code like this : + +.. code:: c++ + + // Inlined notation + ctx.task(lX.read(), lY.rw()).set_symbol("axpy")->*[&](cudaStream_t s, auto dX, auto dY) { axpy<<<16, 128, 0, s>>>(alpha, dX, dY); }; + + // Explicit manipulation of the task class + auto t = ctx.task(lX.read(), lY.rw()); + t.set_symbol("axpy"); + t->*[&](cudaStream_t s, auto dX, auto dY) { axpy<<<16, 128, 0, s>>>(alpha, dX, dY); }; + +We then obtain the following output, which contains more interesting +annotations : + +.. image:: stf/images/dot-output-axpy-annotated.png + +On a more elaborated application, such as the ``examples/heat_mgpu.cu`` +example, we can easily understand the overall workflow thanks to this +visualization. + +.. code:: bash + + CUDASTF_DOT_FILE=heat.dot build/examples/heat_mgpu 1000 8 4 + dot -Tpng heat.dot -o heat.png + +.. image:: stf/images/dot-output-heat.png + +For advanced users, it is also possible to display internally generated +asynchronous operations by setting the ``CUDASTF_DOT_IGNORE_PREREQS`` +environment variable to 0. + +.. code:: c++ + + CUDASTF_DOT_IGNORE_PREREQS=0 CUDASTF_DOT_FILE=axpy-with-events.dot build/examples/01-axpy + dot -Tpng axpy-with-events.dot -o axpy-with-events.png + +.. image:: stf/images/dot-output-axpy-events.png + +It is possible to color the different tasks accordingly to the device +executing it by setting the ``CUDASTF_DOT_COLOR_BY_DEVICE`` environment +variable. + +To reduce the amount of information displayed in the graph, we can +remove the list of data associated to each task by setting the +``CUDASTF_DOT_REMOVE_DATA_DEPS`` environment variable. + +Kernel tuning with ncu +^^^^^^^^^^^^^^^^^^^^^^ + +Users can analyze the performance of kernels generated using +``ctx.parallel_for`` and ``ctx.launch`` using the ``ncu`` tool. + +Naming kernels +~~~~~~~~~~~~~~ + +However, displayed kernel names would be hardly exploitable as they +would all have the same name. One possible work-around is to let ``ncu`` +rename kernels accordingly to ``NVTX`` annotations. To do so, a symbol +must be associated to the ``ctx.parallel_for`` and ``ctx.launch`` +constructs using the ``set_symbol`` method. In the following example, we +name the generated kernel “updateA” : + +.. code:: cpp + + int A[128]; + auto lA = ctx.logical_data(A); + + ctx.parallel_for(lA.shape(), lA.write()).set_symbol("updateA")->*[] __device__ (size_t i, auto sA) { + A(i) = 2*i + 1; + }; + +Example with miniWeather +~~~~~~~~~~~~~~~~~~~~~~~~ + +Kernel tuning should always be performed on optimized code : + +.. code:: bash + + make build/examples/miniweather + +The following command will analyse the performance of kernels : + +.. code:: bash + + ncu --section=ComputeWorkloadAnalysis --print-nvtx-rename=kernel --nvtx -o output build/examples/miniWeather + +Note that ``--print-nvtx-rename=kernel --nvtx`` is used to name kernels +accordingly to ``NVTX`` traces (which are enabled by the ``set_symbol`` +API). Failing to do so would results in all kernel names being +``thrust::cuda_cub::core::_kernel_agent``. To properly display renamed +kernel names, users may have to set option +``Options->Report UI->NVTX Rename Mode`` to a value equal to ``Kernel`` +or ``All``. + +Depending on machine configuration, users may also have to execute the +``ncu`` command as root or to setup their machine accordingly. This is +for example required when the following message is appears during +``ncu`` trace collection : + +:: + + ==ERROR== ERR_NVGPUCTRPERM - The user does not have permission to access NVIDIA GPU Performance Counters on the target device 0. For instructions on enabling permissions and to get more information see https://developer.nvidia.com/ERR_NVGPUCTRPERM + +The file generated by ``ncu`` can be opened using ``ncu-ui`` : + +.. code:: bash + + ncu-ui output.ncu-rep + +In this case, we can see that the kernel are named accordingly to the +symbols set in the tasks of the miniWeather examples : |image1| + +.. |image1| image:: stf/images/ncu-ui.png + +CUDASTF Reference Card +---------------------- + +This section gives a brief overview of the CUDASTF API. + +Using CUDASTF +^^^^^^^^^^^^^ + +STF is a C++ header-only library which API is defined in the `cuda::experimental::stf` namespace. + +.. code-block:: cpp + + #include + + using cuda::experimental::stf; + +Creating a Context +^^^^^^^^^^^^^^^^^^ + +Contexts store the state of the CUDASTF library and are used as an entry point for all API calls. The `finalize()` method must be called upon completion. + +.. code-block:: cpp + + context ctx; + ctx.finalize(); + +By default, this relies on the ``stream_ctx`` back-end, but it is also possible to select at runtime a ``graph_ctx`` back-end : + +.. code-block:: cpp + + context ctx = graph_ctx(); + +Or one may statically select either backends : + +.. code-block:: cpp + + stream_ctx ctx; + graph_ctx ctx; + +Logical Data +^^^^^^^^^^^^ + +Creating a Logical Data +~~~~~~~~~~~~~~~~~~~~~~~ + +Purpose: Encapsulates data structures (e.g. arrays, slices) to be shared and accessed by tasks. Logical data represents the abstraction of data in the model. + +.. code-block:: cpp + + // Create a logical data from an existing piece of data + auto ctx.logical_data(data view [data_place = data_place::current_device()]); + +Examples: + +- Describing an array + +.. code-block:: cpp + + int X[1024]; + auto lX = ctx.logical_data(X); + +- Describing a vector of n elements of type T located at address `addr` + +.. code-block:: cpp + + auto data_handle = ctx.logical_data(slice(addr {n})); + +- Describing a contiguous matrix of size (m, n) + +.. code-block:: cpp + + auto data_handle = ctx.logical_data(slice(addr {m, n})); + +- Describing a matrix of size (m, n) with a stride of ld elements + +.. code-block:: cpp + + auto data_handle = ctx.logical_data(slice(addr {m, n}, {ld})); + +- Create a logical data from a shape + +.. code-block:: cpp + + auto ctx.logical_data(shape); + +Examples: + +.. code-block:: cpp + + auto lX = ctx.logical_data(shape_of>(1024))); + auto lY = ctx.logical_data(lX.shape()); + +Tasks +^^^^^ + +Data Dependency +~~~~~~~~~~~~~~~ + +Purpose: Define how a logical data should be used in a task construct (and derivated constructs such as `parallel_for`, `launch`, `host_launch`). + +Syntax: + +.. code-block:: cpp + + logicalData.accessMode([data place]) + +- **Data Places**: Specify where a logical data in the data dependencies should be located: + + - `data_place::affine` (default): Locate data on the data place affine to the execution place (e.g., device memory when running on a CUDA device). + - `data_place::managed`: Use managed memory. + - `data_place::device(i)`: Put data in the memory of the i-th CUDA device (which may be different from the current device or the device of the execution place). + +- **Access Modes**: + + - `.read()`: Read-only access. + - `.write()`: Write-only access. + - `.rw()`: Read and write access. + +Task Creation +~~~~~~~~~~~~~ + +Purpose: Define computational tasks that operate on logical data. Tasks can specify data dependencies and access modes. + +Syntax: + +.. code-block:: cpp + + ctx.task([execution place] dependency1 dependency2 ...) + ->*[&](cudaStream_t stream, auto data1, auto data2 ...) { + // Task implementation using stream + }; + +- **Execution Place**: Specify where the task should be executed: + + - `exec_place::current_device()` (default): Run on current CUDA device. + - `exec_place::device(ID)`: Run on CUDA device identified by its index. + - `exec_place::host`: Run on the host (Note: this is providing a CUDA stream which should be used to submit CUDA callbacks. For example, users should typically use the `host_launch` API instead). + +Examples: + +.. code-block:: cpp + + ctx.task(lX.read(), lY.rw()) + ->*[&](cudaStream_t s, auto dX, auto dY) { + axpy<<<16, 128, 0, s>>>(alpha, dX, dY); + }; + +Host-Side Task Execution with `host_launch` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Purpose: Execute tasks on the host (CPU) while still utilizing the task and data management system. + +.. code-block:: cpp + + ctx.host_launch(logicalData1.accessMode(), logicalData2.accessMode() ...) + ->*[capture list](auto data1, auto data2 ...) { + // Host-based task implementation here + }; + +Kernel authoring +^^^^^^^^^^^^^^^^ + +In addition to user-provided CUDA kernels or CUDA libraries, CUDASTF makes it possible to author compute kernels directly on top of logical data using the `parallel_for` and `launch` mechanisms. + +`parallel_for` construct +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Purpose: Apply a kernel on each coordinate of a shape. + +Syntax: + +.. code-block:: cpp + + ctx.parallel_for([execution place], [partitioner], shape, logicalData1.accessMode(), logicalData2.accessMode(), ...) + ->*[capture list] __device__ (size_t index1, size_t index2, ... auto data1, auto data2 ...) { + // Kernel implementation + }; + +Examples: + +- Applying a kernel on a 1D array: + +.. code-block:: cpp + + double X[N]; + double Y[N]; + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + ctx.parallel_for(lY.shape(), lX.read(), lY.rw()) + ->*[alpha] __device__(size_t i, auto dX, auto dY) { + dY(i) += alpha * dX(i); + }; + +- Applying a kernel on a 2D matrix: + +.. code-block:: cpp + + double X[N*N]; + double Y[N]; + + auto lX = ctx.logical_data(make_slice(&X[0], std::tuple{N, N})); + auto lY = ctx.logical_data(Y); + + ctx.parallel_for(lX.shape(), lX.rw(), lY.read()) + ->*[](size_t i, size_t j, auto dX, auto dY) { + dX(i, j) += dY(i) * dY(j); + }; + +See :ref:`this section ` for more details. + +`launch` construct +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The `launch` construct makes it possible to launch structured compute kernels. Contrary to `parallel_for` which applies the same operation on every member of a shape, `launch` executes a kernel over a thread hierarchy. + +Syntax: + +.. code-block:: cpp + + template + ctx.launch([thread hierarchy spec], [execution place], logicalData1.accessMode(), logicalData2.accessMode(), ...) + ->*[capture list] __device__ (thread_hierarchy_spec_t th, auto data1, auto data2 ...) { + // Kernel implementation + }; + + +The thread hierarchy passed to the construct is defined as a nested object of the form + +.. code-block:: cpp + + {par|con}<[static_width]>([width], [scope], [mem(size)] [nested hierarchy specification]) + +When passed to the user-provided kernel implementation, this object provides different mechanisms to let threads interact or to query the structure of the parallelism + +.. code-block:: cpp + + th.rank(); // get thread rank within the entire hierarchy + th.rank(i); // get thread rank at i-th level + th.template rank(); // (constexpr) get thread rank at the i-th level at compile time + + th.size(); // get the total number of threads + th.size(i); // get the number of threads at the i-th level + th.template size(); // (constexpr) get the number of threads at the i-th level at compile time + + th.get_scope(i); // get the affinity of the i-th level + th.template storage(i); // get the local storage associated to the i-th level as a slice + + th.sync(); // issue a barrier among all threads + th.sync(i); // issue a barrier among threads of the i-th level + th.is_synchronizable(i); // check if we can call sync(i) + th.template is_synchronizable(); // (constexpr) check if we can call sync(i) at compile time + + th.depth(); // (constexpr) get the depth of the thread hierarchy + th.inner(); // get the thread hierarchy subset obtained by removing the top-most level of the hierarchy + +Examples: + +.. code-block:: cpp + + con(128); + par(); + con<128>(); + con(128, par(32)); + con(128, mem(64), con(32)); + par(hw_scope::device | hw_scope::block, par<128>(hw_scope::thread)); + +See :ref:`this section ` for more details. diff --git a/docs/cudax/stf/custom_data_interface.rst b/docs/cudax/stf/custom_data_interface.rst new file mode 100644 index 00000000000..0c84cc6c88f --- /dev/null +++ b/docs/cudax/stf/custom_data_interface.rst @@ -0,0 +1,330 @@ +.. _stf_custom_data_interface: + +CUDASTF offers an extensible API that allows users to implement their +own data interface. + +Let us for example go through the different steps to implement a data +interface for a very simple simple implementation of a matrix class. + +For the sake of simplicity, we here only consider the CUDA stream +backend, but adding support for the CUDA graph backend simply require +some extra steps which use the CUDA graph API. + +Implementation of the ``matrix`` class +====================================== + +For the sake of simplicity, we consider a very simple representation of +matrix, only defined by the dimensions m and n, and by the base address +of the matrix which we assume to be contiguous. + +.. code:: c++ + + template + class matrix { + public: + matrix(size_t m, size_t n, T* base) : m(m), n(n), base(base) {} + __host__ __device__ T& operator()(size_t i, size_t j) { return base[i + j * m]; } + __host__ __device__ const T& operator()(size_t i, size_t j) const { return base[i + j * m]; } + size_t m, n; + T* base; + }; + +Defining the shape of a matrix +============================== + +The first step consists in defining what is the *shape* of a matrix. The +shape of a matrix should be a class that defines all parameters which +are the same for all data instances, ``m`` and ``n``. On the other hand, +the base address should not be part of this shape class, because each +data instance will have its own base address. + +To define what is the shape of a matrix, we need to specialize the +``cudastf::shape_of`` trait class. + +.. code:: c++ + + template + class cudastf::shape_of> { + public: + /** + * @brief The default constructor. + * + * All `shape_of` specializations must define this constructor. + */ + shape_of() = default; + + explicit shape_of(size_t m, size_t n) : m(m), n(n) {} + + /** + * @name Copies a shape. + * + * All `shape_of` specializations must define this constructor. + */ + shape_of(const shape_of&) = default; + + /** + * @brief Extracts the shape from a matrix + * + * @param M matrix to get the shape from + * + * All `shape_of` specializations must define this constructor. + */ + shape_of(const matrix& M) : shape_of>(M.m, M.n) {} + + /// Mandatory method : defined the total number of elements in the shape + size_t size() const { return m * n; } + + size_t m; + size_t n; + }; + +We here see that ``shape_of>`` contains two ``size_t`` fields +``m`` and ``n``. + +In addition, we need to define a default constructor and a copy +constructors. + +To implement the ``.shape()`` member of the ``logical_data`` class, we +need to define a constructor which takes a const reference to a matrix. + +Finally, if the ``ctx.parallel_for`` construct is needed, we must define +a ``size_t size() const`` method which computes the total number of +elements in a shape. + +Hash of a matrix +================ + +For internal needs, such as using (unordered) maps of data instances, +CUDASTF need to have specialized forms of the ``std::hash`` trait class. + +The ``()`` operator of this class should compute a unique identifier +associated to the description of the data instance. This typically means +computing a hash of the matrix sizes, and of the base address. Note that +this hash *does not* depend on the actual content of the matrix. + +In code snippet, we are using the ``cudastf::hash_combine`` helper which +updates a hash value with another value. This function is available from +the ``cudastf/utility/hash.h`` header. + +.. code:: c++ + + template + struct std::hash> { + std::size_t operator()(matrix const& m) const noexcept { + // Combine hashes from the base address and sizes + return cudastf::hash_all(m.m, m.n, m.base); + } + }; + +Defining a data interface +========================= + +We can now implement the actual data interface for a matrix class, which +defines the basic operations that CUDASTF need to perform on a matrix. + +The ``matrix_stream_interface`` class inherits from the +``data_interface`` class, but to implement a data interface using APIs +based on CUDA streams, ``matrix_stream_interface`` inherits from +``stream_data_interface_simple>`` which contains pure virtual +functions that need to be implemented. + +.. code:: c++ + + template + class matrix_stream_interface : public stream_data_interface_simple> { + public: + using base = stream_data_interface_simple>; + using base::shape_t; + + /// Initialize from an existing matrix + matrix_stream_interface(matrix m) : base(std::move(m)) {} + + /// Initialize from a shape of matrix + matrix_stream_interface(shape_t s) : base(s) {} + + /// Copy the content of an instance to another instance + /// + /// This implementation assumes that we have registered memory if one of the data place is the host + void stream_data_copy(const data_place& dst_memory_node, instance_id_t dst_instance_id, + const data_place& src_memory_node, instance_id_t src_instance_id, cudaStream_t stream) override { + assert(src_memory_node != dst_memory_node); + + cudaMemcpyKind kind = cudaMemcpyDeviceToDevice; + if (src_memory_node == data_place::host) { + kind = cudaMemcpyHostToDevice; + } + + if (dst_memory_node == data_place::host) { + kind = cudaMemcpyDeviceToHost; + } + + const matrix& src_instance = this->instance(src_instance_id); + const matrix& dst_instance = this->instance(dst_instance_id); + + size_t sz = src_instance.m * src_instance.n * sizeof(T); + + cuda_safe_call(cudaMemcpyAsync((void*) dst_instance.base, (void*) src_instance.base, sz, kind, stream)); + } + + /// allocate an instance on a specific data place + /// + /// setting *s to a negative value informs CUDASTF that the allocation + /// failed, and that a memory reclaiming mechanism need to be performed. + void stream_data_allocate(backend_ctx_untyped& ctx, const data_place& memory_node, instance_id_t instance_id, ssize_t& s, + void** extra_args, cudaStream_t stream) override { + matrix& instance = this->instance(instance_id); + size_t sz = instance.m * instance.n * sizeof(T); + + T* base_ptr; + + if (memory_node == data_place::host) { + // Fallback to a synchronous method as there is no asynchronous host allocation API + cuda_safe_call(cudaStreamSynchronize(stream)); + cuda_safe_call(cudaHostAlloc(&base_ptr, sz, cudaHostAllocMapped)); + } else { + cuda_safe_call(cudaMallocAsync(&base_ptr, sz, stream)); + } + + // By filling a positive number, we notify that the allocation was succesful + *s = sz; + + instance.base = base_ptr; + } + + /// deallocate an instance + void stream_data_deallocate(backend_ctx_untyped& ctx, const data_place& memory_node, instance_id_t instance_id, void* extra_args, + cudaStream_t stream) override { + matrix& instance = this->instance(instance_id); + if (memory_node == data_place::host) { + // Fallback to a synchronous method as there is no asynchronous host deallocation API + cuda_safe_call(cudaStreamSynchronize(stream)); + cuda_safe_call(cudaFreeHost(instance.base)); + } else { + cuda_safe_call(cudaFreeAsync(instance.base, stream)); + } + } + + /// Register the host memory associated to an instance of matrix + /// + /// Note that this pin_host_memory method is not mandatory, but then it is + /// the responsability of the user to only passed memory that is already + /// registered, and the allocation method on the host must allocate + /// registered memory too. Otherwise, copy methods need to be synchronous. + bool pin_host_memory(instance_id_t instance_id) override { + matrix& instance = this->instance(instance_id); + if (!instance.base) { + return false; + } + + cuda_safe_call(pin_memory(instance.base, instance.m * instance.n * sizeof(T))); + + return true; + } + + /// Unregister memory pinned by pin_host_memory + void unpin_host_memory(instance_id_t instance_id) override { + matrix& instance = this->instance(instance_id); + unpin_memory(instance.base); + } + }; + +``matrix_stream_interface`` must meet the following requirements so that +they can be used in the CUDA stream backend : - It must provide +constructors which take either a matrix, or a shape of matrix as +arguments. - It must implement the ``stream_data_copy``, +``stream_data_allocate`` and ``stream_data_deallocate`` virtual methods, +which respectively define how to copy an instance into another instance, +how to allocate an instance, and how to deallocate an instance. - It may +implement the ``pin_host_memory`` and ``unpin_host_memory`` virtual +methods which respectively register and unregister the memory associated +to an instance allocated on the host. These two methods are not +mandatory, but it is the responsibility of the user to either only pass +and allocate registered host buffers, or to ensure that the copy method +does not require such memory pinning. Similarly, accessing an instance +located in host memory from a device typically requires to access +registered memory. + +Associating a data interface with the CUDA stream backend +========================================================= + +To ensure that we can initialize a logical data from a matrix, or from +the shape of a matrix with ``stream_ctx::logical_data``, we then need to +specialize the ``cudastf::streamed_interface_of`` trait class. + +The resulting class must simply define a type named ``type`` which is +the type of the data interface for the CUDA stream backend. + +.. code:: c++ + + template + class cudastf::streamed_interface_of> { + public: + using type = matrix_stream_interface; + }; + +Once we have defined this trait class, it is for example possible to +initialize a logical data from a matrix, or from a matrix shape : + +.. code:: c++ + + std::vector v(m * n, 0); + matrix M(m, n, &v[0]); + + // Initialize from a matrix + auto lM = ctx.logical_data(M); + + // Initialize from a shape + auto lM2 = ctx.logical_data(shape_of>(m, n)); + +Example of code using the ``matrix`` data interface +=================================================== + +We can now use the ``matrix`` class in CUDASTF, and access it from +tasks. In this code, we first initialize a matrix on the host, we then +apply a task which will update its content on the current device. We +finally check that the content is correct, by the means of the +write-back mechanism that automatically updates the reference data +instance of a logical data when calling ``ctx.sync()``\ t. + +.. code:: c++ + + template + __global__ void kernel(matrix M) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads_x = gridDim.x * blockDim.x; + + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + int nthreads_y = gridDim.y * blockDim.y; + + for (int x = tid_x; x < M.m; x += nthreads_x) + for (int y = tid_y; y < M.n; y += nthreads_y) { + M(x, y) += -x + 7 * y; + } + } + + int main() { + stream_ctx ctx; + + const size_t m = 8; + const size_t n = 10; + std::vector v(m * n); + + for (size_t j = 0; j < n; j++) + for (size_t i = 0; i < m; i++) { + v[i + j * m] = 17 * i + 23 * j; + } + + matrix M(m, n, &v[0]); + + auto lM = ctx.logical_data(M); + + // M(i,j) += -i + 7*i + ctx.task(lM.rw())->*[](cudaStream_t s, auto dM) { kernel<<>>(dM); }; + + ctx.sync(); + + for (size_t j = 0; j < n; j++) + for (size_t i = 0; i < m; i++) { + assert(v[i + j * m] == (17 * i + 23 * j) + (-i + 7*i)); + } + } diff --git a/docs/cudax/stf/images/dot-output-axpy-annotated.png b/docs/cudax/stf/images/dot-output-axpy-annotated.png new file mode 100644 index 0000000000000000000000000000000000000000..3101370b58bfb4e34088689838ba83ee56ad54b4 GIT binary patch literal 11489 zcmaKSbyQVd6z8Qo9^DPn-60_%AuU}>BaPDWBm|^BN{|i_ky1JY9-?$eOP6$a!yIO2 z&6+=E%@V=&eB5{MIeY(VpJ;7O6+CPzYzTtzRG%v9f}cg;?+_+B_@1H&TLVAPU#hDp zLiZ1U@>+_MA&3D|Rg~BF&Dw48^VL_Hz6&|e*yFZY=w>rjeCqHrobZf$=LKycd#Ly^ z@(O<#^9o$nFZE+uF8-jnV0>lTHdhBfCOaoKW;Yo>63JT@76rNXaMUU!krtJaX1tLm_RBQ_i3vJHSgM$M&xH)i(`kWEN4&j=` zPo6wMsWIWn$jH#v(+j)0@-JoLjO%-$9rPms%88KtKTedc}|D6?w}Q zghdvWT_t+O?i+s?X{3D#A8BUESz9wpOG^_{Q{%)bTf-Ecoy+trYo9-V-aRm&q^TMI z`{?LM+tQN2$Hym@R*H;+gX5{Hs=S#Q&8HEI1-y-x0&Y&upZ2*y?@%!C7dM6rzm$~7 z`}#Jl%KDCd(V^hCf-~GFpZd%2TKtG2ex$khC;y4~>kY!*zP_dX{e(}SSU!FJ{Bkf$ z0uvwKyS6kw&nLR7tPGbv=mb{hGV^YwE1_d%hFVosm5dRePWtaS?=-4vmXr^EBmsSA z{*%c65)dCs%Y=!Xhb5}KyV2bltO z=wySlRyyODp}~(rk(MF1wWa1;Iy4fV;Uy&{s0d0SJAN^-$#LFWnxCYrm&fZLb8D+`E;FL|G~)lcy1E7~MUaP!D=>2F&`JCL2cD-eHW!tP_Vjd$e39sI#@?+hrwXI*p}Hk0j*gDpyu9AE>ON?O6=pA-oH)Tfr16@o zl^bk3r9EYgIs#RUOF&?+txYIIBcAVBEx;~d$$i^A$s zKA){0<)g(~%mM;1wIZTo&1?lHr>{b#;Bv%&~ z;?@(1)=GZMzp%dQbKY$kwh^@rp15yqZ%e$I;J&)LI=Z<$jxYN<`(lYOA|m4H)29fq z=?Uh|-tuh85(QMSN4l1l3IG1tq$?eT`YeW4v08~f!lQzrnPv%S3?JX&mY^pfn|VS<6n zf`D}=A^bNPw~<_uhN`M#YVh_1&)u$DNcD{rH+Sqp;4xlua`NWprfRLpKs50qK_Q`J z7QgL@a)Wi0#ilpg+^w}5_rLjqmu{%Xu+H}8)In9>-!9zO`V61#ZqP`1#|nOob!)q= zlZmO70{6%SwVsTq|L{9z#(p69NJPHa51Si^G*Mr&{SZTO^Tq)XGJ=Jzzi8=Q;E~0}wXJcF#;t@8&VB z`+M2D+lRyVLk^T^Ru)YF*(+I{iHQk8PR{pUYyIC16Y;G9$z+Lpylg({q6>({jtCte zA1@zyXa(d&aBl$IFE3USjvLo=s!i&Y3mKstnSi?I{;saPNZN(NMk8nY3tG_4fLrh= zr<8V+Zz>mnx2~!x!2r_D*BO@W6P?Qw zWyVh+8X14#(UB2Jnq{Y8U6Vq$g;rF=tH}!Ek(buin|pinP9Mg0rm7o*Z?9Y};=CGs z51F2GlHQ-yw@sBky^a!;lFIaK%-V0gu8?`}B@PHB@a}S*l9>1jJ$SBPLQ!nSZ?C$4Z1#q$t6UKj%}Z&s1@O2VLAHj6s1{IarXA5@l{*41@{jPt*3Gs zQVRoU#XUcHh46d!yx zW&fcDR2U}-c09dotGT^m;qU`K0+yDYxpw_rP!Q@=n?V#EUTr_*JsTlKSUB8`LFWti zS|8kX)tMt+O=MQVLParfS-73u?i`r6Un`?;YQ;9w)@8`D=V~%iO2jDxDR^+(0@AnrE{*Ig|2R> zOu(7d<0TIx9k&eWGvRkjz4VFWVbd-_pFK11sJ|yhU0&%~v<5bI z$lJjZ2QtO<^0oh7AtCBSSHZ3#xw*N;(#y`Dsnk5JWaZ((Q!a!>;?q`b24*UL1WM&_ zrRxcJzrLeSRCM%PMJ7kv8m(u~T;A-w#h`OARDSV-(QY6kA|r#!zG>bONFcjI z%Oc%c$+Mm6p_kWZySztbUKbY^47|MfDhYJ8<&bGkrM$fS*$uQ{tER5LxxWv(9$?n! z-UR>zafd$)^Q}S6bKk-K3oktC0kYxz-&zT-Q68G4^QoUW0mGUMv$fkBIJ zR!D~BCblB?VALZhIVP2Ca7*ux$%@jooG4Jo!N?)KoAf@K6|s9 zHkFbpK9}%3qeM$ffB*jGYc*VniD{hqX8JtrVfpA$ww5#@Kh#waD# z{QRST3*Ah;Th}MmWI`e$07T-airWorc3(mU+>QcD#n;!@YPP|nyTAX( zN-Y8XV;bmN$o<{m0{^1|ZFWK$8elGxDspF^Q}UTL{09V+xy(SmM(*QmNc_dmaQtKE z`5#k6Y1VO5)%K>n;-3F70YDEVKhn}=Oa2b}9`J{?zCH;6wnqCQx(9OM<4YiB)5udR zauP9Z;S!Jo4OftmKme%WVl~aOs;=$>5FuI8euDdeRq%6D0OO!CY&2ppE5|zcv}EUP zSME!AtsM-j(C>mhSS=~esR~ru+<0>~*>;x-U35~r8Oh9Z_wuz~{MM|v|FCyO3()oyhvaEZ}*?f0Z zQCb>iQoeV^JS87^I&OIJFInqgtF#gs3I<@fGEr{2do(^diGmo4FOR2@7;(B9%$8zI zRZINvVS@9)Ha&^N|Gha{3>+FRLfAL(Uw5LnEuc(c5s_LG6R=J}5FAi)O)uCdOGOox zg<-=+v-qZ_CaEFQu|D&D7+SKo!k?dIz8e|Ql|CKQ z+dUE`P!JOlnftCmK#<_=h=+#aBcI7 zZrIHdL5LjC%AYl}u_52<-TL8TEqYJ~l+8Z-uBl?e!X$82e)Hc019*=gKmM<8)xpUL zS+(>-N(w5XW@7_rp>H-lL}ovHVE~v|c(MgE40}Diyu#Ab$bmx4A9tD=4>(>Mzz_Ob zQi7z~_XaBwc&moyW}nL+yfGsaDpf60Z6UHi|HBO}{C+2q!C)^g&jD#@KYwm3K}JlB z0(~%aHv9-&*onVPRaMnAZNTMf8jO+>tHpujzfx=&eA@t`}A*&sn| zUkNCVm+BeA4mR4_+L0?M1GL%kXMMa}Tq|%Sl)r5uA;E{g$;-ZeU5=v`yO=O_19G8d z+}F(TCA{_iF5v3&vUV$IeSO`lH-!Ti8~bX?qZe%O$+~M=Ujq;=4c@!c6*&{9^XFjM zC}?Tt#z%LQjf{*+C1Q9`%FD`!QQ*{~&J2zw>tA-svw^cUlIfxX9&HiGDX>Z*fRWMB z(LIm;_3ReVOHreYX~@eXt@nTIcpP-4U2fp@D+)H-65vxR5)*t{YFYc0Sw_bEpw!`H zbF}M2E!4!@`#}4ER)Nwmml1g@_*e(PzL}0@jx;5xb19#_ni5?VFR*OhxyV6*fhMbtQnuy0 zVlIZi4o|`_G6%m|cj1E~Vqj!!uS0g;H6O^9N&}!*clx^c;1{tLAwuWKK59R~*VG!W<>8WdGNrzQHp8cttdUZB?KWNuC?Xzk^w{R>IgZYJia6!I31+^w8$s2o zii&t3FP+H~Ia0*w^1BvZEBg6K%+1YR0tT-({ry@!#n#pq6hY{{WKw3PdZU>pa2aG| zWJeFTF+C_9a8@=xNA>vjWYl2cH(&__;7w52#v{{vjib7MF$m~bpkSc7Kh%7vUPY?$ z=L|ld^FjTyGT~%WZHISzduwcJI+OjfsV!;$Xtj?L24mbPW_84l$3T3y?h<5S`L)GM zHW#cArNl7CWIXQ$OqaS*E)qwY`F_D{41|k_a3E*I{ zf@AQ?4f;Uvg}ialGe0MuLoMbCu+DR9tmK1mwx)`urR77E0Q4ycYCc#$NK*G=EU*eS ztLJU^GGU>i)4TFED0O@p(;6TJ$O*o3EVFE*o2<0lclE#rqNAgV!ElCu#;%FFYh`^s z65#y`;w=h04-Xm?2!!#2xdl(9V`vzu_4pDM?r7krtc<=kkZE7b{qQND^70_J)=RdD z3iDTKE#kXXFE+kDF9i(0-+;%Gt?0Atn_*nz0K-K`d3>>m0wKU}es>sVl0NEY{;$X8 zZhiFt2L47-f9^ud0}J$1m@PCK~5 zuFfa*4NV#?AtAedI0#Hc-R5K8zsClwO9YIcyt_L;n^sON(D^G@r`vyP9Er(5IoHNM zta3kB<&RK6Dpnxw0od-%G3si{l|Uy$)pk9%f6%;eJGdrJVK_P`P6q+6=?h#%Vs>_% z$g2sfvz@6AkLG_y;Jo0c_~yN9V%8Q++tbry(6@ngCKTdVwkG3{9R{upYVbeZ z!XMfP*8@~VBk&THn22bHHo|J5JXKYYiz`O0=qP?WNGcP=Bco0<@wI$xpspP4z$e7U z#yX#`IMdPf4h`)e%z`I925p!GOoepF-H(=-x;A!IPw>boPg!n2UF=3J)4nfV@%2}@ z830Sfk*a1Ig!_#-9>%I#VbPL-9Z}H1Z^XFr(Dj5N)dA9w3fk_tOW%Hzzm^+bXUAe z5{8lZ@nf`VG8-cZjDX#UW=3=TnK-m$2b6`Lp5Es6b|1hm$wD=ll*hkrn7_ZjCve|& z++W>r;$P|2DHV@;0E#?0J6i+kU-t9zr(zet_?PIGT>Q08knsBX{TOI&bTfMY4aVM8 z&(^E0(!ny5PdF5~=oQA*?|}7dyBu(H;HStkZt+r5&1lWIS)sqrGvi$cNl?kSQ}%9X zH!Tn-Yrx8HuB~+fq0>dmD{h%GU}Iy$&(0nNe7g59LnIh=cTdj}U{(q;vQ9vE4ZU%D zL&Y_zSj23LASim*+$^OWM-}0-5Hwl!vpmX6fCBg8df`4^r%;)Zi3!<$FsuEGPNCWQ zf2G0r17KP2fRxG8V6#?J!w$Jv!Z2)pZRoK#D*@D85|E2#^{&~3Ozh_8SFU|P-#$d5 zAly0Ko;>e9HtfwIbcm#l^?h`)xk8>M}{dvIzkh3l=syI@%!Z8;tNL8LK1JyYtGK#N z6f*~~jO$cY1h{_|=L z?!^JJN#@CN)2?sd_(2tb`xO!r8Xb3?QWzq(RmaB0ZV0^eJlhq0S&e5@?bgr7~_wU~iF}t|q2n*=7hwO6cXT)r^eL^0}Koi{BKBqZ!dfH@Xwvl+I z&ZPt7DS4CI4pdCl$4a>ffxLMI>SRlk-v;PX=IAxB}gA( z+V!UoGaHO-Ad|Lertl0q)rx!XP&~X12z^E{K@t3j3tWmE$m4{m@W9tQd|K=x-AyN- z2lW5(L)ymPp83%ujE8ZDx3?$@5!Uj`3Xg0VNES~-8S6gz#8mNsT;t)hK>>z7q`Q24 z1gEE`+Uz6`VlZ#mfgi*zipt7N($X}*`+~r;cKhTWY?oQHHy&64fPb9t-@pGaIXMjE zc_btx2rGU>M+XYf@Du+GWxf6BF|d2momTVsw-k2qULjD)5ZxVMt)oRMq&yqEhAgbJB-V7RXA#3<|`x zCU#GHRz1TbA{g1&(5d3k0f$5?7q$aNWK3;xMouejY*w0r0dHDVRK(87Srt_RCRyO$ z4$jVsz{GF=mee@&@}>1HZExd5D7Fg^6 z2F=Ln=@BENpzQzs9vu}W2SOnhqG)27ZPKVpH&gJD>9Qfgk<!Dbatg6hqedw=gH+;%bi#1lD1zjvT;b>`sfI zqM*%}%V)X%QQb8L49YmU%`e(j1&eX&ut=B-c$URw2r*N{v zG+!>e9sVx&l6Rh{L=oAIfZ*itL{ZzFA31}E0YXAT!oLMC~9Ns*2l|DL)<|-L{o1mV7aI18{j+Wk0)ij|ZUf@2mDom0j|O>BGc!5ylVmiK5@Mmku=Vj%4P z5Q~()M83qlv6~N3*mz2_Q80?&``f6Pm?tk^7HDT8|AeiAEc!uGmGx`b&`E|L8&<$s z6jnWdQR7BEf)QA;z^x3WX2KfK80Zm>P;rrNmI0?nem;^i(JeC*zumIp!xS6j>+OJi z!}+Uw-V}gP^EW`XZ~HNP?(X-&vtLTy7HhoIA;Rk1IHka`B7w9F#V7kpklktI$lw|g z$i=bBW0bgxBw)@ys*~u@aIMRw9k@ zZ2>ws`Z7f-mjm(v54;dM>sb|d75<57_3L0MT{J+{ZYalvGUg3(Jh?4ks5-ZW_}xsg z6*X0?E3VY^>Az8qP|}O7srQ&MR1gNz4%#=Z5=9SdW^@7y_$3bcGHCbC2X$&3gth^J zI=>iE=R2IgyTyX*05OGjAT0NOqOc#6A7>F8p6@Avr0um3H2duZirZY%D&}R%%<~jL zjf>VeWPhIJ=T$1SlR^oZeEs8MhBbScNQ*GIIt!71ug!_O*UHHK6`!5G{fwUS8Ekbu zH)Ie)mj9`=mqGX(r5`BwBkKCM8R3sc3#!)yU$(*JSzlGJlS z4u6K+;5!`tN)ld*)LW#hNh9?Exau>~)t=->lthGbaSH7?$PHEjQwz{f=NI1WZq#yk za?fKthp-kU%^i!8@U_|h@=0fbX)btT5Nm4G>h@52O1wgKPmJfs1AbQnP6Q7@HNrUv zfG-g$slrL2uI^QU7dsjRw}S<B2kClF zRDQ87E-dOX`R5o~B9w^DPw^RF2^SH9IKe8j^*e=fF?{*tMC2^tj*u+Mw20F5V-7LX z!xevw;E?IJ3s^iyDJVCz2zEF&90^)SmJW#YtbhVB^Tgz2;X+nU#dYOvM2n-)HHv;l zB{SNha=Jgl{IK1)4lhriY3achTuz=B9=EM79%hXJBxZLO6bBM5Qb2{f|vq~XL?S30${ z+b+EJb(iU6i%99gM$FUyTJXQoBM6UMN(3y|CA}!Aoln|_?cgOUt)8pBgV7^k+Somr zRw+I;Jow(TAoZln>gv?6{^>S`drj((;f8850Ny{;mz?B;mI%La)c^~{rPXb$T1 z(H^W2-6Y5uCUjc& zY{w-%VIsX(IDmfDYWCG^9@m5?(1% zL@{kj5DtPSor&%o`67Y##)bdd8Fg{)=YpM-^`K8rZ=(x}Z6-O5b3D5nuPK@y#W&_a zca}Yf>eOGj7Bj^4b2dB3f*6z4JO(IpjF&WJZa+d3NJhwdzj9!ob1%`4J?c2@88w8y zltMeg^tl*|&$(>-2K;2hR8Q2Qtg2I{{)TpvN4$|7|KPMnNpZFT9VEe~+f4LLPq3i- z$3bPh!*iM`z}5Z1r3}S=ByNZ21M!f%w2~vF`0*NTkT8+HA%Fgjl7VJsUqnL>k@trf zLg}c<{H)NKQO`aa1MX+J+M$7vL8L#y+4B_w1fL22a&vWAyyM_Da$7aBPk(iHjKkHn z3`y^z(mSt4>Az1bR5q4%C@%t|Y5%L<)v`C9Qepfee_D{p+Q*bZ6%Ph-Pu+CfacSTM z(;_+`=7CL~nf7`9Q(iACDmqS-i$`9q_ifLzC(wt4thS`%1asY9Wme%XLh0|ETOlK~ z19U_^)KnPssHo7>Aam3de@Xbf^$F76yAl^a($8nF`4I}#LGCr?C|^?48a-Az2j9GY z3exbTRtt-p`>Y-A7hBG;VI>{JNC$CbU7QfFP%&4(Sg-)X{%|z&OXH)DE`tHNmz~$` zEZ=+%^?W-GYC@7-;3d{$qCxT)cizI9eg#T7)ckk?r()N%nC^lm_fcizjh%sIE?2Q+ z>(7yIWHgrb5iEZ$A-=l9YN5yCeZd@w`~WwAk;^7$Rc$@&q)t&Ul3qhDR@!bk!_pu& z^xpY9oOmKhRy6GJZ>L5Wy?a%f4;}+6U!PHh(+N_h@feRJ?(A`^C4r+28Z-i-^2^*_ z!%dC>NN0(CA4OfLGBYxq$!)9o?T4~d#Y$4{VTj++6URH|aO{$l7;+wP;HqOtwZcni z2pj|pxgXO@fdX}zzF*nO2@g-3NFY>-bk$ghDopDugmkxmuxCy+z2QO$3Zl%{_pFC_ zO@x=GLm+TZv6%R4V5r%|R9zAI*Faz2y`4FM5VG&J=_f~EhjJu z46n>Vx5U797TR(Tsqk2#QCq<^oW*xhYm!oE?iqgap@)Sk$Kbd|omy!Cm(7E{KI(ZC zODmnO3&!O(f_sq--hbv^`lmmp(@5&$KoaMCoCCrgTOXG3ARSR_eu@h)2=n-vE`W-V zS7WtQT7T62exkaJWHAO}CdF1`CDK%XwhE%*T}IZC`O=ESe-Sw34gTYtSN{GZ&p2%} zG&-MMTuOu#V9mH9&7$%@<1JR;EUJ&hco>80JbmKxt&Kgi1mrTEanurTfVH+2Z0fTK zG~!&1T*G;N`iyLMy`1K=Z_$)4$J{5~rk>R-Aul#?7)32xbDe!t9JplQ?-n!NT6?0c zyL8!7b$Jur%M{wlUzo_tJBML+obV6y`y(x49+w(3CVPOH(!=?(uU|RsyNhNwaSj*2 z*c==VL$tO8e0Mu{HLEOB{HE~M10AK4DvkohO8i&tP_{g?#;a_5r0$fS|2R^Y?evHX z6rKpYnc&vQmLvs}yn~~r(OK{2m1!RJr%XV%KU~gGC1Kguj?GoyutSemB~&p~;MubV zdbUL6#@-$a3R3T%6NS_I1o zh+&wyxNtsN21O8e#V=hKF#Q)^X%;UIb)5R+V(_DW#>>vSqTXZFZzthR!eF45t5bMO zLnPuaW24Klw>wDy4hp>lHh==0aY4tU`ezbp%FL}}Qdn0> za>}iEYY)|qn2(p)oBVICvhQVZf z`3oX(+gr}(UrlEUNK%0#ry!Z;ZAfw*7E;!td4Bo&waQt+5*%BdHm!GWt|M!jsRyU~bj;1? zzLsrGn^2hU?DHh|{OPJbT?-nJ>h?d|MDfwm^7Rks=<34Q+uL*Z@rkUj7XycMVI(9r zQ|rxBpL7|eJFeqlnYq^QU~b|X>^;Up0%aE#) LrecMHRp|c!_ly^Y literal 0 HcmV?d00001 diff --git a/docs/cudax/stf/images/dot-output-axpy-events.png b/docs/cudax/stf/images/dot-output-axpy-events.png new file mode 100644 index 0000000000000000000000000000000000000000..d2c3e25aa724039eefbe888c1caaad37f7a0c3c3 GIT binary patch literal 135415 zcmcG$1yohr+b_H=M37dxK`=-~K|oO0QeuFLNQViMQqs~$C}JTZQYs>000Po!0!oQA zqDZ4i3->qA`M-DEamRbd{l4*y&*K?q9Aodb=9=?)e)X*IBZo8@=(y=96bgg37FC}@ zSv5kTtd^#w!LLNEJjILu&{!SNq*9j2KM#r@#ZoAO6m9ChV;=WLIxm^r7+8{@s9zsn z>AxyWLRgq?E1k!UYRQ=!N7m`>IC!xBaP`weM&GSZ#-%0PPCtFxRcFt6shdCUB|7dM z{T9%ox6N_IDz<%Vv`?N+uoTGp!6fkADag{Pmp`O_;m_R@A6(5_JG#9z{&sbH71+ly ztNDlh`y=v=SV0pDGs{0e^wcHV!vFK@%=B);|Nam;CFM$?K78=-A&pHgw;Ju{4RrV! z=)=awrg!AX1F!668$BbVCtk~K)>q4$o11%PQ&UC1{rItp^38eov8^}l6hd{j_)`_Ao$6gByxj50MWErsal>Gh5s3;Og) z|C}86<7S%;930m3Q$Ghgi}K4SBiJR?TwLUahK7!qnMFT&vbA_%Efas$w>Pm2(MK&ECh$*Z2G?xcI}W{{9}C+v8tetgTmBcv?R2A#=Ix z@+!ju=S>oll1az5S~@snzcb&pYZvE^GwaXP-?wx0c6HtP(#S4$ZA*AqWS!W3%SUdR z&!4kix^(H4>oEJTU%#%!$4AIo*Wg0F_T0Jj`@4{$qEU9H%H-ECUo3mdE+1qci4{9o z(Ni91X>T8WC_(Y(mDZ05N~dcv;gi(k0Q$IgDeXC=6d~BoNg9W!_{_gNXB-x$_`I-C@!P(YZRwpSi`*oq$Gfj$d)~NlV^DIqqhLqL(!8y1 zf+Du~w{NGg$xN>sq#w)4$?<5ijE{VAw!W_>iszNj!Y*t^*X2w)zkDRcRe3kV z4jfQFbLQdL&6_uapFWk4FfZZ7cWzf`X>H|jF5s1tl5%u%+Si(8>Kbf*K+5U&pFci# z;y?ASrlAp*k(nr|_44vcNlspwZ{JygJ$x-THt_Xp<<4Ihe|)r@=qYCsIVPN}nVgr$ z&c(&`%Ja`w`;J$a@io{bWUU6jybyOA>p1WD6i>dQqN3;9w^Sv^FS)iSaMimmjn8)W zYew_V$;dxacXD?REh<{@x$5rX5_J1EhqJS@rB%v?ixYLrXIv+II~ zVJ|DoT%nFysyNZ=gZ9mDQafr%y(8@|UATZE1U+ha;+u+Bn!lEK; z+@Rs}Q{h#5`o^DZa_c^R)HriS?B2b5cVA16e~|C4>MHi)`6lh);DDn-Z(v|x$GMv& zcU8P$&gWIzq@`<76&xq~_KS*(lTW&%pEKQ=6(0bi3;4uxWwwnuY06 zb!X@8Qf9BiY+JLEva%RkT3QCXOZ{RsZ>=cn?iN^DTC&?98F#mak!IlgcVS6MhSqFz zhv$Z^Ho0wic07xJ|H__gcpz5)jVql*!9G*{~;XCp1mH1utkoEf< z9cA!24W)j5Dp}EwS4~V#?!(vjj*l~$o15cPlKhsJl*2SN#>X^^rUx5Wm6w-uZa+nf zzr)PTOhNe%7ZFVkjl5m;dHMF;yI0u64++Z4b2ya#&1>pq-m@LIUz=UJbR{TgbyaO` z|Ki;5&3LdAy_HV$Q|HqwMUodB7pCnNXZx=zT>2$ykZs1m!NHNPF6=3KB_hIgZkvh< z|GIVSuyaD<;<)f=@z4*atK%C(Z&$gWLTgcQ>Ra8{*T>Ak5gIS=P|?!Dj!U?qD3`4H z_VZ`9`}gk`K4NKVX%XJEDecSI2DX^V&D*wB9#;LE`p#NgTl?CZQ=D?=n1B8LeKIHZ z&GQ#8DudRs$yipbHZ?XdU}t4zz3aochb324xw30Tb4!aAdcrH0pR60`)`T$dggEur zSa!T}qNStj`>h_ne)iWpX{VmD70PqpV(%!rZtdvkNM#IvtZr27xpS9Y`|3=Ch!v}7 z2A$gm8&iy@&dDW5Z&p&`NlHq(BX^z^P=cA0vrasNnM>7-wOenrJ(r%JpPzHr1>J8uz9j)8V6uc~j`vyD~XBnVF9- zs@r#g!L-n&TxtAuAcvX9zM+ONEdhGBqu0?}O^e(t@sZ>^QO(jA!|T4C&o#QI;=`?{ zr{^(VJmc{oa!}>?i4!7XVl=qDN^I7cTeoV&uQKOe&Nz|XhXzj`5WnA|Q+lP4u#ZIB z!ScD^KPIsZB8rOKI%%xH0ur|*POcOX5J*=S5f--5-?wjt%F?X$h3*o*HEY(;GBE5v zbxKt1@cs7z323T?2j2&c8tUt_;#VU5{VA!B9${_y2p~|I9~6m|Y$;tBQ>r_)z3$ZW zg_0SwOoL&Sl%+qv>s95I><8Z7^t5itHt&A&$G+?JN%=glnXwOf`8Qv_l*8Xx&!_nE zLu+Tu26lF!8KWVYut*IjCs~6`qcw*QANB?=LN#2sefxH19-fGt92tC9TrI!K_h)%A z@$uVSX{mg%58Sjfn(0sxS$KIR82J=MT@~!?7iT9c-o72WtkPc*Ouy2fBB-XW9zSAy ze^Q9B~Qb^y_(Wi zaPd(UOI()gaEs99&9s)5mUmSBR1)V7@pTPnmqugd(3Jz&-tVD&m&HoR_%3=pH7#Nn z@Sa$)a@8uQ-ikFS2SYN3Ir-Lrgg<`o5V&R{Rco7FOhciOW{{AQs_9;umPc!QBB^f6 zhHiMdpkQ~N-B%h)+l#YIHnz5#qTk9D6czQyJCy9x)m{9|V(ej+@>K2Eh9Y-|b;({YOXi2RNf_o(E?>SpF*BoSZ!g6tpd#UFC?+W(vHooR z{b04Evbq?NlP}pCS6TC0^V@DijZ$zM;S?4Tk@cFsFw&lTVCY(yE%xe**8x>mrCx5#Z&4XpzEFgyO*_h+^IpQ9G){gzT}(fTh;6x^0JuW3p@ znxvop;Mi7K=Wkl~6kVS*a$+f-`7QhE-jP`W&~jw+hKb+5EAerWK$(HhS_O8ptMwGm zbbU5{Xpr&Db=cskS#f*M?1HJofzH33Mn*;zU0wVrRw%gCSRbARzA*zs!?3nC!}p1* z0%Zw&Dm@=R))zl{@+9ZX2kxoiRwlGYK-#^AhMcG-55R#;Ogb!9rq1u!lbf6CIlAM_ z`@JqMFU^cKG-&So&Yy4l@+Ao<_|EpzIw`>yZ1d{x6kwUL=YVB#hV4!~V~g90ErLyQ zOLoVON=*TuAXE$M#^n7p%gStx>cW3ar<&w)jU;VsY*HChw1IjmK7T&?>eVZ9m0)07 z0g0*#U58gwo}GTjZfc&UzeOXGv$1Cf&^-0Fl<8DE067us_6y}!5nD6LSH-||AYhWPOLQBUI?ClCL1A08Qz03zA1tsSm)@ZiSJ z4-c|!kJClJvw@9F6oI11Qzr!C(h(ahMx;}u1)qO!6&ZpYam zqfBhj! zFOQ+F>3-2*AJMP}cC4tcKit&Ztj%|2A3eJSGo=crg-61xFI;@gOY=wiqx-Css?w!D z(#6v#H+0*!WoWp&)I@T9?s@U!W5PBG2?;9h+iamzNl}fKRCRia)mTS?8m$lrlcSp( zJDRjw*h6<0qZeH*BZEzT`DizpW?rXHrwz7b#$y)?i;HV!I~?#`D-^`3?2&`IlW%5B zr3O(1cR6%DZ5{jhb4v)5Kv&Pjg{hw@`6<6EnN(>heRDq&TZi3;nzrY@rzo6F{K^`7 zb}c@QSP*$iL$b)z%rA}4a&tFQsA$H4OTNColD@njA0CWfO>z4Eo~wA;%iRV1m+ZG7 zz!!j4HUZz+Bw8k>R!RAnDWBW_aW-OgFn}h@2mO9 z_iSd@JLysh%AnhJ<02pGDPFgcHX$^SAcKsXiZ{e$ujbkpzqeM)PPh~q#xJDO@}X@b zJNqur!}p#&e)2?3TYH_N>(FXrnr!&X+!F52&NdAO`uYJN6J_=FOw-fT zg$r*DvP{-da0ncn6M$n4OT2f#G?MJ{U9h{*^_t6}WbVw1XDWl94Clv+T;1GgD8TA3 z=iO(AGO~{t7|^8Yr@J$Cp|Gx_5LE2#{W@#;Y@$yH>VJQGt}Umkud<^1Sfu!UkClKC z3@To8zXk}b{E_I#NO|QvKt})}7uR{2`(9_SqpLV8;FhFUuQ(eU8=2KC+FzcZov38`qvH4R z-fo|;JSeclZItpDg%1b9} zf^gC!FO4j%6UyHCm3*APa zcxm|9o-sY1e#~Rw=7|UT=|(OFeU_Q6*XGBk8T@FzDz`NTkC|L13GL`}MAst@UFP=ZI0A#xgu1yvI zrB48B?`HnhFTZ7Rx9wq-<;9DnNv)z;e@$^ia;TK3=xUtuwi2JBrX;y=vJCd`ttH)O zCoDn5t+2TIp=zfJ+o$NfZE}Z(Y(S8tHKMpTq2Hx4KGOU1;(lRa;g!I(OjjWM#N52O z>6Vn~)_VyF`ym}My+1yui!cL*lgKr{I6|vlGn0PQPPEGWf^qw)!A9XNTePu-uXJ>n zJaZd0TCw-3d1*=cELuy_k)wv=8tin=-DR=z-YQ0OwU0lG$z`*>A)>oC`RO? zvHH~kYv`*06(0`sFf$7Vz1ng9hz}2ig_ZTwx9QH#{H0tRP=m~T-(UBO%pG1&hLpRh zcV~iA;4$>gPWETc^F4q5e5ytERNXb2{%rHoE7fcA2d4ddDM2HvEMbKHb=z|-w%Qm z@6})F)s{CloEe!Llalnx(bOuBXXAu1F2l#=6M zgU|@HTI{{*PyydCz@T#MxuJ&0XpNN1`sp_m*9!|5(*HJm2VGcQEOv58COPj}`0~oQ zFbJq%AqV(qTpdx{a`N*vK|&`x-MgK{Ybor*)cLJjw?fkt?;QC?)wkrg0FJEN-;0>B`U^$INxe@(Ig&u`v*8W`l0)p>|alPI)D?jnO)i}62vG4m!%&{ZL&Ns z$Bk3*Yg{PCsjccqPCVt}#fzj@?cBNZ&MrG{Xzmj&#tuRvBI@9`ijjBka#Q#37s5gl z`0hG%0}GwY6Tu-}19Ct4QE9vkk|Oli@;9;K&Y4(tjH;SqV4{;@5NYMT}>!B>b{Iam-5ue zm!nt#UC64!)1zNQtE#BHdpwT~Ifm)#fb3{Q^M-@c0dxy0D-)r#0_(B+_UafX%ko3+ zSG?}86SYae74hzNUJXk{;#0WjF=J!;9Xocgb5S$VA&bUKA_&K+3}FZ~`B-!+JaPlQ z+e#4LN&Kn>)D|LeXk6uIfUcTeDSVx3YRX6SIMktp{OioQs|dWtEjU<=v^^`lou8lI zWXYoay(QB)EFz+k?XdC+O-)S@+vK&C0SW%-puO0fdreGuS=rcxg7$Wo_^?1*>FKc` zBo5F;KU8~4i-@R`laparul^%uLAyZtC)gA6E@ z699OELP9G_m*%&1`OKKUPf$96hs9q=h!Q$VgH5iZt84Q>j*xnYMO%(#PF~(EMXIo; zqeR=!a>zgyyE>lXef3zrc4h-fCThp>c%?o1az7B0xq-5n)wj||lN z_jhE~D`s$&{O|4Mna`-XPOjT-+wz?AD)Pg&ZmLQe{G7PE+!yBRD4@!@4upts8pI93-O;& zzlT^mqj==6#l@w2sh+B1_J-F(Q!6GaYJoEbbbA@F5ke&%#d=sa@W67e*{Wl^Yz8Gn z_a@7G?;fv*mzOm}SDM<_)9tn%%YSD83>u&TcQR%b6)BObOia~x?=l$j1HvA=Hrcoa zb{zMcfUU*!=FJBWCCqhdX2c!!-jwjX@y8K*;x`8h%d;k7vRQ}_Bbdpa4IY5cD zLQ{vfcEH2BNV|T05F{iZ6CU2DTrw6ELRZ~*_EH~)tx?K#2xSteng25!gI$bvhC?$b zxtzE>C$RW;+)rLXW#`U~L?pYRvuR{x#KPKoRhs@f{KU6neHkyFNk!q{j-tWOO`A4B ziw?wXplaJWzQ1km8xE$NjRiohXfYOxeKUbZ(vz9rwOyg)@nc$}%v<&0o01u|GYzcY z-q?a}p$-T5%9Sfqx7^1%xbS&2wSZ3jkkwiSvgdi-98`vBvH$=4x(z2*SF94UJ45uF zy1Fv-eAl({aY3LxkC~-2c#TrrE#TE}c$CrCm6VkF z5Btr>mM@w6&TTSyYPcdw5fQ5#e=NNNWTv=_Jx|Zbs08Ygz4%=>upfTe75Q%O zqw-^B+YBzy3!nI}EWpz;P#=b9)r;TPgXp9bCHIcqj*Vq=w+IU}N_z5y4(4v*%I9$0 z;M{I+Qw1kM9f9L%?^1xfg9HStaO>8J|xH7}4$u&$Wx)RIrI+T|@rA9C`V8yJMtt1g?&l;A1iFmK$r@$u8Akc{}c{FZ$J z`yF83ss8AL4*bObOF9+F1NK3M&hzIs&n=e6{gxx-yF6B5 z3laKxOWHVZy@1bDO!?%NEnA4oL>U~F`O@BA(b4hxjg=`1AeEUKV(R??!ZRbFfWN%+zveGv5#G;|!w>47EJJ!6C{r>$svzj0d6!r*F z0#Fyg8bkSveda#e zz7cYS%$?G^?<}|8q=UY1|D1^>rpY5Dp{@>fuos1~1_qWOP%4$0`3oMcFsdUk?+wKb z!oo*$w543ZJ^uO8^uOdUs)(>pe$>%Evk?C`l(Ajyzl~}D+OcuP>7{7o&T_rR2i2sa zD_~)-6EqrIj^F(^>A@wS68c{s%K3l&(Ero-N4|&C6r{Q|s}9W!`*RdFrGop|y(UX$ zmbmroBUFtuC@!-zrOT48oVc3I>7S1ht*WnV1XQJ-+HUwiuLc$V-`l2Ypa}hYi7_lz zz}o-(I2H8Y{?R&^o8d(EbW{6_`Y;F3;6NdMemp$;y&@HvWBDwEtrf81zK6{q*4P zRn?`2B8jdqU${^q?6|mDSoC5;82PTikSz``JUMtHB_;^ki!3x* zF?TmejZpbVc12zZ3wwO}`^SWchhteUb8^byj0(lRX_wh5Ed04VAVJFtr*yxmX+rrx zvFD$P2m1osWw5c6Zy|opnjyMg|I7 z{w2Zoag1bx3qn{y*~$LU(Q)r~B^ygj;|r|a(ZhSv+@*_j#nizpU%!4W8W|b(_%FEQ zsnw8#Wo%m{iW3sr+l}tW2?{5>^~RmOzt7b*cdQsD@O_`@lk4E~ZD+t;L{@=fx+*C_ zii1<*59O>%4`0300pNeIJvU>l7~X#**ACH0+=Lvt30BBhL1+a2O1D{my~lOtk`xD+ z+U4NjHF$h7XWnzV47VH{$_Wc=-$U*I8oo8eSjyOj5olPq75;e)@q4^>VJVIPT>#V* zHW#BUb6b%jd=!%1D&T*i)sU_>&;^&5golT3-n_Y?7#V=gl9FpEodZG87oZxPbMb}^ z2@_#O$Ch8?_n&@5iV6~EsD8S4j~$+MbPAe&S;DFsKth^@(`V0$Y}&LxJM-VxLqU3l z#EEp{cV#wC?%usy>(HT=;{4p);DWh?PQ3#M4k#!oRla%i^KxErSlx{ri${!_nFc92 zIm}Q6HgDakcK-YpIc^4kb{OrlKJzY9W1U}nW+A^I1H}yk51Nza@#8nUegRlo{rdKX zrndA?i?M>o#DnqySTc8ZoQ*CYAWCj*f4uQBFtgLIZ`u$y%G%qxN#TF-B2Yg)NI$&_ z;ukH<*}(p0o1It_tnvD3A%tF1V3FRgB(XFDgNN_T(ayTMx(}+ zp>KguCcniV&O7IQ{P;#VGp_G+4jgDizY}W}U}X&eGuWHbILP-H`(iK27F{S6!q%#d z5zz#_=;`V4mohCZD=*(SW0ZG+E0OOowP_`u2myS!V5Aga7t@2r>fV)m({l$mE@yPD zg(C|aAAz;M^M5K)FdLzd6a({y-i;q|f%3zkVxDy50WO^4|L(T*aKgUQ*3XAkqX26t zl%W4FiLkqi>xGl^MA_76(*p%(T5Km3IVj;bpEE! z8hZK&O+L2>q$-A_nB+8|U{!-IkTeMF^Pwf<<#}L7S;wBef&GbIW4pl~1ds|-11>}e zYBf~r|1d8{c%GG+Lc%u2u))g#G+cu{Q8(@5V3?lya5orEHy5utglH%Pe2k2Ykds1n z(zX|VeGs|(DvS&vh?CL|oqz8tI0*yRS~@PGZTw@_gDM9DS%L__Bnsi*<5^Z&xhg9w zOQQ%}PRg#YbhGF>$S;sPNT$yWoE~wr3bc4x)UQBmA0HpW*ZcZbg59A(96vFe`;ArJ z+`K!qU<*^iGx7Pdvb~6jb-m*uErWEZQ(7d% zA|kTtM79}Or)wZ_@89b}4_URxYq|=n%nUGRukP%;cMD-dttH+V&* zA_QmW0IAn7-#%fi_{EDCVxOL7SdQgkr|_NKV<6aC~VN7&&DqsHn(? z{mWTzk1ZD=+X~${B!*2-GZuUs?7uTp8$|+{GxTcMb}~dyb-K}X3&@h}Z%}%1@ke3~ z@4SA0kbed^Of&ZmTpFz~;O5QaByToaCdNK8Rffam^z+julh+qVR}D5Kcw#t$ba4)BU^~T5Ptc{v16+te?sxGs=Fpc z%g9JH80c8_waqN96o92cxcxsA=%L-!OegFC;552YWfK2>K@>dQwYD=1jZ0s>cm|ENy^{esa` z3#ypLcr|RS5~9;&|5epKB$5pgr!p}yL9}Wxh9p@BvS(*KL`6jf5vDqG=8V?i!}O{83v*m}x-hKSY$R?&+hM<^>9nQR#me#rH-BZ-2 zrHS=kYawPvAe-BVIs|W;q_+tDf)#`8OWk9ES~bx$m}abN8F|a_V8a2xkUc$!z=vWg z(dc0Qqe*efSolkYM)B_{+<#R=WGDqE8Y&(?@cdpXb@lx8=r&gmDqjr|;7~gtaKUKOhHTp& z3}fC3;vgKseULLzAXxbMqmeF)Y2Y|aU*H-#@Lc-RclQIzGWkUYAp~#yYeJbTVf5{jbXn^-lWN3>| zsoJR#}kH6g}!#EusLE?mJIaRaFmKCON%XX2n}w4XM|EBHWIIN0a4CulLsPt?kMYJ>PaBiwXeG z%f}}Rm35@M)EuNiaW(r$Q}5rodcTnPcpmImB$%sd-l@)gV*^}Uhr)eJuuyY2BfF#m z<|HDYv?#y;pM4PL^g~a7Lde^3dgLSqNFsT6U*8WuPUfFnT;I>1;czR2wr^*J%U6SQ zj{;ctjp);T*uum`ClNGI`y}XA9)L^FY+I#>rjLDCxb8f^*NANAtL4QXs)cTvnzb~E z2u++uRDUCc7-;QaMO7$9+)+u;n&2JQqP+D0x&*7OK!qpT?cYT((XTbO`YAD(D}i2NwGo-SME#xv+iDdw1qHCj)35<==jpRYHVb& zBLgJew&m%*Qs2K8c6JQVe}jU9Nu&;%8tf(pvAs`JKOZ$V&S+J!u33*tGn{++)Tthb zBO%b*pyCDqx5^pS<(*lGjKeZx&-DPJH8eE1ZB&X$CuTG@2^U-?@2N&j(lFR1jZRI} z?_k*@s^7i;@WG=Q&B+fRY7EKTy^)Y038ys}Ar4&SMo?=KD0*s~e~7|toOhw|&+)A{ zt*XLU7*s?A1^2rA{Iow?Qd06D(rWAE5yi$v0(}vK1GSkrIiAxaUo4*AN8W73-o-!J zej9P~!m6s~-oM_6lPuT?bD!LDMQNN*R7hqu;&k@Hi-2Sbui|=EJz!TA2zW5F+9E_D zJD2D8Te9+Z#{NO>328FnUAqj>qQYR_%G$NdK`*Gnqvx5jdowN=#Egfr5j&*0xW2xg z11bgdKNF(e{xN-dF6K!3u|90+w6U1QU+e*+! z!hv9Z<_3p`(o=+$l}!*U*1#Z#)6}3Iq<89(BS%uY_aDg5-UAq95I7!G*M!=2fKTnk zo!UFKLkJOlgy}5>xa>ZVpmeTU)NbvNPxVzl< z9NL2Q8a7}4WLpXWkqDhGnrA`BknKSl3}-?O_dS@~U7R8k#Ehz5LJ@>^vMq>NEv1o` zSn@Y+hz6bW)O@sa*zQC0p}+1(hI9|*o5&Yf}h5ql7Bc+#ZyU9b#Wq)B6#0#=~A&TbQM-GUJ~ieF80;@(G~X@ z*Cs>>!%fkU#kd-l>W#pNbTIc<0W z6iLO3_Pcz1fFUdP-ne;l=ivB{AFX}0y%*YFYEzg&V;XHs-8COQeX2|L8yt6HsZPSE zyfQK2*y*gUq?l>yVC&{`citY8QsR^_BPsE2|#=X-@L@cM3{f7c_dbiTwUi?MwS50aS z$#0VeaxT{>UHv>Oa)2%Gl*Ojn<7&eOS^BIyBN3vqOqr?Tia^*1Pl<5MoZMWW$BBs0 zhrkUjuc>)h?hH6jV~2if_~Hx$*3AO5T1D9xK-ek%n8OllJ#TFt2y;q3qq&c$z=T16 zk9XL`GaQ)JdhL6OF&;8y@1aA7GW;NfL1uwfE-*!dHD+Gg2^%a|D-gYy0$z$nX-@-HrTtPYrPJI0zzt1=%PI& zvmTL=0G%{7Yz7!^cSogfztb}{?R-!SPrUp`dwY8_V)izi$}(P|9Bs_3XAHEp)jcW z6>9~&{n2qNySkQ2Nn}gWh`c$^NsO_**;~ z(wFU-_!LuH%`k2UC;+z^IwLjP;bN&eJay8My=NyhS_>R|)hK0%jAneLfq`XG;>|^| zu(Z6qj!kUvOkJ`3F1Q}b%ZtBB@PaG=N}uJI=ci|K|GL|LNPIE<4G}U;G$|R3KBcP* zc@823V{!U)FxbceKAqTqmH}xRXLy2SvI&bT2J7Ns#=-2eM+O;LS?pkQR!xufk`eV8 za^ya)It-61gJ@__0yly@SYep0;>(wlL_nA?!BE;t|2Bm2;0v@wSVIiS?t5>h4 zrL6Qf%(c0h5^D|L_PKr zs90FCV#NxzFr`?U=KK}R5NQwEcX#>>o1?SZHG2+=j~6WbL9&GNg8WqHlVe-SV~7?| z?RwHJv~Ak~2R`Jc&dV%eofxc~kX(lDOBhE*ZLKEKO!NRbgrYz5S#S$HEMxfE;~7QJ zmw4jue3$0V27dK+SOB>-e(5v%0YEG$Dk`e+@K)`q2Z)JP0UM$rYX+{Xh+OxdnCkj% zTeq?T{ga6MKS@o-m@Qv7v-PPs=qB%Sg%pdS$%@w26IflNQ^*$0f&dX8=m-C z>1*K-s6)X*nkw#j``H+0aT76kR`kICMyKEa-qAS-}(5}pO*6n533^4)?# zi^9Il&&m~g(3`Pnos7JEEvlLxa=Pa~KP-npNHTF4@d3YmT=^K$xnsvRV4-^f-_aNm zozp9N<}qm%$-jr2gk}&2lR5j5_nz`4R^>-~o7`{)CM)QrNqP*it}Luo<8tMW5wY(e zJ4m#wC4oPv)u85dXkA3vZc0MN3yXhG!HJ}yNeao=S4B+sj(=U97GU+{l1j1xG8?l< zo#kNjp--lOj^0u6Q6v$ef4o5cm^aATlS?H`8l?RJP*CAEQv`LlLN%z;{-JvU2_r^+ zgXPDAYV7%TX3#Ka^L6r2bY095M#C@@iVqM~AAR>*s3>FQQB zrf8-yhSr^;6@t1M3ay)rM`98M(f3+JKxLrhf84b1l!;-E`!Q&Fuy3~AOGv=VR_?e= z&Af5_dH_g@pfDr%wB~kSMDs=9lF<;&hOUK+60(YT zU*Al(-;YdD$dBF2OYS5qhdkQTv4PQ1NkS6~pbZiNj3iQ7W8<^(U+ABhBZbivE~?&3A31HYu==Jd}WZwTuuR$xSX!eLhU~4#YD|TLlr}2P7A|AFvqa z7Z&|ry;8u+s?lC9@mUbYR22#V0OkrPZK++Dc5 zdi+@z6LSD686G#T)e%Dw0Qj)qe>%6pDkGzyh(39W!Wx3C{OFw}l5Y^U1HkPtrb2v7 zcArDgiV?kX8hE=A<3FWuc8i$#{dEPxA>)W-+#Sgr#4DzAc7^|=@|-4b6F}wwB{P8B z4RVvfatEWgNJtRcdF|RYe~5TQvcVX%x4m3kDkj>6kfp&rLvJJ!BnDLxS73z5R5### zjYiK4-yevvQh-%yXmDj@%GABvhXauBCdNNQ?J*DE_R2{FiZYq$-hBarBw_SVGcyI| z6(P#XqIVKR00oOe0T4jcCEaXq4#DMw+rlk#t7;8cUW0x@e zXTWxT9-}Wz;0=qRcmKDH9K|2(w+B29!%TEEjJ&iIg3d5`6avK`L1-PW?JU~g65P8; zfrq8mC1}x;h`dlM)HA)Dn9F{m5_8H3Lr10ei%;&{7?{)+OJvEblTi#jO#Uad}R4c`aP?>(wN(gh>VxslZsSCW4bl zDIvt%1D*WA1Hso=$?Mgc4<(^IVLGJ(k?deP*;gh#NgmLN*UBoV)jtVpiEAbF8^i1u0wGRMcA2m z@;E6e;1VV!>(yvOF;GcFOT^vQ0|Y~@rUFkkK0Ubu#mo{inv9&>H$|}+b?hawbMQ6r zY^zkUH4&y)LH0fzeeNRP3;m5_x+nDXE@NloJhQ`Knkd3rF?SluA{+70|w`YX(P*^V& zbEKVbJ#;xHbQ(@h&If1_eMf)MUA^I>Z=k#DmHPUC74AE6BA=U$?c@m~0XKAZoLw4jZf=lg za#W!NTG-msV?g`mxyEFj`s?4+5EjFLJ`rcgr6&rg`mUaErCCsNA9Lin&iqAfmD@=q zsRGGs8qlp@UHrl5;_51kvP?z>VZA(=^b&%pMNrJ-xQ2#?%#Iy_Fyzs$8#9gN@%15< zL8)7-k;lNKImDFxm{gj5G%*9TYz-E#|FyQZ7MNe%yWQ&&QjzOQN=is4L6K=hMCD4! z1~o0oD$c-%55=(ES9m8_e(mVs1^VkNxqwKFv;3g2IcQ%mmblM%8~hEZ%6;9C@FK7m zZnc#-UjN)Kg}n<#MSNA#fwDMovmhzSr{CQ=*ZgFoi;K$>*-b+C9;_&OYFz5OXV--` ze-ecSZy~Rt=x|*GXzPoXf)e%5hmVeogcFxX@xb}y6zWdVV5%5JXS=vAdAHx{P{Bp ziiX^R+fHaKY%`YV+ZK`9yvJrSYs0MQGDrc(=H%hw_;S)3j2g%0L^u88H6%BRya0l( z6o2F#jB;&)(e9El{s4$fB7eINX=|k=V_5+Cf{$#LaCMt$A41HQ?8d(SevCYX!6^){ zh4671ZCJ~0d3w}X*>{l>JFcO*IT#O3PEPJYQqqb&d-j|=cdo9Jk@b9u_uNXAL?)b- z$jJ4eI28z8R(U<+S8$}1{+;Gg@mq>S(F(8iQ09k56q@L_bkb%4QU`5th1EFQwgWhS z4JAHaWYP%2RyC-kLV{8Nv#wPt8HatXbbm#^Zq^6$|~7Ix&Ye( zvqB=Se@PqxRI=yyT(wFGBrh`8YvYCPHMw(MQBe^(8Z*-`KyDId#URpQn+1$(0zmIQ zemuABYqp$S`^zekCiJ7=(o$9b>{g^Mf2}ENJ^GF+%tGQsXkO`~NYn)<`p#>2j2XYX zB^mhW(W9nmP#E|X%!!Q?-2$sttt!}J0V?N+*AmRm&I+RMiiwGl2tBc-aPmIu<_BHV zi2)$P?M_m*BQA-ND9j41&B1UGnUF##!((w)7V2gg8E5OQ2<}UX>fSCVM^#e`Xc{G1 zBqWZJLM2%kA_3jJnUtQsjv}*jCmFv<&8tVYn@mCUPhKhZH3y80j*iyP1yQ_cc$lJ8&CF{*A|@1yd$Heal^W2dUf-=t4y4bn@!4T_Ma&OI{BIt-t~Xxc=91)AxWqL1!G=k4S8JyzR{5{idp#VSXXfbs+|aS9byMWCPGvy z$Euo+yed*5&({X$m7QbhfaKv@!T*Y;V$@j>dJBU1L%>C;IlO2~bYw`X9qD01q(-931Lh zdDX)1&CX7WEI|?P1w}@HgUlB_NKX&Pw(7^fWMvTD&>Q~IOhU{N5%jq7Nmx!tOSRuh z?AXC3ZT9+_Lgo^lRy7P5GCGvi&+^Nh|CdvV4a3A(rxJ1A@u7hjg$G!abssai%XQ-# z_tS&%yEd6Q@1qq$vT$Gx!qsTIM@MZlY)<6uawy>yrf|$%^TaCywe1(A}yw(pG0fZQv7K~ntL!kKmasq zGHDyozyq@)gi~)Z(*+Y&La-26iTZ&95yM(R%v>$cPanVlA3t8;6#&(eOym$1ueo_6 zkf}@DCwbIX0+(>yt%sV@$u7Hkm8o-6$PHsqV;ygk%^UgLmkEh(jLkR?_ zwa$M&xlSkzt)v>V3lR#kQtDpg?KIXfQUK1XAoN<=*t~TW6%NI`AXzBjPi5~}W+r~+ zGAzu4j0|}38}5ePI<~7YB!T&)`YN}%skz^07yq=H!=$Xhjzycl%(df;1vD}lS=nAi zexabcU_eQd9m9nuzp2#9C0TcvS&b09(uZ*%$m;~~NHc3rQ5iiNdJ}Z6-@aYzx>_x4 zeTH$q)c5+ts{rUYC?p9+WMr`6GFLe@t(^{301c6yuQg{;xG8JnFhla=u4jGJXWeJZ= zUg5RIXK|JVsR0Zi@?E=sKN`d6j(p*D;gD3&Z6k*{Q@Nq;eiC&FYFY`I6~oMXu~qQ8 zqyq@l@t_f6QgM%G5NIkR8>`xryVyfyv>GGnD;^zAtiiuz^b7@#ynyD&@#6!E!ZCU- z3|_k@=e)@$5UuC_eSQL@a1cMQWQy<^q-s!Ed+5-~>%g{X7a0j~+0b zn1;`6nzx33d3+e-QpiV17`>#$ga0nGMTmL}<8TDn*@ObiQjR-#Q|Qv^hp zzLi(3Fk{5~kt%V>CXt_OJSB$b7YmvMP(%h>;32J|Vd9Sj;7mgEHqjIb(qciT1%?C> z0}UZSR@w3X{`HHPtB8FO2@HBkzs&j3N$hWUzC_f6vtI+W^D3IERY1FX5%8K{KR71-p-y6Jc^-5*0>GgmU&?;pU z{%lJ;S9XY2XeYuF%)pPEWSuq&QS(q1%Q0A>%+CZg@1lB!aTGTM!>Jlg_Zg;9=cP6| zOM-A>J~0#r1cH(dTSv2moW=4jv?LJOR#{r1AZFoE!hsPF85|zI$d}eq1hIthCW!Xr zWkHy&HqjJL)gcRk5$}}d-**dB9=G>Nno z0&ARD`s;O}vv4D_zGQO|nrLHl1#QfUFXjg7H3cS419=@=Q4zwn;fErG@1e-^tN1R~ zxvtl*v}?;*fwW%D>r20Jj;e7n-`?U($)F^XqX79Hr+to*bWH$-hq`eUt{c>zn51Y# z{i6!vl}=E?Jf;R&@Pt-@Q)Z>aur>{?DAE$GKnrZhGF2i41Dg@D_r?U)!wNeQUp9H= zco_#j1ZytfyZqOkq-9X`NDdmKrEgp%)NEd(fTmzd&SPO(lDw1*?{vu$i5X8#NvT9R ze3F0%L&^XUITz5{rYs%-fy4RH{YNtSa!S2s7%~1wq7;}ah=%o%j29=G)C%L|BYb}a z9Ra>EFA;oz$MAL*dUB_-va%+%BB?dnIyx0(9s%=lc<*8l8UcC5UF$RK;J1H9zOaE- zl0%LBYY=44ERp7|xz1(x!AdciBZ3!;fie?2pYOPMP~9f5LByXK&q+ld2Q12*o6)4&VH0mN0E^^UU=QXE|S7@I@dv_i)#0lomO@P3(UXzY5~W_t*W z2IK`WG^vgC!k1?~d>De&>PIogR9r8LbI`Hfn5ZV%35TxN<`l)3#*}5k#+8na11SqCAfz$5(rwW z*wji0`-Gtj<2ZPD^RGzzVGUX#>UAyAKE;(nJ3lTj&8Wh97l8BspuF%NSV$RyHw_OD z@?fr)4K_i)BCF8${pRMSP34}zY{1?-H(ft0XCaO=>bA6ai9iDkoAoNv3GEC)Tqf_r zmzD4ZB6;MahZRy;5y}EVpudB;S2ClE`^*wy(NDo}crwIcB2nU9JFlXTio6;1-?MES zJ-Sw(B8|{M3s_L0fLO0C<|pwg41Xk~Er1os_C)VUXUCOgGCyY%NR>$Ad zKX!~HUx|sah!>?G>8ODbWNfB%7J{Q8<@5rL5ZM#wmT2DyP&)Eu)?9mM+lF^C%HTbR z5NCxH6gaW%P+)?taaTjrLq56|6@+9vajZWVUkNwS(bg`9M(v2GDPERA4>4rxi3KQ@ zWH2~Z{BTxs5cL*0W+avb?F9VL&yg<_`w$s6IHI(OFiDeyIW``7&mC;?SJ7hcl)u@R z@iQ~~V;rfxp@D>$_G0Y0AexFYd;Q4fD!i5O@-oH>k6;4#(IaubeWK0-Aj;(3a(Imb zrengLnOd|*H?Xp<#OpF3I5_bIU*it1|Iv>bJCC_vn-B%8apgTJZoX=oh*-c0fzZ?7 zSxNojPe*v-trWmG61a75-jpL1fx=hk&PEqZb{;W*v9+zhjT=6H?nfU9V*VvL$CX3g zzX}LJ2Ixo>Z^8qoltnGf#4@tceOvk@3=AY%$cxoTYeWD4jvW+gzh^Vg`~Q62_nGH0x%=JM@4C+GJdg7@jpFWnVT@d_H1`cG5$weM`z#0*nc{vo?rL~ zo3igZX5(I^L-Z7u9FJT7(NjQGA=-^GHC5)>T9R@}Q;IX1M*Vcr&t}Oh`;^IE!;}8+ zjxfr2&89(3(GnLBxoytARI!NqA8L=K(sRRR0W07Y4z!v!?IFqelgn%U$r^7r>gw-F zt&qxw!=vT5_?!YnE;PHlo0RRgIP zwlys4(956@oOb|_9n0l%F(5tuOjE_-_UzY37$t@rNHzkC1wEsBnExc_2XLejpCDmyREw66u& z3X36U0j&Mg+7ULmvJn+MLE=(S@&>jEEZCw~@14VD5qn?ZJ{N2quN>tS>e`})!V#A6 zeb?qjI=ktkkOb@iIw?n6(2{FcD^B$FeP^71ZrD#F!UA*`ei%XL^ZQ4=&%5L_W39NW z5{#iCqaoWui|7UCy{K{)X#*6i0s8tkh#7sfHKO`bI!QuLu#xhRe@P8CF1M|>u`CLh z&=Aw{UYn{bUjL3XAkEd+%+jIp2RIyQuxxy(XU-m1u~>+_&$7ff6`8SIYh@w54a|sn zpFcbvC`Pgn=4ai~NDxaR=?)OLNkiUV?tOeVcN|!4Io&1z)8l~_fdx0&{J{2}$Qo4J zwc9m3c|)a(v<>-BS^pVb=eYxpE=M`o@2?2r_;M!^jBa`12ZXu-w z>z)2_eSf0Oz*8ID=_d?AJvt|`VNb0o}=T}r}vYnLJu4`F!kw}=uDb+ zlD9rDZT*9?>NuUYE+@Mik0jw}(8v083Kd=)zf0UaBP~z1%!|`8s*14#;L)dZ<3|c& zhf$L}Le~0?8&he#mz2dyLQgw#+A1B2qfSDxC3U`j=T1`+i=DJJ*?ihLWO$f6`?F&e zbKgcUe0F6hi?eG)n<>iL*D$V;LWC$L?+)PV0kX8S4#E2u&rPl4>(SV<1Ck?YAP;x^rd$IEA^-L#+zR=2_a8q_x;^jEt(Qkh?BowhxryFAz5Hk5 z(x*C=1Xr`M+g&XzxjE)RNdcelyC$}!g5n1r?-GOFJ;K+3S~*Vo zj-BkXokh=%nX67O<;8r$Y5w_zZSgkBu+uk+Zt2h*i&9~9!MG9JjQW)Hi!h!hmn9_e zsO^6Tq+yHvrU9*4bcs@1Vw9>vhiEs}C>jg!O6J&`Q0UxUui?#(gtoZF zkM+elj_P-D#IhbEMyPR$BuSg}d3^LVl6=9pJUl#}gKr66KoEYGo3$8lQ@{r9^W|#6 zduANv$cmXHm5$sbA%@GlwcY#ARrguZDC}M`MD9fTUylLsvmUwU3PlraY`=Xz_hDi%xfNgNkN=nN8z zqn+2U#oYcvT}c?B;qlW3jUcFnI9VSYv9Ud|GbXSMO1pF)#mtWU{FLJ##>vxMYafCh zLi|ldvPmX8<34ir3*c5G+5)^Ax~Q7H)_~`qOdusCM!1OiizKC zPlCgW-i|oIxbn7ZJAgR?HnYD?5Rw2Rr@Z}g{p}Y4gK4f$SmMq}nHV-wP3)Wg#hJfP z0T{=w{Uv5~fdzGyrFj?8ZkqgoP2Xu=9S4_*tId<$@}K%($R#7W1chSy8CsReKZ zJ>P&!T>+UeIuUyKx{6m#h225nYV|k~2IF#YKsnERi)o2!7PYUU>clb;Tay;>e#?jd zxkKl2omT(oQIU0f764@l*d=!%Vz5=~qq<&DsuCq7yA^0wykBkfOtoo?rXMm{(NIY# zZs}7)UfhJAK#fWYKw0w_j$?po#Qx#mKAich36Dfi44d$4p603F8jd-0DWaHLyZrfe zCH__qIw_E}UcAue*aM316lPWxxycO>&IX&VR$-jd0qamfHUS_xb2~^Sc?k>%=<45l z#e@;v|L*7@1sa``e_T~9#?hGC{Jpq|)WzT#r{c@ViA5$MMH}_Vsl~%y)l97&6Wu09 zJ$w)=3B$ID)U()1E^oK}a{Z~|zkAU3$%=CBvhi2l2-+QcsAm!6OP!{v*$7?)j*wfg zUL7IAjD%Y1`_EuPyRIE+_H9u6x~FMr9&2vz(b>yig8f88&}&x$_iA|&mAKB~%$Onk zQ!1Au5fP1Oj>*LaLMS;bh@_;`HjlNF!Z%SN`H_gdAf>tiEx&<3p zx$g<}Eoo&j>0+nYw{PDb2Wfl2bvM-5*F?dc7&5SJ0rqm-jMa$)$WIZquwBPrP3qX2fMStvlUOK{;Mo=rVRRKQOS&;eEFZzu$Ju|MdiAFn< z$|l;UTt?mar=xhx0Fljk;i{G(GXu!HMCAo?PmJ<%r;HGEI%)}M*9ztr403*;;I?o^ zB;=j9skKqGpJ`7cR5z?@IVn9wZ}2*^H*lg9a4Z1mQ&2tpd=zUpcF@AC-zJ3)I@&bO zCl0lw0L+q{Q$5tfHS$yyTT5CcrPl(u>^XQ)_wYDNLfeZ`yJ^EoXZ}Oi^ByWoaX(Tf z=t8Rt)C&52LEQ-0)U-J=R+v3rO=SSGVGqYzOs>DL8EssZ&e{-iNFH7rnw|NJUy{%v zN8O-ti~K%+1KP>?o$p6r>V^i1*)ag{}+v9njJQyLT{60GR|zXS)Kbt~WxD|Zzm9i<%g zh+TeuqQNevjfL_$`dJsA=q~V5>y910po8!m$I<}ezTU|&8m_xoB8=fAUloh@Vwg=i z&oeXzbiIuwjY8HL)OXmL52JC}88UhNZ5xOa!ki+D66Yqmip$laZXU1Ru5x2X!7O)a zMp97<-b6Bes=8|#{2E;$3esnSr?QJ4)%AjcP(xEA(}Z2L1br}wCoYiA|x&ZWh%zCIcqX)T#ASc5|oQsl26kJ%=3XYhe!9J_#8`m#^ zRbRX2o}g~~3k{Ow3}}*skCbz%tl|h?(&CTX6KVx_7i%d>i3$63qoVIU*m89$CtPUx zB(qmG{b)mx$)Oc*1awTW8?V*m{tmJ>0jk1hz;|K!aS`3bDS#kC7U>13DT8PZ7%Xa^ zr#HUEe&Pz{Dg9PHkk@QeT7@QChSLCc*%5Ymp!oU+D!gr3iv{Rkv^Csw@ev-LXXfSk3i~ltpI6`;HIX_mT9*`|yQ@o_hDR-N^R8AIn*WS|U z?W#!UV4~ziUahRS&DaaR?se|&Q*IO#;`azBB65SVV^dv*kTY#vrL~p08M{ zOU8=Ccf!HQm~GK(Qa<8mhM=&1+6ndTv?V7jvcHyUfq7oKQ}OM@=x(E_9B2gl1EMTf zJDCeU1>@^<9V}twz&>=Lfaz^UrV!DHv2|?EqPH^)}pIF7o1J;qH6MlwEO1YzfeY5 z6rPzLXLRfyrGUQJH(1fE{8}Dc#oW|yz8qS_Q8))h%#+8Hvjo!ABNO0D3^9uu(AO+#I zroz^NB$?vcr75jeA>~wkeA;L4$uW6@XEo1lAog@}6{8}Vb*3<)`naQ&`|Xy|qoP~o zs0W+mmb9Wfe+2@UPUF(KaXRq}AN3-%+@ze1c4=#&*|X;WfLf=eOaD=D>MzA1h_u@1 z7mDBeVLIBI^HvRauRGXao+6)?l-aC5H8poEbaByF$Vi}~gxTxGuj>T$uCyB31I$p* zQnz+Jmzd}TE93)+#<&92@(bbkHz?K#xxabRFJKN2JBB1sc2vK~3dtB;r5-cV~kKH4b- zG=8-c{dx{W^o#cqo#`QB;^RnQ-uu1P7&B{IwG)s_C}-z-&wW7-ahhF3#o~}{w~uy2 zXUnNmwX)s^=i2!gcL$ffrHBR2JeTiE3YUS~Pd?q1P6{VDV-&4AwFbAh@ks*}i)kc` z>$+;~+69Vy(w>a3v!F!T5ew|OHrr)X6n*Nw{@s9IeTEDDki5RTBC zYnykI_!&)_bY{kdNy(uUy`)x1c*haF<8QyDh)*MUJ`5k|{C~RNFPb9JgG;^~wp~4XZL*kXAgR>=-pQt)-vdgAtn}1tOa~o; z*{)Kpg9QJLR_<2)qDSd)_;bcum-=T+eXq0z>5sr(d@?zrl_Wo%D3<{d# z1IrxZSvyYLJDf%-UU$OBl9>g=ADYnaPA{W>-D}PqLU;H3HE-_yA#NYA*r<_419vc- zX}kSE8r4wDI8_bE*Kk1^3nh&XLQAlbsK;bh0u2th`58diZ1_G(IgD9~-G9~0^^O@e zbZ934X(JZkC=eS5+yR7&!l!-As=A-+y_-%!#81@zO;}W=r4C5)VQr0tzYZOy8Gp!4 z=z!Abc4nsTw~gZX|7;(b)ZvkC&A|3qw=vX_Fiu-f=@NMO@Ek{@eK*E0`K{sBw2G{y zYu8bfDpEoX7MabO5_-d3o%@%ne`}h9&^bZ-2Dp}3fjX!?U(~z$3_Z!y3@0Iicv2e* z0aUIoqCU{@>&^FyHg86PA;ZfknZ8z57DM?0+-!Hpyx^@>ARa3U1x~HIhI3XMD&XSu zFCW@W=1M`u^`OB6UveAwZ7M8NQtlg^5@v;6(I4SI+|tJ8t)UUV3|-*?ERFls`1Uv3 zIQvG_DRUNm>M*&IZEodmdaU~8FE;sA$GE!hOS`DQ=v@SYwP1H=1AE@c2kwXu9CKX_ zjjgqC=9?=DF*{#38w`Qm;6nHN50l+Co`8)W$85!I?@LPbnP2$k?b};8sXXTI+0qHJ zn03e1QFwJy40g%Zs^S)p zhbX${=l2!muLjCR0v&q+HO68-t+}Fx)gK=tw?rrH zJY@BRK2=A>o0l&wU&VooTVG7=0?-$icY9OyS8s0i_Jww25tnbTni4w5Tf0x6bFg&1 zU;io_(8*yjsd;n7-^``rosA5;-@HYO$JE*#9kPaHvSRv*ammwPzI+)1-*Zt>^3}A8 z8~vXw>)kFsF3wJo&o!yJ?@yO(FD61vw6|UV>5iI+Fw|7&zM*ZWj8d38K^)3@eMbqE5+2?6b@C-qxpOCs3d~n$2pBK~O zX+`KOG7-Pouji~x2fo@h+#Psb(=f9l`OS^{pV;ng?%wk4l&XjYKPZo{*fcp^J1Vt# zt5ypXK0j-8`UjY%!Aei%l^@`=Dy6VdrrG~dGGP3qD*P)3boIZ*2NXH-N)x+0dlxqB(Ty78HNlaXT#P>ic; z?^cLD$w{9i>_lYbZD2@Xn!=vR`4O%M`1{;dM@>C5^Yi+zQ|O}bzjFHX@81eX#P68v zSM;8o#Dtu{=K7Dp2EGs%Z zbl9*7WxAdE%=G*Lv$63O-t2_c)=(hlfohF=7RGyIl!8fCIvp%4`?D2S zSh)UZDeZqyP^T!oe%HO3@oi|kld;ZMxTykeH7lAxKqZUdB52d0^Nvv*|Nf8GbRD%h z6Io0OF4a@idIc=vmUNUHqws=1IYt{mpML|?o{V-~x@6M5dd!t}?d?Jx_pGE6VgHFF zXxOm7OSOV^@EN7mK>KKN#N>S<6obu8-P$BHG<00oKL-rZiq{3XqLv)T^>S2#X6^#m zT8gP98)IOE;+!&COBK;<3$V!V6l-SpfMXj3nEs}wcBp$^PjcJl3JfpJ!_T0UPy0m5 zK{cUE*Me*~&0u-E_4h$tcg>#jePk~vQ~Uec#{b6aelgw#oPwbb{>0}FHC?FV;KS`; z#}c5AL`F_OF)>SZPLF&TOlWs^?f=F7&us*2Y-ifW;qVL;qp4^C?$S%8e=dXifCA`g zwSu?ayvq<32o8PX?^Pr_DFj*kJ{zHtA}ws%{GUb6bh{L)!;16a4Jj0yibe_~MBDUw zm7n7(h`O)VwQEbrv(snHsHXDqTT8$@vXmz&4pHu4~Q$?7Svn%uU)UF1QVZy^bcs0cw z+F2%1R)Cv4?ikDw;@a-rGywnsG1a(z?!#t=6B&SUg+NWZ+r8Ai?q|~Nc)2XaON%6y z*WQxff3suavZ9vHOMO!!Y~i;c(D)20FT#ZMfBb$dnM`Q}Fb|RW$n6f+#W3pJ%$}~X z4gNNwN%G|37Een1f6)3hbIZhoLt3-c`A{~gq+{8u0WG z2BBKGIJ|!gs53Ch8z{ib=JF1YC}{uP;BMLF=Dr_KUIjw}A}_Y^zm*E*SV*0~tzrl; z&)1IJgaiC{+M2sHp?8mU4|{Z>*{`SYdD^W%y*%XVRkx6^{^WZc0&(PgfGi1Lho+p@ zH1S?S>BcV+q1o%hvu)<+-CRx|{8RPXsljuuth?^&(K($2N?=&xSCU_MJUi8i9qCnz zg17$rf;L;ZPpvI4n$Js8J6q}xO5B~+BW$-*-}i31U%tNIZiG0tGiJmCm7hHhdQs}= z^Y@jLY@Edq5MLB=7iV-HTg?fOq@Fi+gS9zA$44Vo;H?DYyiMtj{02lT8$dVqjd_SllN#A#x;LFPC1KgMXw`p=7a~-E5&0O)is={{tl|^c-?~gEzVZ=K zMMp&<=05vELhitXe9YV9?i?S`av()<&Y77u_WN=XvJS?yz-ivRsjoW6qY|agBw$Le z1}%0V;h|hNXnxs%h=5!>8l>?Q0I|?D&auL=sD$=~hGt;tb)P=du8H5xJE=MScZ=g1 z=(KFv+!Eb0I~YAp(bf57`8tbX7ih1wPjY%+7?x!?kx~v-a)*4 zd)Ja|J6JLc&;)Yf?YI{*(32DagQK&-1<6y{(fK_4-ShXdZg2F|ZsVO#pr`wL=&Cp0hELi-4^j_N-HB7D?g9f;ULE-PjxSh!qT`)5xsm2QPMkiS>AQRP zW9Z(tOj=9ZKqV(xT&F+&0FyEhZ>lPI(7EK=<@-OdDbq@?xz+63-BE8FFvhNBGx5~B zi@qnlw8Q3ooW;ek2c+ER5+4Y*&KS`_i4Z@^FYOpO!^PRzbW>cbjsuRQ<=Me>gtYFDgU)#uyB zXO~=We#|tnX-=WSF_EXq5td zL5~d~8iY3`B^e;AJriZhWZdQzCPznZ`abp5da4UsKo7xW7SW`nj#)r#pHDLGM34)t z{&hAyPq*xg!|8X|{New26$V$X(zyd|Ou(iCiHUJC%RQ#nYFwr+%7?!2 z<;#~-rjI`T5lb$!YeDfVGctEMVdMe_%Z7AIFtPKro#RdkzqTm;_=%|9F(E(0XiuCt zdh|NbM_N72>YzSd#|6}scGS${mX0-coXg!LlU+|Oq5j@eVq9tRIOk^G#JVYyPLDE+ z+i`1pC#O$)j?pP{&I)ge>Q<*(HTVtL1Y{3s4Ca}8by@W2L<Rve0e)z1_)e-K*-s-s!AimkZ)MKcdq0D_7Y{JJf) z)d;dyWrZiIx8Dvd$nTKHf|yyiE%gtyT9@(>-$I#@@y|bJ^}kqky4ew!+7nsC+2eaU z-)43%cnV2t3IP%opV`%zQUVGIpC`M40n{^J4A!oQ9-RE9z!W|PkU`zJh|&M!0zgK1 zX=oNKsS{B&{hQm_-5ni-V zs;zp{)r|6_$O50a;i_w~LuBI+%>Y6i>N#BjGo{nywL0Nw9n7K~AtAQ5)@Pn7RaJ=X z0dM&?nEaSSkLFm4ZplJ}_$_?}Qewe{oLyX2B3lv53Vv!HgSVxVowwrlFB*p$#&gJc zatP`&2@}vOPItknjSqaA&;RXwn0nAV)2h4mjSme!aYEbx>M5kWlBS{$1DBQLZ^h9fY*;<2?50gQbiGLLd`JcfWM;V$eF{L5u(*$_8IvbnRldHr^tY6>({! zMn`c~fajdhg*U&aWOSv;Yw&{e!I*c+CX1+>amY~GRfp^e%#Z!HCyZb2m zqN80Remh+T|Ep)!)N$B!LWg2d- z?HG!)x1~Bl{SuTtSj-K`!PC_k`lAWF8?9&XBNq_*A(yjg=}5(17*0O(`rSKaZu@#Q zg_Z$%ok#X$+A4>+?a}f-2cj3F-tnjQ)dqcsm6v+s#x^Iv*P{u81ihYh{`+7`C@3yH zK<~LXJUE)$ZoOP_Fi55148)%%Qo}L;cU>z31)oRXx(ltjEtKS*O~;Rek|;Ce(g-;Z zC@Rms44O&P8&2;Tzw{>DTI20gJwhtaGq9^JD1=)tsBlts< ziyK|9s2aH(6uS*BAV1!=UM=0whHw{)y(7E0BJq^h`u6dE>^hv_+G> zKcI8yB*B+TznC}c=}J$i2~|#ctJ=4{<9+mY{yPE6`Ws|6Tf+&SAKAA>yvpui}_tq)Dso|Zyjk|C9Q4x`Lbr@_=gq7znZh!sd zM;re?7v(nR76>SiF++ST+PcHWy-y*x2DCtLd*LrYjvS#x0$vLZ3)foGUATJv>huN; z^4Pm~Z*~=pi0jaydhUbIevBqN6oXrIP>HR%Z7v{uSGc-Xnr)n9yqWfI-U0VNSW{_O zqp<2XH@UfHMChT&79=N`cbMP?puf5LeQ%=3Xu=+C8Ci33 zKsX_gz^q@4aK^}y0{`9bG0o^IeEzWsY=8Xh-n0+Izud{GmrP_5u%J?@A^e=o0; zUk;y=Uxr?)`?dAj4OJE8KgNxjx;*wt+mg`_axYCiJ+qI`j0<5i7N|`teKu`o!kHNf zuIGJbn#bjC_0qTM;RfXRi-yl@ZF{44*qgb_-+%w*RPK#5C@~p_ zZ1{H&2cy*90L7qd>#1!D4QIf@te&v6f-n0a>$>yQ?gZ!&ZH#@h-rjmy9^S0VXL6H4 z*r(8L`MxPRyv*~OT9+=b5F{4m`)7QM-1k%XNEXl0=*n< zozqlfB?s1PM)dcf@0{Yu?xMo9Sj4GH=#+hrkvMQv0S(aznHo5+P}_pMCTfz1stEd{ zb=!k^h~UQM1Ed;Cs=ZG_w;F!nhrJrQ;lL?!Ic&Oh3682|g&Jdkv8St1P*-F)!kRh*oT)T(F7&QqCnrZ4yJ_fh?8?p$@Q z0Z!z;q~@;qP0a^}xm9&SHjHA^=yq=x4zI|Pm?-;#=8f6YT!NJz6CJpcxxcktXU+=w zb6HnWbe(lz$yi3^KiLg?_U`SmDmk+z_7e#aJvY%}1O@Mwh0Ebgq4W0wWuHf}WcK(k zcyFQuL;y@n=Ct)1;ns43m($_Gb3^n!tmTa%!-hGK_+vn~dHuQhMV@Y?+8qZv$wMQl za>>GX2)+rug%d%QBFm&RV8mUF9o?h{zAL^cxeCx zGu|=k{5<(ZyJsj!I@fyaf9Ys{p7Q7yXV~Nn|GDy6A_Cf``@#uenJn_q^mSJ|e=c_E5XZ#C zlnm#wm7_e!p;xGl(WKJF!NkNwmc==IrGM-wa#zxCYuEN{A`ca9COeCcs6NL~m3{R( zqbwF(O5kMA-X1+9%Y}BE#1ony11J7N(9S$Pm*2n#WD4kv8L#*0dZ;v_f>x=t`uMCr zEG_a!8~OAt$mV)LK`8C$p`z*I?R_1lYk&}S_%#i<;x$U5pS0YFj~mX=Q6+qek=I|s zuqVUXBGSHDCOn^!SLfqj?#h$4d&Ffd`XTU*mjKTz_wUA)fBN(TK7p;}xr5=7?G`Be z&8CuA7Dl6ss2jx;0&=%T|Nb#g=EJ~4(JLNT_)9*uUbH^sMv`}t zB8DCJkhfueLX;v=mL~uAX#Y)A*(!Yf6Jke3amEY$1HZ9*vxkTyov`ibzia22%dd@y zdZ&EZuh}^mi^UAr8q3etzmwK<_=SI6@txh((QvgE*Yf~?u$FsQ^xQMX==h1YFiQZ!NaACB6Y+plVU3z!X)LTszS7N zthEr<{+O1$a|B#(o^afnHHjA6W&OiXXBA#0PvbmUQuBzV)W7=mi0F#eYiQ*twwU&` zn1hr_!y~9Pb*{Ayj@1V*ql?m^;7$`E#lJfu zsDy+yc-XZ}?Q#f%if8Jf5;m3pgi&?TFC_#x$}8R8Q1dYF_hz3*cKy|M8_?!^Z71v4 z5PInmQe-K5DX%aQYl;gKXa!%6r#Ptd$c{vP2Z>dEAW`uXQ7`QfWD2hhoAnm67rOQ$ z?EJr<&&$f%Mzf2bFTTw+_Zb#2!Otj27sUh`Nn<3~A?qm>@N69_8ZLm4ReStq5@wwX z&TF$q54EULjvdZeo!>x0;Jf^|)=Tobd1#rFd5dZo2dR*6b81oaJ_MB% z-`Di_${L|wa6Gfy`s>}i*m8{5iBOOgO7QaHFgyAf#7?vZnvduLh}99S+?B2OsvH|_ zAJ-AD9t0X`q%(Fz-BpiC6>i+U`#ZlBzG_S}%Qi+X1BD;0murrAsAU->84|H}{T5)K zMGM7wVem$a<{IIRe=uP-ZgcGhBH}rV<43R`R;613Vee?>@pA6YcJc+g5BT=p z3D8=5S%B-;JQHP4EFqecc_!V3HEBmNasFJ-@b+>|f&j-NpEoOt7h(@{RYiRm{D+^) zF+Fyn@rWna+|@xR_FisY+o;VA4M@>|OJjhg-sn@C;%9iE8PdHVu}UN)-2yZnfI*|# zg>wwe{@Jtbc79gu7J0E-N&3$Lf9U zV0iBs<}@i!{Ap+w5g)TR)c`gEdK%0+8J(0Dhz88=-GifD!n=EDZKndw zYn>em(HrU?P38MU(|Qtn@f=!>mBT)8DAR%b+I96&vB(wS{;XNL2D@*)6gC%*qFo}o zd#L1WsQWpT<02T$|DEcY(6w?#Y1VQ7iGHj81JD~?Me3qsFo}-DWca|&K}?*4rfF`n zRi&K-{iSYMt*b!(x*^!dPpMg}zmx-L00{TTi1pN;)U!=7107xrz!Bq6&du6Y>_{NA z_lE99Rvzryi#gD`BoF_nuRrVqjOT4o!P$`Qy27u{g?$Q2@S6g#%7}|{9t^n^I!30h zaQP$7x|78?Lud=Jv|V(GVtWv@3f9tDl`UKR4`rMT_)WTd`$Oz7ph(~y28LL!8(?H= z`|Q~>{0bU;uB(m{%^)wvLJiXkChuM+84FI_O(rLtXoj!s&q}<$gW8@eMKWfUW6xgs zwJCx{P4Y2znyvCfXP5)x9J1xxuuOFPE9AqbakgXw@v`6#2Pdi#U`R9@5#+}%U3xYv z@wf|JazVsybu&Ir7Vfp-QNW+{tUrpOE{5cHufA5&_)UToollMR2bpOWVA^y2Vq6Q! z_(IAku^~LQRFHc#R6wN=#6_XtPDUj;#;FDvX85|#qg_IYOGx?}C}^=!QwBYrquYM$ zgk2v$n|gIWJ?Cm?@{cqs)r?XJV2PygA zc{~uMtyr2dY*?M{zE#MsJ!TBc&;%wEAwK&zst8JeNY%jjdY^NnzX7|m#n#c?v{flk znP}JG%C*;b$PDHuj=~K4$)YJE6KcI$_mI#O@`Zki7wi@5%eA)(-+uY=ynLPFS@-W7{8gXF)PZXb5rvUA{N5ZhHWcT8>1Zm7Fh#;Mmb{F zJKAU1`P_nh8LN4Rp0pgMKvj|Q^y%!1CdQ`o+cBzPjS<>a{r$VB>Nr9gs?9J0^QRO= zJ*Yt`N&{b{d}Mi==W5wFlrNuxIh7RcLs|%o6fWNWM54(qyiFFXo%o`9G4Glj{ zF?^Vp=ZOQJ;)q@9P6_duRU8+ndV7cWdun@o!>j=AWMS_GFQt#_LC$f8a6RIKMnZ(1=NByidH3GOvDH4fa}`h@^0>O8?t zek(cnyv2)~vtS@Im>)G40yw%F$He=0?uf|-G~OAhHQQ}wPU!iWp>7}lk5q&Xrve#6tpbx_{+Co?1mLAhh#k4laFK{?D zHx8_`oNBmNgyS|qUw<1d^N?1PC<mIX5N-ts8zgyYCX^sAxB7mzJWpT&&$VVZePuaY1($yUmr!s z4O+OMsRAk~OvmUo8)+tUJQ=?=)8aIiEaU5zaM50uw_q?Cx2$Mn@Y>u4GR%hvo^9LPpk{YXf~+W@Swz5 z`K_4zZXb3>!U0{mGX7O>ZSAJuOHd6{2oduW!A$$1y;OqhLE?9VF`^aA`0s<3KAHaU zNDq%}M>f=GG@HO2sLOPw>4JZD^@@7vLKf_Yvw$zrUmHY=eK@-Kw z{RATPo|^Ux5>5O{t;JRUL>}LTL2Mw7HME{kLq05&4-d6N&}Slw`DSx-P>1}WH;sV z)fqS&n-GwY7M%y}DXjZ@S@LzjTJbdi96~O0{ocLZOZCRO1ig4+FA7db3N#aUi(o8V zM;`5ceNMh||EblUBT#Wk{tHKgD!6XpdzYhta@`2w&?NpC{g zBv-qp8ZP&NxPVF9^~=M5hcO4;DQ0kN9!+de2#uOtC!c9qz&@uxTJG>Rr+*N4qT6X0`V8h+!YFk`w_CLM6-ejKlvw?an~;8hX%-*a3XU3u#yx4RmOS)D5p_Q7ZE&^~KOQ=uB6#=bG^sZpo3M$A0#u)jJ`< zfgUo6FWpr&SY&d?kV!=bM*FuZikUk~TuU+E&1jbHElzyywUr&jAB~JmN;SZx7vjeg z>5E4Y!A@qT!ZbVvwoLx`!f@XVk&B6G3M#d{18#g9HPP6yL=eK9VTXuv4lnm4zhf(9 z3k_1sQC~UjYU*ZM&mx?o>A-Xh8|wkV$LobyE`wgL`a-W<5(6%e0dTVU5(UuNKEK{S zSEfXj`Nzjhk88+Zl#k9cx0-BO=SRV7KIp@$RkMS`qOehbXd8`sfOy}k0b6k@AA`;244kRbn^*$pyCy`$9h5N023_;ua z!qVA|x4;*Au{Uw%OeDbiFmMtfS19Nf*^y%`Fofc8QJqLWh9GWw3*u=Q$nFx(i z6tSH2;LDkG0AyJ%W8kiof_#iejk^)H-4MJ1X&HevBU3h=QVKdB>nPWR#g7Be%()t> z*{hdzpSzx6Qxfbve;j4u22)z*9vP#wZ|~kzoC~eCmH3y?jJ?WpcOXp0|%H~b$b#0#ZJp2%Ocvx9j=6Mm9Z7$9_ zQxZFK`t

upB{kr{LB|I2|)fYhapDpA*Qv$EndHz42f zB@U#}m!oAR=AG0zcke;HSPm8hf#?QeX-*VDRTw^8-t(RE0D7S&r*upu&_}$%N7D}9 zWT4@#Pu_;%tpdJ!oD?7Lwe6%bAi9X;HErKW-%{zYWICMAZG1QB`w@=`iqijv8?}Q! zq#E=@8i?EZg^ZXVYhB5&;=GsRvo>{^nh9xz1>Q+rlt%w`tG7(2+P{ zCdH&v_cc&()`Kn|)Q$Kx3Q{2BU33jT5#TT`QI+KSxm%pll_KxWr)icE%l{JNK3s1| zKygXv0Dxjl=m-w`a_iGv^q$GpC4Feuih(X=iZS6Wq)WT2Q!2zkUV*fKyZ#ppRTK)^ zl(ft&WEePzyufW>R^1qILN@#P+qYXHiQ;L@&YT=PP=-7-)X*Vt)Qg{v*Z0AW&M=6o zBZc0U**_tCxtnj8sYplhu*}TO-N!}jBPto;Yi|0nvhfxYjPdcOK`=_W-9lVB8ar*7 zhBQg!Nt;H7Np#@5#RGV?Z`-zB30|6A!9s(%DI3HLN36M*kUJ8aO7mNVvJp!n! zqn?6>?&MG6NKEhO6Hc7)Y=qfdg$Z@=Mza~PIaV!SJ`ZU91Ga2%gZGbE_j!wJSwV0P z^uDK8)?2##K)F0nky%s}#-PGjF2qEP4(~i4$evvQ`YeB^mShZ_j=>`9gt@EO^i(ow zcxa?C<@gSbauGcp;24P<7O{J`Co(2iA@y^Fj}@bGQ1aG)G2D5dZoGcskR6^s--^ZM-A zeJs^rOc(Erwq)^nn!`JG?zCQ4yK2cW&D|;%11W?mO|lgXFs z`*VO|F)Vyvy_I|(4O@~zu`G#&GFLFzbn5Q<^W-aQ-NZaXv|h`Wee!83ILnF=dN77yfLDQn=M@zrKxZJe&fIP05;B< z5ToEWpJQfGB8CJ}t=8BMgR5&P`_Kv9Q%$RNiVr+r;KB&?i|Xz@W>%Ba_Z#)W?tQpkhz3(IgUytR$l)TTYc*!H75{Q zr4)P#Bi2|W7tYMjznEFN3VXuFTaqqaGMf~JH{1%Y{vaJap-!?NJDOCFvKQnpj16%#{z zrqAAiTG0n!5HIU!$Bofaae#VANA2i#O(_qtzon-zMiSi23yF#S2KL3rlK3P(tTe!S z=2a6(nkaNOm2sYSk)0od9U;~XTmW$te$7^iE2eJy)a>l@nFiz0^(ragO$9Gp1YT1) zoyi4ca`VqRJS0?JYJAo(W+>#AwHw}-l4GebS^zAMwbZc^!C z{KmkkrGp#Qz&CWz!fl{u(ZdY@triz0t0@M|&rrSdm4aP*$J9bv-zpCcUDZqR97D?e z=ZnkNNjHGf_qVO1W!BY}dOs2wL+NsPn%h6Y=7+w$cZm44494F~r#bHjsjG*X#q_5~ zCEZkP10J4VXern{c5^bF6hK2NLmK~AGme+$TKnpWop0FJU!!g5W>c_>Gt-2|W1`Ox zpM{Enk{#|#8;D&L9gXtzbifIDL*G}|Yom(cRR@_ILilcB>WL-PZ_;;Wuk{&`QG61} z@&L~g?Io`Eo7!XKi6Ci0=qTt9^jbvsMQ(U zbB!@&0JsePXKJTu>+dyS2PmNK2ZW-EcEjQ7I;?llro9s-IsCI|({|7=d2L+NX`p2p z_u^K+w-6+R<|P2#uk4fJ$<+h2Nie99rNCI&9hl1FY}kNtu(Zb3aKZx_YGBX)qW8hD7%dMs?>am5WST5Fm-N*fL5Np~F@j zYcR5CO1)h_at%jhNK<;w54!ER;+}91395>k+HO*3*#EwOVBAsD<vK*u%o4gcCD? zC4aTuOuGkSUXq`=ww3!8lS)r}gc-X_{C&)8&0R}oxmy+VKDeg;xSnl90wxNkac3;f zf>{p09P&+Z@pbyER`HXjX{mX~LtXqp)eXDQ6Pb$7q{G8Q9_Q!Bf{EUVC;E^MP%ccz z5L(KvKlB}yDcAa}YQYoUmuo3o>kOuYj-qe@-JO3{qx9kNCQ2!}XJ_B#8V#AM|(7m5%MeW?Ls`IAM>pm2hY2+&V4PvPopZlVAG}+w&TY) zWqihbR1|xzxYJf}0N9Ft3T$+T2b{Uo&w@JxnK`oVjt-w?^ zYA_F)Mr7_Pr-@ZHf3nN;x0hPIb_)~(X)!Qa48T>d1|$8DsvQ&ofA1g73irDJ?lA~y zT1294K*8n~A=7o8nDQ_WO&m{$%lCXHO>Qe+cST>xB*zJ{ofBRD#at${Oh|9xhz!Z4 z+j-W(s`LL>!VxuxxyFp@xN+`wfe+jDJ&|wzqCZx7A3dYu8R0a5Udhe3Umg=b7u|c8 z48vkEhWD8$Veuh|=QPIAYK=z;V%8T+)_f6H@!k%wfJ{=b=gZlV$p>t@&9=9R|4Bcn zUc&^-@>OCx{+1KBOW8R-pygh$62lW?)LQ!ADO?g zvPARSkKhXfMds&!(b_qhbUQ-Td521_}loyK(lc$iw=eyI1tN|$3sTD?e;0?=-0$? zuWKVG^fqga`N_TNZ=ddy-)H##Jk?V6C=2wrK%3@PM3X%rAy$3=a#9~KPw~gdO+)hZ zqU#%E|7ACo5g#i}+G*D%E(N4;5`j85l+PU3B-9drlW;-94&0RA_Ly$Ou*~Rc@`8~+ z_Nf`{?lp2jg*$>lblR;~tzPXYeb>Ad8A#bjzIpd<`p3uz2~VjsLIm!Z*?EQdhy{0T zWP6B~vXysV8O59f1hXWL<9wHC>!@xdBcs{@eJ= z`Nfahu)|*aebt`AtvY~8LVEM#?_~S)DBa7*+{6Us-;PHJRaA!B)fsI!fnaEnC2&y> zy{AXTO|kfw{&TBIr81IHZZ^%=t^h$$Y9ZM=e*}QQ0F~VHeV$8vGk~p+MP_>Kj+lix}WRiC%GC#m$83;2hqBa)vipT?$Evpr?h04nO}{2+^iSfbL? z8pE4^UQ*Hw&X%a0OcP*YKWB$v$|A7LAp4-N;!BUx;s9h%_-RpXq@|Fs1L0hSh|6VC z5k<@EzKN4Hd+48@mDGk$yurrjiDG;v={$jdC~QWIh>ufaAnMspnvUtN%;5tuyG}e{ zI+jZJ*T7^lwD-S07R;%auu~KYX0tc{V{mf{u9rh1R>~M5`jdSj8j!DIEKP&obdqtU z=ZnZ!${Rsg_+&{$co2)ne_MufgG-`7yCU${CSZBY2-5giQJ1=`yT}%>e&mPM;}Jr|xiqi5I`oAltC)m5}T#9ywzU)hT$y zjkEEVFNnb2WAW2fq@YZ**kp6Q2UL@3<0dA2B?%NuNy@Ji z|7_4i2l$*yjf083!b!8lAH}~GTDZ#D2-*Sue>BVd1<_CDrI5)wKsb#qI9;&*+s8xM ze_HV?Bs)C{oN(?Q35_tD>B#btDbJpD1t?gFo+*0*1O~C{Bp+qkc>_5VG%s+Z=zmV; zg)c{Tc&BANtUu{f+m(cl*KL2CNQroFEbj7d*4BY+KTe9T_>gKA(i#{dlSFZu{uIzm z*f~zes>hCqdWttUM$6Go!`7>rPkaJI^n^u1Td=tY~aL~okkOW+z z;1r4G0~F*t1p)%ppiY0QShV={^Gho%>3k(6`!hSG=Kk1?8*7?<(!^YGE-0bRW4%R( zfUL!zP^OK#>PCyGv3^iB!=V`X(8p~fEH@(F*RD~@2(X;-v~y_{yap0s%9aK~g@*2J zkeE7W8W%I8R0ChFK>9;ssQKfBZl69AKK`RmiA*nMH0gqPrNj9=21$ytXe+3ysDKz= zGHrM}v`wH_i27x=5@o?U>PxU^&1(d3Z<(;@RD-B%9!Mpw@1EGtvS*ZeNlAGJ1E>6n z0-&5mDL7OFlv)P_3D?sOMLViQ=TE?g=fIz69^jO%PN1L?m%)~^`elnUq9Ze%Xm!? z6$-%7pxt;Xw)j`8d4z&~Ws}*AmI|S1E@EF&DI+kAsKNrUc9}@Db@Y+!GL_duF&zQT z>v`xazl<@5DgTb2lahjNP*h#F&Tn5XGDd`I} zFXFyei9$^$s%j3)r!N)tgXkm^5Gi zAZ<$-_{#Jw^Y`iV%*|Wx?lvp6_H$}03F>#V?~N&wd`bx1`mvk3r_{%ydj; z?J=}!w>g=2YPMpf$D!=S#Imt)^Is|L!txPvt^R&@COIYlte9{ILBp^8U%q}d#!ezr zJkCW6;p^)QiPg^0@i&pc95^tV8Xq%+ppC5x2fz%@NTu~mx}<7km)OvE%tK_rIVQ}_L4tB^8WBA7+hh&CN4gv^%Y zlDNo>23&la15F6e2bUZVex9b%)PxRO_sl>`q%0yo>W`@GGBK!=$NdTd zB9WOj5=I8?xkIo1o^*7oGG6NZ5MK;7h75SpF9fulU}y&b#+9-)`xAf1js?~)9EGC~ z47i{!jk0ob#sQ9LewlgRc`Gf9yNkn+$OBK@KKKbX3Sr^S!7$o8)Cb1^P@nuDk6!8D z<{HMf*M3&~4nCu}(*LZ`vc3?|TC1p_Jh`y45bFw2VFcRaFpl)42ggVub1&hYPkB>C zF3}Wv#evXJBXlL4BWX1sJ*vUWl(9<8_%m{<;j6%WFf9mfKiTKvM|>dKiRJ)vx#dJr z`Xcn{-77XU<)|@^zvlfyc1ku%6TbPTG}%!c{9)jDgDp(gV-$?$5LDMh3u|fp3JT zP6LIQ;)0b2t|$9jxfTl%Z*rYENndDzk{ao(_3YH4gC{r280de5;rSnJoSZCLg2}-b?FePXNtofZ`1?8fl;ozAgah@Q96QWT)!=_ajlh; zJ_seGboWRwQ9netuQYjItuxbo)2y{Y)!Eg@kKJwUvWnn3Ei%VxR@%=0aRFLa0YVQM zHf)vJc9jIH4KCIMWMnF;iF0mtq^{mYsxc2YPAIV*4kh*_2@86HmkwVz& z`Q%Z#mkY7DnEdwj;Y+4oy@-cnxtc=4{>wBlMC>tKV;< z&v1K$>)c=i1dg{r-;cAMbIzdvEG_?)$pdI@h_*buOU~ z;nl=(K=w+5r{z`9TDO!rEeBm%%1yWgQOk?hQ#8-G=4@`Js7yoRj zogR&k8f4^=ismbmC^a<+Ec!irT4OmX%M%jk9MGHnSV21sn7Bu?Ye*De$ZREb`Rs8PTRg{ydbVLK)p2Jp^Z51Ug$#m3)L7q*u| zK~gV`muoM_zw6AtW3cUd##MpvdCO>#4j`ua-$()(|Y3EK;r-zlV?CCK7Okb^yzw*U|o9js68;pl8<*f)Ty*t zyWaIBvZLz?>9Th^nSb#h9W}erqop}5E)|b2tZI+^`QJ@Fw#{O;ZWX%=il2sw{+)fGNZD`DZ0dw&GjH#@Ol)322X)1VsQ0`&kxqfdQ!0vnQe2a2*3uNT zESh{zn!^$&i7NB|{PdAF0S$_ZFJJt)^)SInSB~g0EAIV%SVCzA3wI>N~4zB zULMrfnw?Tq27aQorEgt^D*$%?7=C*Eh7neB`-UDssB-_Kr}qDNwe#;@>`M`Me8i;E z0VhMt%sWiT-G|M*rM#5eDQ>_%QQl3-&rsoSM~=4iR!<+4IW;)96xnn$fxmCsjPvuA z_2J?!U1V(A7wwI_UNrp^vHgUY5%_)P%U&fyyEroNN*8Gj*CmyRh8Z%BjXRES&b0GP zuU=7(gC0>yWR$M2aGn(zZfI!n*1t5$iDDBmnGxQXQ-1%=aSMi2#0OM{dy;K?gr%;z zN@KXZvEM;8Tp*V_e533kZOq@X?Jk@a7v)`SF%a8%@(>n2A_^_6&KRA5pR)DBO($s)m2miB z*sVh>cKgC&HC9$GW0OomaKVKuqZ>OIk;gFE2;d}86|=!(;0ryb`Uu1srl<(QitqiR}Lp zphyk0NGsfF9U9WrxPr=!;P)I>tFdXK{Vtcq%N&?P^9QD%O_?m>etDMPIfn&5SI77j zj3r^UEyRWe6HD*`nf*%qbD2^WZ;UA~Q!HLYXP+ z9%Q+I|KTxt;B$nO6{J8rHD?(rT%Yu8)q`&3P-}9e#drc~M&v)UMm2_zo`_gZlsOnW zO{2OsvF+Rl8qKSdWt{5&8bENB{)hkOZVE9$uva^FVGMoh1vElj*`r8Y`p&p?1(}|{ zwD`4{4Cu^goAcxeo_8vt8k)()0u`T__e2A$rbes@%NszP0_kV-Ja*-wENMvkql9Z;ZOjO$@7CabNG8m{_yVi~BO9)6# zVJ=%iq71QRpj<@6GPvUY1UK9b3KZf5AhIn@&CQSpK=?*{j9faZtydH{g+mU9VF#nj z(k5^?t5B2Hb+GUa!fOsiRPF3yuC}I>&awyQH}=3XY=p9=?~H<_IqKZG3hBBbJYj>Q zcEUUFe+()+{(67>jx>(wK_q9lw6To31?1uMWkK{Y(&^n8_$?uvX3S8AwD>L!sE*Rl zrxcQ8-XEv02aD^8xe3^kyxk`qaq!kWpjAI&lGOA$hV^H-PZ>WFUAV}X^+Dm{3Gcb_S= z`;NP%#T@6S>Um9oxH!U?n3z?ciOGk*?ou5Sq~{GpWBNmG5L?@QfTZG6|DZ9c6$8m~<^GwpbD+4O6P4 zdLC&L4IZ7t(HFVi225tg&72Z~;FNW^mR%mLV0;SurI$TR@%*Bs6~#2~Ord~63Hj&n zrfd(_vrC$B)Lsj=)TF%$K%L-!W0q^+AJON-ib2}RN4CJ%Z$oTsY_VIeY)+*M<_Y|i z0t)ohRd(7LqNy`LPs4lxAcyeRYEgn$Qnx-&h@PuQl5%LpaZyZD6P55D=h{?IfUVB zZ`rl$G+0%^sl5-gkL1f)j=fe78fdwIvRPjNRwbUe= zvzbLBThG3_MO(I&07;V1hYCxYlVjI`Zd(U10OR`N*difBwu^#lnIG9e4AO~^#9b7_HIo124?e7Y zXeDY?v(=bh3-&&kvVY-+Da-MbITT9F8F!b;A0!78(dS-DG>L%6~J46^=_+FRc zBy}WFOeW5XDsh=XS6yboBO)RUHP@Z3_4gR-?Yf>pt{1=zN1KOD48}n7@*6^_Xb^w2 zMy6-Zn|a%3cys6ocr@OsxL!Dhk~>Sd^{3gVaW30r<=ufSeGuCyd~(FUr=^X4lcZ*_ zObal_68Qu#jGdQug3lx{wQLrhf4wxx>7%FE~mLKzX2QNDF!{ z%C0-USFn;&@Y)8-9#;nuSHW_??i4tzUabP7Skr**Aa!3Bf5 zs`b}!+^F=`RF+o2UDt*sAcE+Q@y3O0;?>(W2yu5&%2AgMzc8TpveT^RAQoH3mdW(6 zvjH?C(NvMW#48bMeb&OUO;AW_(J7F&719Au;!@D&%~`18vI_&L2%$27I9W?Z<%e9i$FKj>>R@s*oy7K{%mU=d2@=!9H8s_s-ZIk;+S>*!@QB}*1xG6D z7pdo{PsAztbf{GpX0Hv%JeQD2A3CoqzNN$vIb0GNnW^7QNp*!EPBh*+zhu8IG$nI} zvRe?*D@2|wK$5bN{uH{W?GJt)J9*NYnQ}zMd}5tH*;@MXN zO6iS!ll8D72_;C%qUr9FPb!oDE&(<$Xl_G~TLzkC;Og%SKN`w0h85$e zt8?pQ@dxAz>e_De`f%nm~wanJRbufXoYe@oCKD|6REk-=<;(#Ax$BipB zRhm_X2dn}ISdo;W4=*tdorsco8ag-8`q7rc;Bh9)AVtS6(3#SvVDexds23d3d9924 z3}BBK%@MC1Svl!3c$hTHc!=Vc!Rrtn1s@-4W=Q2#R{vQkI(Vj-(ZR$SVH<+OpTp&+ zS7kx5o}Ql6WfpQT>GYRd8&%3M|$#8!)0TKphdLH8u5Y1x~0tpnR5UhUz#b}5tb8G`l@ONkL9UYKgk z@)M63*0op%j}LP=#gf4CPZ8>-|zDUMeu(`;968^ ztJ{Z0TiL5uCHz4FVtoJ@!?ERLvXu_aU@^C<*HBqpN-H62=4%~&3Sw>s*3U(LFJ~PS znNw4HMhJH!3J>G~scgW_)AC_--^C*$w{)oWt7n7zGl4k!7JEzGXV32T^>D)K^Q-@8 zaQ&EWeQjzN=>yl)D~EkwOzv3*sXJWm>5lUN+m0GhFUy7iCge~knDm}J|ALVcsIJbT&gnD@#iNqBS}*~gxFd%doYIjrx_Z+q)WFRN6hoD^hzSR2Olr|YK{a;1~b4nr$Y;-bIDNG*Tq0j z9#c@u5ck(X9LUFyACo0Ck~qajbZvJrg$~0@1YDm~6ls8Z+%X!*6Q zb&{sh5nXpMJ{<=GK}eim!$-cZsr4?wL|4)XnDB5s9}CD;6%idY;fLy68MBhhgIp2v z{I2R;&EEa`S@UAV&!=UdGgo@P?DzN6&B9?~$RSKEcm)!?5e|UmvxWJb!lWb@C4N<06=5E7PgNWjC7mTjH70oZ@^L z4MS1KaQ%?CSlY%9Z*Vq>6WvjMD)mzQ>C@``#o-Jlh&YDsz#^?Mm89OXw>dCyQ-_eh zl{}*p+j%O>A0m<#IlLRis{*PD6@jIRACOe)9m;weijb^@G@#bqc`csK%P882P`zf} z)?_R0$nIATkXWte&z~PXGHsRFTi1aUZ-}hZGW+okVSx)~kN)(M_$#m!#>@F<2UW|) zEe9JuXulSReX%AG)dkhs2KnIR7&v7@6OR{ZR1hY%OfAaxO5XJXt)y!-A5~~S)FqT3 ziT72dBgavC4z~jnSl?3_^&I}vt9Ng8YZ-&;#)IJm*_<&k)-4WMFb{rm^zdJ6J_;#) zb7%jfnM;811k=Kme8^fKu=>ha(;x}diuth3L0EA%rO{DQTXyfZzgUzo1eGfDjM7w+{(uD~^;1UNSVwSUO7CFt!%OXV&Ts4j= z!+v2peplhvPO1GpNGo)M0y8~o61q@p!#FG(oZB=_s5V>v zslKe(#pP?q;}w@Wx^+@h@|S>4{F0r!2#8=f-#q5?;hhrubxicDT~-h--| zPV~7@+=nHf^9K2!U#zmHIQ*DJ;=?<%Tj8VhOElQNrFN}grZ$K14`^8aTcC_#v zGHaFrHlzeG0aPqNqf+9-P`1YXQdP6g3ule{*Dj!uvSWvZ;p8EyoZvf*!-2 zIYx2x;rtO18X{xh#L?Z@%h(K9j1)Ya{tuxb9RJ)iiYDaWL9f`&AU3rf%3}48mu-bj zMU*7ST6PA2@%Y=J9TNW@!m^lkuxl@%N~A?lv>?F_pePWVgDYa#xXP1$PNIGkTLH9Y z)VTig_ZfoVx2IqD3hrk7_0W<}=>gAy*0i)TX%Na%Xp|%IbfQIzg*~s74x};XNs2x> z1A1(x^HljJPgkra81fN8Cka!WPb9EjK095Xj8L0*dx7=!~ahXU3wgoU;+uK6DpKno5WAJJ- z6x5BWi4gVIM@pdj;SrSSg+E^>t?bsSIkEc+q~kGefe*s40mTZ|ZqO{C5M z>1y8@(qPaFcDl3^UmPN)NMNXUM9jy#K)X9MtbMB_K%=;suQ-<)O47V_rbk1v#&p5X_YzVO^ye>m<)*ecR3BkzL^p~<1{tl3} zYa(Yqda*hf&6!Y&q`)^o&hy`T;35_iGvUBT^!p|ZcZ=&j?DPx3f)${r++6oXAw-9~ zla0;qP?C(9(sW7kuAm3ku3s-9Xp5y0sA?a`nIN?f)Fj3^a5zORngn+`VtaJ&KFUyUw&VEm0rVmEV9?F!_oO_1^EjgqjsX5lyg3X4^8swHK$(({ zzsFf@SifFO;jyrL!Ce}F2A02B7dpVOux2oHfhdm{h8A@U0}0}GBVOS!zg5NGIQ`;< zR#f~CzlkJj4LjKq1Vr9-NPB8UtuQuiv~4S1Qc{0HiPR~0DhxND`% z1M0-WarQ0yM2&HzTA`7hN!tR}cPJGZh12QK28yxH&e`x@Qq6;!s6gsrn5tGBwRLL= zA4rzV`dPl{`b};;Or9sHpBs<~fCxc51&!yQb9wwQ`qTgJ4DBg1{QN5Z=oR1%S}%`} zf7879j)6-mlNR_;9n3k&tflZEbcHk?-_gNixY*(H=+0?ohcivXczF(HYwVleTSAvM=`PdBi>1W++)Zk;K7 z7Ou#e{CjnXSAJ-p#UhjZL!U#3xiyY)N$`_5tniXIz;h7S9mLjxIuI?Ht^7Kw`zm4A znc1sb@v+rx2+5;WeVh55T#dzPdi?s`w!0A-3E~56%Pkl66zIK#UNU||_O4tcuwIUj z@O-|ng_nj1VqFO=4xVe4*(D>=GdQg%CEYX~B%|tGuiTJU$QRNcNko`uEKZ;YQ zN7nkSJpkYj&`yNpUPr|WX(C^p)stBB=#Wcrqf=S>UN4*2K>79H;+nyzsNLtx*-Ta_ z{BC}TrB*XV`x7~F;T9b|kL`O%%0P4t)GtBG*FrRpzs(WrmB`P~TQ4kLpJz*)$ zLs^VX+c~0K-{({1#g*x+csK;t%V5J39_#Jsvd5$`zgwnPcnQW>-XTC; zq?L5}I@}}t$gTH(wl~Kdky7arh8YBmd0)!Y;cnZZQ9#u^7UGlaj33HIP(#s*nK8b5 z=zmapD!wmuOv=x9jDjYAv?sTe3Rc$ai2tR6Ee7yZjc5&IBPWsd!?Ima_1d$jE?v9I z)?mElHDm`e{}yb%uds;M+d_yWN{RGm<(s+= zaMX>ft2+1S(Tux;iuTx>7Arr!l@U#X$K%&eN%mZN-7X1$0(FS}gKb1b{THc<_XR|(G?k^4_ozzCyT14xqckhZIn~pRs zpS_o>S6XJhrdcB#4Mim*LzcFGxI`vBQ5Dk*wM3`v^`{s-w-4qX2Q~$5fo7Pt9`Rhr z+iQ$PxGdW|9h$$FUA~$3?>n0W&^k_2`x@x)?=F%`m=KzA1MPnNC6-PvSKK4{Vvr^o znHPHP$ISwou=1w_tFG#(5qa!bZz|VP8ip`swKi99X9zJiDw*?pN%|giw4_ZOzoxF$ z4w;W(`|B`RRr0W`8FBWGMC<#X$?5$z+e}bj|f@O-i{W^ElA` zXPG_ls$CAu(ljO|S*;;<7|qA6zCgX-d(xzb-npYc|M1QG{uM27TGdW0A`dJfGtc2e zvu|;?nDEfd9y6z%hAN8}*&tBsu*7?n8A2jI1^4@BA453*^=f}91%cHsH1pe7bPWr5 z^qy-2g?K)7Gc+uz&}w-E4zQPkJ3mt>K-(O>72~&=*ilMJ@0b`T&`mlc%(F`8dzsnP z^rK*^^rtVq-Uw9;z5oO=%6nNTMux1wyxlZB#HbbqOazU_0bDqIALsA-@|hx{U_e5` zi4(0kE}&v^m)gi!x-tQnv(^Uu_Uncy1`i%AlsRmi_>;j^_QFyiZMv5i|5rw1K#Wqh zAhVCl-Je)L_D-&hu1n08$sCqD!CVP3!|6p;T5$Q;W5UZqZaA|0A3nxDe($oHYID(etXWfI zH&+rrAx_uaTuTH=QsCoH5tLXH06m~Yo5PQ{hAgXj?f6-^w|pHa6#nCSW?CV0I`8O} z>T587@iWX{FM|zuXv4Uhd7O8M{ooIA>a!l;Iw8A(eq&5s`;r@mxOXk{q#Bc!1D^Pp zFKs}ENO0W$o_jFudX#~>h3|k|gNu)ay2PUr#X(fLe#mv;v#1Dbs*hhUt=xO$NT^JD z*xQH1FOCf`ZU}WjRm@~I0Rv+@0wUmVW8y{LZyeto@TO*exJ~=^lV_#O2Pq@kcc(>1 z>DFrGvu5{s4y-|MOh7ghaBfUwOYKw!E1YsZdTbgsV`?q>R3|JB_)Ti&WPbWhC^H z<%9__BJwsWqYnokAYi6%-$yrN#&)pO=uCG^I^w$4n-7lrMU}!Y8ums1JhQImU4NT# zQpCF$W!MIu#R9DZmqiFo4L!k=3$>Q`B&J(@z@vT*wd}6ouTSNkbqifguSucDd>;1miOwvU#Tyd-avuP-|5#6t)PyKi@r}P(RK!<+zDW;zMbg8%~zQ4 zR~<@&+gkl3A(RkIFh$lCo`fX8*r`E(6zDQPj-$6qlX&CzezrKkD>3vixl}8SAsqP< zAmL^+7WV$MhTvU+n)C$&UqcWyLM_^j9NEp@@&LjeChVZ%!{Zauv3eSYPEIlHK3C}C z>hlwuX7PijLA$4)#BG2U;3bMWxQgy4tK0A3^~K}KLcAW%kE|_?t+LAaden6-s9jLG z)wPJV?xt#KJ8o=|@q4G#RytiMP#AIzBR5G6TwYFu?f~!I=)V?H{P0br$Id=cGzkf} z61-(4ATQAC{9hEDBIpF1d3@~8a{~NZQEU5u*lf%XTTH(WYU2I{i+^X;s3H9LHBi4< z7cTrD6&tVq{oSr@Z|`Gpj2MWx93H5l#65cWdFK#-c=vq5R{E4Deowmk}tvc>Y{L_9JzOYH84t_Xm5Ox&ph#5T=zIZbNkx6i^c$?Ag6L z1lA#_`YZ*uybivwle6cMG0-K5 zi%1D09c2hN*{5Q;^g)wZ3gKySC_%h)ytp4}2cCaDqI#TfX7x5S2

?E6AuxKEV5~cWNI)o9V1$buPa`Y-Y8J8lAi5qnhs; z80ZzxT$Gv_yOe*g`+1QxZ{U{k*LP#-{#WL!9@wXK!e37FNi(+e|Ck3kTYPWqN(lu1 z&pCKZuG?9&Yr$D=Z-(uu9DC@&kI%13GmK(LBU`_%brHpi>(wcS&1exBN7k*-(YQMD z|Jq~pMCm7ic?P?cxIp#@EVTbFqeU4z382kKXVmTLZ?T4-a?Y~CTi+d+(x3jj#&`EbPyB4SRTrc=Mr9{<-C#6~?qw|q)@P!l=T{ewYh(;6o zgl7kA>*?Zpe?PW4$Co7ttg0SGf!7J928f;SrtuF8Q67c7 z^k|^OVqcn`afH}dU8rE8SkN9_QvF9swi_)jG>!1tu>&AJR{b?de?h2O!GQ3a<8$qq z3NH9OUD283`o)`63}CpqXUt*UbyIj0`}ggONjz4Pg`4ykEx9&VsB|v(uf-JF3^09Q zjCHx4t=L;i`R6WN*rC3QgPkL?U=J;JN*ZdxZ5Im$t<^G?%HOJ<7!eYa{G#n7b@200 z`?sc1-aWszeyd-m@!7}#vyP~!D0&Qx4QMWd)iu`$75Ocb_6G*^HfIMgEnpzH+Mq$z z%S;PK!Nqc3g3Hsn=cM}(^FSyMqpLCPM(XlZOMlp5+v#V$K48ZZ&~)0Wr5bDi@k+tx ztyChFC+838-XMX<&;@LR4-6A<8~-J{+oneDP*RP)paC3+U(cuDrcTrxY)EMRTB#j% zgqP1ffNlTt&#UTOMi%1yNd(^Gbkshyi`*7fJS?PgRLwS(z)ew|bjh)o3{_w|D4($b z+*6Mij~sTW!zx8cVw6k}jC``!d_Td^5Pl=8Pd%!_H0{U{+k)*lO%qDDUFBw}oKx;S zp4Dd~^x3E4d%Q#n86Ab{*h9^bT&R|Cm(J!5T=CuxfzE+XF$newFbL+y2ZVP_v&r># zx0y2!@U=)A2%l0nVFl*Dag{|@Vul%L7khVQgw7Z z**d$2lrZF~eH%`Bq0mA=L&`Y^Kq&;nxy_pq0#Ty#-`Ej#;fF(W0IuIr!}rM>uB~0g z$&1Y+^#vu>ty>G>OUAU-*3=}YP0^w|&X-OEj;5tK+e)gtbeLao2A5KBswS>mqX?EB zhJg=9FPog#H_`Rg)-S(Wcv;97=tRPr(Rc1#<$2;jBZ9{x$7y>@wPnGetG(J{2djg6 z^J*Y&fAnB5C8?jWNo*)NGwcONF)#aj`)~$7@O!50+k{R1q41S zk(Sb-;3>!wc^j3V)^DFOGP=st*mXG0skyY}2%t(c!5@HtBkx&uhkvv|xToyrFVHg3EvlG6Vn`fgBz>v?nOho!dXvc8)`!!RLTk;0dqU zm$e_>xN+3-RmHJTNnxyq&?EOc`yfNgXtbvaM$ykCj%U!Q&z?Dy6+F9+W{}UPckgV2 z-7(t)c17I*Lk1X&{hR}>W|Zq_?HM71am)OZwkb!B2wwOknRr;-yLpp=X+#L58dM-> zky@QQ7oq19lUJ=hn9a^k2l#=5L$*T%;j4HhpwfAH;)^eijQ;t_zlZ6TxZgLvAT)&58lF+edLI&>h5Q-601;EIN4y`e+D^XvIxE}J`V1i7K550)u1SJC51 z!RCrA^>q3la*do0504U-p701>UENC}gY2KO1ZIX&KFHiZKz62i-Z#VzleWalrSE|B zQQ>nAO<;g3SBdxL;V#xXZ@pjl`WPy+q-pRJUCN(h4!=s&L{Z+zR|rUqlM&lm)FpdPnBaCN2uTLc z)E_hm`%rc;9e+-@UOe9#x##X&y0W&cN~#d>aR8=i)A~6@T9p!6Tkv55M>gF;&td~$ zv`sI^jeTN}qG9iXh+I=!yNim7V8ZIO$jXGJ2P?y@b{HyQ0(@w@$Iapk>$(-7jYafG z_-s0&(xZo5Xk90-nFD{~9?q&^x*#DH_vFgu86k_t9kxF3r{mG4vil=2t^C9}k=87F z!fQ8Suz-;3kt8-hc5aueM9ZPn^|#^7D_+aNAo<{pp^G4g87bIJcCXi63Xxt;Ipmosxz$^KV9T)of z?T9yPK1zcyM4NilY4`M$Xgyi`1N*Rn_BK)nfA#fv@F4tc=i5IHk=Q(W^a#9Y8vSGH zmjIal3H@}v$KF3}xlTxVnL|zvGV)nY_3EZVwcky1M%G<9efn-uk)5-1Jz9g1x??lg zoP|a((6f79qTKL_HT$YGftw-Lax$w&(Kwu}=UG}|XY0k5R*zzh#8pO|7**D96V(Dw z{~nWB^_F&-X04&DjF=5^M>*$d-1N1hzedkVJv3nGP}O{+5hF54vtF$;4E5T42W2Uv zEeU@)!N1PEIM{b&OswONNs}ixKX$xK1{|~9^5D8%>aI8KEGn2p1&qy!O_&`$`jDP$ z$bu_KUkTsd>S{!s!ri1cqzyS|&$^TacC6vVmY3Gn*FW}Rq=PKz>3oeMrcpl0>c94o zr~GYayWMZHuX}h7F|i*zmsvA(ukf4&am?C|y$iGsXph3rOi<1pG^e`LOBo26b)%{w z##}xae3_!x@n*R&=r1xN0#KN&0>cZqgyxqjTdF{N3XDFo6+AvNSL7mO2;&(b)im&E zK$LeU@JBrU+amv) z$#Uksv%x(|_HR#G`fGFG&mPd~V;vlz|A%8AAgZL_z#kB`rUmH4sF{9|F!r$MC~ zG#@98a-!L?H=>4e9Jf~EN96ylJLDY~2DcYqQccQ#x&0_wlKmys3|7e9C0I^92y}Jb zNo#?B)>|%6weI8m$`zhw^=NyTSygDXY3!lERy|)j>Ny}|bKa9tnh?{#d4QbbXXy1~ zZXT{BFTZa%kk?y@&UzZ%R<@4sGAl=&?dC>e&TM!J3;b{Z2r|=FyKIJVz;@s`JrV$eT08W7ogjmd4zI*rJ&_!xnI6r!K{WyEh z9NW}~pFVzMHQ@pGkHu;qgolQPgqaYZ4xn!xHnd^2qXLmJ8DD*mk!mQ|M2H zM+9t2%Q+EO;cRh7DgP`v)Dlng#1i3jr%5%Y$J@bsoj}h>AWHPU{+_DWk28rjkabKP z8u&xWBnEALxc5^wee_34x7>x#%|Ggwz%WB4$p4B{q~C~iRow0e(>o~+)0>s z-`w2%^K`>M+GAU|{;cxB`uZ~YXTd@{BCot4xN_gR)p1F?wM9W+V(-}!LYlEWporEe zc7Sx0Hvo9ok1$80kwz`qdX|A@!Z?MFqd4Yuqr7ukzB)efd{nqhuUFQSZxzmO8dlM% zIWOJ4pyvS|sKfE?27F#`uE#p&u*Tr{+na&la137uLwI43XO*wYn4J!|r ze#6Wk{C6>FbySBq-orDo&0__ujKePn&CHc6SK5Ciw%7c6a%s4W9gj+@=Yf=tJ{q5^ z46E0Iic94iRf>}3Nc_lEC{BC+Gs51Yww8AkVVG$7F>;m5ey>hXF^6Jq`a&{%7Q>od-3iwlTwj%9hjaa{Ht1U)!H&eGWUC z))x9^xLlsRzH(ma%_+XIaTVxDa!MA< zXZhj-UpxBX)fa52Bu*wLeQ_wqTUK&im@>lP6C;oPU9;0Rph5Csn+WJVZ=_ zA4lNqlO>(6o_2hESD-`>I=4eyLir0^2&uuP-Cb~VCDU*sntdPdOEg3A~hG^4JNL_U1wLhBBTq9vZ z+v!VCycD>V1L|VT2dnFaY&r7COJm8Ob5HOmq-X~|SiC!kwvQPzUQD=Ll{jfg_szfy zH5rRNhgqUTycBH?YG|*uO-3>ga1}z25bMZrB!h(X}nvBxB*VvDYkE2OyAFSmjdFMgeUEpqqW zpj79*0vL*5;>ACbuUl_FVf>aUn9FLS`__Z2d^Fa7ZdY}lQx^+Mfv8_x@Rej$ax})eJAOw zbl;2*CA}y;YkF$*>`6rr;aX8wcan2EIoWvSNB5S9yHi1B=*vg9(W5Wiz4O0AmSClV%PZ=c_|M)yzF;;?FwN+O~?s`BG5bIz>o6 zAW=il?5lFNE3G^G?Yc7cwA@lJav`23U8)P3&l&ApxR=_JYz7;<#jiCk+I3*G<3BJO zj86nF$Xcl)l|Lic?g#d5H+SovjkuQ?+Mc~@*JB}E)c<3$w^D=Y20F^<1fFN~e-J)5 zwqtZYmYze04-e&(da-t;Wz0MClCNZ=1RwkxOu5g?kN4Gk1Jqw(luwzfek2q;m^LFy zX*%!sJ_a!GW1?3wD;xHteuC*8rE?~Ckq3@ARl z?j>?G%kpY&Q9Wu~u`iYPvn?BrCvoWtDzeva{znU-Zp6%E1{7VaDoOY>$pQCGG%6SN z?R$=L?Ol-No%*o3$5)a3B@S7@e6s0y3m46>o%ij_;4}9NQv=(slRbEbd{>R~rq+ZI z{Pf)4dg<z6aV@B%Gs-piKJwth$ zJuU!&*4E*vsnxh8SHF2fKb=(JootDb9isDv@ur-43|j^*VaMaZ6(sc}=|iA0!tZ#- zqVntDJ;-nwPAMeYt+NtEJ-69o1aFE>6Il#s6H9>$Z*FQzN~_CZ3JL=4neOf%Dk?Ao z^_;KHG{x`kXZxXF|MZs6~BwS`xUwdb*h&t8Mr%YDq>niiQDjfJGTR} zMTnLf?^z5Qff{PGe|eTZTxTL|;d_nDUa~e6XjP z#VLPf(R4l<~fIN^TqlzE&09sn@e+1Uh& z53h49YrE&pBXtCOWJ2(OK7GV;JQK3AnyF(1v0=@N=ifC(4t$OQ^k}#v8yg^AoS-Q` zR>QXQ-E`7|53M5{!dU>(AIz*78s8_}Dt_qSD{e^HMFjt9+K9}nVHWS zQ{!MteTU}kBc%*Ajq|L@Qr zp)tebR06Jnyu*pSS@;wl3 zMuCf|)v4FmbQNBdHh(K^a-P3Vm9*yVf_{-3uDtk%GkoMrI#t%|a;(7ML3 z%BJ@XeN{S2_0GD!^ejFod+6BZTA50T>NL1;Gtwejm?f9Suc|88Jx1J9OQj3I?C{?gf%37!f;Bu)x5mIGN0DhhFBZih0 zi(Y3gT)=<-@~Wy}xKu1N{Qc9dH1Wuy=Z%ZFcW((h48o8Oezw+GT2Pf->Fl>4Tsc+Yd`N+#?u{8bippbKFm4ZS~Acndw+PiHQ6`?B&0%Y z=vuhbsapmtdwz6^bxS`qGIaPNavQg1HvZX%r4T2eX3*j2zk2n+-MLkX*VUsSOe-?{ zyu2{iI79kANGcQ|9L9aIgT(sfBzI;Y(7WYOm4BFjRCOfjG#_Y_!{Bst;?${4cjw}J zUNzyci$1{~m=^UWSgJg4%Q8?}UR|^|ZWj-Z4`1GR<1e)by?J#G2>P%-eQ5J%GNarE zo8aCK=kZQH6EJzlg$Zy7eERP^LCAnGdJ-2$Y=l^AxY2Jxe3PY4XD*Q*^Eq>l_~tH< z&xc^*l|<$m0ib^WzB9N7&oAt2V4%5uab(kZieCuI-odY?3|JNCO?s#%_t}}OY=!9o zf{y+3sV9d9`H#`sHuezIF^msw6cw)rHtgB;F|vKNeXn86q?s*lYO&B*o*pa==BvN` zYFb{Gc&)=`VVq8C1oRNSCajY7alA2-sSfj^*K}<;sh+K{e@MDJmZ`;a&-cB14uqa$ zo+ZjQkC*Jd`*!Qc+Ynq8lQsI@Wkk|=v(NTA35{j1DEp`W?6f<;4NA~OAI;3FF0GC3 zz9!RtM_SW1M8xDoAfw68SyXv#t@_vBJNire_h6;)d68XftFKadImR@;+Iud zS?|0VO2Mmgm_Z-@Ja7F^O`XmND|shxliz+0chaqL+zvENpbhRfxaADhu(4=<>GG6na{Mzf^6W`n6!{@o`$)plBQW8ncg@ zX7=a1EEYEC7ny4`jkZ|r>Md{Wj2a&@h?a)NSJu8sJ6Pv!uAKw>Bn^0zGVuTAtMgy# z$cqEdJdhMAFCKE6w|Xc|;h6rAMUN|Pn@04)eA^e2gM*SrjYA7xT@1lZylL|Z*Zbti zOoU}_)IxT3*~Md8!-rq4w>T&&pD`z4$Bwt3Klf%PHr^vh?)9Cqlp{)FNJyJDs=-Y0 z`k~2F;5iUA52Gm5-Tn*NdGf#x`u4uQZ;|K-Ehd?Y_c7tRrAy0GW~T5>LyU~}KoQn1 z4|}Ra;7pZqv zXQUapvy|DR2uToy0cf><+;>0e$%w$(DRhKPnWA#|?DDW)=_ju1l^c3wszj-*HSe`` z`271xl))%M@mzYQx%lN3rNB;BN~|YHM_3om)T|MMdL><)dv_mq2g<)-;X*9UZb}EA znb^mu##t6B8_#1;#2aYVQG=OCjPBCEgKqivS&aF^3i^-T?`He;T`X=%tSRWEkN7y?)IhXn25gj05Q2P$=`1k<|1;Mwlu$PAL}6#-&MxP zRj64I%CewIe6eA9GuYkWF{|Wpng==>S}rvQ8J$;X2`W8EY&v;zK<)DD?N|<;K5@c~ z!KPRZv)ZoWm?_FPGB7o5A|CNHIVcA?W!8-VJQwury?euDZ{Vx!&~LVMe;x}g1n zOa%9b_56+JeQoJ>RV5`9z22oBopE`bHGTS;*584M1v4Uh88`Jds!5ARoRfb3yf9w- z$>|w_G?sSIq@l%!vrw}O=GGdjs_N>96!+}kFZU$qr1+KKP8W3d1`KDa_nv_00=$2) z2wcMqQhRSzv5`JsRvXnQsE$fpO1W22fvQcG-f!Kp6s^V^iEmSk!)MC`dzj~?b6o`Bu1C#NQ-+5V!i zA>+4L{dN5qtz;sC!c05)=VIIIRp@7UPh&CEe0<&s=`{sOqt`J`m~%dKW$8gHP1>#f1%Gg#@ym4Cw1>;4ZvIni~YW>2v;ITBuNf zh~=JtU317fQkuNz4s`_#S%8_)I*SD5UP(#xyS=W6dOzrgTPdom@1gN}LMGEZIV34M z@#SF#%7﻾iJM>_TmJ-6BNyh=XpJ8he5YR|4H>VeE03**1RF1teGc7A?t-ZaytE*~Wo=BMmu!0!H7^}EK z<&GPgIO}%t6u8|a!&5;5BhqH9+Xo)n#~(EZ4(`;I-gjJ%2)}bxTgxvP9n0V*O+}f* zm$w)Dnm&2mkNhR^;2C}8Slr1HOE{Z0p6Qr`I?c)su5Az|B*+;`yY3>Cps*5?a|jrN zXTUtYdmtU~udIyCQ1A_|V(XLaE_T5BUKb0GY~(OS5z^BFM;?w&pe(kYF}b_svl!bi&j&DMh9! z%1Ck~XHNKZK$5EFIQQ;tPQLWUOQt)m=kuR|x*TYA&1J-S*UcMjkz zE{8)yliEi)_Dh|D2bArywWVv)(8eGOnXNuH*@dD5GY+%hre$TeVg+9F#RC}P z{DTe)lng{(TTR8AzRRJUsqQB}d35MVYAXLsN13_;&cDwFG3k*$_l33zo_o-;dAMAV zcVTDTe_apVW*PIH~8sk-v16V`pUtFMuda zU9)EBHv0hJJGA758VX0NzpD|iOt{uQ8*-0C+U_I#nWVYf$N#MNx2SfHQK042? zZF%-dbW>H**kw9Tx=VWz8cx7X|EefoN1B9AwhH56o0?Xs>=bwHfOkujHwpSPI$EgZ z(R@!)B5>@O%vZiJr|KAIDK94{^z$ROLCMgkBvaHR|3OJ2jF~-eLD1>L{O%(^t^ zwpvRvXXU?t^Iv@`NpAry#%d~ppc~15`}vc7Mp#I!7Y4CUon1hMwJ}ZgSbrIxq_-fH zuNP_xCE>$zK+>lRFC&)fpFKTk-~ZG_@xQXY|AFV?|I1HnA0n&#g|&=DudTP=5xU^& zJ5pxg_P*`oAz@x*q?0Zj0CMgRZ5<*uhS!GE!_Z^@UGhi46^8T$ku;JhVH)bKTkr;} zE&d63zH&1-12i>(uA6)mk(YgR(3vp+e~r>fKTLiZhHd1cu8n2B1{n~W1F{Cm9oVVs z%$awp8oneF0)-C>r*7R)ogRkBWi}|wwe&Ja4BuD2wpU@V(7-p&BoQH_Sw|j1 zEkx%th1hzVK0C)@&3NhT*)I&EzA#^@R|I!~Vs()nks={ERzX=t68LYxuH|Gn)V4gu zgkoRPJuqU9RT``4bGEUwOI@@C=&xcrBAZ{6C+ znQE9|lPu35D)K}eToX3iB!5AQfSngF%r!NyV`@IuNHBo3ViYS^)J9JNga8&8v{v)FfvkD}byJ?#*7{z*c~hUm z5}&<%Bf93`slcFF;cbZapwjqp@P9Ht*r{U!q=Xp^wq0Z-!OUow5logzrMN9*5Y#lubI6Fi?O=~V$o+U)B+fuj?aV)86(EaYjJSljc&5djPHW8f+!s?x1hZhIavEk-EDj_%$23?%$jc1T>NSr~m5 zF6+TxpuDN6p^$@s9I1~m%qN77IOuHW(S00OyVpEu2{VyUWUxm#h0vDIWC|61ExB zr@OdG`qI#JGt6EGEp;w1A&@P?ruDBMIpvZ=I*{Z@Mxc)`x~!ihzgKgcJ&P=xp=xXd zI#5{i6>$=c1)|_wBr;}CZD5XYI9ghs+4pn8^LNBGCM-WQ?5IC_DuA-XkxB4NX6FV+ zd6VjHv+Kr4Lp%9QK(F6WyBW(WA{tNM3_I|*6QG_o&GQ-UAs63`b~!_WlgGt{k0`|8 zQnBK5RZ>(G245oOL;p_Ad{gKQ+(jcN!#G1}$vh@Eku==Tc<|sE8O!@}H2IOfr3nM@ zaKj<{9yl%ZSTo`tfj=WOa*zPsG0$Xn(H5kgUh29k9jQa;;F;~~dl=Jl&(km8J)Fu+ zCb~0H9?WN#f$fupC-e?{YqhxS8IMHXx?@~n?eyGI=mOMjbkWLxtKT76YrXLPii>N{ zE#a!j-g!6Y%zRd_9V{R=F@bigM}2yB-p)X+sHkYRI$-Xl+S-K}fWl*H zZEM1N+cC$c!-U5V<_`gY0SKis1b!w4lChD#R+wK{n{=tpFVH2~MYt&lBL4lfYZIGp zo2YMsN4T8cSz51djjW8%7u!$L>6b761d8)rRlu(T!!fCIGRH8~(MjJfqXwKM!^Kv=nAHr#xd{R?ahn~N<_7N5+3x-Tb zn7w%Mr+Gm#^71dw&2glH^!84+*P(F_<_l?>fm+ZH_n8APKT@-ZXfR#UZ1w4JtoHU| zQeN^R9=;GCCrtS4x9twh)AsQr%HOEpRfb!qT zN1bwuM*8a0hf8nCxsdT;($D)WyuY30DeSl@{VyUTn`&2ZW~`TyI_Jf^hE<%s%?KT8 z0@jeQyBpkCx=`a!&HXF5jiBAIa};Ta;pz1o(amLJYTR0=rM!f^pC}AhLj-& z`(Krv|4++^np-mlm5`Fy>eFGe>}70`xt zL*n?{Y3w9a-#NuEIAd?Db7mu`sCQnr<+UeWwpUswy8}iQnWUVo_ZaLx>TS7%8HLn$xa)sg}Zg=1Fq+aqC#803-IiX1{je{gSMw)Q`JT6hN7$*~^QX56#^7tN0D`;gt z)6;;Mp#e#oHeDfrVi5M3m#@XKf-|tAc%YrS9E|NNH(U43#RR8d#}467K$L;T1Z(ZZ zd{jZOe#p@X4CLRZ7={lNQ<{H%Np-~cM-%%Q|A{Yuq*04q!m%SqXt$#c@*5zj>mC-j zZ<`O>o3qnRGTErSMD3!Lc02w}^)}m~$lgX;0uI72x3z^Rl~~!kcrCJuc(*t2E=dS^ zX3|)zs>A>=cH?!DO{ddXT6*}&@jD?=Q9E%~Ag$!%+|(jbQ%o-4TjpiKCvti^gaROt z`(J7_Lit_5l#QkZj3=aP+e}+2BJXI}QpUQ2sF*YX7jp@G0%khZsxPdzkY*ee$;8tO z{Ya%q_ks{Z0v^jH@bW+SPyiiV;h;tHYi4ejkJ)mP*(s@naVEcE(@I%w4S zg?|mjhUi8V*#XU#!>c3q!i>1?6+B)R78pZAG~PF4B7UPY!C>3w51|T&1@?gE51M?h zhPgVueb_8ZO+@({IaCnwkYiTPykeYGkHf}uRMJ*P#;aa$5u@waqkuxBR`ez6CmA+C z-FFe5e(JO><9=*luopwjYS@Sr!BdAo7uvAat0&241|1E&qWS4V1XGoWRFHR$2ixQL zL8U8I+|aS~DIkXI&UDeR>vFr%+$I_OqyUeCa6YlmZGcQq<*DYlMb|+W;W8KulBC=h zJ2_HsCVG&`(xkwxglVF`tlRSY4hRB{n4y%xDh76oYk<#Dy#oJCw&-jb!}i+3e!SPTqh z3?tw5a^kXqxqZ9Fke6VwVNH)gidsL5(|lMXdV7nV_^!Ac(Y&?dN7~#qA#yRtt&+uJ zCjqPY!!F>(JWu9bwg_{SelMt_X&TMzQ>xb)Z|<$cF>?{xq`W*e7c&?O#5V)WgfP_u zAHPz9IPClDHX%HX;P!*4t54Q?m?XiG~KHGdCxv8Z+dK z4E#}?$(r(HO?dc}AQF8!kdzA~L*KReWv7E_VgEiKZs{l47li83-E!HAtuPbm zbWug%GUQmbx7HM0jJxxc?%A<}K6R&N%4BBmg}VPxa{d9^5AGS)qY#Xl^CC_ Jt&Pjr{|ASouHXOw literal 0 HcmV?d00001 diff --git a/docs/cudax/stf/lower_level_api.rst b/docs/cudax/stf/lower_level_api.rst new file mode 100644 index 00000000000..8dda9fcef46 --- /dev/null +++ b/docs/cudax/stf/lower_level_api.rst @@ -0,0 +1,114 @@ +-- _stf_lower_level_api + +Lower-level API +=============== + +In some situations, the use of ``operator->*()`` on the object returned +by ``ctx.task()`` (where ``ctx`` is a stream or graph context) may not +be suitable, for example when the number of parameters is not known +statically. To address such situations, CUDASTF provides a lower-level +interface for creating tasks, which is described below. + +.. code:: cpp + + #include "cudastf/stf.h" + #include "cudastf/__stf/stream/stream_ctx.h" + + using namespace cudastf; + + template + __global__ void axpy(int n, T a, T* x, T* y) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int ind = tid; ind < n; ind += nthreads) { + y[ind] += a * x[ind]; + } + } + + int main(int argc, char** argv) { + stream_ctx ctx; + + const size_t N = 16; + double X[N], Y[N]; + + for (size_t ind = 0; ind < N; ind++) { + X[ind] = sin(double(ind)); + Y[ind] = cos(double(ind)); + } + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + double alpha = 3.14; + + /* Compute Y = Y + alpha X */ + auto t = ctx.task(lX.read(), lY.rw()); + t.start(); + slice sX = t.get<0>(); + slice sY = t.get<1>(); + axpy<<<16, 128, 0, t.get_stream()>>>(sX.size(), alpha, sX.data_handle(), sY.data_handle()); + t.end(); + + ctx.sync(); + } + +The ``ctx.task()`` call returns a task object. This object provides +access to the local description of the data associated with the task and +a CUDA stream that can be used to submit work asynchronously. The +beginning of the task body and its end are delimited by the ``.start()`` +and ``.end()`` calls. Failing to call either of these methods or calling +them more than once or in the wrong order results in undefined behavior. + +Asynchrony is achieved by using the CUDA stream, which provides a +mechanism to submit work on the execution place (here, implicitly the +current CUDA device). CUDA ensures that all kernels synchronized with +this CUDA stream will only be executed once all prerequisites have been +fulfilled (e.g., preceding tasks, data transfers, etc.). In addition, +CUDASTF performs all the necessary synchronization so that future tasks +will be properly synchronized with the operations enqueued in the CUDA +stream associated with this task after calling ``.start()`` and before +calling ``.end()``. + +Compatibility with CUDA graphs +============================== + +Similarly to the CUDA stream backend with a context of type +``stream_ctx``, the CUDA graph backend ``graph_ctx`` also provides a +low-level interface. + +.. code:: cpp + + graph_ctx ctx; + + double X[1024], Y[1024]; + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + for (int k = 0; k < 10; k++) { + graph_task t = ctx.task(); + t.add_deps(handle_X.rw()); + t.start(); + cudaGraphNode_t n; + cuda_safe_call(cudaGraphAddEmptyNode(&n, t.get_graph(), nullptr, 0)); + t.end(); + } + + graph_task t2 = ctx.task(); + t2.add_deps(lX.read(), lY.rw()); + t2.start(); + cudaGraphNode_t n2; + cuda_safe_call(cudaGraphAddEmptyNode(&n2, t2.get_graph(), nullptr, 0)); + t2.end(); + + ctx.sync(); + +A task in the CUDA graph backend corresponds to a *child graph* +automatically inserted into the CUDA graph associated to a ``graph_ctx`` +context. The example above creates 10 tasks that modify logical data +``lX``, followed by a task that reads ``lX`` and modifies ``lY``. The +code illustrates how one can add dependencies to a task by using the +``add_deps`` method. + +Similarly to the CUDA stream backend, a task is outlined by a pair of +calls to the ``start()``/``end()`` member functions. diff --git a/docs/repo.toml b/docs/repo.toml index bda9b01918a..e745a6dc14d 100644 --- a/docs/repo.toml +++ b/docs/repo.toml @@ -174,7 +174,10 @@ doxygen_predefined = [ "CUB_STATIC_ASSERT(cond,msg)=", "CUB_RUNTIME_FUNCTION", "CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED", - "CUB_IGNORE_DEPRECATED_CPP_DIALECT" + "CUB_IGNORE_DEPRECATED_CPP_DIALECT", + "CUDASTF_HOST", + "CUDASTF_DEVICE", + "CUDASTF_HOST_DEVICE" ] # make sure to use ./fetch_imgs.sh @@ -265,6 +268,9 @@ doxygen_predefined = [ "_CCCL_DIAG_SUPPRESS_ICC(x)=", "_CCCL_DIAG_SUPPRESS_MSVC(x)=", "_CCCL_DIAG_SUPPRESS_NVHPC(x)=", + "CUDASTF_HOST=", + "CUDASTF_DEVICE=", + "CUDASTF_HOST_DEVICE=", "DOXYGEN_SHOULD_SKIP_THIS", "DOXYGEN_ACTIVE", "_LIBCUDACXX_DEPRECATED_IN_CXX11", @@ -375,6 +381,21 @@ doxygen_input = [ "../../cudax/include/cuda/experimental/__launch/*.cuh", "../../cudax/include/cuda/experimental/__memory_resource/*.cuh", "../../cudax/include/cuda/experimental/__stream/*.cuh", + "../../cudax/include/cuda/experimental/stf.cuh", + "../../cudax/include/cuda/experimental/__stf/*.cuh", + "../../cudax/include/cuda/experimental/__stf/internal/*.cuh", + "../../cudax/include/cuda/experimental/__stf/utility/*.cuh", + "../../cudax/include/cuda/experimental/__stf/localization/*.cuh", + "../../cudax/include/cuda/experimental/__stf/allocators/*.cuh", + "../../cudax/include/cuda/experimental/__stf/graph/*.cuh", + "../../cudax/include/cuda/experimental/__stf/graph/internal/*.cuh", + "../../cudax/include/cuda/experimental/__stf/graph/interfaces/*.cuh", + "../../cudax/include/cuda/experimental/__stf/places/*.cuh", + "../../cudax/include/cuda/experimental/__stf/places/exec/*.cuh", + "../../cudax/include/cuda/experimental/__stf/places/exec/host/*.cuh", + "../../cudax/include/cuda/experimental/__stf/stream/*.cuh", + "../../cudax/include/cuda/experimental/__stf/stream/internal/*.cuh", + "../../cudax/include/cuda/experimental/__stf/stream/interfaces/*.cuh", ] doxygen_predefined = [ @@ -422,7 +443,7 @@ doxygen_conf_extra = """ EXTENSION_MAPPING = cuh=c++ cu=c++ EXAMPLE_RECURSIVE = NO EXAMPLE_PATTERNS = *.cu - EXCLUDE_SYMBOLS = "*detail*" "__*" "_A*" "_B*" "_C*" "_D*" "_E*" "_F*" "_G*" "_H*" "_I*" "_J*" "_K*" "_L*" "_M*" "_N*" "_O*" "_P*" "_Q*" "_R*" "_S*" "_T*" "_U*" "_V*" "_W*" "_X*" "_Y*" "_Z*" + EXCLUDE_SYMBOLS = "*detail*" "*RESERVED*" "*reserved*" "__*" "_A*" "_B*" "_C*" "_D*" "_E*" "_F*" "_G*" "_H*" "_I*" "_J*" "_K*" "_L*" "_M*" "_N*" "_O*" "_P*" "_Q*" "_R*" "_S*" "_T*" "_U*" "_V*" "_W*" "_X*" "_Y*" "_Z*" "UNITTEST" AUTOLINK_SUPPORT = YES FULL_PATH_NAMES = YES STRIP_FROM_PATH = ../../cudax diff --git a/examples/cudax_stf/CMakeLists.txt b/examples/cudax_stf/CMakeLists.txt new file mode 100644 index 00000000000..b30c459a9e7 --- /dev/null +++ b/examples/cudax_stf/CMakeLists.txt @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +project(CUDAX_SAMPLES CUDA CXX) + +# This example uses the CMake Package Manager (CPM) to simplify fetching CCCL from GitHub +# For more information, see https://github.com/cpm-cmake/CPM.cmake +include(cmake/CPM.cmake) + +# This will automatically clone CCCL from GitHub and make the exported cmake targets available +CPMAddPackage( + NAME CCCL + # TODO update later ! + GITHUB_REPOSITORY "caugonnet/cccl" + GIT_TAG "cudastf" + # The following is required to make the `CCCL::cudax` target available: + OPTIONS "CCCL_ENABLE_UNSTABLE ON" +) + +# Default to building for the GPU on the current system +if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES native) +endif() + +# Default to building for the GPU on the current system +if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES native) +endif() + +# If you're building an executable +add_executable(simple_stf simple_stf.cu) + +target_link_libraries(simple_stf PUBLIC cuda) + +if (CMAKE_CUDA_COMPILER) + target_compile_options(simple_stf PUBLIC $<$:--expt-relaxed-constexpr>) + target_compile_options(simple_stf PUBLIC $<$:--extended-lambda>) +endif() + +target_link_libraries(simple_stf PRIVATE CCCL::CCCL CCCL::cudax) diff --git a/examples/cudax_stf/cmake/CPM.cmake b/examples/cudax_stf/cmake/CPM.cmake new file mode 100755 index 00000000000..8269a8bf655 --- /dev/null +++ b/examples/cudax_stf/cmake/CPM.cmake @@ -0,0 +1,1269 @@ +# CPM.cmake - CMake's missing package manager +# =========================================== +# See https://github.com/cpm-cmake/CPM.cmake for usage and update instructions. +# +# MIT License +# ----------- +#[[ + Copyright (c) 2019-2023 Lars Melchior and contributors + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +]] + +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +# Initialize logging prefix +if(NOT CPM_INDENT) + set(CPM_INDENT + "CPM:" + CACHE INTERNAL "" + ) +endif() + +if(NOT COMMAND cpm_message) + function(cpm_message) + message(${ARGV}) + endfunction() +endif() + +set(CURRENT_CPM_VERSION 0.40.2) + +get_filename_component(CPM_CURRENT_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}" REALPATH) +if(CPM_DIRECTORY) + if(NOT CPM_DIRECTORY STREQUAL CPM_CURRENT_DIRECTORY) + if(CPM_VERSION VERSION_LESS CURRENT_CPM_VERSION) + message( + AUTHOR_WARNING + "${CPM_INDENT} \ +A dependency is using a more recent CPM version (${CURRENT_CPM_VERSION}) than the current project (${CPM_VERSION}). \ +It is recommended to upgrade CPM to the most recent version. \ +See https://github.com/cpm-cmake/CPM.cmake for more information." + ) + endif() + if(${CMAKE_VERSION} VERSION_LESS "3.17.0") + include(FetchContent) + endif() + return() + endif() + + get_property( + CPM_INITIALIZED GLOBAL "" + PROPERTY CPM_INITIALIZED + SET + ) + if(CPM_INITIALIZED) + return() + endif() +endif() + +if(CURRENT_CPM_VERSION MATCHES "development-version") + message( + WARNING "${CPM_INDENT} Your project is using an unstable development version of CPM.cmake. \ +Please update to a recent release if possible. \ +See https://github.com/cpm-cmake/CPM.cmake for details." + ) +endif() + +set_property(GLOBAL PROPERTY CPM_INITIALIZED true) + +macro(cpm_set_policies) + # the policy allows us to change options without caching + cmake_policy(SET CMP0077 NEW) + set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) + + # the policy allows us to change set(CACHE) without caching + if(POLICY CMP0126) + cmake_policy(SET CMP0126 NEW) + set(CMAKE_POLICY_DEFAULT_CMP0126 NEW) + endif() + + # The policy uses the download time for timestamp, instead of the timestamp in the archive. This + # allows for proper rebuilds when a projects url changes + if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) + set(CMAKE_POLICY_DEFAULT_CMP0135 NEW) + endif() + + # treat relative git repository paths as being relative to the parent project's remote + if(POLICY CMP0150) + cmake_policy(SET CMP0150 NEW) + set(CMAKE_POLICY_DEFAULT_CMP0150 NEW) + endif() +endmacro() +cpm_set_policies() + +option(CPM_USE_LOCAL_PACKAGES "Always try to use `find_package` to get dependencies" + $ENV{CPM_USE_LOCAL_PACKAGES} +) +option(CPM_LOCAL_PACKAGES_ONLY "Only use `find_package` to get dependencies" + $ENV{CPM_LOCAL_PACKAGES_ONLY} +) +option(CPM_DOWNLOAD_ALL "Always download dependencies from source" $ENV{CPM_DOWNLOAD_ALL}) +option(CPM_DONT_UPDATE_MODULE_PATH "Don't update the module path to allow using find_package" + $ENV{CPM_DONT_UPDATE_MODULE_PATH} +) +option(CPM_DONT_CREATE_PACKAGE_LOCK "Don't create a package lock file in the binary path" + $ENV{CPM_DONT_CREATE_PACKAGE_LOCK} +) +option(CPM_INCLUDE_ALL_IN_PACKAGE_LOCK + "Add all packages added through CPM.cmake to the package lock" + $ENV{CPM_INCLUDE_ALL_IN_PACKAGE_LOCK} +) +option(CPM_USE_NAMED_CACHE_DIRECTORIES + "Use additional directory of package name in cache on the most nested level." + $ENV{CPM_USE_NAMED_CACHE_DIRECTORIES} +) + +set(CPM_VERSION + ${CURRENT_CPM_VERSION} + CACHE INTERNAL "" +) +set(CPM_DIRECTORY + ${CPM_CURRENT_DIRECTORY} + CACHE INTERNAL "" +) +set(CPM_FILE + ${CMAKE_CURRENT_LIST_FILE} + CACHE INTERNAL "" +) +set(CPM_PACKAGES + "" + CACHE INTERNAL "" +) +set(CPM_DRY_RUN + OFF + CACHE INTERNAL "Don't download or configure dependencies (for testing)" +) + +if(DEFINED ENV{CPM_SOURCE_CACHE}) + set(CPM_SOURCE_CACHE_DEFAULT $ENV{CPM_SOURCE_CACHE}) +else() + set(CPM_SOURCE_CACHE_DEFAULT OFF) +endif() + +set(CPM_SOURCE_CACHE + ${CPM_SOURCE_CACHE_DEFAULT} + CACHE PATH "Directory to download CPM dependencies" +) + +if(NOT CPM_DONT_UPDATE_MODULE_PATH) + set(CPM_MODULE_PATH + "${CMAKE_BINARY_DIR}/CPM_modules" + CACHE INTERNAL "" + ) + # remove old modules + file(REMOVE_RECURSE ${CPM_MODULE_PATH}) + file(MAKE_DIRECTORY ${CPM_MODULE_PATH}) + # locally added CPM modules should override global packages + set(CMAKE_MODULE_PATH "${CPM_MODULE_PATH};${CMAKE_MODULE_PATH}") +endif() + +if(NOT CPM_DONT_CREATE_PACKAGE_LOCK) + set(CPM_PACKAGE_LOCK_FILE + "${CMAKE_BINARY_DIR}/cpm-package-lock.cmake" + CACHE INTERNAL "" + ) + file(WRITE ${CPM_PACKAGE_LOCK_FILE} + "# CPM Package Lock\n# This file should be committed to version control\n\n" + ) +endif() + +include(FetchContent) + +# Try to infer package name from git repository uri (path or url) +function(cpm_package_name_from_git_uri URI RESULT) + if("${URI}" MATCHES "([^/:]+)/?.git/?$") + set(${RESULT} + ${CMAKE_MATCH_1} + PARENT_SCOPE + ) + else() + unset(${RESULT} PARENT_SCOPE) + endif() +endfunction() + +# Try to infer package name and version from a url +function(cpm_package_name_and_ver_from_url url outName outVer) + if(url MATCHES "[/\\?]([a-zA-Z0-9_\\.-]+)\\.(tar|tar\\.gz|tar\\.bz2|zip|ZIP)(\\?|/|$)") + # We matched an archive + set(filename "${CMAKE_MATCH_1}") + + if(filename MATCHES "([a-zA-Z0-9_\\.-]+)[_-]v?(([0-9]+\\.)*[0-9]+[a-zA-Z0-9]*)") + # We matched - (ie foo-1.2.3) + set(${outName} + "${CMAKE_MATCH_1}" + PARENT_SCOPE + ) + set(${outVer} + "${CMAKE_MATCH_2}" + PARENT_SCOPE + ) + elseif(filename MATCHES "(([0-9]+\\.)+[0-9]+[a-zA-Z0-9]*)") + # We couldn't find a name, but we found a version + # + # In many cases (which we don't handle here) the url would look something like + # `irrelevant/ACTUAL_PACKAGE_NAME/irrelevant/1.2.3.zip`. In such a case we can't possibly + # distinguish the package name from the irrelevant bits. Moreover if we try to match the + # package name from the filename, we'd get bogus at best. + unset(${outName} PARENT_SCOPE) + set(${outVer} + "${CMAKE_MATCH_1}" + PARENT_SCOPE + ) + else() + # Boldly assume that the file name is the package name. + # + # Yes, something like `irrelevant/ACTUAL_NAME/irrelevant/download.zip` will ruin our day, but + # such cases should be quite rare. No popular service does this... we think. + set(${outName} + "${filename}" + PARENT_SCOPE + ) + unset(${outVer} PARENT_SCOPE) + endif() + else() + # No ideas yet what to do with non-archives + unset(${outName} PARENT_SCOPE) + unset(${outVer} PARENT_SCOPE) + endif() +endfunction() + +function(cpm_find_package NAME VERSION) + string(REPLACE " " ";" EXTRA_ARGS "${ARGN}") + find_package(${NAME} ${VERSION} ${EXTRA_ARGS} QUIET) + if(${CPM_ARGS_NAME}_FOUND) + if(DEFINED ${CPM_ARGS_NAME}_VERSION) + set(VERSION ${${CPM_ARGS_NAME}_VERSION}) + endif() + cpm_message(STATUS "${CPM_INDENT} Using local package ${CPM_ARGS_NAME}@${VERSION}") + CPMRegisterPackage(${CPM_ARGS_NAME} "${VERSION}") + set(CPM_PACKAGE_FOUND + YES + PARENT_SCOPE + ) + else() + set(CPM_PACKAGE_FOUND + NO + PARENT_SCOPE + ) + endif() +endfunction() + +# Create a custom FindXXX.cmake module for a CPM package This prevents `find_package(NAME)` from +# finding the system library +function(cpm_create_module_file Name) + if(NOT CPM_DONT_UPDATE_MODULE_PATH) + # erase any previous modules + file(WRITE ${CPM_MODULE_PATH}/Find${Name}.cmake + "include(\"${CPM_FILE}\")\n${ARGN}\nset(${Name}_FOUND TRUE)" + ) + endif() +endfunction() + +# Find a package locally or fallback to CPMAddPackage +function(CPMFindPackage) + set(oneValueArgs NAME VERSION GIT_TAG FIND_PACKAGE_ARGUMENTS) + + cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "" ${ARGN}) + + if(NOT DEFINED CPM_ARGS_VERSION) + if(DEFINED CPM_ARGS_GIT_TAG) + cpm_get_version_from_git_tag("${CPM_ARGS_GIT_TAG}" CPM_ARGS_VERSION) + endif() + endif() + + set(downloadPackage ${CPM_DOWNLOAD_ALL}) + if(DEFINED CPM_DOWNLOAD_${CPM_ARGS_NAME}) + set(downloadPackage ${CPM_DOWNLOAD_${CPM_ARGS_NAME}}) + elseif(DEFINED ENV{CPM_DOWNLOAD_${CPM_ARGS_NAME}}) + set(downloadPackage $ENV{CPM_DOWNLOAD_${CPM_ARGS_NAME}}) + endif() + if(downloadPackage) + CPMAddPackage(${ARGN}) + cpm_export_variables(${CPM_ARGS_NAME}) + return() + endif() + + cpm_find_package(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}" ${CPM_ARGS_FIND_PACKAGE_ARGUMENTS}) + + if(NOT CPM_PACKAGE_FOUND) + CPMAddPackage(${ARGN}) + cpm_export_variables(${CPM_ARGS_NAME}) + endif() + +endfunction() + +# checks if a package has been added before +function(cpm_check_if_package_already_added CPM_ARGS_NAME CPM_ARGS_VERSION) + if("${CPM_ARGS_NAME}" IN_LIST CPM_PACKAGES) + CPMGetPackageVersion(${CPM_ARGS_NAME} CPM_PACKAGE_VERSION) + if("${CPM_PACKAGE_VERSION}" VERSION_LESS "${CPM_ARGS_VERSION}") + message( + WARNING + "${CPM_INDENT} Requires a newer version of ${CPM_ARGS_NAME} (${CPM_ARGS_VERSION}) than currently included (${CPM_PACKAGE_VERSION})." + ) + endif() + cpm_get_fetch_properties(${CPM_ARGS_NAME}) + set(${CPM_ARGS_NAME}_ADDED NO) + set(CPM_PACKAGE_ALREADY_ADDED + YES + PARENT_SCOPE + ) + cpm_export_variables(${CPM_ARGS_NAME}) + else() + set(CPM_PACKAGE_ALREADY_ADDED + NO + PARENT_SCOPE + ) + endif() +endfunction() + +# Parse the argument of CPMAddPackage in case a single one was provided and convert it to a list of +# arguments which can then be parsed idiomatically. For example gh:foo/bar@1.2.3 will be converted +# to: GITHUB_REPOSITORY;foo/bar;VERSION;1.2.3 +function(cpm_parse_add_package_single_arg arg outArgs) + # Look for a scheme + if("${arg}" MATCHES "^([a-zA-Z]+):(.+)$") + string(TOLOWER "${CMAKE_MATCH_1}" scheme) + set(uri "${CMAKE_MATCH_2}") + + # Check for CPM-specific schemes + if(scheme STREQUAL "gh") + set(out "GITHUB_REPOSITORY;${uri}") + set(packageType "git") + elseif(scheme STREQUAL "gl") + set(out "GITLAB_REPOSITORY;${uri}") + set(packageType "git") + elseif(scheme STREQUAL "bb") + set(out "BITBUCKET_REPOSITORY;${uri}") + set(packageType "git") + # A CPM-specific scheme was not found. Looks like this is a generic URL so try to determine + # type + elseif(arg MATCHES ".git/?(@|#|$)") + set(out "GIT_REPOSITORY;${arg}") + set(packageType "git") + else() + # Fall back to a URL + set(out "URL;${arg}") + set(packageType "archive") + + # We could also check for SVN since FetchContent supports it, but SVN is so rare these days. + # We just won't bother with the additional complexity it will induce in this function. SVN is + # done by multi-arg + endif() + else() + if(arg MATCHES ".git/?(@|#|$)") + set(out "GIT_REPOSITORY;${arg}") + set(packageType "git") + else() + # Give up + message(FATAL_ERROR "${CPM_INDENT} Can't determine package type of '${arg}'") + endif() + endif() + + # For all packages we interpret @... as version. Only replace the last occurrence. Thus URIs + # containing '@' can be used + string(REGEX REPLACE "@([^@]+)$" ";VERSION;\\1" out "${out}") + + # Parse the rest according to package type + if(packageType STREQUAL "git") + # For git repos we interpret #... as a tag or branch or commit hash + string(REGEX REPLACE "#([^#]+)$" ";GIT_TAG;\\1" out "${out}") + elseif(packageType STREQUAL "archive") + # For archives we interpret #... as a URL hash. + string(REGEX REPLACE "#([^#]+)$" ";URL_HASH;\\1" out "${out}") + # We don't try to parse the version if it's not provided explicitly. cpm_get_version_from_url + # should do this at a later point + else() + # We should never get here. This is an assertion and hitting it means there's a problem with the + # code above. A packageType was set, but not handled by this if-else. + message(FATAL_ERROR "${CPM_INDENT} Unsupported package type '${packageType}' of '${arg}'") + endif() + + set(${outArgs} + ${out} + PARENT_SCOPE + ) +endfunction() + +# Check that the working directory for a git repo is clean +function(cpm_check_git_working_dir_is_clean repoPath gitTag isClean) + + find_package(Git REQUIRED) + + if(NOT GIT_EXECUTABLE) + # No git executable, assume directory is clean + set(${isClean} + TRUE + PARENT_SCOPE + ) + return() + endif() + + # check for uncommitted changes + execute_process( + COMMAND ${GIT_EXECUTABLE} status --porcelain + RESULT_VARIABLE resultGitStatus + OUTPUT_VARIABLE repoStatus + OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET + WORKING_DIRECTORY ${repoPath} + ) + if(resultGitStatus) + # not supposed to happen, assume clean anyway + message(WARNING "${CPM_INDENT} Calling git status on folder ${repoPath} failed") + set(${isClean} + TRUE + PARENT_SCOPE + ) + return() + endif() + + if(NOT "${repoStatus}" STREQUAL "") + set(${isClean} + FALSE + PARENT_SCOPE + ) + return() + endif() + + # check for committed changes + execute_process( + COMMAND ${GIT_EXECUTABLE} diff -s --exit-code ${gitTag} + RESULT_VARIABLE resultGitDiff + OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_QUIET + WORKING_DIRECTORY ${repoPath} + ) + + if(${resultGitDiff} EQUAL 0) + set(${isClean} + TRUE + PARENT_SCOPE + ) + else() + set(${isClean} + FALSE + PARENT_SCOPE + ) + endif() + +endfunction() + +# Add PATCH_COMMAND to CPM_ARGS_UNPARSED_ARGUMENTS. This method consumes a list of files in ARGN +# then generates a `PATCH_COMMAND` appropriate for `ExternalProject_Add()`. This command is appended +# to the parent scope's `CPM_ARGS_UNPARSED_ARGUMENTS`. +function(cpm_add_patches) + # Return if no patch files are supplied. + if(NOT ARGN) + return() + endif() + + # Find the patch program. + find_program(PATCH_EXECUTABLE patch) + if(WIN32 AND NOT PATCH_EXECUTABLE) + # The Windows git executable is distributed with patch.exe. Find the path to the executable, if + # it exists, then search `../usr/bin` and `../../usr/bin` for patch.exe. + find_package(Git QUIET) + if(GIT_EXECUTABLE) + get_filename_component(extra_search_path ${GIT_EXECUTABLE} DIRECTORY) + get_filename_component(extra_search_path_1up ${extra_search_path} DIRECTORY) + get_filename_component(extra_search_path_2up ${extra_search_path_1up} DIRECTORY) + find_program( + PATCH_EXECUTABLE patch HINTS "${extra_search_path_1up}/usr/bin" + "${extra_search_path_2up}/usr/bin" + ) + endif() + endif() + if(NOT PATCH_EXECUTABLE) + message(FATAL_ERROR "Couldn't find `patch` executable to use with PATCHES keyword.") + endif() + + # Create a temporary + set(temp_list ${CPM_ARGS_UNPARSED_ARGUMENTS}) + + # Ensure each file exists (or error out) and add it to the list. + set(first_item True) + foreach(PATCH_FILE ${ARGN}) + # Make sure the patch file exists, if we can't find it, try again in the current directory. + if(NOT EXISTS "${PATCH_FILE}") + if(NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/${PATCH_FILE}") + message(FATAL_ERROR "Couldn't find patch file: '${PATCH_FILE}'") + endif() + set(PATCH_FILE "${CMAKE_CURRENT_LIST_DIR}/${PATCH_FILE}") + endif() + + # Convert to absolute path for use with patch file command. + get_filename_component(PATCH_FILE "${PATCH_FILE}" ABSOLUTE) + + # The first patch entry must be preceded by "PATCH_COMMAND" while the following items are + # preceded by "&&". + if(first_item) + set(first_item False) + list(APPEND temp_list "PATCH_COMMAND") + else() + list(APPEND temp_list "&&") + endif() + # Add the patch command to the list + list(APPEND temp_list "${PATCH_EXECUTABLE}" "-p1" "<" "${PATCH_FILE}") + endforeach() + + # Move temp out into parent scope. + set(CPM_ARGS_UNPARSED_ARGUMENTS + ${temp_list} + PARENT_SCOPE + ) + +endfunction() + +# method to overwrite internal FetchContent properties, to allow using CPM.cmake to overload +# FetchContent calls. As these are internal cmake properties, this method should be used carefully +# and may need modification in future CMake versions. Source: +# https://github.com/Kitware/CMake/blob/dc3d0b5a0a7d26d43d6cfeb511e224533b5d188f/Modules/FetchContent.cmake#L1152 +function(cpm_override_fetchcontent contentName) + cmake_parse_arguments(PARSE_ARGV 1 arg "" "SOURCE_DIR;BINARY_DIR" "") + if(NOT "${arg_UNPARSED_ARGUMENTS}" STREQUAL "") + message(FATAL_ERROR "${CPM_INDENT} Unsupported arguments: ${arg_UNPARSED_ARGUMENTS}") + endif() + + string(TOLOWER ${contentName} contentNameLower) + set(prefix "_FetchContent_${contentNameLower}") + + set(propertyName "${prefix}_sourceDir") + define_property( + GLOBAL + PROPERTY ${propertyName} + BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" + FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" + ) + set_property(GLOBAL PROPERTY ${propertyName} "${arg_SOURCE_DIR}") + + set(propertyName "${prefix}_binaryDir") + define_property( + GLOBAL + PROPERTY ${propertyName} + BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" + FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" + ) + set_property(GLOBAL PROPERTY ${propertyName} "${arg_BINARY_DIR}") + + set(propertyName "${prefix}_populated") + define_property( + GLOBAL + PROPERTY ${propertyName} + BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" + FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" + ) + set_property(GLOBAL PROPERTY ${propertyName} TRUE) +endfunction() + +# Download and add a package from source +function(CPMAddPackage) + cpm_set_policies() + + list(LENGTH ARGN argnLength) + if(argnLength EQUAL 1) + cpm_parse_add_package_single_arg("${ARGN}" ARGN) + + # The shorthand syntax implies EXCLUDE_FROM_ALL and SYSTEM + set(ARGN "${ARGN};EXCLUDE_FROM_ALL;YES;SYSTEM;YES;") + endif() + + set(oneValueArgs + NAME + FORCE + VERSION + GIT_TAG + DOWNLOAD_ONLY + GITHUB_REPOSITORY + GITLAB_REPOSITORY + BITBUCKET_REPOSITORY + GIT_REPOSITORY + SOURCE_DIR + FIND_PACKAGE_ARGUMENTS + NO_CACHE + SYSTEM + GIT_SHALLOW + EXCLUDE_FROM_ALL + SOURCE_SUBDIR + CUSTOM_CACHE_KEY + ) + + set(multiValueArgs URL OPTIONS DOWNLOAD_COMMAND PATCHES) + + cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}") + + # Set default values for arguments + + if(NOT DEFINED CPM_ARGS_VERSION) + if(DEFINED CPM_ARGS_GIT_TAG) + cpm_get_version_from_git_tag("${CPM_ARGS_GIT_TAG}" CPM_ARGS_VERSION) + endif() + endif() + + if(CPM_ARGS_DOWNLOAD_ONLY) + set(DOWNLOAD_ONLY ${CPM_ARGS_DOWNLOAD_ONLY}) + else() + set(DOWNLOAD_ONLY NO) + endif() + + if(DEFINED CPM_ARGS_GITHUB_REPOSITORY) + set(CPM_ARGS_GIT_REPOSITORY "https://github.com/${CPM_ARGS_GITHUB_REPOSITORY}.git") + elseif(DEFINED CPM_ARGS_GITLAB_REPOSITORY) + set(CPM_ARGS_GIT_REPOSITORY "https://gitlab.com/${CPM_ARGS_GITLAB_REPOSITORY}.git") + elseif(DEFINED CPM_ARGS_BITBUCKET_REPOSITORY) + set(CPM_ARGS_GIT_REPOSITORY "https://bitbucket.org/${CPM_ARGS_BITBUCKET_REPOSITORY}.git") + endif() + + if(DEFINED CPM_ARGS_GIT_REPOSITORY) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_REPOSITORY ${CPM_ARGS_GIT_REPOSITORY}) + if(NOT DEFINED CPM_ARGS_GIT_TAG) + set(CPM_ARGS_GIT_TAG v${CPM_ARGS_VERSION}) + endif() + + # If a name wasn't provided, try to infer it from the git repo + if(NOT DEFINED CPM_ARGS_NAME) + cpm_package_name_from_git_uri(${CPM_ARGS_GIT_REPOSITORY} CPM_ARGS_NAME) + endif() + endif() + + set(CPM_SKIP_FETCH FALSE) + + if(DEFINED CPM_ARGS_GIT_TAG) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_TAG ${CPM_ARGS_GIT_TAG}) + # If GIT_SHALLOW is explicitly specified, honor the value. + if(DEFINED CPM_ARGS_GIT_SHALLOW) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_SHALLOW ${CPM_ARGS_GIT_SHALLOW}) + endif() + endif() + + if(DEFINED CPM_ARGS_URL) + # If a name or version aren't provided, try to infer them from the URL + list(GET CPM_ARGS_URL 0 firstUrl) + cpm_package_name_and_ver_from_url(${firstUrl} nameFromUrl verFromUrl) + # If we fail to obtain name and version from the first URL, we could try other URLs if any. + # However multiple URLs are expected to be quite rare, so for now we won't bother. + + # If the caller provided their own name and version, they trump the inferred ones. + if(NOT DEFINED CPM_ARGS_NAME) + set(CPM_ARGS_NAME ${nameFromUrl}) + endif() + if(NOT DEFINED CPM_ARGS_VERSION) + set(CPM_ARGS_VERSION ${verFromUrl}) + endif() + + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS URL "${CPM_ARGS_URL}") + endif() + + # Check for required arguments + + if(NOT DEFINED CPM_ARGS_NAME) + message( + FATAL_ERROR + "${CPM_INDENT} 'NAME' was not provided and couldn't be automatically inferred for package added with arguments: '${ARGN}'" + ) + endif() + + # Check if package has been added before + cpm_check_if_package_already_added(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}") + if(CPM_PACKAGE_ALREADY_ADDED) + cpm_export_variables(${CPM_ARGS_NAME}) + return() + endif() + + # Check for manual overrides + if(NOT CPM_ARGS_FORCE AND NOT "${CPM_${CPM_ARGS_NAME}_SOURCE}" STREQUAL "") + set(PACKAGE_SOURCE ${CPM_${CPM_ARGS_NAME}_SOURCE}) + set(CPM_${CPM_ARGS_NAME}_SOURCE "") + CPMAddPackage( + NAME "${CPM_ARGS_NAME}" + SOURCE_DIR "${PACKAGE_SOURCE}" + EXCLUDE_FROM_ALL "${CPM_ARGS_EXCLUDE_FROM_ALL}" + SYSTEM "${CPM_ARGS_SYSTEM}" + PATCHES "${CPM_ARGS_PATCHES}" + OPTIONS "${CPM_ARGS_OPTIONS}" + SOURCE_SUBDIR "${CPM_ARGS_SOURCE_SUBDIR}" + DOWNLOAD_ONLY "${DOWNLOAD_ONLY}" + FORCE True + ) + cpm_export_variables(${CPM_ARGS_NAME}) + return() + endif() + + # Check for available declaration + if(NOT CPM_ARGS_FORCE AND NOT "${CPM_DECLARATION_${CPM_ARGS_NAME}}" STREQUAL "") + set(declaration ${CPM_DECLARATION_${CPM_ARGS_NAME}}) + set(CPM_DECLARATION_${CPM_ARGS_NAME} "") + CPMAddPackage(${declaration}) + cpm_export_variables(${CPM_ARGS_NAME}) + # checking again to ensure version and option compatibility + cpm_check_if_package_already_added(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}") + return() + endif() + + if(NOT CPM_ARGS_FORCE) + if(CPM_USE_LOCAL_PACKAGES OR CPM_LOCAL_PACKAGES_ONLY) + cpm_find_package(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}" ${CPM_ARGS_FIND_PACKAGE_ARGUMENTS}) + + if(CPM_PACKAGE_FOUND) + cpm_export_variables(${CPM_ARGS_NAME}) + return() + endif() + + if(CPM_LOCAL_PACKAGES_ONLY) + message( + SEND_ERROR + "${CPM_INDENT} ${CPM_ARGS_NAME} not found via find_package(${CPM_ARGS_NAME} ${CPM_ARGS_VERSION})" + ) + endif() + endif() + endif() + + CPMRegisterPackage("${CPM_ARGS_NAME}" "${CPM_ARGS_VERSION}") + + if(DEFINED CPM_ARGS_GIT_TAG) + set(PACKAGE_INFO "${CPM_ARGS_GIT_TAG}") + elseif(DEFINED CPM_ARGS_SOURCE_DIR) + set(PACKAGE_INFO "${CPM_ARGS_SOURCE_DIR}") + else() + set(PACKAGE_INFO "${CPM_ARGS_VERSION}") + endif() + + if(DEFINED FETCHCONTENT_BASE_DIR) + # respect user's FETCHCONTENT_BASE_DIR if set + set(CPM_FETCHCONTENT_BASE_DIR ${FETCHCONTENT_BASE_DIR}) + else() + set(CPM_FETCHCONTENT_BASE_DIR ${CMAKE_BINARY_DIR}/_deps) + endif() + + cpm_add_patches(${CPM_ARGS_PATCHES}) + + if(DEFINED CPM_ARGS_DOWNLOAD_COMMAND) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS DOWNLOAD_COMMAND ${CPM_ARGS_DOWNLOAD_COMMAND}) + elseif(DEFINED CPM_ARGS_SOURCE_DIR) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS SOURCE_DIR ${CPM_ARGS_SOURCE_DIR}) + if(NOT IS_ABSOLUTE ${CPM_ARGS_SOURCE_DIR}) + # Expand `CPM_ARGS_SOURCE_DIR` relative path. This is important because EXISTS doesn't work + # for relative paths. + get_filename_component( + source_directory ${CPM_ARGS_SOURCE_DIR} REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR} + ) + else() + set(source_directory ${CPM_ARGS_SOURCE_DIR}) + endif() + if(NOT EXISTS ${source_directory}) + string(TOLOWER ${CPM_ARGS_NAME} lower_case_name) + # remove timestamps so CMake will re-download the dependency + file(REMOVE_RECURSE "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild") + endif() + elseif(CPM_SOURCE_CACHE AND NOT CPM_ARGS_NO_CACHE) + string(TOLOWER ${CPM_ARGS_NAME} lower_case_name) + set(origin_parameters ${CPM_ARGS_UNPARSED_ARGUMENTS}) + list(SORT origin_parameters) + if(CPM_ARGS_CUSTOM_CACHE_KEY) + # Application set a custom unique directory name + set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${CPM_ARGS_CUSTOM_CACHE_KEY}) + elseif(CPM_USE_NAMED_CACHE_DIRECTORIES) + string(SHA1 origin_hash "${origin_parameters};NEW_CACHE_STRUCTURE_TAG") + set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${origin_hash}/${CPM_ARGS_NAME}) + else() + string(SHA1 origin_hash "${origin_parameters}") + set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${origin_hash}) + endif() + # Expand `download_directory` relative path. This is important because EXISTS doesn't work for + # relative paths. + get_filename_component(download_directory ${download_directory} ABSOLUTE) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS SOURCE_DIR ${download_directory}) + + if(CPM_SOURCE_CACHE) + file(LOCK ${download_directory}/../cmake.lock) + endif() + + if(EXISTS ${download_directory}) + if(CPM_SOURCE_CACHE) + file(LOCK ${download_directory}/../cmake.lock RELEASE) + endif() + + cpm_store_fetch_properties( + ${CPM_ARGS_NAME} "${download_directory}" + "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-build" + ) + cpm_get_fetch_properties("${CPM_ARGS_NAME}") + + if(DEFINED CPM_ARGS_GIT_TAG AND NOT (PATCH_COMMAND IN_LIST CPM_ARGS_UNPARSED_ARGUMENTS)) + # warn if cache has been changed since checkout + cpm_check_git_working_dir_is_clean(${download_directory} ${CPM_ARGS_GIT_TAG} IS_CLEAN) + if(NOT ${IS_CLEAN}) + message( + WARNING "${CPM_INDENT} Cache for ${CPM_ARGS_NAME} (${download_directory}) is dirty" + ) + endif() + endif() + + cpm_add_subdirectory( + "${CPM_ARGS_NAME}" + "${DOWNLOAD_ONLY}" + "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}" + "${${CPM_ARGS_NAME}_BINARY_DIR}" + "${CPM_ARGS_EXCLUDE_FROM_ALL}" + "${CPM_ARGS_SYSTEM}" + "${CPM_ARGS_OPTIONS}" + ) + set(PACKAGE_INFO "${PACKAGE_INFO} at ${download_directory}") + + # As the source dir is already cached/populated, we override the call to FetchContent. + set(CPM_SKIP_FETCH TRUE) + cpm_override_fetchcontent( + "${lower_case_name}" SOURCE_DIR "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}" + BINARY_DIR "${${CPM_ARGS_NAME}_BINARY_DIR}" + ) + + else() + # Enable shallow clone when GIT_TAG is not a commit hash. Our guess may not be accurate, but + # it should guarantee no commit hash get mis-detected. + if(NOT DEFINED CPM_ARGS_GIT_SHALLOW) + cpm_is_git_tag_commit_hash("${CPM_ARGS_GIT_TAG}" IS_HASH) + if(NOT ${IS_HASH}) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_SHALLOW TRUE) + endif() + endif() + + # remove timestamps so CMake will re-download the dependency + file(REMOVE_RECURSE ${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild) + set(PACKAGE_INFO "${PACKAGE_INFO} to ${download_directory}") + endif() + endif() + + cpm_create_module_file(${CPM_ARGS_NAME} "CPMAddPackage(\"${ARGN}\")") + + if(CPM_PACKAGE_LOCK_ENABLED) + if((CPM_ARGS_VERSION AND NOT CPM_ARGS_SOURCE_DIR) OR CPM_INCLUDE_ALL_IN_PACKAGE_LOCK) + cpm_add_to_package_lock(${CPM_ARGS_NAME} "${ARGN}") + elseif(CPM_ARGS_SOURCE_DIR) + cpm_add_comment_to_package_lock(${CPM_ARGS_NAME} "local directory") + else() + cpm_add_comment_to_package_lock(${CPM_ARGS_NAME} "${ARGN}") + endif() + endif() + + cpm_message( + STATUS "${CPM_INDENT} Adding package ${CPM_ARGS_NAME}@${CPM_ARGS_VERSION} (${PACKAGE_INFO})" + ) + + if(NOT CPM_SKIP_FETCH) + # CMake 3.28 added EXCLUDE, SYSTEM (3.25), and SOURCE_SUBDIR (3.18) to FetchContent_Declare. + # Calling FetchContent_MakeAvailable will then internally forward these options to + # add_subdirectory. Up until these changes, we had to call FetchContent_Populate and + # add_subdirectory separately, which is no longer necessary and has been deprecated as of 3.30. + set(fetchContentDeclareExtraArgs "") + if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.28.0") + if(${CPM_ARGS_EXCLUDE_FROM_ALL}) + list(APPEND fetchContentDeclareExtraArgs EXCLUDE_FROM_ALL) + endif() + if(${CPM_ARGS_SYSTEM}) + list(APPEND fetchContentDeclareExtraArgs SYSTEM) + endif() + if(DEFINED CPM_ARGS_SOURCE_SUBDIR) + list(APPEND fetchContentDeclareExtraArgs SOURCE_SUBDIR ${CPM_ARGS_SOURCE_SUBDIR}) + endif() + # For CMake version <3.28 OPTIONS are parsed in cpm_add_subdirectory + if(CPM_ARGS_OPTIONS AND NOT DOWNLOAD_ONLY) + foreach(OPTION ${CPM_ARGS_OPTIONS}) + cpm_parse_option("${OPTION}") + set(${OPTION_KEY} "${OPTION_VALUE}") + endforeach() + endif() + endif() + cpm_declare_fetch( + "${CPM_ARGS_NAME}" ${fetchContentDeclareExtraArgs} "${CPM_ARGS_UNPARSED_ARGUMENTS}" + ) + + cpm_fetch_package("${CPM_ARGS_NAME}" ${DOWNLOAD_ONLY} populated ${CPM_ARGS_UNPARSED_ARGUMENTS}) + if(CPM_SOURCE_CACHE AND download_directory) + file(LOCK ${download_directory}/../cmake.lock RELEASE) + endif() + if(${populated} AND ${CMAKE_VERSION} VERSION_LESS "3.28.0") + cpm_add_subdirectory( + "${CPM_ARGS_NAME}" + "${DOWNLOAD_ONLY}" + "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}" + "${${CPM_ARGS_NAME}_BINARY_DIR}" + "${CPM_ARGS_EXCLUDE_FROM_ALL}" + "${CPM_ARGS_SYSTEM}" + "${CPM_ARGS_OPTIONS}" + ) + endif() + cpm_get_fetch_properties("${CPM_ARGS_NAME}") + endif() + + set(${CPM_ARGS_NAME}_ADDED YES) + cpm_export_variables("${CPM_ARGS_NAME}") +endfunction() + +# Fetch a previously declared package +macro(CPMGetPackage Name) + if(DEFINED "CPM_DECLARATION_${Name}") + CPMAddPackage(NAME ${Name}) + else() + message(SEND_ERROR "${CPM_INDENT} Cannot retrieve package ${Name}: no declaration available") + endif() +endmacro() + +# export variables available to the caller to the parent scope expects ${CPM_ARGS_NAME} to be set +macro(cpm_export_variables name) + set(${name}_SOURCE_DIR + "${${name}_SOURCE_DIR}" + PARENT_SCOPE + ) + set(${name}_BINARY_DIR + "${${name}_BINARY_DIR}" + PARENT_SCOPE + ) + set(${name}_ADDED + "${${name}_ADDED}" + PARENT_SCOPE + ) + set(CPM_LAST_PACKAGE_NAME + "${name}" + PARENT_SCOPE + ) +endmacro() + +# declares a package, so that any call to CPMAddPackage for the package name will use these +# arguments instead. Previous declarations will not be overridden. +macro(CPMDeclarePackage Name) + if(NOT DEFINED "CPM_DECLARATION_${Name}") + set("CPM_DECLARATION_${Name}" "${ARGN}") + endif() +endmacro() + +function(cpm_add_to_package_lock Name) + if(NOT CPM_DONT_CREATE_PACKAGE_LOCK) + cpm_prettify_package_arguments(PRETTY_ARGN false ${ARGN}) + file(APPEND ${CPM_PACKAGE_LOCK_FILE} "# ${Name}\nCPMDeclarePackage(${Name}\n${PRETTY_ARGN})\n") + endif() +endfunction() + +function(cpm_add_comment_to_package_lock Name) + if(NOT CPM_DONT_CREATE_PACKAGE_LOCK) + cpm_prettify_package_arguments(PRETTY_ARGN true ${ARGN}) + file(APPEND ${CPM_PACKAGE_LOCK_FILE} + "# ${Name} (unversioned)\n# CPMDeclarePackage(${Name}\n${PRETTY_ARGN}#)\n" + ) + endif() +endfunction() + +# includes the package lock file if it exists and creates a target `cpm-update-package-lock` to +# update it +macro(CPMUsePackageLock file) + if(NOT CPM_DONT_CREATE_PACKAGE_LOCK) + get_filename_component(CPM_ABSOLUTE_PACKAGE_LOCK_PATH ${file} ABSOLUTE) + if(EXISTS ${CPM_ABSOLUTE_PACKAGE_LOCK_PATH}) + include(${CPM_ABSOLUTE_PACKAGE_LOCK_PATH}) + endif() + if(NOT TARGET cpm-update-package-lock) + add_custom_target( + cpm-update-package-lock COMMAND ${CMAKE_COMMAND} -E copy ${CPM_PACKAGE_LOCK_FILE} + ${CPM_ABSOLUTE_PACKAGE_LOCK_PATH} + ) + endif() + set(CPM_PACKAGE_LOCK_ENABLED true) + endif() +endmacro() + +# registers a package that has been added to CPM +function(CPMRegisterPackage PACKAGE VERSION) + list(APPEND CPM_PACKAGES ${PACKAGE}) + set(CPM_PACKAGES + ${CPM_PACKAGES} + CACHE INTERNAL "" + ) + set("CPM_PACKAGE_${PACKAGE}_VERSION" + ${VERSION} + CACHE INTERNAL "" + ) +endfunction() + +# retrieve the current version of the package to ${OUTPUT} +function(CPMGetPackageVersion PACKAGE OUTPUT) + set(${OUTPUT} + "${CPM_PACKAGE_${PACKAGE}_VERSION}" + PARENT_SCOPE + ) +endfunction() + +# declares a package in FetchContent_Declare +function(cpm_declare_fetch PACKAGE) + if(${CPM_DRY_RUN}) + cpm_message(STATUS "${CPM_INDENT} Package not declared (dry run)") + return() + endif() + + FetchContent_Declare(${PACKAGE} ${ARGN}) +endfunction() + +# returns properties for a package previously defined by cpm_declare_fetch +function(cpm_get_fetch_properties PACKAGE) + if(${CPM_DRY_RUN}) + return() + endif() + + set(${PACKAGE}_SOURCE_DIR + "${CPM_PACKAGE_${PACKAGE}_SOURCE_DIR}" + PARENT_SCOPE + ) + set(${PACKAGE}_BINARY_DIR + "${CPM_PACKAGE_${PACKAGE}_BINARY_DIR}" + PARENT_SCOPE + ) +endfunction() + +function(cpm_store_fetch_properties PACKAGE source_dir binary_dir) + if(${CPM_DRY_RUN}) + return() + endif() + + set(CPM_PACKAGE_${PACKAGE}_SOURCE_DIR + "${source_dir}" + CACHE INTERNAL "" + ) + set(CPM_PACKAGE_${PACKAGE}_BINARY_DIR + "${binary_dir}" + CACHE INTERNAL "" + ) +endfunction() + +# adds a package as a subdirectory if viable, according to provided options +function( + cpm_add_subdirectory + PACKAGE + DOWNLOAD_ONLY + SOURCE_DIR + BINARY_DIR + EXCLUDE + SYSTEM + OPTIONS +) + + if(NOT DOWNLOAD_ONLY AND EXISTS ${SOURCE_DIR}/CMakeLists.txt) + set(addSubdirectoryExtraArgs "") + if(EXCLUDE) + list(APPEND addSubdirectoryExtraArgs EXCLUDE_FROM_ALL) + endif() + if("${SYSTEM}" AND "${CMAKE_VERSION}" VERSION_GREATER_EQUAL "3.25") + # https://cmake.org/cmake/help/latest/prop_dir/SYSTEM.html#prop_dir:SYSTEM + list(APPEND addSubdirectoryExtraArgs SYSTEM) + endif() + if(OPTIONS) + foreach(OPTION ${OPTIONS}) + cpm_parse_option("${OPTION}") + set(${OPTION_KEY} "${OPTION_VALUE}") + endforeach() + endif() + set(CPM_OLD_INDENT "${CPM_INDENT}") + set(CPM_INDENT "${CPM_INDENT} ${PACKAGE}:") + add_subdirectory(${SOURCE_DIR} ${BINARY_DIR} ${addSubdirectoryExtraArgs}) + set(CPM_INDENT "${CPM_OLD_INDENT}") + endif() +endfunction() + +# downloads a previously declared package via FetchContent and exports the variables +# `${PACKAGE}_SOURCE_DIR` and `${PACKAGE}_BINARY_DIR` to the parent scope +function(cpm_fetch_package PACKAGE DOWNLOAD_ONLY populated) + set(${populated} + FALSE + PARENT_SCOPE + ) + if(${CPM_DRY_RUN}) + cpm_message(STATUS "${CPM_INDENT} Package ${PACKAGE} not fetched (dry run)") + return() + endif() + + FetchContent_GetProperties(${PACKAGE}) + + string(TOLOWER "${PACKAGE}" lower_case_name) + + if(NOT ${lower_case_name}_POPULATED) + if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.28.0") + if(DOWNLOAD_ONLY) + # MakeAvailable will call add_subdirectory internally which is not what we want when + # DOWNLOAD_ONLY is set. Populate will only download the dependency without adding it to the + # build + FetchContent_Populate( + ${PACKAGE} + SOURCE_DIR "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-src" + BINARY_DIR "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-build" + SUBBUILD_DIR "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild" + ${ARGN} + ) + else() + FetchContent_MakeAvailable(${PACKAGE}) + endif() + else() + FetchContent_Populate(${PACKAGE}) + endif() + set(${populated} + TRUE + PARENT_SCOPE + ) + endif() + + cpm_store_fetch_properties( + ${CPM_ARGS_NAME} ${${lower_case_name}_SOURCE_DIR} ${${lower_case_name}_BINARY_DIR} + ) + + set(${PACKAGE}_SOURCE_DIR + ${${lower_case_name}_SOURCE_DIR} + PARENT_SCOPE + ) + set(${PACKAGE}_BINARY_DIR + ${${lower_case_name}_BINARY_DIR} + PARENT_SCOPE + ) +endfunction() + +# splits a package option +function(cpm_parse_option OPTION) + string(REGEX MATCH "^[^ ]+" OPTION_KEY "${OPTION}") + string(LENGTH "${OPTION}" OPTION_LENGTH) + string(LENGTH "${OPTION_KEY}" OPTION_KEY_LENGTH) + if(OPTION_KEY_LENGTH STREQUAL OPTION_LENGTH) + # no value for key provided, assume user wants to set option to "ON" + set(OPTION_VALUE "ON") + else() + math(EXPR OPTION_KEY_LENGTH "${OPTION_KEY_LENGTH}+1") + string(SUBSTRING "${OPTION}" "${OPTION_KEY_LENGTH}" "-1" OPTION_VALUE) + endif() + set(OPTION_KEY + "${OPTION_KEY}" + PARENT_SCOPE + ) + set(OPTION_VALUE + "${OPTION_VALUE}" + PARENT_SCOPE + ) +endfunction() + +# guesses the package version from a git tag +function(cpm_get_version_from_git_tag GIT_TAG RESULT) + string(LENGTH ${GIT_TAG} length) + if(length EQUAL 40) + # GIT_TAG is probably a git hash + set(${RESULT} + 0 + PARENT_SCOPE + ) + else() + string(REGEX MATCH "v?([0123456789.]*).*" _ ${GIT_TAG}) + set(${RESULT} + ${CMAKE_MATCH_1} + PARENT_SCOPE + ) + endif() +endfunction() + +# guesses if the git tag is a commit hash or an actual tag or a branch name. +function(cpm_is_git_tag_commit_hash GIT_TAG RESULT) + string(LENGTH "${GIT_TAG}" length) + # full hash has 40 characters, and short hash has at least 7 characters. + if(length LESS 7 OR length GREATER 40) + set(${RESULT} + 0 + PARENT_SCOPE + ) + else() + if(${GIT_TAG} MATCHES "^[a-fA-F0-9]+$") + set(${RESULT} + 1 + PARENT_SCOPE + ) + else() + set(${RESULT} + 0 + PARENT_SCOPE + ) + endif() + endif() +endfunction() + +function(cpm_prettify_package_arguments OUT_VAR IS_IN_COMMENT) + set(oneValueArgs + NAME + FORCE + VERSION + GIT_TAG + DOWNLOAD_ONLY + GITHUB_REPOSITORY + GITLAB_REPOSITORY + BITBUCKET_REPOSITORY + GIT_REPOSITORY + SOURCE_DIR + FIND_PACKAGE_ARGUMENTS + NO_CACHE + SYSTEM + GIT_SHALLOW + EXCLUDE_FROM_ALL + SOURCE_SUBDIR + ) + set(multiValueArgs URL OPTIONS DOWNLOAD_COMMAND) + cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + foreach(oneArgName ${oneValueArgs}) + if(DEFINED CPM_ARGS_${oneArgName}) + if(${IS_IN_COMMENT}) + string(APPEND PRETTY_OUT_VAR "#") + endif() + if(${oneArgName} STREQUAL "SOURCE_DIR") + string(REPLACE ${CMAKE_SOURCE_DIR} "\${CMAKE_SOURCE_DIR}" CPM_ARGS_${oneArgName} + ${CPM_ARGS_${oneArgName}} + ) + endif() + string(APPEND PRETTY_OUT_VAR " ${oneArgName} ${CPM_ARGS_${oneArgName}}\n") + endif() + endforeach() + foreach(multiArgName ${multiValueArgs}) + if(DEFINED CPM_ARGS_${multiArgName}) + if(${IS_IN_COMMENT}) + string(APPEND PRETTY_OUT_VAR "#") + endif() + string(APPEND PRETTY_OUT_VAR " ${multiArgName}\n") + foreach(singleOption ${CPM_ARGS_${multiArgName}}) + if(${IS_IN_COMMENT}) + string(APPEND PRETTY_OUT_VAR "#") + endif() + string(APPEND PRETTY_OUT_VAR " \"${singleOption}\"\n") + endforeach() + endif() + endforeach() + + if(NOT "${CPM_ARGS_UNPARSED_ARGUMENTS}" STREQUAL "") + if(${IS_IN_COMMENT}) + string(APPEND PRETTY_OUT_VAR "#") + endif() + string(APPEND PRETTY_OUT_VAR " ") + foreach(CPM_ARGS_UNPARSED_ARGUMENT ${CPM_ARGS_UNPARSED_ARGUMENTS}) + string(APPEND PRETTY_OUT_VAR " ${CPM_ARGS_UNPARSED_ARGUMENT}") + endforeach() + string(APPEND PRETTY_OUT_VAR "\n") + endif() + + set(${OUT_VAR} + ${PRETTY_OUT_VAR} + PARENT_SCOPE + ) + +endfunction() diff --git a/examples/cudax_stf/simple_stf.cu b/examples/cudax_stf/simple_stf.cu new file mode 100644 index 00000000000..2dbbe70d00c --- /dev/null +++ b/examples/cudax_stf/simple_stf.cu @@ -0,0 +1,40 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include + +using namespace cuda::experimental::stf; + +int main() +{ + context ctx; + + int array[128]; + for (size_t i = 0; i < 128; i++) + { + array[i] = i; + } + + auto A = ctx.logical_data(array); + + ctx.parallel_for(A.shape(), A.rw())->*[] __device__(size_t i, auto a) { + a(i) += 4; + }; + + ctx.finalize(); + + for (size_t i = 0; i < 128; i++) + { + printf("array[%ld] = %d\n", i, array[i]); + } + return 0; +}

aMoM^V$V=K4 z6cmWk5Us1s(^7w`1q_xwnFtm6N&&!s6B83jJp?-RFHM+aSkHE*^gqa&;{g`H=Gf@| z;B0cbrF4o&U&7Urwq>ar!h>FPhtVb`6c@pqX@H@Hkb<=q@fu>d%u7^t#0dEdB6kj-+e%ecsz%`_5XbT-WWA&aRJaQ?E>kkoAS^1dQem@|J9Q#*&0%PQ#1>C- zKapA;wVFbPkEGD0GR}kLSIT{h$Ca76`RvIbw~mv=I|A~Bd90;{c@s8n(WB10*`g$G9z@&f&fLO?XOJz7xVBI&@%oC4jl zTN@{BE-;uyw#L=&Ki#E)T}m5Kf76uk{~+y1I`>a6tlGr$+jJ5KMNJ8rXsFSqt^<$D z1~{ZY2N%zQ-v?h}7jPjn(i$<1ETY6hW&!1ZqUO}-Dl`U_RCU3{Y?Q9`Q2P3QyVKE0 zZUlGg6=}B+U>3iKrj{0wj2_uAtmEJu0NetMb7e}N`gIv9Ct(Pv{U)rq1%G%Kg}QIm z{Rx_#=PmQIbPri$<%TYus=yC~Cgx5w76599%SNs@NW4SpmZc~%254h*pm>R_5JK|~ zD<`{H;}IIFjxw0ScMYU#fh5=A@bS+h^HI}Oz$w9LHDtWCf7YP6v|wTQgL zLjH~Dvso7{e+Jsj2GF1Wf&0~cg;~t+*fX~;R}dMVlij3aOfiIhGx)zsQDhbz&mCC{ zXGk{!7ht4h?XA-GfM~dDk`jTBDJQHZ{fN}-_zNT}{#ioZ6T zAf>MyPg%8*iCb#`X{r>o0mJ`yFV8FXqeh9EgMrJs*j&x})b>|nan+ZuTJ2g^ZNK?z z|4D6?pQUBtE4ukCwn4~rA8=Kc1BlFogC$D@pqSVMgpS)qXFJzJlnA&YPuBUf>M-qg z8SEiG@b3NKS%F%Y8U2;vSbBA0nP2H@Z_k*rv_Fu@$g7260l8X15sNjW&$$)-sXi)x z{4Smm8ZEx~LO_Us1{elxtj83Ch|Fbho;5rYy_eZrS=DD#S_`r#HL-*kE?jizTu#>- zHi;q+jqX|K zv;^dSB)(t-c`PzK+#Q$Z=&y4hpAN6XrEym~Ntt<^uzie!hB??CW010JP5;Urvdg84 z#r=rG*liAr&wAFoL_p|veg+>;S@DPqG;aL(8H|Vi$Xsf$9LznF{w{5gEzs!0s;jF< zO`W<2k&DQAGehjbG?o)72Obhg4n1K3<|T^E*=9Ln3Cbg6Tev4z;XUClcK2%vCj=A$ z5K_kU3;TgWx9-_f{V?~JQ~LP{I=hh~Z9}Xyh{ATmpAJg?mNQ^^X)N4<>GhaB`-UQw zk@hsIlkpQLUPa6`iEf3V+Iun*27aR!hi@^?K`fq$K5-j!RNV{=lIG9B+J_jPgFos- z)Z#Z-J1&$RJK?Y0xN#iP&FMNs$2E?zth^Ty&bOR$2LgW|teL@IuS}I*+~pl^=LZ%U zH`adq(CS{8s-Svy#%4!9KrC0v+TKcJ|E?pKrz&_l}5 zh*BPci8I!89c3W%({GH zY#9%liAGu0kP-T61R%y3jfu3U*)}A32og}=#qt)i6d6QEc?eQj@a>-|l$|diuw`Wo z6AEI$&1x&2U7p6fmrhPja+^gr0}!KgWl0et{v|JWwL}QDt@mF_3?{Ml#mb`I0?yky zm=qCLL`7x!Xl902RLlpL&Sx$-Q(J=pv!OJEaD4ES_Q9n~&o_m^|2OT7?Jkfot_HHS zH()t=mHm<>C%rt9;QC}5hozC0?YX0Xu@c&}{&Xg2ERUbPRgCloEm6jMP3A{|e-(sf zPcVYpSG|n|c62|;#tZNYhW(BsNlL2@QE=kwDN7OA;5|^nlco2j0rMo^9{f7^tq)hf z>*_yM6A%sFA9w8M1Q!=2Jg&$CG6seYKov1vAL*;3%JY}EPy1cCs{oE#UiJs?Ir|48 zR2I8l=HpZz)t^90%*EbXWLrWioCaivWUruyAkZu&r5U?-tgNla8x|rTNx|_``V+DQ z0vrcsIDFmn0@@Y``So;cDC|To;CjKg9l;^a<9zsRdc>S0DcBz)fumT;;_vfvuKS$O zfQuC$JDO9^P3_|4!Z)Ns)Wo-dE|3_N5LkRPb=(o7(6YB=qoWWzNsZ{EQnnsS*AmS& z0t-p!)Q!8o7$Kfy-pdy+YKe#hE$=yQbcl59232?sAiL~o{Q&G zj5CNcxnx^~g3EwrC!1<56*m+%q!)gJ+B(9vF!+E{Q|k~RkZ8?{06*T=A9T;oDhlB` z1Z@PQOiVb39BrbcBo`qKJDd9T>zf4MahyC^)Q4ibSmY4o_W{K@0aD6jKTnf@-6}fH z$S&$3T}KnO%nbXOcuIuL#!%0fSg9kzKI&J4bqNW^*Y4a2M3Fx8&olUP+?C{LBkLG& zV`xGoiC*#-IeW;YN!x4JS!PKm98_|2uI;;50c+lFSIIx}F+!ju z2Nb-zmRyYCOs8($Zj6|bUx9=BFv@)Sq|-gl(CpYt95opq?L;7MA`}}~z!pBDC;^XP-d`La>xsZ&@HCE8Ae|2z8}K?RNZ63oxGv5;0l?f^smDi#*N)foXOQ zMQ*i^rNP{3(Zph?DXl@=@`?C8*SwFx*MoAZr0J8j4eHkqvWPr$xPGVRgk&pdXi+5) zJw_(n37e9!umN5ZTf)PCAN>pQO5W=!)h}PY(ksR|!p+@{+|x}-!n1l;l(5!ka(P0Li>#u-Ns?YM1#@LsSR-|Eks%m&H1w>5awuivcy#R~IHcV7Z8^B!r1u6~Bm{ei!&L7<| z3f~95RSu=G;o$}s(He+?oA7e<%MNajHm#%Cg@?nZJEirZCqQpZ;(nfMyZjt3I({Ee zQUC6IH5SSX^4$BsPc0p>)2E%beX(~r?2@h|g!A%G30WP^_i$6~*}ePW>whSmbKqvX zMEKhVz%-H18*{q_Zan?P6o&y-d4k8+j~s%%R7TQ(kgAVf9-@;*pys0mTa5$cFpKYK zXlV0Bv$Ai{wU%ZD)OL|WGQFJoid{F;hIHgDc|xFT9b^u}50fKpO|?o%;Dbypre=bh7Bn^h-QEF%J{X z9)tuJn9gOJ4=t3Gl6%#8Kv~oaVu$G#%PCW|q@JaSATl&(URC}jVU|2Z{5c3Ubp}~e}+4ME$PJ!E_>V@#%2#%ylA>)HGTtgEdBdpQUA8FCGV+Huw@#Awa$M5`g zzzQD#jx2hi_$pZG#*+|f)VI~|^hq41z6-fx^A@aewCR4@-|9vq$8q;<9$=RN~mzIHd}pr;9u_Ubrh$WQhhSGR})7E z2ie0z;m<^&V;J;?tdrpyi5^U+hq|`kZyA7nNZEWk)ZsF?ZadTq?IhKYzu*l( zR&viQZ|`GV(=<|pIPdmj=>dJ2%ZgO)>V!u*SoPPLMx}gs;hG;A3@J@(!>SkP(g!5;ZCr#K zH*fw<_*{mBMzyMnX_Z>{>Cfxd?-dV_rvKlRmblvFg5(qd*7WJ#lq;{IpoqCiO1S z)pB+JMS$MMLB=gN(CZPxw04RmG-8$J;%~>lw~W+(5UAWPllYX)!;#A2^)kwf=(#k& zcyEwc>DT|fd_A55Ao|zfmu5epZnmYT`y_f2&izT}X=sx3P8tn7KxxS~dmeU@f<>CMEnr{E8ofm zH*&vMPv8P7_FiwQU$fg+=19QA(OaKIL`SDAcz^!1@Ary2oELiXu-J!sZQY(*`T|8j zLxpI%Mb6Ak5Sgr+TC`oR;hJZzc2ErThnQ=9ml8(dYZ%k3%#um0{bORA)9UR$^_)6% zg!uGu3c97wB`GTkOo#sE9Qk(D_2qRH{4yB@rr$53Mu)T93JqpGY@<4U8+lvQ97$)E zeRAVApyz&J+AygrTq78B9hS5r&~)?z71(wm7`YhA#cF_F~6UL4mLP*1JjWY6*`k_xZ@S@_TN+*Q2 z75Jx{Ryn!SW4V!A336(YeL|*`r^tyR*0iWv@_UTfal_AeLfI)4NdHX!eyRz*z@MGt zf08Oxkv#bC*`EwvH&H+zmtX`S#f*#%Fj1Vd&1=0`&bBwFV^@v_3$8cTTj7>ZoI04F zJs^LX$i2vF)FI-wWL1|)goHcK<&gltq^2g(^|3OxyP>dXT zL+Md^dV058uXZ6aDBWF5i6SE32XM#c^z^SF-i!11i->UaZ`vF}NXn;s)x^`iCr+Hm zeE;=j^|y;P`)Do3r5{0p)BJf&2hX10Fyg>^U~tLGVX+3(Y3Nhe(xW_IrQIWJpc{%t zC>wUwZSvYl3VL(#_08?$UaXI#`LFQ(>Qd!nK8Du>i`a}#g;+(<)QD?p?YwFpUp~6# z+}4XmvpUbc(luzAe_)^pdxZp$cdRPnW~K#>7A}#4p3L@JeRPvVL&qzWN-}_tOdTD@ zS|%E$>nI^mW{Q_C{igJRID`f4VzGFFKLyULKXqzp$kJ;Fqj0(1xU>J?qKD+ENjMOu z3@P8w)tl6$3K51FNtsx(z_@v0{OD6$BMM5cFqVaSVuwheI=7~#1695JS{!3^TpG@! zNg$pLOuYEP-XT;j<+Sh%?4rX=Mmln|s(*hTLm0Y0doZ*F?k9(fEW9Ik?`{VE{Y~)Z z0H&ngVUIC_dM3`q%WwFjTeoh}C)#P(cKf^JBd?_$3jcwqejN;0DX0d*!IHp0y+7`) z9H?lcF8!xp-EwEe;0TV#PK_A<&drdVwB%D|F=GX)c`E?R9 zAH*pE4WFB_ZO z;vI8KjK9`2TFon{$IEm$8m8ZBAqP%Ok?{4Icyi99^Iy}c+?gGdc6#QzX7Tb;P7}bx|!tOGjsEHEN4JV3bbUa|K4<|jGA%$`NUM5rb zf>_0x0yi4nXe6dUC&A4S5=q~xNo~xjRXy~)oPmHLK!n&br$T_f?z8Jo1PV36#Wa?c z;?-BL=Jw1wzgJ$MyE5y0{klVU>!l~I-_0+PqDuBnkRsr3;F~q}kprocT2U2l2~VnV zi-guxR3Gg$+lwJ`nHr;Cugg-S@&U0oX$#8WHyrZmP{(+g>!%-tK1ZOBijHjIj^6q@ zMaix1B(@CBp&St7F4SzoGBO8B4>xj$&*2eQx`M136t+El5qu>MjglkU-EO~w**7Pb zuPOV!KdY(E_3<+WR>{y%?HIk)D5~&o_Fr7o&Oimunhz2YcRiHM{endUcKh>F+jJ9_5D|myrTX-n^a<2MB*@@H0n>Str9Z}ZO z{=*lf?6`EsIlyX#1G^?*9j8e>Cv_rS8@E+eY9RIhDllubhC;|d8DxYyCgRL2GISBo zHB?r=MW+Mfb-(n%=XD7hV!OvmdP81f*@@-M)*oyMvj=A zScbaBzit^wv{uWZ+|KbplOCCqxe6dpKjoKrjSqBUr1xqd>2a}bWyPwU0p zjXY^rn;!2|7|WChDmCbmEn~Ls+$nv%D?DkwY5?^l^19U7MLmziJ4=g5W>|RjC8l=i zOr6!^)&fboaulm3b#c^G*{OQ6p;LsOhfalS+R(ha);4lbnrTd-UWI zFQd8@=YHy>7BKmB5sT4FV*k8lLTl*1EiTPET4;i%#5)}F`(c~z6BX#0p-H#@Gu=d+ z6t)5Kl(QNjZx!Jo6VlT3%m^KSjS_uta&mBO-IeIVWr&+}pS;(C>E+_3tnL)cfMro{z5yK6HB^=eNPH5%GnFmd>2u{~)A0T3f zHq=}RKhwS!6O2U0#`~7zD`|~$YxuPNKni-`cuUAeaga(l^9C>JENo3GV7`$R_>~X1 zzk@`{iCjwyXBzQTu@)X2xr>T4M}faqbK7~!WI`TWzMmJ7zn0h zb?XAAwqtnLdl+cF!e%1T${1J+eKMLyujlN!dhmV9ke0Ny0r^Cnc9g9;4V!BQQTAVc z^yqBnZp-{-!UHq4;=UncWjd%0qaqE!%`09xmm6f%3}G8pk?_CNZ&vVr#>*;dJU`wp zjWhgJ6Lzto`j+9B8Y~$0_?}9h>(wbkvU{8w)#Kab8G^#8>6ZceK}uImZ@OL_JmiAv-u0KW+z1ZWCQd)M1Ti1s=mJByFVdg{H#>+8a z-xwA)Q?fc+M)0`)tKCvu;yQg;Wu8nWrJwlb$=Uu2-h+&cyIx$q8?fw27mSc6V?zq) z(K4F5?T*$kdeHmiU_i4OTSHK%wXiclgVTEt9R!#CX4ykQG;l!X6fg4jC0qZb;(F{_&HYW?0a=s-SQ2s*M(b7mOETBBDMpR7snV4k^%Mi;Vi-eY;q z>;(s+YBem<%p~}U+Z|UAef$uDZTPvieNPRs zzNtsrBG?)6v0_@jmqy6I&_hm1s&qKe2gtrCPLwsj)b4keOcGX3aq*eEF+*?s-Qwby zG4EPdo^F@JGjli5_<_Qrj@V{`sY-!@*^eKal^**}doxcqd%k~v?8z8!Zz0o{FK_RE zeOj+xG!hWq96#?eBubMn=#=C*yj@$KQA z(A^*`yyfK;9m!21APr|=jQiD);jWy=kI(grYuiL--Vr@uNU{#Jvxun8fbH57x8Fe9 zhyAfR#MG33)QLAt*^BWXIc7Tz&wFsqDY`GOUGd;q>)rV!Vmq?+Ob{5LEs6_vT=Z}$ zDGX#aMWWJKM`wM_pO0(MKA1mRB|XI`!#3G0-MWXkNSBe8K7F=NXlWS~p$GX4o3CNw zRpC^cIIXQ^kQpUsHW+Qpm^)oVpD^miF}t_tg7Rw2zUj{;_oAbVK-zTRGuR*j zufL|Gd-28UDkDg;V9_jvh62W9zcy4nolq~Xl}WO9 z?{<`yQ;J@LVjc>CzjftW%3RsQ?Ai1yO!)Ot?j3Hi8IWdq1!6*rw1FNDSilWUw2S7r ztlIxa-XXdcVH4&SD}5l~|E! zDD9xS3n_I_A|_2tD;XCOL%`QoT89!3)-SBb=0*8<2D?s~G6KU9o+DhqmTlXva(*O? zQ`m;B;#jX4!zkwdzmAah@0M5b-En-`nfnS=a4Q%zr^6G&mDCK!64wmJ9}dEe6g)8I zrcw5arw<=JI*#lHNy&C5-a4-O5?&at5OWEN>095=a1n+S|0xPveS5K~7U*jRRy&W+ zEsx|F)s^MAF6rLU#{az7ynM91CQGM=Bm1)*J^J>E!n3>u31!F(K6`%r_LNhBIEtUp zVfnIC4aOK`V{(vz-5{J+L~J`=?`3e~w}Rv43ezq65d*@CIrUg?<8}kc?06ftuw{dfOcA%Zb$k8v9v=`1Lzz<&?;#IkfFkXf%sxxBd za{UqmCbV}j0vyQ+7rQ!AYcRqJqJcXX15A`UHf$CABur;+V8_)Hr*Eo)Bau9f?1#r} zumBaT6pf%fzOJJB+&8rP4CUbr2|Dw$O7Bv-h9PxSGsGKZgObZ^h+CE;cR4 z7{CmGU1@a=W5Ac2P-q%)HJZ2ZB)xjjeh@xV2*MI?XcK`PHF*b^#=X?5ly364A5c;TRoS zUi*E2d@`ApZ&j{u8RTJEP1m&S+Laq6!z=GY?!#ms_E8QW`SN-+k4}rMh;wJ`%&EQY zTOR?5-Q&VcUGcVyY@dNFDh#*MR#H|@;Eyhhux?ygP*&51NLg7`^^!ckt ztq4_Cd!)9&SU{|IQsRTmys=f34%5*h`Ps6FkNP&AQXejj#hyATx*Ihq)IU94`O$d# zRh6eJ%6sr;b^07myo;_R19xL^1MkJMwQl}m8pksk;bsd~1XtQiYj}|3NP}zyLOd$J?f#YRh&9LSOHqyWD*(no-|xMM%QN^oP<^{$pss$q zd+~o-fLU6HoSHOmzMsaV?~j);viA%BvMEQbR4s!X=h5Z@fZ|QFrM$>akeOvvCkTO? zv4dkd-iv4^%W63|%3j+?x!82eQ*KzV&y1VT@qEvsb^)gJ#_p0PdwQrIf7cVewlu6U zVSD-iKjK)PT{~=>`2c?bJ8@s#!-?TzSuwdL(RkF(ejx;=9de&o&z|DhglJTkg9Plm ze(z;Prv_(XV;o9261{0r%5jh;6*6uL`UAw3FXcsH#MN=-L0vL|7k#Qno12-50Um{7 zb`Y4FuXqOAp`;Kj6&pIrH@ zraDI66E!B~S2pk{)Oay%g=YA$29w2ityiJ&<7A?Ze z+?IFYJCtm<$Bmm)Hd||0SvZLpWsauF?mBCQd|6)6XL>~yY$|4XbV@?mw_#7&9u3gxX!Znyd99H!$a#l*%A$jY%iO6rnZ@EHfkr|c-Kv@&MFM?Zj~l_XIR;er_U%gQe%)=`sjne2mo!e?@}=q}g>`w)Jw zDW6r*apQs3!`_?e)DhfzlHtO)q(Xs00?uCF(jq<+JOvQvydB=iLF6$&O2^bt*^<+y)TyA;o0)e`@D^|iGK=+(huo;JOAB`QHlNumQvntI+8N{KY{Q2g|dp8u*L_zlEXXxdys z@X+hjsgvg376w0>r-8(x{_W~cEY^sc=O*i@8*uu(nx*Pw&^Vi^d z=IB*nO7I1z7L4jKxec}mE^PXNI3V?3divDjS?|W-yE*nUia}YmC%I*UN?e&3|MCJ$ zkKeIaSvaM(oVIN3L)(BlED#NWl%C*=^w6jt8!lg>K^^VTD=;8HFf}sx=rq?|85sj9 z;r3%I;dI2c9em#V4;@6x!q6F?s=PuQg`K}N#*o?f^`S{y7ThdXC#}ud>%-afB%x7eD)c0ZR?@jXK zS{u`tLz^1I3~&_nW+qj2vQ^7nfwY0=7fq3Aj5*KxbF?mA5>%lcY#$4#TFU5RMX9o#+4;4cpcK!>;j<~=xqxls^A7s7=D#K0 zYU`{ODzI7v=={ysDjeOfmO)b%R`eE& zeOOx->lyv;2RI)bD?uOe%4l$*YyR8C=PELr9-Eh{zg1ud?lB0Tq5br;F`LO?RedcE z;xXU(wo1^-)umt+1{-!zi%RHN2WX{o%`>%@o#z&uoURuiyw-(hr0qEC@#!>zy+I-L zEHw3AA7yQ;CfaHMy%iIBo7ZQAJhPp|Wnc<`2zk-HrEve-|5XiwME zmw^8jU%$#DkMrBY%W>#LEVHBE2n-}zWXN2sqnLrr)=JEhR|YN<`weWo*K+~zJB|10 zu5D%U5C=x$Dbhnr3683k0+rW!Jw9#BpUQsr%nj*BjNybMo1yGT%lA#~HmY{$Fq-`P zkcj}gNwQo}04sRulLOKxGSaYGNZr`M(w3*AuVT8pIc@xmu=CqK!SSy*@2F+AYzuIP zLgospB@RGLVYr{lk5LBM6dnJgShv95XIu)aeKePNGz-bAtclnw@0fT>!7^JbJ_|Lq zDSS`~ra&-{L&^tGx6hXt^Xx*8Z%3Ot*v)x*ZaekgXFzh>Hw#F;&UFBQ&)aSw;Z(JWH+Er`;$ID5i?c~6OD&hMS;BTnMN%0LzG_| z0RrT%d!XbL`>frgCwS#XajC8LiFgyqYQAZ})kF~XarY|?Q#@ObKvV=H(hsn@fulMy zZK_zriFlpCp??EW{kv}$`>*c1TT-^A41Y`WI0XezG3z@h-|m}Qhx1wF87yP4tR0H7 zHEb@4HgwK6-Fnp;Hh~|IZ!i_Sl_u~vZmdj|u7UdFXL!>^!x>!L!eVN<{_udy(=dk% zqCya-u`esmQ=&H4)Y8gWQDta!{!0YUw@JSlH`Sq#6Kk*Szdw(&soJKE_=Eu?7K+hY zeBM1;j>5UUEfPtM1Iu_-QnQ*?t=yBip6HO5Nn3$N>@3Y53HxFF$ zew#$7xT{SluR%48$$CRGgx`UQU{y(cp>gfs>cFLxT9|w)A6zz#^p;@#@Btx#xa*Qj zb$=XwTFM;8|G9Yrp8XGycDXtub6KCBNTj)7L|1I z_-p#>=--Xvw9^YG<;mO390x;fQS*Bd9NT|SE)w24ddIvs6yjHar0}1$09DsdKMw3~ zCG=ZVKGqa5RF#5-q=5=L#iz4_f?;l1CU9)d7+c%DG-tMLzcyvP&QfU_PHY=j ze>*#qsqSQ2YIx$rMni;uRmouM_BG*rJMgKA{X@$qE^mXE1xOHeLIUVYT0B{g9_{2v zE+R6r;^zj4TdFnW2V(Ecuw{y_5){~7;c7{FwzQ_xLnAi2sv02+upv}ZAVz>WB?co5 zwZ&vyfuasqL%h0k3L*uB2Sbtezt{2%PK5v_Gj+aTBjsGP57#ds^F_R|4ZyKV4 z#esvss}VFbr}wV2B_g7(H1$qwHsGoipaBm{S}J|^OlGYO13Pzn?Y^7Id^o4fCpGnIb@L%y z^5e%BX&+F{o&q&fTZ9Dw)Fk=u+XzkTIb%i+WVytS4RBD{z}(b~Ig6ffAL2Uy_V0`t zM83>#vC>Xe#(}}xM@$!GYc8zV_4-8Ih8$SAv&#jv`%OaMA&!#Ir9j$+DC~%;2 z2$g%&D{gXu;QhOPkO#mC9q1P7`DL3qh4=6#(=-Y(GVJpnql+>~gqw>$o^e#-QDVS% z2jyeWm}@bPM&~N(emwte>dBa#oul-2?b>Bx+a>rA(_$bSwwuSu4O%Yiry-uCumfMY zdhPeSHPc;{!gzS>jB2ZBM8hroJ%G@H<*fO z7ybb=GE6LR8J+Q9`2Qp7O~7()*QoD1DO2X6B4kXNN|BJHl#oi3BvI02C^APOl12$d zh6WlmP$6TXqL7jzb4iGYkeR;UWxwzH?c+I)ee7pb_x=B0!+EZ?&b2ayB#gx?>?_4V z87V2RD)`MAcE9#Pur%-RcGwF8%oU#DES z>jnP;xUH6CouIbiMNse4@0CZ7eK&ACvb90`uXVDNqdW?kqyJ;O0JmtpJXD6+iSyr` zfrL>0x6pKOHx0w~b18d9X<^f9y~7*-o6xyu@_P0q3ZC_K%M|qD!-2oKjG_$9hK3~2 zB)jHo!vHnsqWHf-qtpDqVsv0{8fC=^zJme1>ggRKAVp`3gX*{lb0j+|#ZRtTo`^|9 zz#upgV87da?r^KY(^Pto4%(SJkTl1>2-e{~e$ZU|tX(_$<=zCfQ-Corb4nT=C^k#6 zrSnm@bbR;p5;QkWedc)!#%_?^IaxB|P0 zqf#swVZJ2}q}b(68s;MoPcovs@Q99#G$1n`%m8xGZPT(bitD~TSsP0fbNJYI^it)O zkvB0-j5jxdg!FiLx?yZ<;spG#5e<}w##-^S>2pPJB$kDXNCcsI`xYCnYPJKR){-BS z&f@+{(cq#LD|+w9>VW^~dSabmJ08y3U*6QR0n(L4BIGp85g--haxUURFgMt@638q{ zVHlY&<|kYy523iL*lR^QyBm}iczr5^fBhy_U%fgovD+PUBvQbkJdXcUlK-3Mad>!H&;sk zZ+5C7`8dfi#U0hm*ay}C69Kt)-gtC2e^pa2Yp%nXkEd>pm1gm5-r6Y|Do76Kkn4X@#TA8N4d2o*F z4%$_-1ER}id#+@OXD+;A&b7s5q*P8qt`$$sdtv&$H{w|?DcyVXz=T{fp8tnEaN&~o zE(RJWtl2f^r%9Bnw(_p9Fd5tu{+`Nd;%8XXGNRZ)uR4d83xyKXqEa5iq=h4^QI%F^rC=s1a2cW?Z|PYBGKPd|JY1x|4BX?O0{Gx3_PlI4_!N93LJs zv}cT*tg@6sU`K;_1-(|b^{(vSB|zn9yAgq_431xjnmf1u!O`7K+srbM`zSFrZ5=CX zYC1!8)R3TwnwpP){92I^sL`ue-?8Hx!>u0Am_6i2RP-VRmA)%eZ9DI;&GX5y=>*#> zwzoj|IT^H;e-NF?1_91NWUv@`XG`Yw{9}m#m(7WWL#rQ^sCAZ8JWWvVvR|WV+w#TN zxf6fQRf{vfD^N}JlE*MS9<+kfEAC}4nii++A>O#p>eW>P@^6G!xW7cUhCs3#rTvd9 zf9rlf%yB&D8}#za(5(tB8{WQMeNIvCoSJdxf`%xQC7nM{+3o&(!_Rv4m4p8H+^^hI zR&e`EwAY4Jt2}Mfi&rgQUbW&uH_@1Bdu#=8k^f}aKB{xF^{Gp9{%E@WL!ykUKo7~J z?M6Y=wp}}XM6S}{3lzZdXZ$v{(LLmku|wMYH5v0viK17inUdSj5B?h22a^Ae+s61&gr z)Ja*#-5`43NS;Oqu(lJ3d5S(69io^PwF}sP5r~4o?yG%1MTL6r$^Hml~MtJ}So zn$L{JD)!U8Ia9>R4BoQ-zx+mbNQ&ZFK2!}!kJgK0Zt0!98T%{j51s$GmcPh7x55KWj z{>!UZ+RLkg$EoOvv0?~_REuL<&jbd6mceM>g;(S{gKadJG5@NKKn5yS!Vj=0#Kb0* zNu^%+@y$eN;^8)fqMbu+h`6gw)OFZjS3$Ji;1NnMcib`Gmw)?duNAL@@;&0&tJ{6` zHy^T2>~1%6X05VZu5HMgfIg@9c~3nj>D#u81oMdBF^59UPh6}2(fFn|y*DUNGE_*! zrLgIA7U?~lPHy}6?5;y;QqGP=0YRJ8K3N_C8l__;Ux%jFoA(^V3GWLjzSrs^0H6TN zgni+wSI7M>4jiPfe;5Gpl>eh~YrG#7XS{Y#v!7RS-}z|HO{0lN&sVF?Or2=ZExXgl znx=gBG0&Y{rVU%T@O;Z_abmiDoxK0)rq8oQ?g6y9=*QL>4Ogq4DxQ<&CsD|QC;RX{5NT*4wu(l#)J&b2xXN%akb~^&+}i#2o>LSLq!dO5MyX zfeYp--cL>kadapT8wOOpgrfP@k%{|(F`Z!%5~2q)LfGI#kFE(*-t?!a_eG_i*L0$HEljo_t#Ei)hI7|Bvm375yy@+D zdi(i2&t@oLR8+}+lWsGELGCcQ&;JVU3BJ60=}C2@1gGHo7Xip&3CPl8>c8YD_$>#TNDq9RKLEK z*9At!1gkhrgqg#tRl@UGzUV?ZHN@2N_Q2@`f}Ts|1GhUJs@$^Qd~Dh zGiuJ1?*Y9J>I~A>wr+9{QfHm<+63nC?-2`7qFdc?3Zq?gi(|0}#tm4_OKErC7APcC zDPvg^PFw!?QM@GhCCw5MagSj4GzSD+GCO54T-{T}On$PH;rOfHM&=vR6fT)-v3aXQn74(+jcjK!w;FaI2Qn3N%v zC7srI?QeDDnaqnVgMLkp?)qzFrn2Oh)!f)c+MnLMyeZdTZ!c#@hKqtOc z)Wp4Y(R1_{(FMX5^?sN>s?+Y(yZC^^a5+$bQcU6s6w~9B*t31bo8SzfD@(fBY}_pB z=-@@y>@T==?lf!Y)4O^WQtoXYOD&&tO}|T<_W|Dm<^(=6Fe&J&R-y4nKUu}RZcN_) z)dCbqR;`N5^}gvgWeTFt-0;Cb+8lF=%;|$+ECLV!C$()db%-9-Of{#=NRb=$woprQ z19;e;?9@DBuT|*Znkx@-*NlABDb@b$w#5@RUg=_y^yJ#Ic2!cDgNlAmyIe3eF$%AM zdlyV1tTgWdME9k-5xFvm-Jl8h%(9#`68Av3&mh0) zgW_|$%)P!~PU~Df>z60HKVLDXoA1l^A1?jMtoA`iAu>Tc7)-*Zr2%)I(|Xqiy*xWd z)hH@v&mKcZ$3w@J2obrBEG5-LR`sx$Az9!0WbF^5;k04L6~}n096hkI-M*ybr=4FI zf^Z3uoss6*&3`rseJiz<$K9iW)jlErfH&iPaNl%&8P%E{pFpS?a|fYl#MF5ou(J=k zzP=?WUO(TN_`!m#M-~^hj(x$qm_HcfatoYQOS$IFYCU0vgH)K>EQz~85T?LUOCGMJ zkX>=9r59Sl>v%6Qx&mx&VrqJ0?xp<%XmWW?qvQkM(n8b8dV!gK#zml2EacR4FYjD% zf1=;py&k_ncVj1STvJ@bmu&==hxoK$=0+%bKYc2F{caN~@L5@W5T#+xfY)P}VKmn| z_tJ^q){5~fPHg%)e3V)9qQL&cZ(oyL*_`yG>)&jTM~Nf1S!M-`qOn6chz*z=_WH1{iYEHgNo zo%0(1=lSLC94Gdg54-2BGXwqGpYj!2(%h8d?TV9*_E{Q39|r z^J8H3{>gcBnR{BI+`oJGW8e|V;GFkSQdpCb%q=jN>u#=Iy!prl=e6&BhX~d%Xu%JQ zRyY+rCWU%uI4}Xas@=0(@cc=o^^ZZ&iH+q!oRsDQTU$cVTF|>#GC@0M(-ay(%K&03 z&!6A;Et3&3E2OY?AS7=ueS2D_J#O$|qldM%>IlPHo)=0d@3o>qE5%)b+E=~o^5jXA z_7XYE$FwQlC=g^%c&gL6xz_k~+*E98YD%XmszT+F^X>%1kS9mToK2&DWU%nxGN>@> zt_xWg3+K&uNE>q0T6K!|)V!N7@<)1{-If04=zm*7+UWG(tCaHXujU$UwRRR0rw$1? zAEPg85!kBOB$ZY>XyJ*i8`@bneyN`{`@g4eewqI+`FilE`MC+65$FG!+uKd>Q*Ke$RvCE1L#n^GS`7@&T zaoo227$$~DRNZc6c3V&%macqXvw8ORDU@9Tv%dJLRHLH>%GL5*o&o;#^Og*MU=pf$ zEMSDpGApY+yxBDJEp3s#mp=HL?fi_X+T+bEa=QSYap{9`#2v}F4tEgQd*;N>8mhzJYk?JBa$yCpL?-3=^XIc3 z&574bmNdM05p`-Af4idv=YE;?S2n9yU+P1KH0=5?9+Hy+(Ib@qUam0M~bPf&Pw?cUieo@;vx+oN#} z*+)jUK6pCC|5xI(=ruwZlUV2B4;ux{ z?Ye#evp-t?@9XN?h}4y_XV4mUK;lDgajb_oON#rw9wU#(=bFFEb;G5VO!1jT{)AneIh(ozQVq~iQQIngO4jWX@ky$5^eO@ug z*ls}kb<(=GS{KOnpI+ zpxD2m>$hcEv0MZ`5P!pga)3Za)wcl3w;g~orOfN{_qtBh zam=gLQR}VuUb~sON$;~$eqRU)4ZZc?D^WW@Sp~fHc1a33(0Z@J#^=t9dR$z*gBgKg zT{p=B3fecVme~+`Fvk7&D3dcRcvg2TB3HxGJvq1_kN z_x)B*sr=7ni|x%fv9klD=Y7~cC02jbg(Ew*WLhnc{o8nLV@v=4F33f;oY38(C9mBl zdibLw4Rvaa=CC%X*-lW4o5>@<1$t*jBBI(dL`B@(#&I~PBf1RQ{8*F-x9;@n1%h3= zZzMQc4@jFEpMEn{krTRh<;nqIHjkG7gpQZ3bi4j}PZ2mA4w&c&%~HMl&xs4#@Fsab z-QnnUOf*ekF+f5j%X7)rY2n5|Ywvz9A`%~Q@-*ijNcbX|w4J~uhVN+;H+*Uhose%Kc7b|wTaHvn%bU_WVi`O@0oWDJ>15*Cd7ICeth; z`;=h9b#>3CO%;?T1+~luIV4zhR&ys z{J8k~;NQlt-iN(9{=Q;(Bhv0$z}+fq_DZK8H)LFRpyW&=n+)zjQ?XYFGl>~>Q!y3nwYGtEIm&6WBtBSVOzqw*s^DWHay1QCJh&jrEhy|fs5l9O;@)` z`*M9$GpKRw`RgQbe`5yCTkl)>HysqdGO>NmJ-O^Z|9&VVC7_xM#zaOQW%@3|vnu77(1~Y;o`QEv_SqGXEZmZ?ARScqZ3%>i&XuwRL* zR_YGFrnDNr^M)&?SI6Z3_g{@^lhbmkw~jwoJ$=*j>ezb+vxWrJc21lA^Xr0$cO`+j zqh237y2s#&tj-Ys$NO7#Uro7iUBhCdsh8`w#Xp;l?45Yx_^Pv0?@QVAsaj+4ZL0A$ z2f5N6Cp*{s)|MxId0!aez45`2%NEb!UL*+%*D@`t{R{`o?%Q9$_I(^A; zRAlNT=l7Nw?W5Mhh*P8?I^CGa@D`+Y{`$|yiHM)OkiP3!|A_=0wY<4!!kop@ z5og6s+E8B~?bFc2Le(mMmcPp664QWbuwiCdH5%>OwG(d&W<#qO11`0)`#D|r%l~Tp z$~39##Zh{C4wtC9)2PD5pQduY4}5asZ_qdf^wGdh&p&rg?UGZLsQ<`^u4>nRQ>4#D zURx@Z%F@!61aOnR5i{gb;xjftR~tkEex$~WU&&#X&IFPTGKMPLBYkguf%Dcdz+diL z>Xl(XmH{-U3B%~h_`ig3SZ^3R*6(;^r~E4hd4n?MU+k0;6fvo{jOVPMYy3A@>b0qA z7qKa;|C&i%XY_wG=91dTIS$R2_U3kJ=vbwG+DYZFPRg{0-u(|vdm8pIbclcdt|dm% zXJ$FR|MJ-U==sPWZKtg%`K|jWf5a58h>QPqNjW=1<8e!p!iQ}J@vSCzPTaJxk8rn8 z7&)Sf3%t|sRgQ{+7^exb*|g~*e4bi0m!AzS%U)K}&oB4p#1uN;)wA*$H}}L|s~_VQ z@Q#MufIU1wECKaz5=sDw>(;vn`oQ5oUpH@QMR=(NgLH$Z)=BMQy#OCjU!UgXF!W}fW z!HY)%y&^gi(XQyHv-}R#Ef?z0_EFN&H;sfyp8i~jnt8W~^doL~75?|;&CTr{dJ0W{ zN40r((DDfJ3sVdutNL|74t#ul^w8C&rq1yd!N$i_=Iyoa8Mxxs?GEkb_J+R&XH|MB$#SWBH9AgkYLdcFW({iN^;}qdi z%qc35i0{?w%ee7dyvO^h_8X?4$WVaYiTcKdvSfRj_%=fY#4>3IL?zY$Nb)DoDNEe- zdpzHip&^FjL6HF)c;25}7Up=|8w!h3LM38|!iwl(OOEpbn&$syi$ zpk71+)gXNIn{?+S(HK_d#5FqdP)KP12$}! z+_PnaYcu=Kg%HfjHRKrCouT3hrbJNbZOq7LxO4VDmCxf5ev2yN|=IB?$q zCCj)%YzT6dk-tR%s}O<@k?hheROvY!hRQ^*X+3J;h2rgVrYuR$8SCe#=cF~hr2iNX z6$|;`YXxKCyYwDA)~)to#fRJDJ6u>9_@URx1Sl3Ej~^Pn3)Q{AtEeACl4u4ue}B<# z^WPs!nT457qsvQuYr3Cy=9C!d5CWr>E3>ODedQskD1J-W5Sq$i!dID}=%F(6WzP7g z>$Y)wgS9I1HIw}gZCa`oD;y#|H{KcI{J}~6z`=W?bz+Vg^f5T$XArftF(9wNbLPye zR#xuzS^Zp!BbH(}^q{uZQzgqnrd_+w$O&M?%y`uR-$qeiXv`sTbHgniZvTV4PckDD z&+#M_mnZ^q!#~?ohT^yxQ{%L;bG&(JAInYHRG-4h)Hch0>Vt-|DkeOg%{pER+-LA2 zOMSF&lYF*_8~t94nSi8JV(O?n+_VhcRl&&6~SGKGCoV+fe}iF!}Khzhk=8{Ow!GmL}V;@1M;%T^Mj7dB}U- zQs-Q0$J{RS%#5U_7Hocg)Tz9=%6hN)&ZRkStDEkN97@QZp-1XXD1XJS{QK)!s}{qy zOluuOv~+`Ebwv^fHe7OQ(jBPr?@TJE-}k{CUd-B?taoEkvJkmViuj0-D0LiEdq>XS zN2NguhhPVj{ciPm!_q^}8o&pq9yS!rT1}=3EDgHqQ_wO(OT)x@idAw)wp!wr*1r$} z=H|y;2dSzCBH*O>yInYWqpp&2NTK`nqSBBxveJi#_M9!%wZN$)jJ(P`-z*qM@v#1fLvB}F4WSA+JFfm)B>ROM z%T-7h4dsXMy%<9Rgp?9Z0APu@ANx&oa=Ue!%efyg*}8X^^38?|XJxPqV^qSc`qF1i z5MKpxj+m3eL{(_)n6^AMAYs2E-ZJFf zm^P=U8W_tPw^isqG9$q-+fTzby}J6LUfO+!ylT%Ywbl2}o80%z%M15>V(SxN+a!_i z-mOsPhU~*du7{rW-+Uu0Dx+Zb)n7l>y*&2x@v09^Llr+={&skkB)8(eaqm?5!@d(H zNzS!#moYea?|Rx!DbqIHx<2n0VA`*1RBInK^HRmo&T@B)?)AL9dGk6tp1UZ@F?;mw z;}9n7vRTQ-q|V^C1ZjqQLjne3y5NH)lW*ow&aZcJGX zN8a`uH*V~Yn7Fo^QoryMts47Q|JamUQ4yGKv~`W|gjy;0-TTj0oo#!xY|($kgJmMS z4Cp?(@lu3zxAJ;bC%fLwR>RF)wIl%<`4c8x@Y}YO_$Wru>4JoqZP>e~7MRabl;(#| zHH~%-eSs3`8uy=S^v!H!v+z8-^-Zk%{O_B=QSHF_LoMV!z=o@voBH7?6jW?``(?jS zX@SWU5DYbB^P-u9dyrqG(T|2b%ST;_uRm$?Z$-Yo*|3p3j|g^5Av-y`evj$ks@~b* zn9A)nKQk$G_q0u`wd;;dn%dm@s`!;ucEx@Fc^79UuZ%2P)U}{TKv8SIeUg(B#Z{~O zX7!N1Ase}C|DUp))^Da#9}ksGJ00F-k0fcv;K)B8(hnRC?^3Hb;h6skbN9x#X3b#=>hGK*&c}{tE(&y(T;4MW? zWRXgUVJ?a>j@*u3}j;Y`>_NlS0@tnu)K&iKe720YkZUcPOVY1;)0W?#~O zd=7_~kH3Gc5}5`}(^ILeJA5eg<*+%3CAYqN>=;~Dy0`Xq^N;Mi)jKpJ+D(%j_npvI zOR25zgzb`0$?M-AnxD;`x=C`??(4#~2A_^>*)LI(86a&TIod7YK)K1OeZ4+?-xMq* zuerlJR<5pffs)ebu9=E5Qj+IwhINCz+v z4)ZIVeAbFbXu9ePe#&GfbYc|bDxzK%wQyp?c%Kx(iQ$c zsn#Jl8S+5MO;^th=|6dcJojRxS+b4!wX^uHBqb*oUccT}gtVMKO?pmH<+LZqFO=Q5 zQ2?ePY^hNy7ZU6Pw=1ao7r0zao;*2okf;19`%qG(iprUn8;41v1$k-M>lBr*^0DmR z(dnYWqN!3zJ$6;@8X}1_O;%GMWm|hD@Us4w*3X;C5xIKzZZ~o!j*oOGFjd=LaHIY7 z>Ciflft1s4Cx4_$K2bH~y&`V`Kz0 zVT>sSrj^_%+!=%u7Dwj#`adZLk!5o@$}?tC2=#iu~lfsJnH2CfOYyLZp z4co7ujZGSc2r}_&PhR-+$v3p`gdRP1wWTCm9=Z6^xPeja`D8@Uczy9tRi<;r7N~dY z-(RCy(P(6}K)hC|?B?gmiZFBfXEwdEqS)Jcez=Ihi-_m|M-$6Vp_;-OnU7Zp26+(g zFjzKH_C(mqjISrsAo|8I!MSe3hU#;N-^E-m=R=y#<_DrnkQ0O1NXQ%Yx_CH?MNwr7 z>5bW1khoG*j7N$oNTKH8jH^ttTQAc_N(@{GqV*%IH8Zh$Ligk5@m&74L>)xd z6f0gsh=YSgh?vUaG`LQ8C!*EjJPf6U6_lkH&z-NBxcGQ42pZuWjwQwQn}3Z5cIS~G zKe%)A@BJWh=NSOs3OlEY2OXoa)9?WWj~Cuo1ZQt=3f`WQhYuy$KLzFG5yWs@Jkx#9 zbZ23v)zV_jr65}H1Mxxb{lq%+oISY5RP`7!5z?fl9#Q6z4A0({t@`$|MOS7F59s-+ z3;iwOQ{GrmK^)fW^-}eGPcO$K5Yc5^X6u&$p)->{Smbhpi)7YEPM>}czr(7-qL7CWrDonV_@`ikx+prVx;b)o7JU4&(Nfp3KiNtyRLi6rQLw)RHT|KXqbzp9`PfzLEcyzZcje zAr0J44D<*!R+E~Mv4wb38xeL3-wiy?^TQK*AhMb*# zhflX|+gZ<~y{d8@Z%zcrs1{#eRaqqFim_)py`3mGDokG_J=Op7Wv)oJ=evq+z}RMfI09JZ_xzHQ z`7163pRJZqEznGWv}USxrMD-R}vEm+9ah2pP5cu?xwt7UjP9xP*lNPeUIxJ8f|vmlb>#rzwLe{8=q#6W;T5jp8FC3711WGcNga!Vi4|UH`qk55&YZ5u zEJTQhmt(|+n(l@-W&+l9+hrsaick-DNEZ1GPIx6u1Ks5@M}`^0zHRo7|HURamZXf2 zGBuL9AS)(3XtuLC#!;vGki=^kx7S*|2+ZuR2;le^FCvF3)x_Rjk!vYG+@$JXm`2#Z z9=k}&mfK}?m1|l$%9QWhmU8R>esNL^ii&hOW`K+VC7&~8$gyY9Y;Jv({M&4l3z;t6 zI{P)(8KVXkCy}si6{$`58xWPTQYFm2y*P}NhmfHO*Dc&6aLep0>=fDu8H!nZ@moY> zAzlh&g;^cZRiZwC;`uktmA`7Kk+HFlN|d&Xc&C(3HJZUYP9xq$|J9A?c>Z2b1_ZeU zlz04l?4ox9W|($%e!4Evuwwl6>9Ed;$;m-tj*WAbSf{C`rtL-eGFM<`Vw4*^eitYs zly~)d9V;xvBs2#vZn1qhsyZ99u#M`2jRuBY@SRWOIx-m_mBgw3*x`tmRHJD+sG_n` z#HnP1t0^x!yz&*z^O$ktrqdYVFkQ8G*3fC|*H7RNjuoDJ|9%#y_y9~$o76fJ0!$HT zuc;EJPWg9@9oS@Zp!5HHL_Mxs8>tB74ke73CZWKdKIl%gtc}n9omemuJ`zVwDjYt= zg-0{ztFlwuMO>=d=HQBy7~T8yDf8PaD@NQz1;NwGDc$@Amtv0EZdh`Q&lf5tuR|3B z;S8Tc+w5TWlfx#M0SzuYof5@s5E`%7tg{*oit`;T7wBdulf=$wP*|vhD&JO&s7Upu zY}j+t^=@TlonCPtfoti1e=aTZLXK?u)PdX_5A9G*Ho*%vxGt)i!}|Uq^udc5_7U5v zra4$y^o}$hBBqG_!FNd+lkXVLmFkSTSWJe14rme!wTX!iqVito_49aI?Y=5L07uwu zL*<*WIqIta^YtKX`~yP#63AZZhsad~QG!%J*|(v963#C*6~S@mi8erPOGMyOUb2Fn zqhlyYGi7~0UD311DN6K@EH@gO7MLkY|Bf4I$GGqAGRF%i5PVs;o8b*FPVKD}d8|lf zT=HcYOen+|*eOpe#P*h&p_+UP5t#@Nizw&0UJq6E86$dK!%ddHVjOxxF=X%Vb?^2z z>Lcx9<@2u7+N-LwwO?R5R78|niv6%8SxUO^xGhot2XfF|gUjSnzREip5PkDpICcV3 z5S_&w=3~gC<-2@x5QjxKg({9joF`YQB+ekrz+i9>)e$2?x$15ME4%>3F@`e+rPn0Y z1zUgwbfhB6TXg7tMuzR%u^3PJ)K3@H8rGJA!T)z?yLA6QKmSuPqt^fT48D+oPc^=* z!MrSOL#;|jTzeKV87|`28RHG0R~5=7uC~abR~4tR{ffg?Izey?hjU>m!QB0S&#px9 zoqbiJB5Mr&#i{Y0v(|LhEU{E2IBUy`_sjh~X2Tcv@OvKN95NhhB7*}z_?G4Ln%3f? z8o&Ip`}Wn_yUp&!d_6nnDuz>lDb(dJ^?JMrW?Y-=UD!gwX3CP3Dr{qCSBPrlwy-~V{wi}F`{6gChtq_C~JHc>wRr=(xE6dc5$=0pehoM z)u1AQFailDPs$NtMABuw-cSj-VLO}Js~3`sibK7axYFzBF;vlo{F>_&yPDV*3HRH? z2>q*kc#(VIH_56i&xV3dB8i=~gQ~k8cF%;Pih#bYLKrRnCxa3tGj1N_PYEk+9~JfL zT|*N@wXFHlcLGU8LbM@_jYSzn)g&e9vCF7AD7?J!T2)o0q<6oYSX; ztA`ktnZ3{0hP=#zl9F~}7D9Yf_M=cE2xyqqR>UF?1Kn5*MoHREF?0~wv2SnOxszF` zm1^BlB1#rE?x}Uni}h4fjtj$C(WG$GpkbsMOu8v9@d`yA2E_^epmXRkhAK;26kTRJ zWPiZ2TzulE{@)g`{w3=k?%zz|6?xNDar|<}=p~j*s1smd$Gf{LR8}22bci84N8wi@ zud~(gmFItwP*f2 ze@l6?5ma}E^Dq2V)pOOF7p?G)=2n0K0sEzMM5YbDNVul62XQ95={DAX{xV8Bywf_t z(onT0DXl{A2PK|O?hMtE9$^D*jx79R=RkHkRYR41RnPl`;Iae(Xf)aVJe%j(9bnUb zRon@1S8V0DzM?>^&_yvO?mc3gI5^ZNkB+_>ce3j*#4@|~?CBvHy`o|hXB@{bxdv8Q@-iFQx{xi& zitjSF&KkOn$_e5&0P%s&hVbLUA)GH9r@u?P5Agjp2w#!l%B_41_j_j9cM=UpaCVnd z_KqjsGfVNDPDm67SRQ6gH_wPA*)46-)H0qd#O>XTkDb-%5$GB=XPg1+6L4YYTr<T}7_cpO0?^>ZWZ5`Q z!T3DcHmI+%5&5Zq*fC(cbwr7~t`oO9(`911Qaw!Z$o~B`DWCJq2KNw_#^cAIEpDNq z7S-0DhaEhQ6SqfnVd6Q%lT=$Am-UQU+L+r-S0}T@TKVE!Vs@xeL~3mMrLvH!cpkDi z1K8uz>wb)lLk9K!1NDQIm=G2?G|U%GXFCz@jsezI#4GD2In6B|X7nWEmC`5o#OsSxf;dJw0z8k@fPWG)i-p@(;k8`d!DBiwc^E`bZ&b%vOy*X;f2MZ;zxjKm02&=mzY7yEx+egzW+9}e-u?yUf zyggVf1mgCrPxkzz{BLz#MEF&}M5oWj^W?vT&6VhE=ajJivBZ*R0B8h}sV{aB>&x=W z3d1-h8gt;$=ZueLo-PYF8pwer#bF>Om(1#&yDedvQt?>+{)2~H;`|qV3b-lnrp_Hk zw`cNZX=Hh0JzeB|kxqV&ffJ?^c8a0BDDxYA66V)+P`!N>d8n}3#n1aHV{c?1g`FQw zLY>*dm5MF`@I`aq2LHb4kFE^G;B?!pp@}uS+_e!XaHU=;RvP&~;6!NIX&)0h+)D+t z?JaVb#i-8w7tbt6%e?&i(@Ze^4!ES3_MDDmK2SLEmkkKfZkH;@?Znc4gg^?hDASNUO~U#@*G zZK1rIP@!|B7Y1sJUn({kUCQv~fmtE;Bc=o9ficDD@~yirSVZ4;2mr1QvXi_XF*3@m zYcnIs(h^Z1CnY64r~3Nnv_<>7jW5D0B|Ty#b-w4LnaNhnHxIA9 zY&LUdda(;HOvgO^ct*y8`ji*!BaU62Gv&8#T^H9L=jaQD!M*SPsi|@zFM!u6DvL9h z97mOKBpl1i>I$fBwnbS{@i~rC>1QTOK!6Kx+(;?*Q@#qeE)0}J8fJCWW53m_h1nSU zAh}{y_sUxt$3$f!v^d*=QU;C~A;&l`3BppuToRLjV0yO*Rv=DE5&&8z+>^hfYw&!s z9DE%q^|AEy`3<>K6+~?KZ?uod)G=;T;%P`r6d9Bk^KA@8>f8!U%l>}_W@2$oTBZml z<21r{_*ixN!tE)ERC)unw5%+H?NU59g@x3m)@7LMoJYdR{{_JwA)ZE&MYZYCx zPn`-RIfSxP!49+`>{w!=^q@h5{@sBSw+IIi_qkKmM@HMy}W-0_ZJa4C;!X8We#T(i_G@XWklRQhQ*-W zY(M@1hvDf1OY5>{^4=@Ckd4*_R7oMiZeb$#zOZK$Iz0~7jJ5vOSO$|$D*Hrdx@rBt z67VmW-56?Rbo~WoWt|{wcAhw~G|w)#XtB>Wr}vlnY1kxzVTv|Q)acX+&=e+=z4SMp zzYSA5D4%z3Cn00A$h3BQN&=`f)DI@~XNnV&si7(NrUP`6H>Q4jfUvowj<8V=(a5~Y zE0(9ULrYDT6Q_eQz59!X2AM#E04Zr%?Vkp5+xu9h%0o69P(YJ5M0G&KX((@<)g&b$ zhrn^!FKz>~D_hh%`p=y^m%4-5W?_QCiQ-dYw4iJo{+X01m;OriF!cO~U}^8=0>Zgb zf}Wy%hYp44?tH*Ez#Zq+x0~Y?y49ayF9HRM5ry}0-MIZ?CX@EcD?`jE&&SVsCj!Q!sD59#Bx0E@%$iJgTpj8*~J=?ZT+vXjuJ%^o7mov@5LAmnYy{Ndj zwC`hSq$oSK1qB%sP2O3#ZN6g{njL$4dsw?Rv=Jd!@TBf~Qjj)(Zq~ z8d){sR0H13OuXO6zx#TB0j;%dBgMz{dN;@{?rg~BCprs`ef{`R9H0UF?K+?}moM5% z*+pp`_KTj~`kKs8qcEB*aTBk|4dV_Jp$lLZ;(L-eUDtYuks3Vh(w1pN-z-qP_d4oF zht$r0zVP%)>Ajj6ad{3wFHQkbD^Pg(s-#y4n&Ao#?vzPV$SyPMQ{>4=;XIOIGtkz& zzz$AXw7w{F2&L~1wKEMxvY6zAmUG0Vw|H|d;-?+VRoA6C%}(6W(@;4a6oQjFHFwsi z{e#Ai?MV<0-A>l2Km4Xna4F*9k;=-Ly`a>@xt8l8ja&+i%(z#o-L6g$sG7Q}j*i0I zoW_pq|k=^R1<2SwhnG$Xae1D(-^f;^WnRM22#8ZBaA73sx(~lQ!nN z0u^Y|{G6BG;XdgtX)l%?xcJ3my!_);AK?(#q&xdRh|2?5+6?Z|0Yr8?hx{s)R3|zb zBrzf28uu=~Npj#KA#q&fM$;7GP!|RiiAQE%a*Wc>JOP|>c6GHrG5Y)V$X74em};X& zW%`6h3!(?CL3sHf$qHa$CSnyumT>aKWx?lBMc-R82T}I_wH>*14Wh1 zsE(j}lm`W+=qazpOWZmLFhtZ-j7kUs31aj+rse0tjZHI>tUSWD8OFxNB~WHF87@oB zEb207z@f8D+k1plf$ue!rNRcqRV9Cb_5eHij8pzVpYgiG4ke1hH(@Kw%X!HX8CD{Q zPlD0Fr~&Hg9VsEEF$cj|@pC3N#27@A`*vc42v3gyO(3QAA0z=R`LTNbievbt!yn#m z_oT-hMrI=8l9Q60p~D2wM%yI4%cu>ac_d8IPoF*|A8v{mX7`&#Mv( z63jL>D$;B)p^pS7>QrBSj0WZEg9nOS zM#5BoLfL0Mj-vK-hUj*B@zeqwM>Xp8u$(LI2<0d2GohIhX>MGrozFaB5=sQ7!W$Vt zF>&K2HT&O$pP`Mr<`-)A|08_kOTWUIFT!dBmR6_q>%nMzUVzYd3DR57opS&hC^q;DF8<$nQcLeklo5h>VLIGZ!shOYgR&J1GJG z83Tlc98=&8oS4)3j);YJ&|hn0EVQ;xT6dc`x;DiJ7O(!;`s+&TG&oPL_m3P%(?;~F zIVZAkB@&9yAdaiz2?h$y-+lr(H9l+#F!W<;5iGBpA$-CpdV^T zBk){9G@s&qGmp-pyA2PPK@%q)Y;rexGKQsZ1j{Hg(rCZNeMeb;__idQK6?E3Gd4o* z{D4y~KiqN#IE!=vNQ?5a{>(BsQ>iY`eeZQ1&iO01%owg!ikb(s8Y0#7TzK*)7`GDO z_^0lDI2V?%DsQ2Gk#a`=aX<3{09ZAInQTCq%EYf>U(<&R{umD0!DBs-NV))r2KC8n zLCs|YIxA@@l`2}WD?$#KP7mXJ3zS4jCYFx_k^OmvE8oZ8@#DLPW9dt>3(v+Ai6()GUGZ^QdK#`weHmi(*mw5NZli(XR;SM$KZ*RseYpKmyaI<=W8f%;BxQmthoLF2CA?KJTOedz%EMin8k%$tXUZrJ zr`5c5TZQcgb4~(JGW&eQ3lNL_Kzu?mN6Is`{=GTfPhm`qNAY4rq(ZpH_Q(%6u4)VU z963GDrKF2kVx%s0zPv1UR`v5o)_z=9!th9b;ednD+^7_JaB3@&HV9gW3$Bpvls8-G zcgXDx3{m~C(u;+|H!zW@p-W3mxJ7Pb0;*co5o4-MaE z=XZ4=RXh}RkcHZIKD>V(we_FeVX^>Z6mPwz^7YU{YeNU z=((Ey#APkGV0$Wy=}*yyBF)@J^e;s#N*;OX9Ssc)Hp<;H>%yf|tY)14kAq~plrvqzef^pXYghEb zREM?r5E_V+qE}gI>0-)#aY&IepF{7Ljk2#7p2E9-bd#r?!bZOaa)oDAj*^Q@p24@gh$Oiah@0#@| zZsWRjV?YF|DZ{@4`JD#d?uB`%ki^js&U1H9S-c?1^g1BaM!Y?eYtngX-Uydt_O;q= z*r1nl=8PLQu$iBJ)c(vKTWn7WC|w}QW;90z9bDIPapj9PW5&bDHZ3KJ+Y>$7l*{jl8W}?Sa(TK{i!rSAJ+hV(w#ZGbv}JD3%US@&Bv7?7uO2 zNz3}%On)m%*6{X|Tj&TCC9ht*IA*`*@9*`p0R$#2@OZ*iqe-=C3k=|~MS;dX7UFQD z7}&E`&+v^uFAtk@u6FAs{u~ltf3q^-Nh8VTxu}m^!5BmnsCM#uZY=Xu=d2H!b{Dr0 zvt{L=k61}IfI@WmzWa^-X}gORSDTO8pTj*Up0=B(XQ?Ec21X-e?%cwD z)cvO$@aHL)!;YQK;0#6)-Jo|TTV*ds*m?u_bV7%`uLox^yL_K^Y(W1IO6AjTO%{uu85ChwdftO5N+E8 zRyKZ#RGsz!L^J(~39t;9O`l&7FKHvImD1-I{-q+54S4hVwVFa1jlT=nRE*OR5$PeJ z?wIZIgYdIbiJF^J%5XdXaMtV0IdkkB*i1*pNwOxrw^>)csmBsHFp)&Zt& zH(Zo_IS3V_lmyPEigkLm@iXXT1XSbDM@yHXrxE!|z(zD#5@E=U+A%-8;;8!H(QliY zt^om0Jv-mtzCCT2qLPx$hNpsJIW1sEG>r5mJCUu`eRk%~lUOu8ehM}wY)6nl?Hjcq zgr!ogSM&4Wre2ry)(H5Lb5%S+UnrVn{qI?y;||8f+2&uY^8+%n9&wPrd42Jz#dpu* zD<))c^tc(m0s(_7z|OIRR^nK4vdxKE_fr)Uk8+aSt*EdZ@eF%kZmUI{HD8n|dKD$z z$Z3KJ)~LqZpc7}D@FlnL)I!e~DnUdcFZ%ZUha?}KvBTbO;d|Sp^+JwW`3|^7D09jAhCYtDyv}+OXWHAm<#UnaOlA4NYaV1R6BO|O>s2Il z2zAKN>-DB@1H0O0p=QLj5hI;=DWtp+0Q+L)Cp(k$YW^RtW6)|cHLX4VaP(U{j1$>{ z;0b^xQ{nqq0*WqTc1lv^5poLWHbB@q#5e&vcbdc`F1`|%lR9Ph+h1|gquc4Hzx}S! zv8DB}L}}t0ksN&r;JEq5?%BV!RQ~%S@Sa#Vqlc-S6Mijf|{o-HJ|c z{_|7kuhh!tLT7mUwt!L56k5xP`qj;Y3XS?WBx-l77xgV@ll85$^N;u)G%Si~XQTdL zm*+h6vm((Alx3Al@>E6>(Y6rz}@8uKQF$LSRk+h zlTBX2Fb@vU>cAm5nWx)lo;b0SidD$E$gD3*+3+cy{UtWJs@4pfc z5SVE>c11`7XD)(p8ITK`B^QLpr=&|?-J|QS5WK38c(93q1V}o~%3oB~duwYN4>ToS z{xH{N14et!D^^4Z5)b(fqN%YgG#|XJp&}ECzh%**iFkD}g#lVzc3i8C?$Mu z=k}c*&6X4HZ>KN``1r&6aFm^Pq5n6YxB1X$ozgW;l{+t`H7ahLu<{@UuNTvn6G zD=Tv$4O_%!&l=E8t@Go#yoam6kOu+-r{Bs5C^6!x3t7Wu$_WxRIU9IPmVNRHm?rJd)iv<` z>POPeK=}_MWft#DWW2+BjTkWCXyyL*KQkXa;HEubI@t5mpB^m(_m@3nPDUxBM9g-& z2>r5&XSatBA9~}lC~90$&(LR|1^~<9L{DyE(?1a98-1Io&ETCGC<*0H&Th`nW*rs% zq~CI-=UD=F6XKC0Xiv_N6ZTtRgF;X3m#GmkbhPT?b1@PB4k6HDF&%$$zUH1h2thHUYxh!Q%p-Yy_HX@f!SI-wo{0woNr@a<9lc0uZ0)QG5D)-&KAe`n$wz(H8)h z@rCo3=iE;{keQt(=0%%w!*=*K-ccEca?Wk>;;W&dq4q3Vuir{mCS=xo)8EgUIdeBo zf+6Ntqu-uf(X^6D$)Gcu|zhIKGBP5jAW*$g8P)-9+iL-M6 z=c@I?h0$s}&Tk(Xl7V&*s+T>Ck%cq-hd_RYPKnFVQemu2v0`z`+*1fslPvRam_{9O zmAB><|2DWPRC5QvT3z;&#~peMkqgb}#Nj42&qC%^D6|nMx0Q(GZgEKR$R1qwpC|x) z)YO?SI%%)g-AZz!FUaz(H8phl_bVzg=Hyh& zt2#6P@7-&`@wj@;@g|{~LJeTgP&(Sy9Y^)KqK1mBZ2_YLJ{0_e*QP1k|G#;e3RgQh zz@N5=5gf7nhfrD*{cdhF-D_KM_QSeTP>5W6Iv6p-ECBI@N}QLGJcqJ5gj@=4dOO+| zX1zPRxVU82cW;?4hU2-qOMG7NH|>>!RkvSU4CMgH5Za zw<(w9JmkIpNk2aopHqK;yHpT;=>UXAY`NKI7ACyo0;L9M2d^xcEI1|~m0jHooyy<_ z#H}fGdBPL!=eG~#euFD7W*n;+1$9znnrwB_A+IC#@9}}Ty}&uHaU=fIK_+T@fdb&P zI~c>n&Oq&4Mhp>MqadMuH{_N9@@bdpf{)rB1_0RsQ-VW63>lY$-)ROZ98*k2YM*~} zLGbpwBb#|SX53fqzs{~Oh`jfc2M12rAcYqwY5O&*9cMI5-m>?yF0@e5b__ym&RI@{m4eX4?s9H{oX zFR#1PILU9J7hlYYfB%fe@Zo`!^lZX|V@9gB7z(YY^)$@I0?h&b*(B+>NZ;9A7z&A!b zOk&^vrFQ=g{v;H}l&%$LS1kM&sq>>pn*rm;+r2THM~8qAFMYulB-w%kOH53xt03x( zIrpvQCzQN9Shj=AglRTpQWBPmoZR-sNB|#LEISC5Erw;mN<-Q`X)xB76U>w4NEmIt zV{z-BXRTzj^rL7jp^6DXo*Gb6@1v4cxPUQB`W#js`g~PSFQ&SLNL&1L5+UHJo;N6L zKuj#M^Tg!RS=A1zF);l?dkQme=cwBF`3LCNRL=XmkZtcC=SSD?-M%tJ3=DXtsXazA z9L%$)S*hAEe&&74k3AJApauRUtEr8#}OHgR8F_((_ zsMg-nL$EZ~3kg1PxYI-wjmRfogfhj^#G{u0g6T9N*6BrcM8MG?5Kwj>+CRJBv8WEv z4+#0HIJ?jMLFl%VmZt!3M@p3J?Cb#kXx*X}E9Pu#>>ks?J@m4nVR%38v5-2RTpv!J zv@>2ET;!N?p%E0x$3TP>!Ce7XiaKOd1X}zzJu2I(@-W zYO!_&H;M!-hCG~~3Zpl$8?bx+zM&A2-m6c3qbw%m*c0-kcc>NS1kZyd~>hBw! zDSE{Cfwj6Z90`>={~&>3i3C$2-v9m=EJWSSR^oK$mW5AU2|rknfvKq;dG8s77|^Y~ z{4)2Ks|~F?iuZY&vAY1OH8GTe&P0rL@{~C$vzL_hf+f9FdqxPQso#X~tf=>h=MAR! zk%=Ls?#pFjx$GANR|0|)w#l}gJKOm`d~rj}3jll4t5>@JGRoLhwAi?J#MZ5CRx>43 ztdS(<&;CEY{yQ$`{*C{~U)k%TY!RuDWXs4-G^|QVR+5=bk&JQ)87&G~(V!t&Wu-E+ z6Aff%R5pc5^}8RZ>;3tC{`h{sZr<;ki_Y_Wov+vPc|4EfaXcQ6gZ7*7b@)#5B%w@k z3L+Ac6M~#7cEOi7Z@jTHxt7wY&aW-__`MI+ywGm+fdMV}A!TaQt(Bd-Y%pmHktf~!THH{0_DH>npNDq_qick*D9-u{Ymceht}8w|O7?dsK+2sEy7g7w?Gm3#YH1dU<=F|C8kXl?Z&+UjiVlVd)_PfRHE z{keiia4opJa%ziX`a|Y*Qi_Xysz4CZpC@XiIV@4xf$7r2r--D185C6BqrW={7*G;pVXY zWReeFbT%|}nct>Sqj7mZ5*Xd#8UE8-J!^zL-@n?B|C6E(U3NCPhTY0?#+CXeB@GbK z*Qt8m0UMcrI%ugc83jg&s3b@Xl|VU(W%aX{tj5?nsHcH2S#B?-4=zuxL&?jKwx zikn-YU+4VLst@4+H@cTOv{u_a+oY4po2Un?#2I^f3e!e3)|Pk?u#%1lZZ%(s4Q*BZ zVZmQ+-0&g4zWMj8emB29;boRV=g!wCZ!_TN%9p6sVNPpALGn(dRK@oPu!E+I-JIn0GPxQ8Y?cON!DGajN{&)Vi|itGoov6kY}t`m=svJOBT^{$*OjH=FC3kg0Hz$!m&j z$B*Ag$)bxB3=&)Q%Pr(Xs||trL&97A!~_Cg0$|t%^1?=7+bF~vqU+_jCr4SPwL#rx zETw{=MS%wn+~axGdL`%+PhdRXld&u2XRb90+eqT4rtpkfb>J#Ls}!lG;nk~Ed&y}a z&n=Vv*pS|m6J8CfH`&%ZM~!b3aQ*bGyPCA@mT9f$qi$CEU+nL{HKw$(X2l*S$Jy#L zHLk9~Hl*2Po2T8bO_|%>=uD~ULZ`#CB4>3pSl}}EY|zfPo`I&MD~{Ms+@tlBLd z?Oa9Yc1d~0u9AM24r(u+y*uZ}x80lWUHtWQ<+Bjo?*~?W=~ufpym4sKJB;gS1kRXn zype|W{=>Egk$yu=?E(S%rGGt_GtZ8|i1lH58f3H3tqAei>SB z)a`nFnvPCbhs*!&_wlKBol0ZL*Ox=1of(a)F;GLUd`35lDIrRM1Y5RqYkU!zsxNJC&U6c{NE70!x5=TG4$F14&H&5r0|Y2+Zfd&S`O+BEYRp(^ zjULK$>eK^#a~l9ZVcu~}M2Fkn5MB!@-0HHF_pt z$T5D?Ak@2UVX9YMb(5^1V|1e##+}Qk{Pyjeb+Zrt4NXK^ttilRks4fnv^(W|!$7B3 z169jGFN6wqNc$ajXMG7_7}HM}$wPImDGyFZXZtp|f_r)|h;h3}HA|9NZQQtr6Gsg< zjc6(NUPB`=0z!F0{q{$!^kM2|P8`DmQ~i_Lob015U{E?OSkRfr^5*krKT@2bhpgn; z6$K!tptpX@unz|rM02<%ZHF4@=-_v!Zc?Z&>vYE1ii)yDLOHs#l*&#jGGc1O2rlZW z%K@hjIy&pGe~H^1kt1&7_WA6D%>|q}V(9Hqi(?ty-;@ zNO44%2iDw$)w(_ht=WLpztt7(UAKjUx0(B4;jjAWqx5A(2tcENEgE>~1EWbeXcdKn zYYsl^SGmOzvkl!8ZgI3EfaJ1V!evU-u5PV#ETU}hMHQ^oyx1_Zs=A|nUW`NXX zrqi)iqoRuYbmhvGC6t^!ZbNt>3Ebx8`iSt8W{Ym9(MWPek5ul;8CwafibgUAJj^jf}T)LGvYXNXsJGSUJ#o|VQ8FAbUg3c zQJ|x^4{dTPB)32eeakTY_HCLZ-<%hXa{zLoGkE$~CCFVjnv`)LXysZhTuV{E!DQW{ zeq{ZndH`feh;f^#R!C_8(6`D!vu8Q1%yf|E<%(9qi^ zn%{4lR55kS`ETE3lNW%LP{a9E@2{qye1}Cd^;K5yPUDZJl2%4_xeq)>>ndVu3dU>9 z?Hf986?9ByCe&jOiqQ?qW9;KHJz@Q>+R)+*E^j4OS6u}zIS_L(IM~j2Ut7Y;>C(F1 z9cI$5E$CYJz|R+@Z1e*$<=fkh?1qkxJK45RpJp5l2q>%v_c1bxJHBSxw1d|lwG=1$ z*HfZxr<2k*Y5Ir!h)3>Yen!@-&ieK{Zc2g{?QGS#?A4|=Us+JExC+uEGa-~ec)$jg zT7^YDy|k!cVs0Oo%v!|qhx*$a*V$0|1xjU;+fTqH=(t*|uN57UUji{x4A&i+GR;j1%XTp*x~9k&X=vg!}pv(bc^C{CVoD4=F_t zMrN%sHysPNMflft3sab0OR+m|wL>tPKa%lRFsNoOT=6>|JMHSUF_0>41PmhVW9zO6 zOPlIGBjd!a$3EtbR6+bfKNd~5-q{uNZlT}Wiibi}C;>HgYjq+>@x@0^Sg>vN^`oAA zZ&pn2hX;PB=l8~7g zxD>`tZrx|jG^V0j+1ge^uz`M3mVcoc{7G)kvF@YW1h!QLY}15=M7-ZGDl2Ldry(fC zc+UNT+=^1^KE2*Lxva*9V4g=KPn#C?t+SyDzj>}}J|^z%z2GuT*g!HQnjV;0G=;=8 zl)TPc-74DlPeZ@_%Ec4=5IZ; zX=-Ms%eSWeGyzg!c5lEZd|HMOJ3H-LyVa+j8S$Fz$?nzLA|TfxG(Ta1*Ojr1Joq63 z6~20)g#Q*IZOf$abz&yTXS9e~Es7s*6@?9N=31vkQG>mfGfV4Xvl(&9xrL z9@S}6T(_@L1q1{KS64(!*fV{=dgA4X*FmOP8-K_V9c;`zWZFLKIl3&hhA;JRSv2~P z)j|4^@SZnOFw^B^=~!*8z4-mZNfhwn7BmcIPq&3n`ySY! z6K32F-2*WGRSN8HBttsz)`17u7bAcvx+C9O|GQ+Y$9g%KGs0oKcZcUImZ^pMM#tbp zi;f)e<~I zqQeIyP}&0ANy3^X5J8Ozq z=m%TW3*9fj&z)Nt{PU1abY_|U8X880Cq8k49LadMzQLfV2On~pu;V$MsuJ2a(O15N zHb&5WYCUv9*Lvi#uXn9{!NEvy7Q+;fUbKM&Gv+ae(4_s2Lj;hlH4}_KQ2h=EkJP(( z_bo+RbC9AS%XVUIMQ3YY-z6p|2iRp`+%n+Dot_*w!E&Zy&92H~=bA}%{fRbB43I_M|v z52&XlHP&g@ZurSop3o$vO99&XhvfaN+lQt!<~EYE6lSI8qWlPlnkQ5m#uxig#?Jd= zs2#JAT-`?@Qaj^H?_b`||JJTOd$thu!w()DQm8-nE)>Z%rA3!;#EJ0xKEb3GNK{oY zNm&{s7(%s#sE+f(x9V&RL$|BO={#G2kn1q2WXgP8?wap6J20`$x^w%74McbYZ{yXQ zH`O@$F}ZK~V$Clt4xevmD+VS?vJQkRRz?KpmvLvFU*?JXHk<0M_T+OZBf&*cU|-wEkilVE%eJW0&)2H>h;mz=rR zU~HFN&dB7IgT)PhQ9a>a*c$$5{M|Xmi@^GSf4|-Jw&Aa$A}n00WASq+AwRo)-@eTW z1y`7p{Vz8%=mH}U`fl^}Wu$u@MI!}NC>MDl#o^b?U6)r7JEYvhNhLD3b^qn{ zn3UWcJ`KMzZ~JcB_7#g<#KLNXR=p1wm5`0_Ev|(d?fQkRQ>64;1#vO ztP3^8hRp7*M;@|zNmp9NX=l!^8~sy2Lb?nB%ihvDx)|0(b0)>_>9eTIP~-9)q>)gx z=mxg#5pvXK!OR{lppmI5G7#d{vsvr`cj)ut8bhemreg^U`NO#P*>?U-Z;ZY2_;Cx7 z6L3o2m*xm;ZPL}TnZ0qBQJCyKW!98}lep#DjvUjmUaP+9(%9h2sJY!#1-j0_S@Y&~ za`Scp3GpLz_geJ46}c9DAZc`c1ryT+2yL_a>pG8vCy**c=W3xZxtIj6V&>15Ft><= zCWow20k**fuVYzr`!2_|H!$XObh@})3EzeDVqX~3_i%S6HoO;yxX!;a{3g+2-RJ4Jw9gXTL!LL6odIs2ef zeGsQ!JN>o#Ctm(F9Ri8s1-IGm?Ec=7<#{woNtiPy2&O9iJnnOquU~d`PE8s9#}hj6 zOU4@wBZ-hGNAwkc33kV)(G>*ruNDRGpdj_uKAk4aQS;Na;!$~bUO zaY;E7U{rJpvW%pKs6p<5*2N4Ro<%*)Ugo{#J}^ZBG5Za(ZyF&hs<0T*5x$?u z_E*a5I2g1QADpqnZ+$sO*+wDK5H-MSh$|+A+2ex9A4K9)0i3Auld zFT4Ex{b3ud3VG`fjixC`-*f`m+hc=0E0XwVrZBm+0luo@rD!*7)W|414thO8*O<^F zhY!~w&vc@UdG}seE&xjEa_{)|@JqJ#d9iw&qE53o?tAxLYuDxy{UsiQU zIYmMpB@C)%*7mJYeh$~b#>|huTs?7hW&YfxKi_Xx@ggYA4snD2b!_RSK_omxZD~?hyohNxH&kQ#TGbVK zQof%6we*x)-#ubq7sJh5f3gx{%{l;9F+ecb-%?vaC2h(a!gZTc@i zs%WFv%ermwqu;>Ig@%ttGMC-w#tm~JPaLu}sIG{6lPAB3EPHYC!Dmq~w)}x2PiVGdA_t48R&Fd|dL|bYF=j+=t9wir@{9WnS zbTDEiaS@D9`d$YvmoN3Qk_v*po-`-sOl)EI6^xb3TUlUO#eeO1qvA#31E8^YT(;S> zz+Gt)cyoqGXy|=rJKGrqD;4GYDq~$83VNGHY?d_Ji>eqm*5)L4Hf9+knF}Q|d?~&& ztqYH$STEgt_}qg&WXaM*B;!pO)6b43ARq{B(_Lj>o?N;5?&1>bfB1r0f571VJzQMRUn)j{WnKU4CW5dr9? zUVkouT4`i~duv5}JdT{sR5*rvAJHIEdC` zP0VyHFOO}Ql(g}!vXoOA%wiov^y{lY02eDmAeBhhz4)itUWjN+pkhx5_?Z#F1K0O!FFmtN?sqf}Kt03UZ_rb_*9m^+Aa5A<1ev8NJuYJgde_-ZD#io52Heb9jHyf$xzdaORF(ObVQpAqO016W8lv(IirL7^$H3KdI?xsQ%T~s7rtdH65IPji?aS)Hp$~~ zX~wCK$hi?TrBb2^@d;XQt+t}5UX6X9X8feb&TAXmmF#nwh;cfDPQ++Rgy+%-p(KX( zfG}-_*vZhnnmOWE*H0-V$1{IAoIlD_HS$yvK7zvl3YfCp$ge!M$+{1ea-1&yypU7F zZA7UBcV4}D^9iqf-j+({&U4G0NUnOd<)WlRYd?Sdcyy4iy+OZz=boNBG_tF1my5lU zZGo^*9JKMB$?}$hdLQ0k44me14w>R=J4&h@r{U{(PqrYLiPaj zX#A%i-uCZ65qYRD*(RE|rSc>Wh#lbL#|{^hPKQEiZO~>-k%rLw=*8~Z>wI&2J7U>& zV8+Rv9e*>ER9(i6&_>$89*Excg=7fai_kal)`C9R9Vn?B0Pz(`!IqI8uFE(cNjI$q zJ}8qD;)(sc-JQZuhK?J}kP?0AN8rYck;6+QkfwA~1qsIe?(pUGc(MJ%F;i|gmm9T< z;{j=|mdvHzSYK;m%T3nv$QS{5H*%z8-t4Ly)imq#z#r5!rQxKPef#$pILlmXIBiXL z_p?~lHDWb2BQV*dMfZ->b%I^8702GqR34x{+A_~p4m!|{%-nml0TgwXaTDl;uc@eY zw6%v|ViwfG^)vq!MgpuEewDOEJP}@jkj%P&j!d|=B3euxBO^NyOmO^K#rlurxWE3h zQp@%8lGFbf?;=N&bf2oRy|`kq{yGw~uOZU$#;zi;!ltDWp?kDc{Ci`Gk+h$n&ET}k90i6G`u6;+*Nm$CY}r1%}0)GL}eHQ z4WTIy?&yO-Zg%{&bWdpMi9xzeTHEsL*3us|=&|JW)um?jcio~kG{t(|6xqh@#yv~H z07tVx&3bXP?}n&_0mnC>fvr4|x$~h#cjURZC&OoRZ;*FJ{<=-_*z-5`V#n=KMxAbk z$G8o^cg&Hy$Pq&E2}tn}#a*4Y6`jvcWkhmDl(jC7WEa}dusk2M}1=NXgHkb(@G*$OiNPz46$Qr^Z9oNb_%#<8cS7N!Jybr8Xo4akiH#Ui@n1 z1u#Y-1#`E-wBq!Y)w zws>zzkMVxI8In_+phDXk5P_#jZzN*O%7^kPvAVqfvcDWix^mc&}c}E&x-qnij%{ zBt&d;y0XziYja^Yw7)Rbg8NUKFkBpsRf>9}_s9Gr2VPhm0py=Qe~x2&K&JIA0L}Ovx<7WTC9`it@Twkcxj}GJL@Qua zHfb@|yj8>$?%~V4JYBbXG0y*Y1KH|x(byt)K!Lx81UoaWU9NI=-`>{iSOYODhcw=H z;J_(``VpJRIj}KYj5UvdHa$WRD_?IX~Ze|QX&iyGA1SUFtQMi&9AgUGtYI*H$U=C8ew5+-+unQ&OcLi z7%O5s!TPzvB?>+$z7PcADEs)?=%7{R{s9lR&KdaI>al1VGXAj_6M89Q2k)j zhZ=UO<5nYB7+j+Mf+;@~CK~P1SD!!EXIBZn!Y^w74Z@Nz&s?Z8h3U5q=h?5fBE| z1%#kf7a z?lj&qbtyI^q%a4q^dtOC5=^6Q$De%;bCFv>9()btKm)gG$?=D9(afOm2Q@p{JFH>g zffg8KQgCf3Dsokz=ylxFYwo{5t6}VQ=7QXxc)aSg6Jy-!yqAW1j5parIJbix62Pa5 z{WDa%tl2j~j{yv0+{!>3q-!_lK}2o(G2)}iK9ojIlZvo0rss!&s>2*b`|acJZ-xpE z*78?6E#WX3Cg!dYYh?~r$B>6iT58_%#{k_H5m_HgIPZg{O$`TNQ}tEtNNI%zJtsHu zQ=)c_3-- z<)6eCBdtL5MXIMBQ;j8VIPr8Xj+=dHNi~Ww0*4T4KRCs+GP$(gGqy?w>#$C)Cji|9 zbZKN}FG=Cn0)d2)7vSY)LJ!}iw3|=S1I{2(I?_y#(I~)pLTjYl5LPM#;6}qX*2{RZ z_C`QkF}#W>-rUdF0W-e!(Pj*!XEa27)SnsxO2goC>lHyV-y9U&ySx+j&l6(2b<^Un3PR`C% zBWUdj)#*v+y;+C-{rqPi9aRKgg z62;}l(|5u)E%JP-M^Wq5%7oNH$o0?oUE|;HcIT+tD{w@G88nB%__7z|UT>k6y=D)&Y zk3G|s9sx`MAL`i_MGE_^WMJpB6?a!*-I=y-|7JIjv*Q3Ro6@E}-_+-6&diJBxgL{4nu6Z~3yU}?8 zt;udT?%)5ckz);T+`ijd_W67w7q!1sD6%Akvq)PIs>YoI16xwSjCXAD^~GRtuuf0o ziBZ>o(EpCDJZ2O9w=GStQIxWnFl@_Bz|_glq)-;{FIND~7g zGg|LY?Keo6gZa^sM~>t@96R_LE3a?b01a!Szf6M&UBcCIoFaJo!wdC;6B$TI_h33O zj*VGcotU0~d1)3jxoRW884iYj9LJ5~Kd(x|&Yo>fxqe0XX3oyK5Us+D*Z-nN{y(l- zznZI>oV#$r86c=e_nKv%&q(h2EP8IL3cn^E`fwSfZ0^1AsYoG`%06EZnJZ~|q~G|C zfV)C-?K`QPhIR4l0dnugiVOWCa#2n>&Sj7G`uep5Afl*|dn95laI{^2NdtSA?C07137oM+@n+%jE?dT)xn*I+uTXZP^`PRIWr_?*Lg zkUL&x_C|!?(1ecwM&w)9=_*&{z@SE}L%Ef9-04b$-}rD*Zz6YSN@W+UT36RsN`sc; z?lPw794Ab%4>0z7{;JGuV+@2YP)TEOTwzQ{=Vfjo5q2$jP+bZV7BpwCsqL<~cc&L} z+fD6Yw{4^tu+R!0FQc3DahcfarV}jW)OZ`kl(P*5QrtXHoB3{W^{dz}Ui z(n@~37oc4pXp4G(_msl!#dDOLXByowG28(O97Km&->ujFmq5evjq>jgE!2ncsNzCs zv`B(FFpaEL-9+Sm($R+06zLbaj81t3RN@&{e~Xp>cQjWIl(-!X9O*k1>f6?aTD2Q} zGGazw0{4jty1m(TQoS`tn)xzrMT#Br(j^HYn4&-bBhEF&z{K1O_K@ zs2s`+EV2|*GK;icYgOgwB4#YTAchI{X}o19L9ij@S7}-Zemo8E`%VN7^mK($p+{JL)-7L&>Gp5lfD7vZFBk{e_wsX zP51wcvAJ*Se)IINk_hDukF;AO3jtD% zCScnJD8npXG0-2WHS&uQzfZ+FtY0~5=+o+tL@mr+~iOb9=G{lC1 zX~%(iR_YoB>cUT6-)qK=fI2mI?YKFE(GOL(R`r+;CXJAF1oR4T^@(Pinwg}MFAUWW>ehI#=mnq>=6p3&MPibUJ7UX zcC`5mU*_#m)-BzzY%MtIscD|uh^^B>?4<Sy=oFSl!|%WE1N=o3~~R zg#WhGJ!U)b_ob{XyLlN?Cr`dJ@^zalr2AZmv?*or`RGaivEF#P4p#iH$8A_f&}1&% zHtEE)1}!vvuP??cNM(-m7FDd_gAPSK<~BH#5=7MaymBLPm4EBK2F#5HRFJ;qs25(Ndxw$$)`W5+cnAR^UQB#nSt8>WHAWH za2?i2EQ4D|dtUu_;D3ic8KYRF#O0Y;eR{Bb66an)4Y`W@;^+u;78{DR&xuYe$LFqY zO|q)frjQ20o9`zpvos$ZTm}J9pcl!9#3opBZ?SKLP8A*AA_*Yx5bK`#GRdmLvaY=M z*Qp)_?TQ_&+E#<7puI#VrV}XC`hWm+->@lNc0ooMaoN^xMc(y)XsNv1e8v&+%f!U_ zn_I*%=<5_L7>11JzSYg2OX(;Os;ybpV?#e9XwH zp5<-=zn4eC0>qIxSuvUYtX`vkfWBK{l*XKH$!dO%F$e;(Da=T3VVQR-_)V&jraM%y*;B zd$UKzLkASdlvLkf;#i78r1kpQk2Xi$I|qY{jA|1FH92n>J(}GER-VlM#2CBd7*H_A z?Fzumoq^|J9I57V|5OoXdw={g7=iH#bR#frGG4x)9v@XX8ZQ7A2b~|=MQu2qS%Ils z9NCm{RwcC8tl8Nk7qz58Dx?}fx2-h$^XODD>td2`(jdDOLT;F(zjVfUyjR3CE@pZ~(L{33d`4rCZLwPVpmDITqyufHkRbhyxqeuV0_N zT$HZDBST>)`cOPj+rTkTxOZtvv;U3+o>G%!dpGNurJV5PUGkDqiF;sEKjHg43wRRwwnxEm$yoQ-`okANH zVOzI-mK3-i{eA6cW@i2S?0)GBxJ?*Yr>8ajkeV|4aV}dL4q0YN#b{Ggy}+y_KVKhQ zm#auYP1Yp2F*^#%i=|eqC2*v}o%|NVK6!%vtPJb7X6y*Oi>lsR4(F( z95;N9C<;W)4rB8bDKTKmxy%fgtFpkcj`{;Vi>#;l!pdB*EIB3c_3Ux&Jdps+sZkwI z^hKa1;!G%-Aea8VdH+k@KCUF2pDipso}dgWm!=*NBw5w4!U;ylU_aeRPu~Qp41gzd zMN^+YpISJfMp{6bBgg^SuhGt2qy<@7O=ye^BeF_Qr8t~?hR*DCKfVpt8 z*Qr+#0@L-PjS(*hI7L5SwA0W~Yvt+@$=sLuN;WKB_D{0K(%; zR2LC1>{iw03Qm(Qlns&;NC+nRo_J=VFTG}Gs-3^@lSc_? zy9$3l8O=nlhwiDKdRdzmTS(Et44A5FDp9v6i0sEYPE21ZU62x~1XHbgbo=NzLtoHQ zI{edU&k|mP$mKK!w&Sp|?IdB`$!Q&JeUitBCqJ`T2=bHEH6&xth02Yweb#x(DpCTa>A!yk}bd5&`egFK&tEsqF7vF`G+v# z34Y*c=Tww;5-e(a8?%n<%i~M)T`I#X7lD@kW`Q~(qLbq~riMH$7uzLEy338hO<~%P zcXzYSf06so)Qu}FcN{Hj;>xnIAxoACT-212eql^RZOC8Ho?J&z%UqKAs4`~Pv7dvb z+puN*Jq_!-PteXtfq^ZrZCNslRlQ#K2d4cRrRVTlC`+l zA_$fS2gvsLvgEB>x5lmsJ-?9LNbKiLh7IGBoAF7_%GwYaJ_*Iss8GNM%${Q0Lb=m4 z?0!QURvR+#r9G!k)(6WrG7yeYD*$WpQ{frbP|W||_>5NXj5?iKFq**n-z;C-*!bA^ z|MPjB8=0DVxLozosN%&|%sGFI^X)Y?jt-_5`%o-GU)c!=KDOl7V8}E2Vhw$7)F}1YZLObi9ve3^(s$1#OB%M{a zzqe<_YCLzFarsikLX9M~TO*TQKHmg zBZJc_%#5OeR}h8mLPw{Or7f%Uh#QH-)3{CnCnFGJH>C7`zZ|);2XYb=!R+Q_aW6WjmUpkeM)s3QI0Gsk0$Fkp>Aux=W1~ zF=KEssVKxD83{Fh>K6+*R)XL(NjBT)Q||9_3pnr_aDWpukN<6=EOp7)7L$8&B*C`F zM5uvt-WJcs<7eeQk^g|#FfJvc&tDGB4&#ker&Wd*$|N+Ng%iN1l%HXmS3$mSX#2Jv zs7b;=bJ8@#jl;nkbtiYy;NpS>`WOF%RNqlCy)yjB5>7*rf0FAZ#cc42@)IqK(J@#I zg|%jio|IZIjrhXA8#TnMGV5Ac&))#+boKO1*YrLmHy!Lzwm9fY!p}XW+sflTPE=aN zf}i!Ab;pYsPKqr8LY)|=|FpVUQvfD6@-MhCGmxYn&TvTbZW#&`t6))b{hH%=2+}pxg(pR;w$azS9I~| zer%WUlyy|_=%^8#6?K7jhtHpXbj(xF*e{qK3D0+RX~7Zq;w<|}zwjTb^&eWqB{!&9 zv*v2p?W()FM-qJ(z@0PqISXZkXV>K4VVO&cp2L|ufhPo=RC;ku(M_!U=$W;(d~6>I zv&%4CMA>WmWBd=go-x({G{Z?i#e^DoJvuMnsBJaA)J6i(@d+L#ymC|d*cHctD0;LS zKu6lBOZV;?^mx+H~?a30ZlT_7^a*UxCQm#OZ)!~XXm%*epR=Q6RuOm z@4N;{I7x^U0ZweBpJ`T<>yR@~_BBZ21)Q(un1yr*n|Nw%xd+sMt=DX{Y(2&DYlNznk%fusBCx?2gxpN(hh1)V?u)>RFA(uvt}7a4Pq~ElSU7zwxAdfy0dHr zg;tS@gF4Kjw#S2t19qg=h6CFNaXv-~GCj^Pi?lIowV=K!Xr}V%JQzlW9+3f@#)Uc3-a)CEU zD(Z!Xq-*lZ?oOAh;p;t`G-)9KnE+JuIi)WR@P0*_^b_oQ0OX+O;$*bDTydnbJpLF- zTurX+1kbAYV_nfP&wzg8R!b+$20pD2A;mSD_x%UAmFx53FN(5+mWqW|{F8EayS{;e z35OU5^|VUexfmLV9F zGvo~OonjO!**g?~Z2&sD6xd7@p?9?};#}0M6P2V**8eyxFm$Uz$Dj$5n7}5(IS9~< zNHZMyl*(g^TTL@?T|$DJ@5Ku8Z9gjN`t(THU&A1ki8ZP48xUdQnr7{LvA2bPxjzfv zrH<;Hl5YfnDaVFX6Liib1Gs$IL?|b<>eb79@)MqcD0L8kKrkNm=H0t03Qb4paP6Av zQJa=@)vsSJ-WBi}Ms(IE1Y~t!hp>3BK;v3*`s?sr=9n|zAG1T^iILo5?HYM8v2vX(W##H#>*A=dK{x3x`&QeO?B1;gJqk%dG*T4web<_lQS&p^bhc58IOSt{E(JX>(Z#f zin2CumaC%Nh~3h-7s+R&Q??UnK;!{_?24Q`jMLaVX+BcL4NI1 z-=L+;pTP>(>^99SqdR{UFxKjkTIvYPApJv}ohF2`p2l^Q^_{0CO{lE+mE!BCrrgNW z|ENsU<-oJ2r_u}7R(N=G7jVjn$*a$0bpIX%f!T#xhMT}^=f9*hz`Qa80g&qIRIeTe zzxF;rliRyHEqHZ%0|QIK!HBdQ<6xA>QRaZvwA8A8E+{bY5T#KpED+6}v$ka{QIwUf zNs~k2}@3PJiW50LeQk zL%dK`G57Cb?AO*{{Lv_AJuS7WECVq$Su@n+x*n}+A^L?usb7{O!GOxi%0yA3jh>;Sgy6idvAuCqMh>X>(~1b zA%g<~rnxpAW*a+xG(Zrr^Ei~3zJ68vUL_53!wgqzphr;LS4G*&y7AHyoyG{WZle-h zvh*wQtba$7xQ2J`@$Z|>0)T*Ld6Hhw?b^fOMQ{T4S zH2F2g=W>0ezMv1By0`P2EAwAMZrkGP%k5SUDhIjNs?5WlfVYtqRL%@*<;`VsuuN>U z%0Dgglm+zPPXVEfK@mOi_m}zYQX&X*nwBerg-{}1cA%aiG*-6xRaNvAR^dx+Sx0=^1O_aJhxfg@wQF(|a>Zx)%~xI~D^B1f zRc+7ek5O}G&`0P6L_d&hVMLmm(pa5+us8r{EI%P<>O-DE-K!4EVI~rPZ^=6#3vSY? z)lmfB;o;hKn{FKr_r{unx{0Qyi!2W{*;zIBfCz$6krU{Ihd-J^`YHpD!kzo>=giEC zvL4pN!1l(UhiqCJ#IbZ}eqC&>u0G9mvUw>96bIiiI;Eo!BdM-QTdwIKeFqm1kSAxC zglAZPJVqPs{PItGXt=T1kG%fyAr|neQ~UNKSo2xnYE5W2z{29R@BAYH6v=4O^QZY| z12O?c#fsw3FP`$gn=T0xee0_!)h(7TJz1xTfgbu=E;x(raJ0h#cN1n%SwRKb=9g!S zD`p%C`|ar`1{9qo*NEY&>szu`_Knh?O?$On;H`%bcP#kk02}EDZ(4NU znZFz1P*XYR6@I*`@)7jM4qh?f+rA9zeSS!(&ae2=WyOkLhA+C>Mg_?3N!!TM8?%XL ze|#^aFK`Os?vI%Z8e=(jZ2EUtGoPbjK-3>ojSKiEN8Yu4$BvsXxXfly-V|U|TN9JZ zYvT8gYHtM9opZXUehZ>p!^uR)<85wv{AB&}9RH*yx)TquSiO4C7C0-l&GUNdiee^v ztr3k5BjYOd5HWkzOuV-D?yTxHYqkuEnzq%)#|o6=G>Tsdx!Rk#%8j`^mKgi^V+1-) zt?Km|*>TV&(sJh7PpoCCk629oY8*D$4a>?104HHih6!7vn=!F^yM)fOkmhb;^Wj-p z_QKb%O3h_UmSoVG?s4WCU*VtD10KD!tqzMV&$YbNO$Kr-b#pWNKQHbJpA(s=1EKc; zPoC&e^TUf2&v%LLJ8nh*gtXij_3|+TFL3IQp)JH-x0H3I-Dt(;no5VoxKU>vn=*;u zor8m{ygGJbLJaDyC_3u-dlw~lHy*QYDIK|iC2w}H$ad}Q>bQ`#u2k4ZeGo3U=Kd;5 z9b@ielg<-uJ~H>}90F!?zKExd4D|DRQrX~!?_V`p`6>b*CMC7cyK&WyHy zPh-Eklg}+34$zn>5(4}rnro$frgz!|p$h=~2f83G&(0^n|KIcJ8~J%QrWLWkA4EER zBctNvUmZs8iNMQXR#^$H!L=ISci)+HN?M5*fyiT~Tu79d{rjR7aj$P6RbVuSj4^V$ zWCQhfmO=N?(ey$z!Jc_4dxm1EfDXJd7%~l>b~MiHVr(sVCidjuM(AAzPo|FOx2Kzv z$-A60i(ZT;HnKw^xbL&e&YgMgF_H$(IY^UXZ0~6Ht_#e^K4YpX+F#f?UZ);IODEvX zb*;CzPOVy2#NLD%Hyg^QW0H9UIeQcUEU(!tCQCrGH*<6Qn|+vI*!0}lH-Dxc=RCzf z{UN7k0;){jel+eb27?h&B=*z65lw5&(&3y*nnl!@fnY8o6`Igs$U$Pc!kYRx5PoR~kI3zw6GIHPmRSkX>b1+<PGC*LaS!Ugg~9)r@^75hA0IeQk`NElgl9M_o5X2r_G?r06v_ziqh};PIqPT zQXZqn>&+c9IlAthzS5QrE{pf}!-o&^_5C};w&j%1COo?mst1%z7^DnYmnJV<4h$=4 z6w4+8x_WrtK_$vI+U7u)58I1`-~}23WD&3bLGc=}wdY4Apyuwovn1feOucwsjjK86*-H06JMq$>l#M@!EdECzqgt zw1TNG;d!ZQ6-7q|OESE&H7oeL55-$2eSLH2e0~|$ttUZUl6VQ@!r1jQAQ!a8NUFBm z&z?P-^#OA9c-%#{qiKN<&Tqo^%u~fIt4kxCYI+TLgJLGfPz z(fqQSO6pLKC}k6OG(=Kyw)m4Z**x_u%KCxdt~ToJm^;6B?&{IAb8tP8>;@*8k+J1r z$rAYE7JvUf@@#sWslEgG#EcUY8f$1c{c>PUKcqLc;QTwlKYGolX|r40N_sV!qpZh^ zvQOz86^&Wl4cl4HNd8HsZ@C@w@MzMxDFjyQLna$+r55`9Y!XNOs%f*v_$74U%!54Z z!e7+izjr^He~ILlqelmmXGkzV-j7(GdPvfI=8+dr=`Rnp1EQ~8XR%=DP92C?D4mZ&yx6ACsZ6^eF80 z`WO73nL{?T!NQ8fDhsm3%o#JHX58Gqr^n1K%tUOq2C|9A=X3AhkomzwaA7GBJt@EE zztim3FIz(z{TAmJzxsFB`-pCqbY1BCYW{nxMbWh*|l zrkL7Tuf=rNdKW+)9d^bqdUDzV*+t=#gb!=tchTyA(jj9JT?y>1+I*-*tO|n70cii~ zcu2|_mJ-)6k|$@wP5I}xn&Nfwgl9eF`*EIL6pNmU&zh(5wxD3HYlPVYG}Pg`eND&b z*d0N}_FOv-1B}OOe{x_O*u`Y(w)qe>JN3fZvo{_-9EW7Ne#oy?9A6#Iv2(ed+kda@JthC6G4nGGj|8H_u; z?aHi*cmilFXZ9WxU~PY>DU0=38gG`cy>igY#xWH#A2a+BX_s_C>&^fGou-ps^Lbg> zi^^k`!3G`ak*1{?UtW~^#n<2?g9{#TXq~;;N;x=$0%*t}nM*9^Km4s;##cVjOiqf2vC-d)0wn8z`AgLlZFN-Bv zlDf*FJj!?L)`!rJ;`!GCs7Gvr-3L~Gx3~yI(O`03=nm&koiScuvc!|*H5Z1xucBBf z8*Cn_X}2jLn@8zbgu`xgt!o`cD*NQ*%&qrA2d6i6X-RInF~&AJ46vyn%id9yoR(-w zVBJBt?M>|>t692T($CvWhcJuW#8H};G71Kq7s zDD!r46O9C=n5Lh2wW?AtE8cN1j#g=(b2?#EG5fi9{d;rSg>(sa%Pf!!&vOBH2K-Bv z9`ItgnHgo^_59|K>i}gE3WE+DSeVlf*?f>&heT$4`Zn)wBJ7TtT8dYtKY^!YGD)6j zf5V+;bi!PL4I&wAhP5^;d>BRq$$$6wtWypwWn1W4h%9?;x?5dIeM~;49V!Y+=s!ov z)^FwOI|vBF;dcZYT1&0S)C+`68qCF@(UP$_EyI2qP`FEZUYkl9p-Uf4<_>F7!wpgE z=9+J{;6?Hj(!nLS-k zjXsn~pOf_!uN|^?)vOR>-re8dUPL9Tlrhg$JShpHIQS0-dweLkf@CtzkDme37vJfb zE-?pn)^16#NKaczdi=+8{^puqRU#-@rQXHRtP2HEcwS$vt=B{%J{MXSq(oat;l;$Y zj@`O-bDTP<9H*#G+wz?N*mb7#-uq_h^Jq@ao~ zl@()Y(pv+62kqWH$F1^5y#9-w$C|*+~iYvMo{r+hddag*(4_MD0N;1V)VBkuo}A`TKQ+Ttmc4b-sfBFXQ{prBS7&K<%rwn z_x%Q)Li#a!WX$CSV?Ay7+e{=(?8K3hcEkSSl$&>^goGj+^YpQ71v5cgY5y6anQ;5~ z$Qo}e(5~2Lvj)eoV5fpLKJ?MhC7o2>D6)CSI3vD2Pmp3V&s^7_R*W^iFJ~EhS&I>8 zT=aRg%;>6%NrUGBAsSeRneGgI#<4*)iJRwPo*;}Ti|wx1@^FmI<^5d1V-rtteUZRA z1B)}qxiFoTdgtT!6{{zM)C){Nf!tvYXyMW&uYUGZlmR-jq4%Xh1!&x`AsT|dD$K2e zJs&MaS;C?hQwf+^R1BeSmk5-_ZX#bIQ#eeHJaYS%rkxE)=`Q}b*T3-5iE!QU+&hva zkp;32V2>_XS&`CFQR;zX-WIeE*`pYVf8Ntr&l~* zX+}<$O+WLahXdeVNn?Mvs$TzoCdG5X5eNZ@uV0wJL7-Mv2;(!Xmv)BSCCnunPUE$2 zpVyPKtN)765!7g3catqzrILk?A$t%MnXl&S7Y-Zy&AEAxPMt(zA*DanN_XQeP9)DH zII5EDz;IHMD)$(k2;JrZMR_k8IpKK-2h9PU$C;S?%O}qSAJy!;cv@ngy;Z5qb-~8G z7=IkekGlZX8ZUn7b;yBTsDJA5z9VO@eR+CroL~;%d;&Eq-@7d|uBim*5%JyVLvR`y zFMZ>4Xi^B%5e|ua3?aLh;jo25fn+~rfGR9tdtZYB zf#?pFaT{UwxOw3z|3kY}{lV$`T!#w-Tub$s=~Q=$km&h7cV z=OLk7nmKF)J1K(l|HkwaeP>J^BaIHgx*L7cbp~VabikvqW9JFSQ&k0(JIgOsx078= zkl>?YGsJ+*`XP_m^is}|?k>BLw2wCJE`}XPqXyH|y4L>Z=VvVNiGVYVh%4?P)pd2L zK?8IlMu6Pxsrd7L@CQ0yr^}LPphbZlK0(6&Z1&qsIzzJ6A;fEHwabD07RxRA5HkBf z1_!rS6mK-9$fh+*_SN#yXfQZ9~UASXXWixgtjEuGg z>q_}!ub&>OrVMCm7VKsbcu-8spEP3LN}gzZy>|o!MZlRJy}R3`eEsE;w}tDzfoNS;gyT}D-o3Nsk!WvH zy_{xB26ynNjI|(8oh~UQhuG6|wx9Tg?2Rwif9~<2GlgSrz+?YBrpwt38REs!`uYBa zQm!JG96*^bi6)hCHh5CP?DWf0 zphu>j+=;*Hyi;zh9Tk9p>+E({x=}Jvmgx3gG=CG z*pz@0ZFR_m&cFd#=BQ97zvZMpD=`ORsbnnoF*Gs)rh=NQk0R859#8Jg5a~uJ%d@Zo zoPXPcw#EU^ij~}i--qUbgKG6#YAQ=T*ubtaT^6!Cc7){v&o*b6njH8c2{)-&J*kVp zV7K4GLR4ZWMc=upEm>L}WN8h9^&P``N8+|a(5#QWSA(WSQx6L8iZb{RHOO$A>~vF_ z8CJ)Z5eItysHA;B*Qw*6K~upK$5AAorVm9)|HgxRliCJ+PHFWSBCEiBZm+M_wy@z$ zpa$Gz=+DKU<&o*_xEVClqS7(3@0Hfgn*BGlUC>W>tg2CzbGOOrh`5hW<(lw^k8R)uTQsbY}&G4jnQj?MQj*T0Y* zyC8wxUh!k(I)1VvtC}{vpq}I-QQVb6?zx;wqi3^H-%Ez}$c`;9DV@E2&ZdP9XC}rO zbHo5b{kUH&k>Db_C5MH+>7RD~KDw}Phw>xlU2#%OANXtXvFyr=^f9PJI{o;rP!h?i zW8~dVK`p^SIy0*Pr#a|{p?^P4(6>i+0KUB#DK#fY^}W6dw?je@PZc_cG!S<9s+Wf69?xH! zfA{^`ai?aD7E{|_QyPWlUS3Pac*jvsfM6c_og4K5TnLPpy!JM*KPIy>ID@+6#(hdT1L7dv^~VPsR)EJ5Ul83D%YjW= zFNXEw#ttkt|4@~DbOSrS+X&%!k}GlW8esj4VeWmDTvA0z&kykU0h01I`u8ZW?vus5 zr5!>J8EYP>BY--@GIWC;H<2Td!uSVWn%}!q+qOfA8(cbcrBl5~aE#F3=cdzoGy?`n z6n0Yi?3QbxRM-+oOUlYVkPYS_1pe{=YwB!(qOQ|8zE_vdm@ji?iX@<;kYm`(j*uW3 zrJ{?-Auoai5jQ5lAIMB;k7Kr;1oj}ZW-_+%NYc{?F>}QGTfhDo>YM3w)W#wg zkYo*$dSQG3xRY)P2nIpRtq&&=7&~u(F3d9;7PD#zj zz^;x^Dfk=EnN^g6oUUfQ9xQyyOQ*!BDu>5>%!&zQrpQ*NhH$TN9tyRXh(OTPG%(qr zR|JCYqz3p01$_e}=kA*V!rxXlwGnjA!!9u%=oR~G)%85@KGOLb@S-O-GL}W(L=UYiJ_-`zxh*%~X$T&>>Xh(Z{c1nsu*gC=E#Eyb{FYUh-)dUMyCfCU@ zSLi=xMBR6p0u4_H8R!V{@!1DsKgmioe=#l0N1{&XbLDPh3bN$tOr&@{4+6`bmNb?E zF|$(&G7tSf{KNy)=ejN9QYk^w=6W}DC=~8Tre3N#gP+yJK1I4ec&2+D%Hx_*FI9V` z<`Q^qXisv)pA_9O=BD|7=dhN^${@p7rdV0!qB4@13FXc=A5rTI0j`IIxr z!sX*(^lSkvC42VW9mIyR#nj`Ob)g|kA7^t20}Q!4M&bT!3Gb4ThF-!cQHB@9^{Oef zmyAZEDahb~yyZlwUa$~Kt~KyS1T2A+ir3n|k5TxfFjOEACUj8f2irkI9t0w)tt;HO zM3A6Z?Q~ShDJYD>LVbL0mu^`CqY{kA-1yq*!TeF@Pod~bS@S}FJS%W%+0vyVTM`EQ zKTH=kI9bgQ|x$IYky+~qeWHN(4mnW8nS`cwu}YA<7Dct;A}&BZXER=AD~s;H^FJGX&y&jf35|)Z z!0D%YNDr$E1Ti}4{yxizU+7|#fBTtN0>Vm`{3Uq z2Fj=x)*Nb5XJsEcgq-qM`FeOjO9I2BXWTdpp6qgxumF-pU!uOxRCi%sZ7bBn%sMGR<81=5 z2cNH~$Oj)L{@%0uJ%DCQS9|*gzVCJeviCksk;n^Ae&I2T%>;sRRks1sq%sIFg9ptE zDLoDI&aN7|_f{9rdbYkn5=QKI*QB9RNPr{YkwG^Bk_|*TwvE_J(G@ExK)fJKRkBud z7cB4xEtK|@$`J7Ng(f_Pik?2E{F~6Ax&Uei%nVPsEanC$_-N zPo=706N&tKVdD$fBOq$`#XFd(Cfn6QSkE!ILsidlp~32%@CgiB|k5@<2nsvEt+2=GUD0M;-snZ;;QebIvEToTg`rTiC8AOW4U-1`2T9=}wBYMh`{p+u5I7^}x zv+;i0`Hz4xhV8@0mhIuSV3f{L5mwVCR@$XsRx9yi8fG~~$9f_9@#@4Sg|c~)j8E=xW1k5eqPOKhQa@qy)10Vr11}nX zB{!BU89Homd1uI}*+4mtBlUdMIUMF)W!Il^=ukd#*M7&f^*3gKg6b?rf`;3yctSnp zKeQggbKpqc(|{u@x1M7-#6BZlhZJ6B9vr_U4deBhHa9S9v~UWXknf^r0XE))vF~ zzVQTs^Y^&_LVKKd-G|8x&+{Wx=%3^Hs@SqDlz0xy;nf@|yT)D#$o7~3uj{Kc*VspZ z1q4}vR0ItYW|h~5qBluxM~+xtl0ay{ln->k<|(hA{S}^MM;@V#xN1tmwdUqP+QHR+ z@6M7nB~fent1BpKjo`-sK6=2sm&8=iHge&y;kJu5x4B7h4kWXNfV=b{iHLDdelMHH?OX?LE8CL%?BC z0r7m|_ZPzqTGF297*mQK3*cs-AQiNj<~lc&@7OH*K1W5rrgBJ1RAIX+_uD!E(`OEx z_pj~!JwB`$jwm_?Kj4_8jA|r7>ytGfszvSer4oz21wnpe;ldVEkh34pY?V&A4870E zl4hE94e~Iqf`=JPm2j<( zyJ#Xps%Sd4rRA>tNh*vj<%K*xZc$FfR1)M!w9eUea`dhVF%*&a>8Nw7WsVn{mC?Uz zWuq`yf0ez1gSE}eE=-NfMv^kKVl6N9^gBSLDW9=sn;4%;4Ad@yG9`X2Iiz)49fx*F zP@8&AJ327Y)=fyOwP<=RR_`8uQK9}Z?&0^O_)qdEH2g!4nh&N5LKeEo0aPR$c){l= zM{gh`WoLsRjN@{6GW(Vj*q5+}gqxk{R+8m?E;06hU;%pQ2P9?ormen{Pq+rBh5C9F zsz4r*e8e88fnho<7wHFt6-iG>m-Y+Ln=Urn%qme(qJN{c1><;$EJ1g>6j?lP=1g}6 z^D%VD7Z@6@)TO5BHgSH|MD9bfaNyHEbPW;2M*-xW2jVWlQtPNZZwZeTO-Nuq-_Y9T zY6PN<3CmC_bLC+7qC^JbiiwXM+s8byNJ{X~PrT$6f{+RT&VZ@8P!PV;<^F|5w;V~1 zjCF{yWx)%q0rN|}r6>s>q9FKE*!EiR_{+kmgM{(G-*8|N};p!8e02mclYWg?~kxCq}2S8L<`d6jKPB&*OTeB z>HX9AFb(#y!TGwA@L`#8g8bKL1fJjjAYA`4XTFmwYARo)*LRJKHFP^BDq8+bqFA9& NE%RRb!-wlW{~v%~s|f%A literal 0 HcmV?d00001 diff --git a/docs/cudax/stf/images/dot-output-axpy.png b/docs/cudax/stf/images/dot-output-axpy.png new file mode 100644 index 0000000000000000000000000000000000000000..c4baf5f92375ec17ebefd4f54dd70faa1a50a5c4 GIT binary patch literal 11351 zcmZ{K1yEIA80V!!8YHC@1qlH`QSwPE(nzP2gwhT2XaNxsX%P?sk?xR2Ktd7eZs`tb z_S66F?C$LBVFunj?z`{YbH4b+2~|;&BPOILL=c4dzPz*=f?yE9>!0|T@H^=tmNERn zeXJlSjhz4eNv+L_Mi3U{zVtl}_jhZ_9@>?9{R{dniZ5p05DArrpm#Dhi^s=g4y$KD~c zBg6ka?Zjhj{FsmS?Sl-p>>9TXLn)~Mwd}r?J>9(hv(w{MLSkZK+yD%ey1KfPlK@g{ zSfG|IeK{&101FXM;iV2Aj=4%fpxBFCVq{Ey_YRRCeJ#(XmaV}-<+v?`cl#@YSmwVW zdm~N@#-m5qsR%RF+UXk(Wb!oKCSGS+g-SO)rwMQI7H38W1O&*lVWL#uYH&Wv=@(8W zK`AJN%cf&A@V!*aHmj)iJ+ITuS*x*(lK+|*cxqPf?%{EGaDa`AdvtV!i;aD7a8P)i zf;;f?o`ywK=lo!SQ9*8Q?&r^+^Yddn=WpG*6%!LvR#ujmm$xdMpP!F`fnjK9n0feX zxA*(^?+5BEWZDW0E7Y@$OZQS-TwLHvxAifTiVs#JZEbCne}~bv=)=Q99Bk}wX2oez zAtdyIFWP2x6eiGq?pS};$GL3MKYd#4h~c_=_3CQSWMgCF`1m+{th>AWfR2J zGmp6%RU@O6f5S*aLqlI*UsqSx$cVI|v5{5q-{9tcc}jHQ!p_##w65Lc2Mtbze`8om zRZUGzSNF48Hd0wxS%_ygRA>_QZ``|gPk&`i3%5>a&e5ZepyJs$=G4Z9gmoXs0Fq1T z@9)pF)%h`MWMpJ!=J5b~Mn*C!?u|3z$rIoKjb<6u;Q=anWMd`qa-mx+7!)^2ZU(qQk zzFRXbv9YnKsj0B_p_J@O@9yx!8rVc=HoU6+dzb%4baZrFoLsN7w6wIXt*yTP*vI?P zW8>ph)CZqZdzWq)*0|4oi&|Y>gt<$VtD_YgQ;h);5fQKj0v28FM@w045e#kRl$5q)wT7_4zB6A# zPmljtnww+9^O;CKemv8aAP_>r*ilx!USwtWHYO$`E6dW%?A;v)|L%phmX@p6u3cki z$HT>S@1U}@w9L;S_Y7Cc>iqHJ$G30t%F4XguWxN`$}1@umOfGS_O8jh zs9^hzVSavow8Cz($g~Z%(?1{}#cRE4Zf=g>v~_5N^@?`*n>TO9zqoCTRS8H)Koa;i z`r|0Xa?=S|5Diw37*}S(ipCi|PZ0IEDWkf+zOE2=L(Kc2u+ex*mnQ8Tme{EN`RQz= z1c@;O>&jq3L0;axE>&}e&<&%ylMb!|`3yBb6rb1rDkP3|nJ&p5*`0;HY;_11XBU@? zgh4yMzl(V8a)fBzC#gy6w{}|_iFy4RVd3Vsg$;pPaLOBCTryVgd)~(E`!YNniw{EV zdy-I&#^8Psq zq-6hz7W$l%Gnl8x4wdQg)~H>*fuZ4q#1aKA;amf$7+XeSq+BTKbhyr&L5*tE#7W zJfLTH_wHRNK(-S#RoU6u8^11#US8Zgf`yiomi{StI_A`EQ0rN?@oSmV?ulSxr1;Ud zmX?;m!9U@=bu9TT+B5-;g*Jovo16AbkrmecpT$3S78MH-;Wl?moL{1%nwy!~-fCkO zx_w*M?Ht>myw6D%S7*Qh*33pLtIZh}SbT zbkK;$d`CotlHiAzmd&yxwCKL%+kcPzT%FVNi3Hc!F!pZ_+$UFEcZsa@k< zRK`@4I~CEJr5q9*9M&?!TYsXK)w}mR1iPs@>iv7BOP5yH*Xx`XeoSoSX-)jJE@SrI z#7OB*5q7aQH#gttC&RCugDzE2vrQB5?^9BY zpe{hmnb*@9VJ|N$b6Fp&f=X&-Wu+{(*C!?3K#qUO0V3L}H&d0$<>bJ@$jC_HcF{Km z-8@&gT?i?&xcyW^AOWRkUNmY74RuCzuTR~x_6%xP>`?l#CugvUq4!oxx?Ciau=5Mr z0ONXhFE3_cXZeCbiR0ChR)W&%R4+U4-5xoZSN-Y!=)u-(UfOHa6j8!M*MrSzXrc=# zzVDOO^SP>TygE5rDWH-$T`@Xembv!=RmkJ$=m`B^T}`dpFptY_tlC9^Gp6ojyHoR` zEQ@_NgfmLD4;p5*_JW_eLqH>+dB@x3<>j4)?vm(R6!=)7GU3$b78Zup&h5?3&8vi@ z+;-l7#@&X8hoQo-x3Un)pY0X+_6XKmd{4y7Gd`( zPg2egQ`{yzLTCf{3ez~OPmlIs*{iCm;PYFpu{?T1S~SOo41Nv#vu%;gLeIuv5hgsM z=>AQQOEZ^tGg=O|D*6LQ*auhuuj;4P*4EO}fzi>FUqYIin(9=B{PJ-ot6gj$lUMtu zPD6DbtqkNTC@QX5UM2m$Qc(XsHr93FNAjx+!hK7>7Z$MYcnjvjlQPnXm7vR za}8F3N+wUIL^e+oN}?DG%M_G303XTxrn1bTf-SjICnw&@%H0L^XVuKu@fiW1p<0qM zi5?XAo^okn%RcP)DO=Lt)BTfV^;+F*~({zV{xgA9D)?hYh7YW0{mJ%)LEY-M_I9h~ zzHBJ+>tvHX3^37ELH^ULU1y(ibFqy_jM=Ee?ZzsLe9xaN?b^bN%*@}Xe4lGDk>^z8JWFw&SR;9K_{N+b7R`y#~by` zBCfgh^%8@+!Z$5{T&mKqvzrvp4{}z9)PZ2|@Tjsf6S3>->*JsbQ%#rMD=5rASWJ%` z`oJWjPJo{DJx`9<+TArXb={ayEc=^GK^JHM^OBNALUV+4jbC4bfGM%;j+JMF&PRFe z;kO?@*55rgFc`NfsJ;EL)9?-q47_3z8=*B#^mK68^2fO#Dk?gZLKyU6RN6(@z}kBG?R9l;A0L3rvac?+ zzq-=f+l{Es50PHw>z=r(kBMj|hvQJ1-0kY4^r2 zEb5GmRH%g|jFlZ9P= zrryg8`UXjbhlh83Z~#a|IZu;@n!2^EO+;Au_wV1X@}HU)bCR!o>F<528;d_t{*(c& zEPnX0ncP%8GBQ%)`H8!!=`0jb4^cw{1A~gt%&e>uks7Z*&tRFfjC^EzcT>(!7MK`i zV*yGbu!xJ!;!nv#rMEYH&qoC_lZA!`8kzR8*OYQ;{9M%Xm$LI@)gxtEGQ z=VzykGhY-1#l*x|SsR_BAyW-m$N&~==Xc=CsKPR=U9t2B4DQy?b92aMVB%SkLOER3-_pQO`62sJGC-wBp4xektxLxO@5p4GwgC1z4mQbzFC)zwW; zPt)Acf89Ah@ZTa?q|7imcL>*C>wEekG&J;Yp%^#YUF^}#EC2NA#i>;&=M``9`t0oX z)ysO#jg4s;87+drc1HnEH`KE=NIh)^_xYa|QxOIsLzeTgvD9U{Ym1AEfVZpbDk|LW z-zPY(1=#IF#5cOGqpM3#MrJG{W{?Hwt3S^eu? zHuhRvXUu~Phiut&Kl6|ts$`{wxj8evfGc1jH`i##HQG75gr)EF*m9 z&Wn~AhyW7CppA)Io9b5@obx+}7Fmwv>x-P@ndYI!$n)pVk)cbI$I=*VSFfI)ZulBi zIhw-+MX>3P#c`VX3Rn5)P1*t;UmUqZ)C?={JRi}}fbv5b?_)Obz=t0_deoW%(CgyG zi(=#LyGy-2`|=GRR5-%I!xabbD=7^uIRm=MjAFQLi%&>c>?!6`VWo4~{p$;Igxhyw zMOYQUadhQ6PtZb{+X{SU$mMJ|OQL9{Z*Z`FZ$OWgl@-&kp|jHl>T|1YRBY_Pw5JoiCW!)&RHUy= zW2JNBfYB~a-3;h+t?2P-1C*o?Vuk_M;#oiFXXac@#|jtIDmY>uBs{dRkuroLu0mC2+D4Ke zr2W-2Qp{o`xcOHP1JtC(#_15|=ZoyB*fy`@;ufZ-x4sKa+VjkjJ{zkfr=Zx5t3L^{ zVWn<#<^NlN$?PnM-b0c{Fp88LX^`j20_T58C-9QTd92uVP1mc+l#i&yb8i`H=gQs@ z$(BV;V;xiPuq<{{!|X(@*Nq!DVBI|yWPZ1)xdS8|9~-OmTuMYO3W=X=HY-bhCXWzPvN5?gJM9zq?~Ug_^XvLUt$p^Ji<=%#E5IB}-F;l!?^s$Gf}D!NguK5>jyy z5!Y2?QYGatoi~X}piTCq%Q1*38`XJN@lv^_+B6E;O+0+?px9FtlBI=H{i4&i4>IAB zl9Gu@N#AeyqzZL(bO41P$Tt|ZY_eti6;orgnxpP+>AE%&w9os%Ej6d;L;d-gS7K%) z^b)+LRdmQ^E0jkP)Qva>QxHhx<-b7+<>mb#O2!cq6vW2H-ZkjO8+~CC=`;qJ=Hc${ z>|<7)U}_9=hU5e7L-V2qvuL2uo9Jkhb;XFYw2~#CADJxnqBMGpd?pPaSNcnXj*-t$hG|$HhIQ9&6kk zpzg+SNfsu#991@PjH)wH!(cPMxp`&y~D$Qfp+=6?$xCSXDp3kYYz;KjCiRC zvlNqB&-$JSK6~~IdV-6i^(p4*fI~4ze36Op~MB)mbJEHi1d~o|(Dn zvaB&1Rg42IOHna`|2Kpyw*A(O%zln)bvgkZ-{-BF3PT_z0jU-}9F%tMq(B)%1Ho(Yq2Jvs);MHif1P~hg|wDOvx znGe$sbuHRuB55skaAorO@zbymbw|rwofXT^-)mp_%LrhDcnxdfU%ysSS2sNjzTG6{ zd{yYVrI#)P5iU$40rCnjwY0Ra+Ot{-8s(yimSZf9q5Hjp!4jW0z0Cg z^>nh1u>od6K}jjXMwqlgHSz0Lv03{YyShV*< z)_m>u#C81}aX_j{cu_pMrJv+R_q!MnNDE+K62K7&D6iSV24p4aK}ww5^YG=l zSGzFP38P(-=@$bx9x2oApFgDd_;(!VSW5`ZpBNcb3U%EW}rDLgDp zxWG0bxv9Y53pFL>6^c;d!JfSrh8_VXv_X>mb`zqBndis#=cBU1$>cA0U9=qaR;cp( ztuF0Q`5-YP<*k$Uk<{#^OM^%H0YAQruP^@{oV zLE58r(}6AAWL&5EpkGpdxeSzTSNW#|#qHhYzgP1}JQ z$?*u72TqqR8$p4O4ZC`Kv|5@)I|`CrG3oaG3^fUfdS>5~p3;|nUb(9f`Od=QP#hLk zR#txh-jDP>NnaU669k4xW%Zu!_9{brxg51v`)9&yvE$Q?!a5;s4bFbAjaqi<@Y>p1 ziPK&62-#ja0&=EXyPl^%l11Rw4&o{2GvU1Z1`8kRws92zE$Ug%TTuU}v4ojV#};0DlOGmz`x;u0gz2JeFCupS~!fJIQId)h){ zgpGqkfbIX6n%OFZ_b-v!6@JqA%v*OLc`ceCsOfNL!R<-=WwI_uDCKY8QUPuT6#Xbi zb8oM<>FwDLu7Z=(4(vNK9UTG7wQGNio&H|_wV|6ppW$ughYvqm3G?x7Mk|Xiom+RG z?FUCeBP`G@>*(oGy&RRRTgJ)770z`PJLyAlu_G`@SRCX-dOB#S9gpnPf7CRpK>`3x zFli3P_zOxbQ+zCh(Ztg|X=!OD@NK~KKw};pbo-uoODkW0T9cw}g6~aI(wE0AgxMMo zdYvUDn}CYS_fDsyWz!G$292OFf^fum>5@hF`&!FxCI-6Lh7;``{hnXH9H$!mppt&f z%35)*237Q*@W-}KqCm?Bya+@mb2GC`qV6i`J#1`jPy#sa-W~q_J;KK7JmjfI^EZGx zAY=ohn+U)ogoPvInyP9P=(WJ^ySlmn1eBGRlfYeSRrKMg(#?g*R$(+_Qvuus^`2l_gQ=bir~boiT7P>{v3T-3&t z#AkwO6AFkr-&{8WW1d(YznPEO8h`{^d&#!4yT zc`6(T=&Y`#46$HG!217cT~R*r9~jWA(l>SK>PVvmz88JQ(UOF23m+_|VR7qwwEIO=a=*^>uy63ZfP>Gjo2> zAL4`@E0KXdN)x|6|_XoF3RRxiMSqDxF{UJpApWN00m+j(V~xMb0}TCd_6T^=|b zoN=A=j#Upb9&3>i1_9`~nb0*knQ{sR{LlV=?1K!7(7;o{AO7v3XTVO)8Gw>Qodj(Q ze#1j#W`av)BS`OR&d$!B@bFr6doT&3wV>3KC!fIX(mHxdJQJ~QWo!GkbH2a7f7SBQ zqnNE~J*Z?Rf`)Q(a_Twqqp2Jn{rxdd`)ER?ZK;9Nt;T<+l7aH-O8RGG($dZdj3z}z zMP%2rwkPF5#v2n_AZZ$$I_$I(B1Dm$vs>QY-at=MQkcCaFAx!#)gK=pM^mRgW1U#! zX&oEhqd_1$O@&l*`d16fTgU9E{ZI`gy3dv`6=`=Z?6f>~z&lQO?%?EPS|5G1GJ&cw zp#*Ulo^NVu%HWGD*iun=40>pNEn7^c$hHYOz@IPpZLw#FF&jm{GGP0A{BK*3b%KSg zYloxmf^iZ^x+xOA;(~&2ZCC>(Sl0et_)XxVpE1+__k&V3CR~EIZ@;1w_zf69{{rg? z##V41pc&}IKY#vgC?jiP{^vga`x>d5do1xG?g1!1(kd5G}uC+>}$RersrM~sx|`al#z1og_#YA6Sug_NE;v;!aOelK>MngNXz zygl;pmecE@#(|>h4())=Il^v!d6BsU4h5O1J^mJM9N^5&dlMi&9WJD7Sc+k*!&A3` zf(TR^mX%}+xCJ1(F1uFNxw=MvkXY6(T_?k*=GI2NY~=a-)`N^+qoYa5Dr0Z2$h~@) zC>Tn4ADh~7a*8%vL-^^4pm@E3d38vL9F#G+FnnKBO`{leJ$@cL{GcZAXWm#4|6uLb z7z4*^b$S|<^q2gAQ$1!v+3&oSMyl+FFyS$res_`MW3P6DZx|m>`|K(?Z2A>$z=k}p zE`ZYB87J|nC*WtwYfH`~R8GGqkmi7CYHI4>kPxT57pQfHazF4TYTWtJkl>W2L-=!I zV|6kaYF2`Yt*x>dYQsZApL(A3S-XHM+-Qva92y!L8R^K+?p;E_10`m-UC1w}sK7?C z|3!oIVkp=m@{so}kE7)0z)$=AyS2sFQ-h^jqxM5xogp~DVD@Qf^!4{&rwW6V6M6J# z&&4I|$TLVSdu{h7KEj{UoyCE?qMA2TEB$2sbdLuAWe_n>^O&88$RFWbB$$w0>X)~i z-Tv%uZ}0BxZ13-H!;4)a82I=g8Eo(EU6qLo33hM3tgEL7+N0`!CMmd|s+yWWBJ)0T zDk@y~;-7_@QtW+?8<3^6(9HK9%|3@xk8(nSr#LfEh#S1(8kVp=gyOFuOX!WD`h`_s?gBTa525}Bist6 za~3uLp9IPfAlI1AIE8%uDv#sx3FA~Ws-K{e8$pQvodrna>odk5Sn@}+u>5s^G1&8O zCH*4Qz=%QNs1RsiR=C%wiPzVZJ!<1a%nS`pBE|NHUXcISIj+dQw0A7d_BEy?vqR90 z#vGiS9Z+JRYe+a{ZZ7v{4rsC}Mo#U>m6iu%*y7_z>np4g>!uQMV1)UpO@*QCqfGjQ z8B^N4Y&WOA6`5X1|Iix2K*PwWXz27ceKi7{d+2Fkw9H$W&;Op|*qC&i-?_rUsg@Ig zT=ttrqiv#mA}}g3sE|XHcx0S%Z<%ghx(Y4j#fwR=$)JNXFdTo2Jz&23*Rf>0>&Pa? zv-Yhg{kv{n3W_YRw}t;9H{U)vpH#&dt|Qu(rLBj*J_82@y(=2fg{X%kSdB$nXR1?C zo54z1z_zt@bw@Vn=!E7S-TaKh3?;^m-3jq$bdz}0NzE|ovFdjYE1ZQ%wFa;EW`2Js zCIaYDmE$|9nH?=rFj%|EB5r5pPL|1PbUb7a2d%TPOfj}!*(g_GUP7KP>?kkpNv7G$ zKAv$3r}^uMKx*kLtO&&WeFIZ)(qnsJ@MdhqwrAe zU!xAtnH({*l6v1|rVWZgy7?&LM-B9-ud|0<9zHwYg}{i?w~Uy0JULP6xo5pPR7AxW z*Tyv`?DV<;xz(_Oz7k18LF>NwlQmyJzMxRc?n&7Qkmn&)j!Rdl45x&pUjJm(z}#^c zC3JTfE(YAo0EUL{#oMrmnv`~XG%jV8*S9|-kaHlBI{z3%A z3d|yc4F5r;_#aZjq><$|bFoQRc0PK(+jRQBkLq`Cj^SAJl%}ILce*8CVN}~ zgKTb!FQVIS>L}NZLH5rW!!UxAi_zEM!qRY7+lY8h@t`pW028_FFVXGKMdapkv!`v! zBo65A&E3T+@_Eh9)WlGLh0N(M{I0_{!lK$-tC)054)sKjyUzmmI5RX9uo_v$AsXG1 zy@{lTu6U2hO`l^(il2k2u5Wo&p(v?{YT)d!+dIQCr;!hB6m+xK#}(PqVqwxQvej9* zH~W+-|GO3WV7@{6bm%3IH^Lz<#g_(?nL&OV`yn( z<>Lney~nNP``H?9YHtSaw-1%exS{(|d@P8l-(6H)RbGc54jS7Y&%)W$_aR0NK_DWD z!hMMO91)~uGZ|OG#DC%m07&1GBfOB*4xD=;3c@`fr=~7(^$a+7W3>`}dJ%=?dt1i@ zhXsQKgFY4aKmeIKz^X~Zn!wCN+`eesyns5k{5*={o5EVf>O--Tk>0_qN-j^tnZx|! z*NmnlGcA_9#Yek|^9$pm3f2iDvRTz~Uz@)xeJN{Zrt*ghwF&ctJK$%R$#?n%6)5l?iP|7x8^MQVj!BBSxm2&bnNu8 zn$SzJ*Oyj_2~bJ2Z-`Y9bjQA4e|+xdHDu~TyAtw=>Gh|pFwr>-1Ym8|5A|_VW^;#R z!^!S|u3N|k))fpPw6%>ZVti@ETVRt?1r}A`&+2yv(I50M3b98Hl)G=0BE)En*QHj* z&CJ6&ryUg08u2xU8%+!oaL^C`9O_rk3l-cEO_UwgToENiOyS z1gil_!9r-G^I|dOkgJ6Z#%Qytl6{eBEIG7TgS_!o`}~lmOUU&Gysq*EQS&V&Hk3f> zdI_87S03~Sj)aE;c*o^8;Vj4&0fF36E=@iDlc&dLw402nc#eqC)UAe`@1lxp6>l&H z5d_niu#h1Y?XxyWj&-vEXXrvDQsTGFWUYYfj8Z!OMvNj$EpcaeLXhK8IoWbCPo1j- zop*_d7;Z{)#_TuN~gkorxfM~cq~OX&o{78~<@m6jI$zQAxa6=;BU4f%m$DXiit zCUK|3$HqpJ7GBaGbiHN5WlxBX^D{x?+vBxjoImqA8tFM(-nnc4*{AD`61{NTBc(E) z&v+AmpElTsUkqHDB3$uaLorKRvy(FA1mq85?D$-6Qu$Qz)hK6<4Erd*pq~!PmNj2{kB_#r>r)Z3eRq$EhNRVmUD~Dvp z+-778x3kP<(9mb^k(Q+;JJZJ1MDb>qOuEf!+(MGgnH^YH|0*W|0fDL?m;+`gj{rE& z2L~n0`wC4Ee;ofo^p5^Q8V2=T0}Uz{g~F&~Vglg?oFDxZ*A;=+3z>Jp=msm4ceJ9@ z#zqc5Xh%;k1?R)+{J81ptT5zhM0fYEl@=wX5-^~jZOs5%-#I)TtIABF{Wf6?ivms& z9CH|}wHaKi7Pj*<{;VXFY?6ujUf-s~N3g-cRaYQw)6*vB>#)a2Nw;`~`xc1hcqjVB zSd$Ey|MLLV#|%I<EzrCZgR?MdS%d1R*DACYe1S{OCJn}bGfNLC!s$OD zoPLChkwK`Sj>aL0O&#+xEIK;ENLel(2}aUeJ zGduI1J^|~IHzp}87t&Z+O{S+KNg1dITGRRD-0fH<+-VaN6M^o}IacZ4fRjg>fIQba zI+i;g$oSOGOw?v-&JGbxslScycFV9LXTcmJ!i_TAFy^aFSpD8V=DqNs!ToU`N;FZv56>@JwI!BJ-hp$$2y$OW LNjhKB*#Cb3iB(R> literal 0 HcmV?d00001 diff --git a/docs/cudax/stf/images/dot-output-heat.png b/docs/cudax/stf/images/dot-output-heat.png new file mode 100644 index 0000000000000000000000000000000000000000..bddd6d47ca208de7f98123603e6cd44b51641d6a GIT binary patch literal 116112 zcmXtg2Rznq|GkwN8KGoE_7+7p*^11{%qrxYS;)*Nlo?UV2pN$mT6QEMp{cTGWEUd- z=hO51zn<6Yd3w0rpX#CV)gsv<{*zN%c%6h~FNwaEhIzp4neYBKmgl!x7d};~ACze(RbUay5iC;J zU-iw}(5!vksNLlBjeWYOPu}|e-O9?{y`VTYtsrpZV@mEb#_pX$clVOLvHP^Tx%o7l zN5SxwLvVItYvA5b=O6g?v)`TYy|eT;G{JzHH;G?Agj<1^$u3Z^2pV_3wvM zp4X!z|NS;B({hR=g+HzC@ngB|l;k9BZEfFF9v53xxt}8setO{j_s{v1l$56->j$5B4hV{f6u(|^b8}0q6RQC%mKM zh>wqtG)KI4TvAg@+t060o&_#pH6q)-dU<;{_V%(E8yjOKy1n;~x^>^(b~N*##A{xC z)9Au|2FA)zZHt%<%HtKNM3_8kiOBX{QQZQcV1SV?y5*uf!x&St(%HEdydx%vBd zt~*ABuA8Z;J0IA-c=h4_@z{g}NntNZ`?`)VkIuBceyv_$YVzcq?X3f6b(Y3E7k>YK zH87C<=hw8@E!&j*Rz5l^eNn@Tii)V{XeJI0gA*rq2^rnqD2c4Pn zg(n}KR!~$FZuUWzo>9hLC@69z|hcXuD^EX+q3K6U-(0?2FUY%bb7saPfk(|QKm0j%jE#+Lb#1Nt(HY~5%aa=IHzeks<@4#M3_B0))fbT9 z%}*sb?+mP$}hg_G{a#>T^{DhtKa&wR$-uxcJY{G~ojO+!PY zxuwM-!u9vv){dad`uPZ@#`ZB^HaRBvV)%e9r6RgFV~hP5LdZ|>5_ia zALHvn)=eLsK3cHPQaD}8{`C6qpBulwdTuPV^6ZM&dG0sWRAiF3OTjCL(?dl6O)?Hq z<-zo)@<)M7rw^X{_BxhDsL0|z+sn)JBWn}ctzBK^r`RIRl@Zu8pDQA9`V~Pd=lm48 z{;sc_S)CtJUis!*IT6L5CV}1SQDf!g?3l<7d)uY9bq!I6J#(3Sm%BXkg{920T1Vul z|LivhEU47JJ_~O!GCbUrA>))G?Ld~5m1WR+J-(Kfj=$BJ}UYy47L2w|A6U zdyUmP^YyP!_7?8AapQ(T>-`iWgb*kvs@z#)V`H~-DkOcf54)4Vr6kIAG(}XFSj$U( ze*T963yzj$R?&9lXGUKqAGswL%O+tZ%ysn5^~_8hk@f2ERVf9ZQR=$7x{g~1t~N$e zwV&LbV!lWm+KTVQTdvKe?5%n1gVTTib-ph)3mi*39xxm$l$s!@(wrT(*>)y77)8(t{>q|pdczUYN9xrgR z*nP1!YmfXnMi!RCetvTDp8fo)s#*E@{LG%;`4+~HGP5v0O-t1>(0N7vB+sPyp_JUY z*OmpE$0x;84tlmpE_0Cb?}<-(Vt21w&nVY%kTI+3R7=*VYcVscw8EUTzUSknHvR>xTTxF)X%jah%L zcA=%2+5Hxag1gTf!yk3W&<&EAcdffqeJ1}H*v=AYZ|J_9#H6>aC;Z4>a=o|fzRXF~ z-^iH5`KxYemEE97i`zpnlU4+UDVwt4K2w#T(`kzh}RDD`OJVH zBE~2SxpG~IsDu-Id@;`hl9dAe?W%fQUGqQm?3HKV!COTBl$<+U=oop?Yqcro2slNC&u|m z&IAQnRl94qANhqSF-9mJINOtJ$lgaq@eY*|32&MFHoUD#qvYb|pJ^$|9o$Du zO_>qfsgdI)dm|eguJDAu-dJ1OeaMGj?D&IHX7Yi`%1UY)8V6K#kFS-{^^*2F_f3cL zJ}~Ee>Lm@}@Ul=imn?y7GRN0MVX>ngKs=WRETX|{bk>7Ox9o3LO$L?E7L|<1; zCHCG0Me!P!5?feUSm^2y)wgfoQWMO@r>3SJdwgM*l9CEt|0&<{#H@Q`W2p<Az-%VOkqSGbm8|Q`A?kAo;AF}z!g{ZchUV_js~aFg=?r8ivF`l{QMsGBpYaHYLeN9{9-ls zpA5h9Ldmz#sAZvnw63F5fK4Gevz;tQw$k-j3g!H^e#+k*@%%TBc(6tk^6{-+=;jL! z4tD)Bdc7lCEqiIAOYHXVtu|Q()3o^y$L(djhqd~{wnD}`Zsz&jrfvE7kZ~|zlH7*Vr>+{c9<9x~(U~cGZdgkvfit$Otjx4~w1BpF(Yhu;2EbBuBgig|1Z5>2zfG#35LW=dG#9Pk3! z5)}(+OlfK9x@ssIwv5MT>7l{Fy|-`Q)6rYvZ;UKLM;LD?-tl_x-m;U?WN(y>=&uwCN0}H(#XHA}&F>m_ zWeuaoxUAhz7o(#p?xCflV?q0K{Ifh|petg5{wd|tP`|#g`FH*5yLZQuv$7JE;<_t~DEW5p%dC(4{dliG`RiiA$hUp?$&s7;wNIS1qV>OH z6imaH$I1C_)U^64Q}YGJP^&U5!SUGE<}jL?%Ys;j7?rL;nyG;S?LM7rIo40k1uu8& z8nl|c7b~yPd~x9?x>zoLx;*#yeY8|Lewk2(;XQa>5{g)4tiwtn21J;`n1*u6`6|y zJG|=7(_O;`wFhtMCyoeLiwUIm0t#t6w6XQQy3_ zrXtr6+&=xr>-0T-IVy%d+fTH->d8BM9M!{i{~pP!&-SF4vzeNiX;-;_Xa?9T^Gn_D z?Z356y#^qTPh5N^tMyFhErBS;73KaH{&Q64ga&~4wOSLnq7fqV^Ye`D=~BmkNQk{^ zZPl&$-FMcdvK(DsYx2GOLeOrbpN~kHj=k!o2~mG9Iew6G>~Gy^5#WS?KmtJMy#Q0Gk$eTr2R!?g>;C?>eqdK?RkHrfZ;H9|mfTQC zZPrx5^TN9xGKnn0?fp4*>U>_cyT0xs*`RgiwRTrCpnj)cFuYE`XP0VlfcHTGPagpb z?)t4wrT+f@O4lR9$b?-Kp(ld9QSV^y$2k2h0&?ZMOn{uWl22&}5>8qSi4b3*ywfA&Tu_Zs*MO*fWC4e-5a+PjrSA=)Qj@$m(tT!ow8GAmLk zPx4z1s;kzd=NcSyy7yIQZ(n{DY{@6=sn#OH5<{sBnWMYcJ(xT#an{S{JcayFxl6UElRZJ(2nKv0EJEc5G}jU!Rnp_=+*i+$}#pe{gcL z$ZvabijSY4_y@~6RE(yRg+YUb9tF#!4BsZadDM5@7Fj;AqvJ^g_m#EYu+6d9rHreI z>xXu%I;rlG$aTJVXR~SRiL+!W<#^zqpwSSCZ%*gW_J7J+2@GKzmt|{d zJJxq=>X(h=X;N{bw_yHvl}mG}eqr7}>8H4z@@3xf;3F4nxor=5RHsD*e$%AuI77DB zN?vn)CHQpi`E>H>@OJg%4cxvm6=~})BG-H_DUC;?G~druY3ugV&F+ zvmaHrvE;UVu)Ix`8h;U*1s9qBxwZ9$=o<7U78d&kSx#m1YQ z#m5cFTO48oz4F+Wb~7!67qmN_`%3znlKZ#XQMc2=4_xW!+=Bv?FSc9(fn)CWzQp1x z$QzMcjUY65DbQ%iPN5-F^X`&9rJnptbz}Z;6rG`uS+F#OExkSuG808>AbNi`Yix#_LhB^$$22D_aDZ2ZHWD9M3Q z#Xepj$~woU+TjuxH*)-9-XO&#qKYpy%dhT=3^<6k1 zUFE4pGBR@VtF^T~Y^ruJl&eD%F_|`dEu~oMCa4PW)ZJ-LF}>f6%Q% zmWIuVn?EBuu;XPMmB;jfy<|&Mf2Jfl=(hRI}U&~(lDmrEc2BWvP_Vu`8VTSw{e#`;We$lJaxcCF7L z++ho(flpKiXTnb8o=;J)*>*S7?%Z+4u87ZOu6Ndd`daq;`m6Q7csI?gQA2aruD|+h zL;$JQd)q@>MwctChBXx2svHqFEq@Pmru{3#y1H~}1_BWg5vd8LrA>{_`)TCmpD?Sa zr>CY!a)R3tCRspkrQ*{Z22$1D+Wgz@Y#&LropDz2&eMX1Ms3M7UzzX#qdcZx-@a?R zySsabY5R@=ZuiVDFXsa%m0CYBni6jD`SD(4cw}VG?eu|D4dHx|B6Rlq31lyt7KcfY zUHLaa8r#XOR-Q2N#lPUDb|3WE_4e)CZHZ-ewa;HGehJJgSNpqchEB12_wJ~usGZ8? zAJ|mB+)7Gn*yCBF@%{PviiZyyK72^ih@@oV4&AZ~1V2ME(^qw0EA&DEFW3t(bL>x! z&J9$%cXEikzt2gxG#uk#78B7RVKEooeCN74RG9tnQPAh$qQd zf1fkU<{RobJM*97c0KtSm8J;?OZxet19@ZPR@j+K1ZWyu2sXSS?f|Us_p^_cmjYl( zgIZ&2b6hpByzY-Ei{O#Hf`XS{UfGe??fNh6&o9>g{r#1i7a*8-?_LsK3e6b4UKX%h zuD$o@FI~D6yf&8ptu7=pl4^&~`tSb;;_<}i+S1f=>XeQfH19CbQh~$E%N}2=%U-9xKR^9Z1{CG`>>1Q8 z<-g0lAectx<`NrEHMF#hRrq$@K6HI@(p7bRMi%fr8YEQbtpg*#)#}uZ)8zwS#k%_Y zxzh|R=Cj%}mHcN3c1+gg&CU`Fu@_&34|se|dg#>3NY61?-s@e{@mt)2jYj==*$vx+ z2WP(3W`_PXcpy|vwBVMerieqn;}3nt48YDVO|qw^Z0MNljJ)*x>05u#VlAD@Pw~6T zw9=tN-S>}6VX%4GnK^Dv^7nuAmMjKNVjRH3NoiFVc5ui|)~!nx8}qD%u8o1c_@&aT zaQ1zX#-r1n3ZWaTAzc)1RaTXiv+~c31AWIk8hd&c>$HvC?wb`d7x9>SA{io^l+ZKI9W6X=_mJ6h$m}!hY)yI|7lwj)}@%>1@{4*5SGb&lwmQ z)fMU<-W0N~wF?{gbd6huBd`1QtN1;nC+b60FbtgrP3kIlE25nh-Quosw(7Q%v!|V$ zmR1Jd?He5(O-f0LXmf6AY#hoOw-hqU2pY3}>g8Fk9N8G2mSz#_A|j%j!0FHu#~QrY zCgjqmnQ?pq?wb zSKR{H?mP2YM)l9+KJEQ-?#^{(T`%8W#IwB?hPAWszI=(FN>8#3URu$f{Qg27F-Q=! zQBf2OJgObxq~u<{z9V1GWV@hZfR$eO?mwV1zrSU~R3%5CvYr2IV2gdR zwdQMyJ$i0-D_eSHanbQZi3Q1a4ms+`Ms>Z{$WYM5QH6yQDk~SCdXG>x zsHl+(+&m%(4MAH+hjIXf5Bb%CO3QLPl?!tNT`PMIdX*S)-75ZY&EmdE_3@JYe80hc zISH2SLKiL^0d4YnX=19sHfU=0=~EU(fnw#Lse<&V+S*!$z(wbqb4tR(dH`R9y08FV zBNh~`gY>RuK!|3d`QUY+O50V+Wp5t2i_F~dGo6uxLsYu!F*6kdr*MF>tgPARFJEW^ zymrKb5};Fec5zYhTHj$+sGG=*@X~JVZx3}Mz z3)^A=$fS~PTiny74b9x^59i4#^yHrzK44Lql?Vr=srPzaQ}X%b6AkF|+I3!~@+wJ+S@3wR?->eqdTH< z9Pwxkkv-mGS<;wsHnw3y#CX`&`}ngB}yDbLSoeK;pm*AH6zNn4Own<^jcq34oWi>-@CfV~+TT-R>=P zIO?c1ZaM>=?PH7c0(8xijOCFa`Q4USPTP~BK4jKq%h;74Ddq17)Z#K$nvs2h1>#X2iJkMfX9I(Z0$i z1Kt2Eb_X!3;pi(h)}sI>>nn3YtM7Ot!nW=_jiL}gz>(GC?v{D|uweuHJlW&K&o`D& z9ltbRGghQ?xX3`evN622>cG#wYJtJ2!oWXMg>Ii7d{>c=7b_(8-WE(K!4(6$$$+V+$>~sni6E2UEW; z0ty@s2zVNJ=whlF6BB=7A^EBM-S7177nq!rw9|Y?0_|i%V#4o5!~gdJu+g5Dq8A(# zWTdxg4SvQ;tKjL-7|5&FahA!xvnk@Vo!#o6wV~ZCeOO#BLgp$IK_*sK7rVE2 zM!rq;+SGbS-mZx(q%CrAxI{I0PI~a%{voobBp*nRHyoM^VpnhL`tC11pufN39dq`| zFF`LXXxHA4r_??7ljc|!&lj0Q>csEqEpju%=bv9({3kwLym;60_>X%Xg#6SLy7ku| zJSV{r?v`_9#F`4-dhbE)+2jSp0KU7a7N!bGoPZ=XEB!$QA79v$@lt@58pSW0o13Be zy{tO>DdyRQ_|KEBQ2O+0LNfE$zpZxK-EXlfdu3HHOw(XRU3ye-hMsapSn*X8$HQQC z%6(odALmGJa?sy&KTzy`z)k%gqcgQerCrx|Ds^iyu@a?2Lsqql=c>+{722tM`~KbA z-@o;LMgjMarGW!4K3Osm>tFv;{ofv-E#Ce6Q<(PYNC*A)2y}lzCcSIB$Ct|Jm4UEq zP;yI)?NX@YoXX0|P*umEjq?i%vYj+FHGPd7gnkN)nRN4}KKd=(5&8Xf@fGh>jCxGX zWRwTrRuq{l4~?3S^6DNxURqX$t!M-|RZ@BpF_#@MOvA~=Rcu$QC@CrV^y$-6ZfiM} zT5tBjM{tpmm!kgf?>&#sbd6&c1j`YtCB-zf;=AlMIje1-%P-V`gKuSu+P5sKU! zcn(}YKjcNH1oYvN&c3*VhK3gs>BEQ8k~UAnt;+5p%}}X(OHaz)lJ_)%9R+IZy{5Ul zqYzKtW~+7Glsyx3?b;|T0Vt)(85uEYX{-drgN9LvV(SOf!|Nky&CObyzskcF5J-uj zvxqPce5I3;`tM_kfP6oac`zQ*(rrW^V2i!-vSKjADhCG#OvPURv(&^UCQ>!DwZ#J4 z63amh9YHB7#^3_W#F$Sa_;YA0Aq0+ z*2Lg>-?D7elB$lF0z)?f`X0NzL?R}>coDR_Mod3t_TOPL-_6Sl++3SD6!cr%KJ4!S zEOEo>Kb%@JG&kShGbfF*Rc*ZnYtK(xHY)!9{Ttg*n3F`E(uG%5o!2Pe+^9o_NI@!m)43Xv9r`D>!l{+;>JW0W z)nnIOKYslZQKC9nSkzwpwM#8j85R#S#WzmRF8t)cAT`(=%MuJH&yg(UlT&&r=B$%0 zK}z~5qWth^F>out2964tE_<>|IY1h?4Ne7qi;Gw^e{oZfdsRoKd}3|TiqGo2LvTn4 z4FiKE@q2gTL)8yqnDT(qGPSFR+;6HU8dKWP&!0yD4>ROFxqYj zro|b`f%|iFbH6?HCWl%~sOJSv`lbbh)9ZO6rcVqxEqKednMYj&8pSE%ZI|V-@p1F> zUOLxyY1D`Q#hdsvJ~l^X2Gf(`mp%}-#l^$JLnJAn9dF^9#W71uOL5!hsSOS4I1R$` z6?^TPIkCV1Vu6}R1X;}P9RYC@9uq?iH&$u_KWi^)DD{pVt&NSOYHDgXByCHGqj^Uh zO?1%*cf$Sw>F(i^C&v{N%r$?!FMbK)$nM>e0}{V-0+uR4!h&Q=JMi&jiSlBQ>P&xa zW>OL(K!(1nYf(=HPK0n?>020JHxHfvu@f9i9^eIbYRC1qd_Qsh5^3ULrfDNTe!Kxp z`M1Ad)Q-Rh77NLfSUxqDUveAnyCoFTsbg3)!{c+`Z{cry^K^Hh0Jo2}U2p5`B;GfTi1U&tI2!E&hPGi0 zdV_KTt^Nupg$V1zM~|FPe?0nYZenIgfFSBS)U=H_w`cr>kf0Q}$X{kvHQaEC93&}$ zZ{NGG-Iw})78?_FptCm_UioWOJcxd(py#kcRn z^9U!yFywv5jn#$idk$Vh6Z!X~e=_Hz%F5w8{{0Tvkf+a{iE!zb>?Jn3uj(pnCd5Xc zIFUhYv<<=A7t`~o-Ub!f{=&*2u>kWd*3z4=&w~EKAEc_sj86=!XAyAtBXseItQ}n{ zE1aO=fwlNs2pr2l#e{b!$})U+MEcOunqsd+&{RyL{YPHW7wprsXf`ueX;~O)C6WM{ zc0+|HUsRuXLAAq6N>uKSM77N#zn#T#2%HIEg9!{dE}-5iC@2UfhWKnZ2_=YkY;;r_ zQy@GKZGfMXpUSg)lJiARFHd&E(6K$QTMrw!y171=mcXy?@^1+&TwF)d&R5~|`%)7q zi(=W_)TB9e!jyG%Y)l)n7Ff%{f^<7ayhnavp%|B8ia{CzgLW+4KD6=EC-z#qF6id} z^l3cmEm(96vV>s~)Z1a?%%Q-=d}2G_V>e2q3Fje>nBXAMtq2ly2EO?BGMRrzCb}&&H1xKDcLd~;9`J-;mNZJ_ zh{Q=TH<#|I>=+ip-+K z!0vc`Yhyw!803QZiHHBTMikTMtDXYGFVD}5ZLJS%1^g&3M1eVu82Y;p%h~x0@i2m{ z1y@HyN9VLS+D0tH^izFUSkudw-oCzh|9!1S8QaMlg=3iVM4aSP^HKm6-bEWF9Dnd# z4o^?tC-zVl!Uf@g!HBu=R%uT9R09b*Fi{N>k)`@funuDs&k!)Y@en;6F!^5~8I@Ve zBI+=KYheyWMRG!JAp`QS;1u&?z()`cRfs|PC^V+G>+99PCJ_EJ#8f1V3wfWW)9{ED z0^Z16xNyNhH!0ueWn<$dP#dgMYhJJgAjErA{&zkJU`lThUXeR@j-r^Z!LW$RtFQep z+lp_y_2kol_#8rf5(FxWs`&%_N|dM_L=YRaCgsx>dv#smQo8W&-qAg985H276Y;hP zza=p*u?QJq^c3_7ea_GN5;P`J&|6xxe+MVM*@p5g@B=kZ`PX|hWHmr$Gpd|Pu{rU1 zX**xOj3lRGkPi9dN7TO@jY#e~hYlbx0PiRGKE3Mg?IrY?rlz}t zGG&&Ar%vrf4jMG4Zsz_UsDvAvNIb~uPN129Rd!!*i?S*^jsJ5Ee(9+q?f;{mi6)II z6B8FV40!vkCh$?t2|{2yp;k}wu)6vcmWxs29o4JuZl*K?_&Df+nRji!96^SODdpBJ zT6Or5Kre0ErmIQ}eS8Ej3LY^XsEt(|gRkTZo+B+egi+mslo)|f142eGVjje^SmJfe zD2ww`ynttkMn_nRXTChTNtg^HAAr_oVqyYzF6wwRHa^bG&HWaI{gD4GC!DFk)=XSn z@!hUkWtN0PAG|VQe}x(Y07-uQ_;LK%rOeDd$mP}#AF7()-U1WnJN;2&czF0J0h+VH zvBV@MPJnR1dnKo&kcF3({ine+gj!8_-(RyCp<|cy+S~{vjt2!ABl&x%1HeHj z>TX}FB{1liTTESs3I()kcX3MK;D3F~?}GN?{k81w73FOHoeM*LJm=FsT(@y44O6yc zcZl6{zgWTok3)f~a0FwH;vEL~Mev>rD=YAs?~sxzr#u3e&{6}8j098M+XND;E~DhJStF#Ie|;#2Cd>EI3I2aF|n)i{Q;41HhN9<2!I1A z_T!(Df!n_1c==m2zPUi7!d_yc81Kq1SKnW(fKkl=l^YC4fUA=*k_c!&CMIT8H=PQ8 zr=v(IOkA>f8Xv|A9-iz7a!0u44gWIhX|wR~bRkqfV~U2&2^TeJ{Qdn2q>PnLq3CF5 z1>V!u>p6_LqCvZK@br{Ii9ye01>U(HWmL%0pknCkoZqvMANJaLe=Kmj6$u3J{*}3^ z`MV$7TQWWC)?WWSefo490CP8J33NMNmVJS0nGC+O+o^MM5m~Ef{UL%Xm zAAZx(;XF_mf)V5nJO!{1#uOBckt^wzn2Q2p>LQ5al+W$Iso*t8vge@3AQlZqA`<(s z2nYXuufc|r{qyDT@2CXbz#(8ldlm6WTdHoA4>MB(VJZ&@yP`a@_q&*ql6B3(t370l zNLl8aH*ccH8pO0YGoxQ>t49CuQ><|9<}1k47ifTU6Dbr+OgVC%{Wc_;dEL}iR<`pi zWtR7&Gcz-BZ>O43GX^NR0up97uq0^=0xS;o?2Rm)%i!k_xEg@! zg6jc^d|e89D;L3O6Ig&4uVE`T10rwg4_UEQg&-U8g%wgdxG^(ahjEG27nlWbAhrfC zPD2UfjlwjGTqhi8An(th(1MoGN#yRFF0VJjuBIi}j*X6wSEAGqw*^4QxB+zITm|tc zv7Vf=+ewl83W98F0+@IyV1moo7_x!Pl6c6 zjHP2_#DGTbj1`PIl>TF?ME^|YC1xBr%f=CZ`RP*y8(Ukzm$VdgstORfc;Vsef=4sJ z8xFXuNv7?9$w1t?`WmqJE@(?cKSzki78DeiCKE^`%>)$zK2v@H0S7sbwvG<*T`t+T zZ+Afb3jF))6Ye3M1_Z`+F4m5pPo6x{&FemeyBUuj$%24d>}O>?h6a3yK_eg?5Wr&^_phOV$y}H@0ZZ;p#q*+|k&8=L42u>4hIh-K z<3w{NkU8#B&dL2f}$uepC_3kAo4X;^HrN z1fBsV=aaP;4{UC37Svi}(Snqb1iye<);;|TY$egbVESAgy>3jzH+W?yoY_CA>dBk9 zt5^3aC}f}jcu&6L1qBBHl8;h{L7v0Ya}N48>ssAj2ur|{thWyktiqKYJb<3tbai!s zRWX5+gBZ;L8aH70UGfeqZ-F1E17mUET0BIAhRR|uhU3(bt4@L(Fciufw%1%?)M1au5?$#^$&tw$ITxhIk}x+9MaR- z@!-8+SR`$qo2OoQ)Z)1R<|-l)S2K9=kpRsiL12Qx|JR7WK9RzOpj=$sGSDq!v@AW* zm?~mMSZOfECD$G=k>;g9&=bD@yM(O;RK#WaJvUxGa2`5#a^u%0`__$N#-@d$3lJ$tSe77AfVzvnqn2f9{+;Yg=1zyM)10RShG0>F~s zfYAXd88~Co)6;vH+V9{t*FSjU(#{3|-igE-YcQvg&JGCxIDY0eM23~Q%nvpS)Y1i( z$GuT6|8x~aOm+Uj6^?ivM6C5xhJ_~DZ-#2f^CIN0#=sV0_$m&ShTwf_e54@bW06O% z?U6Tv+0c;WE;8M*Ek$4$lchnc7(2Mx=<@P%;C5Dw+ZF_sLuky{d}_GA;BBZ)*{l~j zj>Bru3kHDDxOJ_=(7*t~yDIOz9G@)kwOGU;|^^AZheUTmB^9*uaDnP@lF6?)37TC)yMb0D;42AGNSpYhbFij z1aZ-tIei1Mmn>oPbpBGhFfBVB;2bkt3wKcUO%#2it33x)fj;+;#t>+W;90Y5j~&z0 zjL1+6s{<24T;+hh#=5~s7WdS9R(c6%0SH_Le4+!IgVJ!~_R%rRwUeya zYjiC9Ko9)~NXFUOIqB9d11l>I^gHL%r+u(55uZl=KeiWkbqc5)+8rb$Bo{qyJng6G z@!x->OYimINWi};FlXVq4oU<*Jl!nK;P7z7)vN6Tx59$ZvH=Ld3Q7cYfDeKL0V=V-+G?9J87&BjEtUI zmM2`z+#`vBuGowUH<=-F7Tdl!C{?@6EM%OeRoDw{#rwjAaZFWbaLo#a-;)Q!MMw!P zc765*;hsYEBCupu;P`Hk9y^kflmDs7xIA6S~Ta`+P1-K^>xcfwB zpB?4G!5Cxp(rp#=TEa2Hj88P2FEP42b&k-Z@Mh4GjFwn`LdE$Dm=yflxNWoJJU*3bx!uYe0KjDlas%+(n&qM8~!Gv(uZH zf24phg&$j$P4*OAt*G09FAUiG?|T8B`c0+uByUz=e5D2w?SA5E4;lm}7I(YAF;?P| z)`hu#iT$#&Eok&D;8i3@yayXD@lC7Fm}+}?lopv4CJ@=EbYX@OS~K7N{iejhn%}o1 zd{b;E2XJ%b^XJbo=+d$Oci}35TSXK(0>4Z&TE;22+A*IYuu4sD-x}2{_ZTKFEyl^b z;uRA!B<63&u{UW1m3`NC`r|MS_zLo(DxH175)v)aX$Bo+6|~J;>RJ(c5+j;9kJER- zQ*(lLSd39teY@#>Z4_gNldd(9S0f`QKUbFp0GYNx&T9c@t7T-A&=Zz6W`~b3IZ$6u zT-<`D!Y3hdbV{$s2^4p#>xYuGhn1Dg*qKskGX>fG`?WBBY9S=V{+1?VdW^-CHLw(b zsc6$=Xuii#ibfr3TJGx+aNtcnJ*L1I&tH!KqmeNp{>2vqw6vl~q7oCkoN6-T1Ylfq zmE)*zZc7RK)3bP70h2HA885()B)A2ETk_|`9|vYX{`uIdyRk|0$Qvcs*=^osxGLV> z?nvRi4gMkf@Br4RdLyCIrO;5tc%8={Jz5)QJ9ZJc$s7F4n*Q@7#3n6DR(FxfJ_7%s z_Xyzjzb>GzF5#Lh?%k^JA8;_S`Oec&|0u1FIzl*y&meiPs3Tn*7d_*)mc^0^t@#5o z?!niSieg1L3{$eZwVdVzt7GD}1?8PM)<|}@$VB59sawy`#wN+Zy>t;Xv*KQAU9yP| zr!UN&tem-xbo$nx3X`(Go~IoAr|w`VRRZS2=Z=}5gN{lO4(&)-S-GaKKkWxI)y*S1 zWS7*3Nan%7v4qnm-cA3dAZ%(GP@1b&eAwKSTu+?s)9>|!t!7>+sYJX+7ngT)cq1v3 zj~q`YNxGz`6{*GdhdIYq_*vSqd{Uu_KTmg(`R&R1Dg5kcuP#5|_a5iF#Yc4Jf&$)Q zI23^gPQ(AIWx>OTNyMf3VTASF87UVS zp}Ax4?>>E+LEKb@r9p^`i>s%*n-`=)AM&RlxzlTGyPx*YS@o*B9Uh8TPUV9#%z`O5k#08BL@>)o2 zNrdgmjh@m6ifFZ!-9A7P&rV%bU~WL7XKbCRtfK4+(mDR_33He|YPUAm`O^$QC>#yG zVpdSU`Nwc`n{b-J8bhHS+*vu`^HWeg5fzJEMglZ6G-!ZYAS_3fcG`_jO!R`Sb3itYZ)rI>{$xG#5e{~ETya6E;9+Z(%r6DTH06)q2(5p=i@{O*yLHWk zW2NrT4>2FtPYEPR5A?L`_?XJbCNELm*84&hN9wv;Ju*tOhnYHNMB&HipgJKjv@|yt zP3PS`2m*=^2HpMg@`Yf=7r2b#m*)nu1sKjBGdI^lxo!6548e-4m;V{boi2E z09<2>M)f(sK)VCX6S0STKf(e6(J;^+Ra8{O*K0hWZ3UL&L-HUNJyVS>g!f}9?x2Q_ zUi>dLTLHxJJ5a^NXA6zYFVHzss^^lho^7B>mZ-9)B~_$5WR*=50i?F+3!x85ytr6{?z;sm72N>@?Aux8R{F_ujd4XA*|hz@VTd4k{iD zWNr|3ok5;S@k0Iaw*nJW@Rb^(dM=t#@`DGGC#|e{0DZ+gjP><%VeT-4y#_gbJvNpG ze5Brxh2NK5SAb)D{QZSOLPB~zeiQ_9rlp}VnF{)7j?GoeHMYSF9EDYFdp6F z;7HNhn`5HMdxvt*yU*`#1cx(_FW0H7YbNBe9nI{4752F2gFgGZV0}0oh&w_TS$qS< zy^lQr?Zw#Y{TPU&-|;herC2RK2&piL${qYbpWxB43C1}2=FKmelrf)FyaSdenbwSN zYVzeKbu>HDk_2g0R*2J(bW*+65*Lis)53ttvGw#TD8*t(NXx4WPIZTiEn&i-=Tc1F zzJ0r%uI?4kE>lqF)0lBJVD-`)%A+@U6fP`Jnyg)P!cF%-7`R2%dNx4#invvS^-wKkx!h~K=#-I}*$XsMY+oNr{JO8N?>)ZR z;-c?ZHTWQAY6`!>2GXi0wn!$R98c?*D}jdi{Ngb5J+7*nf|sTI+>!EHT1FhX^aB=J88vOZ*?gw7LniuD%h4J8Sh=*-vf-uW$-7jY!Vm(zX-B z$Eq+0UWa?-{I*{AYy}Kzz4*cdJ$vg+@fOEFtR6Z0tPO4`rz-bZZ^v|976a|wbX%1#u%M;m2N$RG>=EbouLH>!A`g?ipB>|RsMmt?rTKXCn zxO%)f47SyM`(|@-;*Nh2E+x2^Nl!t2fa+y#9Tac7mQ`8lf_QeV z-b;GN#Kz|Ozc|0ILS5ZHQeeA>NsaT}eDUh=5!d-}(q(vs-v8~jq0-jY9-Wyvg4H}= z-}YeHu)SDJ)c+^0H@ZFT!p+AgCL?f5V}<#0s{Ul zi{M{<&wLB5`kXRu`J6vrfN$rps5(^d(bOsr?8C08bclDZ?|{}$_K_eG&#aA`Zr7Gc zN=dvfQKO+AzViDg6Cj4QGP;lk?j{3}Z*!RPD7YmmCA9+s(>shb#%*?4xI$?9RaRQM zX#SD<-tc3>rE&sQ_%bE$XS*HRW##0|pVobj>15P0v-4k<*il_nEnt>>RUUPo` zK5HnNrfF}_18RcS*bXbFwHIQfPRh=5g30RY_YdUI z*;rg^n{UzXTa^V9rKaq|Z*RkShpEWB+O4g4rUc9)DQ+iwv628frnZAN- z0K|G5+ztg#suq`)^sTKk;rh#|JZN5AO7SO?))l+e4e@&Cxfh{-6%RlgBdkUiQAOr) z+>r?&)!;RN%a7aGq}UQy65K!_#~ri?yYJXv6Zp9@P?qfKRT+!6-A65!NuHJzOStrr zhm&+@wFNz-oTq3|3Z^X37$^HoJ=hveM<@YRRWICn1&KQ(z*rX6ndg7uI)2mZ*VHho zH36+BrKLrqEfDn1&`&y&UC^nsR72tj+Y(0lr(s*Fr{CT#zHvX~=8?-#N1!fqmfO{$ z^HP9Cr;62zKykcZ6XLQC4m>F-DI97`acMlhCkL%D2egam*~8aeoShrm+v(6$TA@3v zF-(+!&edmmIpp}Of6jkoMiz>R&%R* zw6N!1xDoMdEc=LC5$h^W&T?qwz6VqS;DUP}g%nbYAE*$5xj?dB{=#|tH|EC!IzN3% zhlS8xxzRGD0^A&cg{e80f_=X5&DZViFIzV^=f@O+_3~?W-Xj$oRtmp3EJh+kdL(oI zw)6+K#PSOa{MV2^_h(iy{u9nriU^KGV^8TAYJfOh4AJjEm({&2mWA4hNTmWP zd+r#yEf*(eQfB5=B%8&L3X9d^@Ax(pbgczL*1RH%T#p>AFn878%VDkK9-{y`BNdtS z;kLb6x+LwSHQa98;hrR)N#Bv4qG+IN(5PACudlUPcQG_&U7wZu9-6Z}QF$r(d;8BN zvWD#>Wu$r}86@+hjKzZrF70Y0I;7GRr?xk+G=yyXSxAx*pJKhwG;JS^w6ZTr6}k)b4_QfML>qe!MyNJ3Hxp}|zfWNd|K zrhzDmMT4SJip-S;l`m7BqFlw(){J62g4&L zOK&qTD>G`VJLQUvq(_b7wF4Dy3T<}JRSTM<{9R$d;|0-6U+nwkllA$fg=DZ~gN&oS zch2dg-(Mt(3yM9LL=A4QHzj)d{u28Iwy(2(n@#z+uhSEK-*J+@QjW8(sZ=J(_>R-+ zayIq-P6fSYB_SAe?`d6R_vOOO&pN)tP;D@;Y^z&(cip*FQDc@bahds{({LPR6!-i{ z-9iQ0#4UgStgNe3&Ze2a^wwjH1s{=#1KWwYXF3r7jZx@giXrq<{z1RLhjmw159J>c zsh&vF1<=+{80{qAYWInII!Al(UFeir=SWq~D##ll z`E&Ei`OO9jL!{QWTh`ZVNx%vBN9OUHLW}$>8b>dfcSLS?w>~fR!@GWd@U2VIn}z%4 zjm%p2c(Ck;PHmUnpH~tWzPP>Vi@tsJb0=3*Z3irljJdeHlX8=eQhTE4G~LowpkPVv ziEs8^4E1!pxURp{al=cCV@~zk8m>MnVa<2{_Fb*={=#x3a~vY&qqhA=jUIiiT2@we z=8OXeu{Jeiw>Yt6SGY)@%9{Ty0ZxGL+;1s{UHI&b=XKbs_$nm{b z+U+i=jOyn*yu;nmQhJg#mmXa#kQCZfXnS_~U2`~iVwv&YxCQer6SVKol5(ikkGLD z2SfJu`WGD>>}WP-e}6y=pzp+V<5syg(wn>;eoPuZe7K*VpA4+u&Zefw)rKg81J1}n z)!NwD_&z-Md+hRkk4?60v9<#MLP#-dp#a?lekLKXcfQ@$$N}ANxm2ZU24j$-RV%EuG^Q z~h@GnW{716R?n{y@lx~p$ zKMqmN%e4CAcj}C&FTrw@m{HTc%M;SlcG2LcJiFImvq5&jubn^FZ{EB)c-Jm-YF+&k zt7~#bNph-clkM~a;^Q?fw7PU)z_rmkYS5PN(LRa%D7SN~?~xmC?9%*va^W^R_LL7kd3`)@6RKbQ&4nDDgbU2e!3xSsUtceG#cJN1)|CuvJfB*fr(91r$v~fn= zI8-o!G~G3ukGZPmoH}xCRlmw;xLl4x7qz*~Jy$}VExUh8mw%pVXXg)Ydmtvpr6g2! zD3M~}*Z~*Qhc2=#n^s$WqyEu^K2KMq~C zYN+Wj-(E7^+U}N%dsVUWr`3?%`=b?|>Z+!zBopHob^mLrWXrsZ zwL7XG*x5-M23oas+A#UX5~*~LHP43oo{;j8xz)wDLSiMg%*fQ__?XPB)cQ92GA=Jz zHDAlB?e-DvUMZTYO2$epQ!~9G@%|ibaNN87pe;iP+V=lg9oTQQJu~h4-C8QU?buctIvv#Ju#%2ifrqBH6$; zArQHRf?>+!$;S$prjd7aUxp@4eT{BxAHq&r_hi1l=tIM&AKLv65gSd#q(!S%O)rh; zeGKc_i@4ujprB7lagXxT{{7?A3~5tFInFdUy64bzb(NUF1iv}hZARLBM@P%>4I7>f z?q-`6J41h?q#+>k*nojjbLXsj*wHsRI-|$_!|MH{KkNQ_yMD+YW_?brTll?VZ(ycEd_pqOA-2J3}s|-y0O61LoEhf)A=r#3R zZeC5|qc@RKu>p~7%t9k|Tn$Y(F5GpxufcJ*vUbOJSz7$`{_4=&?l%gbHohED8mogX z*S>Sq7Xccc!p-&m2fim!4(pH3yC_CoDBdT79`r?Esubc+sd|-)6>USB&tpt{^|u^W zwl_+D`XHLQw`Fk9m9T|MyNVaNx|#tzr0$kejJp1AnD9W$&eq9tv=-w6G!qqwx0~OM zN_;d+YHIg!{LF#fDzuD_INqzs+^;Hm z?cqMt+B(!@e4~ZHEofrxr=LokzJ7{2G6RO4me?#}JlN1P8M;yJ1QONL2Wd ztt>Ge{W)z; zDJqm{>ZYHh&eWcDo?7JCK`Qp9r_K}OZ#1v=dwb2ULl?;v0jWqi^SHVysnq4+*_8ms zKR!$jINASFzrSC+n1q{gJmLE`=b2`;f*bS-mFAV zv+sBZ8=KuZ>6Mjspi8ppj#)c^+C_?2evk_$r>dsraK4`pGXwYm?#_8LcMlGY(x!06 zsbwdpzeNz4LeXRAT_l|v(cnkx5E&FSIl#=i5nYSV?LL2~mE#<8 zhyNeHY*y7d{c->(jgSB{wE^8K2FM$C$x8Y)aEO%m3_Tt2u3x{tuYSJu)bcpsel*|i z1YVETm&Q1_pVXR`p8b_FGtw$48_Ff>+;bo;G$Dt}AG#cPaFM1|szbX;O7;CK8vg24oS)-~OmFR?oK@o`?Xpy=4;mK4e0o_|U!V9z#W=i(hAIn0H}z%oY?*&w zV(aFD{gVkQYHEd5RrYmX)c$&Hf@J+sc-X`sP4ip5 zeUF&>rcQMo8a7AEcaOAGj#MZ0v1;dB-O|RSq!dy1*S&ZV2vH!Il>I7A8QHU)oI1r= zJEUB#g!zz|{u;Q|+uJ0s`9)75d4~7@_)|cCk~3-9vo)rj$OiQo1u3sVI~-uAZ@>8u zq_KUpgj()>6+${}l#HhVFM>GtVv`p{v+o{k#k3|&rNpZBB9@l_`(YAzdQRV;wufW!=j#BfD% z<>7wk-aQpwga7rctXQyq-~77I98MsR?Ur`#4l)6TxmpbmQ)g7!UL%+&Qw;lo`BaV0 zeLHqH*;$`(v)NdbP&a zhrHlXC~mGmQ27H&sDF{0!!+z?iH4DWe&*{hRjydGX#KEZ!U>fEW)9UH&<$EffG z!zYVMx>pV9B5Cf}=BReJ$UWPoZX6lEuzvF7@6F9K5e({?n9O~<^0d_-x`yc$KtG=- zu0mCVE_UpY1|i<&_B4O##7e#d!D+~=4m(;gg4QXi zYFf&WD%+4?nq>YIsOFei~QL}%E5b5ral*@G69@rk3qEFq;(Fu5} zPbTH(6x$?cgqSxUE{WAqh@IQTJ|;wF=Fj8NR(5M$7B61hm~0%i-*{jD!Gn`gx8Amy zF+-_+0FR?WZ@cf8_>Vt2u1M)UwY%vy>CYS;+wHzbT@Au|jf!L{Htz3;h`Av0Xi^_v zrPBDP_VQmFjg1QR-}bMC^b4+!KYR6!$Q@kyN|?h1^?g$RcxqE~P^sOtjNsJ2uf&hD zn;HgK9dFZ=T+>#|UPs}1E?^(?5uF?A8?2JlIBxN|=B?je_p`0s_4V8L zO}~DO+JZ-qNDky~Of6=8{PHCk5NLqU$=DGYFV}MiATxS5kS?ZmIFd5LQ*Vf-=Ju^W z8)rAXd@%{CqK*3G!AkA7ghk$0`egKE(ftGa)%RUD{3dIvp`#>i`ar3@e6OLt;)zAw zP`Y{yOr?QIv$93QOnjP#*49X_CHX6b&gJgs^smYvK8SzN345(j0}&A zF?cCsXsV?-D1NuZrsveDAHRM*Mzonl@ax^X^BEL*NrKNT^%N{+RJ7`nrYL9&6kysi zGJYLqpNM_R2Uvvcp6Q1AL2cLfRj9JJm^NjKW{8CuR$9>~uid8T_4C{`A7pCfz0B?WuEmJ- zd0iz+Yt+~8wp^xVdRcm1_c%j&*En1|Xv5oOfHK)2 zlHN`4HEVmz)(B-WOQiLSO-K(M*&N@FTWzJD_xsl$(x%)S`}14qnxLWj)0+faMaEL#;F5byuI1?SFG9tq zC4>f4@7I;Zt;fFP{fH1h>%Q9CP2)1FPlAcI@jX0dv#t9<$ANvlyguo>)tPVII_|>7 zi;=mxTcTzx?(aS4Myl8G1B}wp>4+IaNj|m`IMaxcTBnHM{P^Y7yAK^!Q1Fe4?rx&o zEkEzL+y==Cs~)NmAC~X5lWdfF)c@z#v!=e5@rn^s2PnjUat9-cGb=@u9ip?#M2 z&f)GmC?E?M+JTDp2aIsYa0#j`@ju?n$bbEk{N?Uloh~7Ada5}U)&u*F%sP-P`8?M2pp=QM%|NxAchpZxcD8+G zY}!-mTlcsrlH37%Z0=N)YD)=#Ok>4L_oLXg&&>`N!l1t%#55!wJGPh4#0h}!RQq~B zWtJC+uP)NcVMa#wj=jxiMa`EzyS#e(+uVha!foOn^y`4$V#?I1Z^DM(yt_D5rpwF) z3j}vM6K>))`3RkZ&bCKh7$k5|UQ>)dd-@dU(KTUY^~jV}y~mEt)!)`GFfdR@Da503 zSyJC{Pr(O^BY1B0$+uID>K%$7ibjuoYr^USyyZiGF_2i{J8sL>WcvXX`WjDWpFI#< zTsc!eslZG#OlDbk(~gq49c}uU_K;j`e`|}ctyHHOBP2%$JmPX6)m zn0`6DyP8xVkb_u5_eh^D_v^!$;vs6p4~@R4qmZ1O%=6LueZQn6P)r6`7@fQ__i*x9 zd*Kj7xw~5Znr*Ai+osJe24=2aEAsO;9zE+)^q?|RTHN** zerP>2_R#7+Yifx9ileq`2cKKF$I9K;@x+?fLzL|ARnO5+I&0g1j-^f|RC>nFFOuDR z^2XgiyK36Ii|22BKYZAbI~R+BI^F3)B&tb0ef^#Ed`-WqMs(myW_yY{#Zx>P;trCm zIkw3KDDan&!uUF`-oCwub5*|4a=jAcz8H0CLMfc-uHY>mM%omOpa%T}rKmBiqqq8 zTh>SM$LOE~2qXaOfOoK&Np z3PjJZ)3@)AU}q1HofKWpeAzNm${v;ey zd%lOewre_g5SJC^n?t!TiFkQ@Y(#8h0m`MXaqLIRDm?JsfBBM;xrKEo_wu%kmh>4h zqB|$AoXONsWTnkMKZ z;RtrO@!e6E+Eiy1`}3@>nMQ#RW*!q8D+DITzRXeXd74ymn}M$>eH3>CxY5HVR3vVa zzk$(*CH*KT$!YbAfRvPP3%Z;!iM1WnNVkOAXIZ|lRC&{n%q!bnYVH!Mm$4K@)Imxe z0t{*B9-LadYtr=o3u=o?OYfkBE#p%9MuwcFI!^eow{8DBR3%rhT=9qUo5TQ&Pf(&= z;{`2`^@j0t8=x%-in|!Sk21P?FwE2Wz(wlf5$dO~tiOo^3$L_X}xeZ^n8SiJl>r!=0EFD6uxBGB-+7m$a!vvk8a(*T{n8*BpESV&q(nk9t89CAxND1 zUurL2-ek;{Z{5%Un^DDxi4*QS({Sow6+4(gLX0dwd2(roi1#Bb2wqHcOt7?+V{9}6 zaXh{Q1?A-(NPCvxs{B5W?TXdb>U@eC13s(a^1FCCtgvZP(Yj+y0(M4%=6HUMCT0}7 z2=xV+=jcw$;<;B296ui7waa;?D}+4{5di7lguZ*@Y1naevEco2RNG-}l=JrV-JGdb zR)xp{1%kvFnU=nO2li`CnmMxz4eu@J!{&;M>z7M(XIH>T7a^V$&O1-A)>!er%x?bJ ztv^&yKN(}-lsw}|xbQ%j=NfjMgZUmH{^_ru-yMOp1w~(exhGGkQ!PD@=cgCboX)S2 zA|ih5x`Kf({RzyPV`RGBSo58Kr+YPcRCij^IePD7ZQ64Vy+cK7ew4Poq9z#*EIW zF#<1L8cqJxk7mLbF|`7(FkJBb^oT_zC6lUdoZ@jB(&KmGHubyg8WA2|f=J-Q`#SUH z>jBc+B~&h^@a)di#|sH9JE46GK_-U9SdZHkfA;LzoUaK!i|?RK2kny9i>poP2Q)5!#`vF~OLW3gjxe9KUBEg_Fo{pncgEop8U4;XKFuH6I1P8= zfB`?yBHJ0K@T|0Tb)SdL(ysQm)JfgFkMmxH;(|h12%mG>9kow$(BJ&Dj<<*O%9s(` zFv=b~{jS&bHjwUxqs~Y7;@?Q*sy3^wm^9qN!orzZH*pB)``@F}Ok~Wdq0@Fb~& z#*0cB{cR9mF-1{9M%;3QbjtV?7a)^wi&9i%Nfj2Z`5kC;!hqrOQsL{wtXBck9YXiP z6>07i!Y3%I?2TiV3f3LuBn0qMteg6%VVk>$lnfEj3l~QJl$8F&AVJovsH%R@P)p3` z3<)*%ipLj~A?}4MknlR6X4xdZ59^;wbaV`;Ji5HCH*NBj{OvN$3MmUNfE+TvB@7^> zBm(flx;>7|J!W!AH)Z8IeyWMF7f&2yj7N=f!OXhpuWi4IzIU~I-Yv6d0Pl1Fq@VG& zUpYCUUzwY1;!lcs6~^X2fL8h-ZXz!3z@ixQ&Yh6m<)wL)78vTX0V$r{6z>}w1*?W$ zH2|G5`@g2_jbvQlji<9pat(=e(70bsZlNj$(gY&s#rtc`JUtDBev7q5PY_)V9XYa8 zz>5q~fWPDwEzhJh_k4P#y?_K6F7&m~DiEu<*d5jpM+Kjt*%}+tcDNk)_LjLY@!A(I zjAEYWxxHS@gI&<^q$Sa;;GrAOSIZ9+v){#K<;oiyH#IKcr3(uSGgm&uDEUHHv_N8@ zMhta`ob~P7H=iKKt^>sz|8;)kk^-ZOB0L*#-wG- z^mgvt`Q-1vJCTEM`?#1lUjB+wyFeE;1+78?uK~p_HYvXgh?UT`H#R!Tx9w5~bMSGm zEBP^44*uj-Ray_qfk0S*0#K)SHdV=SaS}op!yZcP4h-d6f6BcUY+k?Fa3!`?)=@sr zAu>fn<^Mu>DrROt$m==S4jZS`Owt}ZR)eT<1z9p9&VdIGsBjSD5xELe+A(_=DVe&3 zQ#+}1uQ9v(wFmGhkNR}}*fEKwkqYOmENxy0tSF7+(n52woot-?rM6Loi z>Y+dXtmA2N7I8yJiSOItq+81eoz3?u${xPg6HXVD#9&XY zrjZE(KTHKo3Fd-329CH=Qld3#l&T~yDJg&<7F@8ExXa5RGN|62U0tOS4~Vo()5P;k z`4ln=`+>iXjGv6|EK9h@)32bUq%Fc+ z@qpa&WJ_8e(AiU-ofoomiijd&srkU}pEMj2_fsG9KNqh{Bo7iahqxSdD-qbw-9P#EA zCrN66&HFA-((6{<&vd}SxAv$mf&Q-$Yy+v3638e#BA4L-c^V=<-`Rx3L zpoHG9Ss^54I|2wxok$an#J?MNuG%*AOi|0jRHlj*caUHy_9hBBD&ncUBG*)?^0;g< zPBKGP)5xPGRNU^;A;6JUS-&7w1;arxwyF7}Zmo-f{Z*XgKXux(_~g>D3hPT-{&YGz z)jm2pZQw(;6BoF0#TzX{rJI|Z|8@vy>mrU?*j!Ae6uWeh0>FSilG*nA=SV7!lP)2C zOIm+m0RtN%M2e&;A{-252(ksUNb$QeIwOhemHyzzxEe(Ko+#x5Cr2BHn^Wn-rTBALpJLcvjFP=sA*}%9#~=0VC(kRc1_#^Qw-?w6LP*!klnlfZx;Q<>_*G83yl<0qeN_? z&2TXoNtyA1LWVEOu=Q2!^7Hd0bbw-*UX&}=f0`(x3iAoFACfgsIX0l>Q|W7U>yX8w zP%)w>I{LPqRD97M5TkO7+D`W9-tv7OzehS283W-$B7Mhs??(Mv&8h+aeaPe~a=97X zMPR*gL!P;2PO7(`uEM`BSkebMq*I?WKXv%tBO z!HL`l#;Tu<)(N|^%6?hvXL)~|d?%EH4Ce(fA8^{UbBD|;6HW&=Dfn`Dvi4df3AqkR zLt%i04Mk6JH6i6{7g=zB@g>_L<+5xVcf9sZ_sD`M+YBf`;V6Mcs%4dV0 z>xhC$z#kG*S`>Q3K3e2CHwaAOG*?&;l%tJDfO_c!gzZzOb_CNNihwOIZya(@x+tXJ zVnYP(8!4Yw-d!r&a^Y_eC9?_QA7Mfk<|JbM>$}~RFZhNwagA!uABm3c4yYEFn3%Zc zJD8pUsME+^rG6WSqoqeZz837xw7(Q# zsZ&T7oocNn$$$9pdsvyTa!iDqQqeCbxE|!H~ ze=%Ao3SWB-s;IWx<39Gc>Z2%-bLxvA>N@p|(YVO1UJ-vL&_Nea5 z-CC060q<8FhawS0a32v-1klMR=;%7WnMqr>=KWa8Bcw>Y+o19;=hO6nr_83dXXfQ1t++bl z8X}6_LA1(NZIDVXlK~GQ5 z6z~Hm5xonWj{9`q6_3%_ktYqbIl_DoQvE#!6{6f0MQ3()cGK3hM)`x!b`o~t*;88r z{4A7x;+}BFKW2EhrcV^#?@cC~LmXm0)un4!Kk9Z-%o-SUVe(TDufVs@zVEcqk_Zdv z=K)8M(Dv!pAwDgwV~B+`126R9+k+{+LMWE7qtRvth{%LLLjf?Q4WUjmT$B&A`QR-R z;S`*4?FDV6cPJ_^Pc1Ls8>Z5dxYQE(3y>IHsFL<!*II+BPkMnZXbmdb?NhV%IVLbrK08PSKhQw<_@l_{T8!z-cX4*U*s4h!KiEl2$c; zq{5&n`CT&ExE8WWMKnR6fGiL31c(;#3FZ4%*SGgi_e09Bclo)^LbQZp-UV_(PF|jQ zlQz((E-O}aVsKJ4vPDJpVRv>^d$oR4OL5={mYNu*xU4qctX^7&;s{%mbo-us(lrnM zOAC-&NVn$+=cGQ)M}?;<PhLm`FvQaHg=ef(B+A}1kY94F&n z5LGxTyc-%7)>1Rbmx0`hwE-9dnCU3RKSKf0~o)-Vph4q!SwLo}nTJh4?9v+2DA#=%dNP<2KoB&{YkL4SC4 zTU$7!2qPqv_*`9(I56f3&G zl3dUQ2|5cxKa=9qW7p=Co~@zkJP(*P8|pZMiQ^;U4MlztGu!WOsmL;A(?wCy7l)Oi z($aRlw|wp*beiyJdFC>$b$r{mPzl1;T_`NcS)x}L4}W!pWP}ypcrise22}E*&}0?0 zaFD{SY*SbZhBQ>(h8uhL z?~f1NJGZ{|1H@9)1YN~b7UcwVN*slbPf&5?ZS4CMH8kXj&Yv5qIzo3r;2eHX-`Y>f zoLX^z7&d934BEqnDM9!v%H5m2#a#6LSmKf)KeQ7B4p^KPim!Bo%o$xpF5JC;zgSTJ z1Qb^2M#F$i^P#8SWvVE1o~X@mDc!*oGpp0K(^I9UZ)q)7yeL`45Fa>S7ZsIY%AU-E zh%i=nDT}t1>beM-BQWjEsh70QWEWC(9=}2c_<&^3T(8?x3KB6gMi1?QGlB4y2MMj) z_WOHpV5A^E@dT|s$4^XaZH8wsGv5Z7To`wF;xrj;Xz0qvzXB^EFbpwth`d6-6~i0a zc#EULUgR)J^e1HCp(9363kEI)1d!|4u?VPPD9ab%bZvFtw)Dx#hdE-MMN`1gs-l2_ z9S4f)5yMxvJAYK?kA9cq2QMhbcd?}I&D>E`P1}~Qt6`65Y-mXjz(;b-+K1OSDTLe} zDy|{PZ;Shw8ZnHSy}r8}rGN z+kw*p-5sC1tzGJ(<;!>R&5LH8Sx(!U?IA8Ifwz|t#>JpB8T1%B-n^ojv6OMUt@|d5 z+3^h3ZXR9QqNuA#TThm>iHQ4ORs)Sb89oPM%14v zNL`7$r5p9Ekp4~N;GD>=DJe;I2q~iGmXo|?K~~{6YHM^5d;}+#T$p&NI8cSzj@!7_ z(o}QPSASS#_>c!}Zkw`?JL6Do{`~YjWip1^Kn6;Az@qoT5%I=GV=E&paJa+;AS3{V z`)UsdhlNdK0?Wha5LJwMAH{rJZ413Y8R6=l5KEycPG(M+vG$YI@D>~wr%V$mrb1fd;A|?HI%E)yp#GddT4h1 zPH8PmMXm03I(m+NJGJ!X67K0nr#(|O4EZo=Xvw2xC$bhKR(p5tso&iu^thv|+3C1A zQ@x!53TCfX2HtAej5L^({G#@ph|*<)}*651j6XZB_N zsJKP&6Otv&FCJ%~jqTspOiuI0->6{xPn{Zu#HnPx-k=?&^&+~9-E%ZJiYh8{B`Ty9 zY7d2&Q)fWIw?p(d`Hb^7(^PRgvlR=_LH2mMbq+O%`Qo7z7~3XctiiBhg}h}^iBOx4 zyr^NNE~jasfXc2T?%nxO@c)pQPMOl4J0>)xo5u7K>co^&bhGyM z_K}y-{RN;-kd}z#hQMfxwk!2ENa%I}RW9HSkxr&+{6WW$1XsYq=H^99E=<~tO>&4B zgU2N^GCn>q;mUSJvuB;$Itl+%K@3rHi1+Xf4~P%gP1rd&2Y@>1Z;ajn$W(vrsU zWz+j7QZH`)*^wK8*WIP6j3XU^Y9=^2sq>WXo^Km#k{hXLX8DOpayLTB-&7DzK@|y@ zm&9OW_~y>B&Ryl&SkIolAFZ!H+LGXqV%L;Y6lg1+UEO0nfBv8A-_`ty6U*!dAK3JH zD2X`kZdO)S)-z^Aj50AfMS_jliVMSmJ$t(BiilWBm{=Mc8*4pnS{M_>r*GZ5)jfB~ zW}KjlL&L)6(4?5BIebt_Tf93+(M+zNiW}+?2~_eLFxHGB_x-|~5W)7f9QWIMuzgEF zUDgDV83vAMcrS?!MshyeJ9&NCq z03eg=YBK(Y=b*s)z!mVPSKEwjIWyBm-$d|(p?+$zZ$rwGlRK#o=@$Jsr3|Dv1r^7P z+w#pBD#Y0P^D@7N+v?)ab`^Xe;Df-uIYSvmni~0BjVzV{;8Q+f>eRGx_RGR$H2tsc zYVEQEBHLDYaK2{R+lxlC@{)qC0?P{9@ogvqz>%caXmmg@u_$`giw~PXxAy6uTXI7! zW|&b&lb??X-z-1XJRNN|e?gv42|0I#Kt*|oUuh`NA5i#Ok2+}wIK1&2zogNo`{9dCFL#-Y0)7Kn<5Mn<*s@@Tdp#ze7&^VmGi!WTRa8xL*@VDt9yH zKP=pHW*|lgY$TG0tBJbaOZ6izrW2e)>nN@p^=E{LZ)m#+XQe;hZjX(fn|p0o$r_p} zNS}EA&L9>0HwT6-Zuw$HyosC)V<{Z9Npf>fe%xn_RkFI!jR`_Cqv(k>jf80K7BALa zpR?rL_1GCuOhRx$g*i1mck53zi5QNDqbaJa?8c<*tvh%8sHwSzNe{Y(ScvbBuPBI& zWHxP@JlUm`dTIImi3XaQQe@Cuckc#Xyl6s*z^U!l#sxEONG||5SzcaU;lWL#^S$OB zl_b7!V+Tja;K)cNcy3RP4okroq;g^oG;idOIENOHdnELK0YFGvKp+S_Z0zjXykGJz z02NCSd5%t8C~SqJga9@XI}n|AXK%*I_d4G=p{oE3oDg~n>@3?1a5#}qx;*-A8}258 zvqoe71geY^)1qYB`qO(1S^DOil;)*#yykzrNm)Zh*Qh^LJpMH&va`EM;xaOrVtRy8 zG4JEjn~ihMcqmh|$!<55PzwcHXtBRXmQyH*9?>u|7{Q#G485iCmT!hCZbH&zH=43* z2U>QaHUk%P#+qNKa)m!$quMJnvzWU?tt=*|d4Q&f*gjAhPj$@*16$oRre^fDt6k;g z<;yMyy)$opZ#g;YrbRE>WCzAxYHny~*chFbkrBOb<%yj;J4k#ZBP&K<1FgU7J%*}* zT()MzhG>%-9if#Wp-~0ulrqNL;mM5wyS5>?9|;PYGu&a^ zks-}X2fS+|Jt4Pi(6?#pb^R49T)TCvc~n&PP(h-mrY3n`6L5f+h|PIncEildEdKqU z&prOopEYNW0DmO!wz}@2e1Jxy%M-Th1uNYqS<9y{0=dxX~qZ=;QZoiz4zaEVGM7rREmHSa0 zNGz3n18x3?94#*qJ(VCHVIBDM69#toGYAAOl5YClX8Od5vZMoI7I`?t`ZDQkeTf~1 zuH?Y8i_hlInWHK`dAaRVQ#nobv^nZ240F|69*sG8P_$qy|FQsgNMFnp8S2ZRtv|mR zrJRiYW>nN({yEk!G~H+pW4wgylA&1wU20mIpf$KiVo563OeUgK#&>RzvPZP!r0S{T zAHFp%M_74Rb;b>gqU5bZ(vx^=3wEuO5_ekeWo1 zCZeIi1Ax#O#ehIINU=IagTlzYvOHP#6d9)e0-SOtfQjzc*jT8EU?xiiGsK1s0Xz7N z>rLbr05ZMhd(2dx!pQ3l8B$8coyFjL%m!deLDl&nfZ4;?kn+8*%1;l8kk79gof|vj1g7FeNWP&GGiW_{Mw=d~hx+YtOzJ*4B)}uChkjNM(~nR5?wHKD=GK zc2k;xNvAZj9ku{nsUxW~H_t62B`xhV@w@WaGUW(^Q3u1!bSbKt?PO`LEV3);!J6N{ zwh){5K`HWy*C3Go8%Mb}Yt(3AWO2xc&RyAcij0K0>&|>7kyV3hgIbddCE+AKDV)oB zg@9SW4Wh~d>8t&&c6^Kp)fk36zG6LshE*SbR1p0j%RBJh9vZhL_Kl4sXHMwrck|{}`_Q^4m8Z`+Jt>*B6g#Z{ z7_u-=SS~$$_9$Q@D0Bf&jzcn2$SY}m&F9RyP_tf7gn&`;85!Xy)P#^}ZPx05tSkc+ zH}S&0BS+5~Rsz7-*Zv>I_a$zfkOv7eL&fdY<4fo4Vn8=Ux6Uz{R#Tmz$P!^a6JW9rI{v%^>V#lNtcO)|YAr#B115_i z0@a<({P{Q58&9JV^h0zC0XRitqnFp-SnKAmntmYev__ivj;{TC*O0v3`rA0HDVMu; z?P>sHDNI|}{*7?UcCo&`zC^<84MEME&D993bY3?56n)kt8T~{(ljzgzjj+5p7zYRFD9Vgz}1wZf|U2*}d33|6oY;wjOM^ znB(w`a|3oAb!fEbQ~zh1!XWzum4zb4In)Ie|Np6^;P4)_i%|pIZR6I7>?vf5DsIl9 zQ{)XO>Rpi>fxDH0=ZKh!%c4AKL5PaAw6qpwR3S5JhjtJ%PxTG z6wtEY-O2#67{{JK3y(?#Y6z8pJJj(1f0<#e0rT>H)0i@&Yl+~>T4=B6wcOYBY&pk= zK7ROsxj=uR|Ck;$5FM7(lUxJ4*|VpX-#xC{i@gew{ZEoxC}0HU0>LEl1wh1GSK-K( zFRI|aLO3Yd{pbUpP1O(1v45awaKl(`b9`J}pHFH_`R-U(9)eUGK&7m*{!=q)WA$Cf z*lR#0nYeU`ikZl~dfn*OHoa-IE(TEI`tPvVeE4Tck{vu!E%+I??Bd(bd0q)Fe_65RHY5>F(X=Igc9t0|>-i zpK|pOhHC1~spqp5fbLM_K~C7coXGCm8DW!kWV6 zjQ4Lw|A}ZLG0a?^d#$YmP=;6@Pkgs6UH&WMZPvK=FH*{(#Q|)?#JmTB=E%`spxNRt zUR1*(U4uaq13vtc9DP%W>C{xKj~mI;YZwQY_)_pwp>SyzHLSdUN*1l!us23DoHD-G zH|C!a3TMVZOL!bE&d#?!w^h~jSG+KL!GfFEl*a@F$X)|%Y5av7P7GOj_{fnn5pzJFO+_5m~|IVGh` zk_kFlHMg3L_{+5kw?b}FiU^FLYE2M&oq`RV`&C-GCZ{gEIOY5^>(~V~DwHJrcvM0Sp0mw!S zP1}Hh16MTuN=!(&N3RgWm6*a zd>GNNZ}(kE;bh7qQqByQ*_MKV=#-Vzj^EZpQL&HLRuJmQPyYjSk-GK&1YPW&d$NGt zXb3s7sE|!go;D&7gz&O2>g(?ux&ctJ*Y8g|IZgjcRQMfO(J6((!YR+^!p*WBACH30 z(1wW2&Mmv3aBLzKU_Oi)<1+^cMb&jr=;;oy6%>GG7Xl+gkN9IIa_#~g$kwE25u^J9g%2*reT=hr^pxoB3F*ur-s#!w=8C3=+X z34B(h3#Q~$SmxppS)H#CP=Ltg$B)14@ObW}B=Fb{{N1&C4B3I>!rlw9?1C=8^4Ul| z56z>Z(0FiM_-0icoubFp#TLTF&8=v?rN4HlfzTu0WAT|#_WWq7j~KMaP|{MotlZCG zU|_m4*0%^j%sf`NtMJ?x9qmd+5R{adAx55;n}j$yyXDim(*l5OToD`=m!*JIiB-#+ zOv?dNX3Pizd7xVL14&{bV0Q8M*7s$MK$2w`s=u56+9qVi=PzEg`m(jp+iKqA2hg*4 z;t|elt3R*WFFd2|Ib7b+sk<=a5shghHVX18TWp-sVwit?o`)D5U1(V$GnE$ z^QE^?$AA-D8^*NX8U_o6$PW$5p<;`c&*vsIpZq69Rn-v4W^_V|glxmjr{u-hliQ=A>ahh zLg0Q#3FaGB>2ty2SVX2h*g2dSg|gI z>dOafdVx{$hI+``(<0nq|BMISE@a$+yUy)I?d=@F74wpJ`5>*z1 zzXD@IUz43hF?b?WdD3>yi0#3{p{l?3<2@4m3zAx}z(f#|X)K`jfZJdJi_9Igc zC;M}>>OeQq4;P-8)8MLj@jt!*r-w3YE;hQqc^|t&fxF1>n%GbQziMTBH*CJlm++#>K6KYXs# zr3I$9#JpmN1#o)-)veggBGfxFZ96SHJ^mWzP;0Iga#7qfhph0hJ=BbH`WfnV|=P5=Y0R;Hl{GK$q@yVxVf#zev$Q%|+2c~|0 z`>+QKl+*Ky+~$iN?-0(K%iayJ}&A7( z);W~&GVbn0s-s)g)j`nNwC6fg%2D(6nWD7p@lKPO;w)Hf<#$C>OK|XAG%bWOf};`- zf`>YcgCF@3z=^`JZA7UbBn};;e_%jB-NN7BdVpKE*Vfj)upnmU&GyYsllg1@l@t3Y zn$DWl6?_Ym;T4GwQ%;;{BP^X`A_Mz2*bFz}XR^mqrvf;++x9eST&^Tnv8m{-q$ zQY}pLNKD?!Pk8xP#<#x340{I$+oyddw=}a_)v;d3cPIrVCyFc=?39+8>T$N8f?;X= z>wo|Ivg2lXE$7@K!>H6oR#)LhOZ%_6Hhol}{;T!ZU$h{BiWerCD}M}VdB$tQ0`;G@ z#BsA{bG`EHCxn>a`S+!bs%>N=N)tyZ-0*GvQSnE7N0!Nq^;~8m!m$6FA z2LJmp#HX{h^;kQauFlZzcBg;e7CNJD-C}=v71w{rS6LnyY%-$d*VoB~8gRfeik5x) z&*ytHg;ROM?>}!%<-gzBSlzgO|Mg!Yc+>4#j_)%ydHbt9THlRPta|^2|9KMOem*U9 zLGYu*_zukzM7H_gKetl5X0w)mYt|6<@9P~l{;xk}rT(9nR?7|Vzp%-vo%oJ_%-_6u z6rT;<9M&Yk<>gk)mi0kR)_P(j5@hTst`hH#5EdYG0%lC|(MwAtS&pGhq8&|67D%rI zHKn|)?8Tr}a=jAwyS3*@hKPn8rN!~t_X%T6Nu0gw#lDGr2CP5UjDY21lt;`rj?lRt zZf)t_U3?mL>Fw~2dh_7S02mUdm(8%+Ip^eiMc~L3Xj5^^wDn8r$@EbX@JH<^pSYt( z{h9B|@yaVCx5OtWf4P~*pU%n2QPWsHxuxPzyl{#oAshS30?Ovj=mxAQre5DSub=Up zlD2>Z(NkmTe|~frIdtQ%C)7-C#Fuv~mooEmoc^Kl5am3ixo(Wkl!l1bhdU81DhQS6 z8AlyEdbCaF&Ji2?b~hrPz8TmLc{j`3`j$TE(t7s(8R&Fg5e4mw^EP@ytd{!#h&vxq zC4Sp_UKtcSB2w?I|2w$#>)uE{5F?RZPJI-`)}F>yODWujrA^oG-#>R_FWHoyt%pS+ zqO=G{{G`AEP_j%bc2ZW3@;=7`cm2J6XlvaFKR3EBpZxDF$woZ}T3=R+B0zp?#{>!x zp^1Lh2m(A2iFd}`W2#5Dw!VY?;1mX~SLDjnWeFXaUBnl>MGikG%%~k)S$iYJ4F(2=qkp{Fk3}M=zqHd5R1|1b4JFG^lZDY1i&Io}HLIy051HZU=3p z5K*?!TZ)(9 zmT$e4jLKpbFD5_5^Mh>@SqcD&&lnG9BH;Cf7o8Cw%`8P^A>g(x4k zHU#rXm$|#UTnX+XrjNwX8Lj484UefUAgtw2=&MfPF6L|23>9??Rk!$Hs3e+bh+NRr zlGrkeR;&jBL5n{LFBA5@hd}cR4KZNCnzd_pph7{(@OXieUdvmo#ALj?n79#2BAAD`~aNS`A^xF{uDuPA)5jOQBQp(RD5g;&j;*zP(Gqoe7Ba9qp{GS zG8QAydaQv$=kcVjPRu+5w5`!taESvE$Vgm0u}Ku?qLzGp>_{Qz7hEK#pxG^{>8N%G zdHG@p9P}ZqJ1JgxuSyP9E`ukQboas!fiSQwe#IjO53#I zEex*8SZ_l@lryD0u^6CUjlF_(CWapsP@u43wm;RHATt1f3$I`A#AmsRrrNM2;~&-F z=KBGp>P!$c7)}I$+WDS~j%uPL6VJ7&X_M%Wdkl6m6&)S65oHV1(!*YeuVOUTciYkB zxiP*5xYqAqLlV(vvO!Z<$S0Wy3+Ah?C)*ax_d$rqZ9?d=P^-=+Eyij236`)jT z{o)&YJV>EHMS`mh`YWcd5%e&Qaz}0cooAM2a_(<1cKpD+ksIkmyxie2n|JV#_eBP_ zh%*4HO6l4?-##!B*9X91WmMklT77=ANKm5G<>J|Zn~3>o7gyK9^^QlQ00^HTS2=)x zi8#_?be^5s6aR$=7)x)1*vbho$2 zY853^#V)Kh7yT$^+dDY$B2_sBO*9_CY%Sw&inN9PKoXG6x3zxWYWaNTG!|``Q>E#z zBPGH}Ju}hKTB_o9@9fH+Kr{vn%Lp|l)y|EoF}e0IJS&U�IpMqNhQ}Ak+iZnq5W- zOLK9H8CFFKeB(^#_N_l&Af|gcrR<;X%r)I-xch4?A&H~68@C$%BMZ>d=R9boBp&hn zGhthW-c}e71U;H3N+kvYpO}P>K!#eQJ z(LYD@Z{dzeG`buRHwq|6R6^Bt*`nfuap_LW638kNv?*K#u~D4jL1xo%>!SJ(IPU6i zes;6|!=k|D%%Tes4YM6)zq=G#{g|l$adj3z*5JXboXkT-(Juz}(KoGNY*nb9etn48Hgx66l^am((*%}~ z`tNc1Ra>CZlv5BDD@4#dyBDldVuiD%*J6mYT=~N2hD1{o384?~}9$UwrWm1X=Q>5O=x)6kt zOdqAzylvCAtzfyC60$8NxQg4zvf{Fwozzj`L_sT!H?^0?)6R-qC~ZSt2x>+9y-r9vHq% z?Pj&Q(ek^_h}kJMo|_zUD5(D+M%M({MF|J!2m?haJ?#6y_nUI&KK!2q46P!=6VU;C zm$!%vbH5A4847(d(%YEPDhLTR2!y~E*A_>Sr@Ro-GC$mDir*!*)*SGCr zS!pRUn$EVb2)fb4w2?r^nd>N3eVcRzyBeX>-D9b@@eH5Q?cfox-fNIaWY;|K~)ehpi8{g)R{(5?uL^dxCt2Fj-Eg}33WO=B)on0m| z`^(8~KX*wvWB=x7H|PHv&`&vwChq{ux!0OCuXeAtuUtah6Zm}8zI{0-^;$xflm37K zZh#$2TNQK;Jv20K&>cCVkUe19$NA#Liz_J8{synH9@u?q)HF*=XQ=)~+_Do$^Hu-~ z+P*|gaF4-IKo`|nwzjU^`s=uY1^D}4{k(kf;$oo`@bMYbbNtpZcPc9vK{8#hsW}`F z7}%p)OfjGLAVO@(Try=T@15hAvX1I>AM{q7Az3+LrC~NHLUy?dO+7h zc9oKdnqKHhq}@6Z?W6U(LXS=GYb!hxx$x;59It+Heu1AG|?%dETnLr^QQUi-UcPg7crFrnb(Q^Y;SCgak~Hir~;gV z?&yi9B`4=gb0_90srS4zY)^|NWi+L;-i-1VkX8EM^mEjP=4b`cSakhwb>Ox2dedAN zFAfk|hqRM3jp z@PE$mlBUP zHXIiFzBCH9XB#5Y6- z6crWMxa$d;4Ga+}JMLPWN{1l*=nmI1>wFACpS*EP$#4 zrYZ6z)s|00L`TA~I$WA&G+@9>ju-=1#2<{cXs4??&g~u2L`9o3ZLFd2kokVRH|5(Z z6O;firXJ~$flkrMDe}(Zm4aB`tRZ zMyb2xcojvjc>e2#1fx(g3HQZ6D#j>`_cRXX%y{g!JmmTj1IcxzPv+k%GAxNt{c@2^#QrT#ukRVOKG&{R8&7LX97*jDpE zTWS>gff^=ZjSfzzn=&OPjOwh0f_m>H$B85WKxfj3qZ*-QQarz$Kk;Ggkt2lb=s0^C zhT+zpJ4T5`kBp5T$is~6z4eg5wl*P1itapHXzPHajzU5#Q0&V1T_6AX#}CAXLbS8{ z<=16<^1XeTOzCId!F0-$-FGe!hjs17jhb}jydp~Bq4e-lTteorO?_Eya7#s{ z%-y{=#sG06NbQRik*NINA*W8CW)zMIg9bG+%l=;q0?RJC{Er}Tt2N94V#YNH@7|3Z z0exfT33qiTCudKI-k$CRbKVUd5KJnf0R$GD-W~eb5So0zn&>mnS%|{-)4^_YWP>+c zbLVIu_>=i;Xmo%9yc2y?MX%qyxrux$JPlUQ0B1%P(~X#7egov|F>ck6prD{|PfEj) z->5D*qf_uiQ_yQ@8ZXzoM5G_p#Fib?5XJouF<|Z z@w^J@ittl;{D|DDGz^6A1g$#AE_@JFl50f$FLWBx1L=aZ^9_O7>Z8v`rI!01##+=S zfJJvEG#k7`=+jITM+dm0q|xUZ>Ghv-K%1lgiyUwhYAH!7bRK9Xl;icaGFy=h+pJpt zb@Ma0UxX;epoQw^ed$9bn{Fox0lGH;L6m;p<`+nU;(6+!IkZWiq}0XkCb|lcZZSFD zu|wMqTc__UN~WZw!&%={6w*&zuZMW~4gzfg_+A?|M;|oO)ye*Iy?>htV5$sCaa+|w zdz1IMGO&WKgvk z98v{F%~RAJIaM7L(r!GTdXIVx+lcw=3W8=r0G2f2N(nV`fKMtb{me0NoU3I%h*03> zOZp%hPY@8^MBnqs!INWT$5(nEw2DX22BI5XFdxNvnU2Nq@xqw2=l<{|Z&_-#*r84> zWE`&qg|Wz2x@u1Quc9EPaek#^0McWOlDGFg1OSJ<+w;^~BVz6&da8)Zguu`TC_B$?@q%lPEsa76DO9;s1#Y zn7Zj7$A1?OSYtgB{1%?)@7PFWQ*v@)XrWS{v1lHR9nH>l8EtXo96|60!q@8iVvbHi z^99oCl6dTVU%+f$S3VxiLZclWglemtyA#mrcpmxdf~svjTv(T06_exw@&d-kKTzQSfP*- zn3@y=5vSw!eXD$h*N}Yn7clhPG z(h5;GRYERFo7Ix)STo_--?q&(H5({^U3VF^5+R}Q|7{`RkPb~pru`pF3GX6#lsG`{ zDfj!LDZ94Uv)l7NRH{4y5fp55L>YB{`NwvmHxNRd zxQ}g1IOQ>g&yD#d9vU$o0B(eaYQuvI+WzJ|ZL35Ph*4ew!?~}!_vj(MX}uH?XoMJ1 z^W=S8zRhJI%_53ie5$_ynqWFgwh9n_oP8mXPIzo@{UPqTDKF z;s08xID0zZDEdrD_w(bY46RTp6AoIBBmFp;x&P|Ag{q(b^n4ST0#Pp5CamW8Q7kVr z9Yf2FNWs}+I<195ad5(vk`*{*`PgzqD;A;5BpKa-s91a_I~rO`&MmmZw#P4y#Kt}k z?^hZ7-y*|?CI2ZhJT_#znXN>eN{Rz#X`e1llv1HrHvw_-Q8>^CNZv$D&N-+LMH^k+ z*n0+WH!^#d+|VcCS0Fj6DcEE2`@=vLiF6jNs4yDT6`VeLdU|V+a{xN~&|BhYjJr3} zKP9RpraD(9w%j%8MVOpNy&in|S4;j!%LKR#y_62w-5U|G+z`*4f#7{i+~_s?ex`Z($&~o2e|35gx5w?@$qio z7uOgentO{H^D*14N z-MuR6I2N_xX~>H#NAru->#KkEx&L{3ZZ*Kgn6LXWGNJ4P+&lUx!JhWZzg$gwp1qtv*P=Aw>^de z%(K^o4kD~Q=ascn72cZ-?2I}3AWjM(xXI^Tnwcpvgy`IY+b^YMDYf>kyX3L zhcB(F^oKqa*KE>S1gSt0$SaFFBAw%t{&M##-S?WziE{+JqZusfR z1FA29OG-8UyzrKKDX0r-eJOj$~7^R3{}Tn$wcYKs=q2C_|bEvncX# z6vE-g4s9QZ2dy?i7Gk@^#AwC~g+{3cZ9h3k$!g~(NmM}-3p8RKe<z_^)n=lXF-OXwS<3PGB$jf|DguRXs!m8M5RNpN1ZN0HB_O@Xp( z+`M_s9M3f#oPu(uvhPF&cnqeWn@3`w03*(;j$ca7L*kAQKZBtK)EU4hyc%IMZh91q5ae-7Um7FMhvA4`TkuWqKYHX%F)qTy zygWs#pj7t_t~LX{c`5g9JBh!7Y?F94Vd{9~j{Iv<)M06BfXN|t==^3-?v3H9h(r)Z zn1V%w&U8(=#m%bJe3x@!A!KFaRDa2m=1RXop^comCb%;wNBqu9g)OX+Oh9iI@T?_d zkX5!;a7?NrA@%^FUU_yY@Kj=~7OS}hQ?SGRi@is>hP2{_yhj(81^{rY7~`c>M&{@c z*Rhxw9D1n&KEFr>DZos=ziB_xp*L|sR7}wLOO&c6^Oukz#w)+n6(*l0D@%;fZ_ykO z)d(ZI799bqpoR?^Y&u&pW!fklu=MUi`LeL``RjzIdb5l~LuhMjONf>zCJ|i}89m*l z)J1K14QI3#ju+;nZTIdCxdN#nYlNlyLC8xQ?VLHjC``zRYRYwh=*jPCLw5j9NaS|7 z(U2*?U`puts~QR@0Mv(DS0Oi-N3~dMMRe8IxJ~@&2C`*QZg51)wDdo^3lKX6A-J`; zQ9w8nvBRS{+*Wylhz6SWTiUDcHi`pnsaW-Uzz^N7UDr;Kk6DYWSRxltT2V;L5sc(g zl!7z@B1+`@Py`|Afz{Z(d$)*6*s2m>OiElWNFpr8@$Z5=y->cp>w)}bZWVEJ$S)Z; zZu6s{@0F*Ql`ZZcR@wU6^nVUs`aHyF*`$U-w5@Ec1L{{A-o*%H8Rnp0Vja z7I?h#dY0|(X<|+ET{}DDAPQ|Njl`nQZG&%DQ_$K3n$(-rrg4|*BVGF;#2rHWTI^j5 zxkbd47pm*lv1=Mt4hC!3zXEsFkkUVvK)V->aL8aBRtF0}J*d?@3F z!{SY4H;)aR-n~V)!{Orsm7ihfh{PfB6BfY(9Lng@?^uia^z){cd+@AWyts1H_hTxm ze3Y~eU{(fF!c7ssvnvcW%|GWybU(;T6NS+6Gi+u>Tr`fnScJ{ zq}TRp&9!Vc|M|Dut48@${l_E6^RB)8*ri{WYPCSW+v<#MnOIFhC-R?vvdl^`-c0+= zt*t%Gs($gLlmhd-w-4Jp-8fj-MTm-tP6iD$X+!?^&!6QtuvVSI1ImX{ z_+Z2O;;R1`@xao@`C!!}dQf2gudn+b-~4di0h>RcvALFIgQ{;CxFFTCN7d&sw$X|- z`tuhItq$6F+x;anM#I-n@?>T?<@ZqCU`c(scI`mo5{h|>gK7#+fjVL7NAnJJFdp%8 zki5DZ)E7-!wya0iU!+R$1IxTvL2 zS69!2Eo1Xf{Jr<>?haKSD}cgd2DMtx`A1T!kA5##}q#HNFr}kbtr|OgMkc$N(R%Nph(*dMA&B@8>`7ZUvi^>bthIQKBwnK;T zup6<`=yW*Og?nGt>%k)-b#fE7y0%d_27>zbz+u` zX}8tz?c}i;dQ~gX!ra_Ew9cT9%P)`k%ec=C?sNE6|Er(BDY*~wldNn)L)lugMP$Sr z(NSRzf$QvaQ+IToH0@#-yjyZyO{5oxhHu(oGk5M#>3cxRFYaiyg+AU2<&7E^Emy z12C6}O4=i^Hu4=eq4~BWJyRSMM2x@NHg(dZ_0(=dJ8kd!e*ZCh7*^oN5L?k=K5ZiNT?^?i}aDa0(mj)xbC zzj?wV-;HgD`d%aGn6h%Bg+m3jScqCD9R>jBEacn5DsRxFvFJ85`jLjEX zWO!^FB3seggTTwV6Hb}CBd-ZHQe;XB+d>CQy#bZH9 z?ZDw9ze$vD?knyNm%U_eFJlbw(ri;u*x>RazQzdQ(fQXY>`4C?)tC5%=E6 zFQx2;dM+>5s;AuK`;7>y6c0ZIY!b{hnnW{nk!Mo&>Kc@S7tkArza-OXIV_ z?M|0}@;D-AU}y+?9nw_*UomL$rlyct!m}IjDYTjnJVSL!`4iWisL3RrZTa%^QDslN z9!wy;C|srz%LqGAI4Fk(Y#Av6iOVF%r@2Os=%i$%5ie2PPIyv=(6cwLBJg5wd+5qH z$hdVQH8lx#9VA<1mRne*)Fjt$-P%}^VbqaIMdqzwF;>naVb4I}x5aA(JZBed-ALZZ z+YSKRBC({smY7fA2??Tt*FCQrs9QK|MILzm zDQ3S&8BMJ&{RB95SG{9h0KO;$WjY0Fnd_|92l!WpnVBuI*H9=G*hX7 z&m_j7G|wHsy&Q>OBblvtcHxW7V&EsxCPW$FI?D$y=O$Rc_rxI+ascWtH-z)8-98CmI43j9_@G zEE74t%U`%|l$z-8#!WZYgie+SP~jaoUj-AfZ!xhQRixx|NHq?7BpL*C5%EqcvSY$) zh?SB^x|5zimdd7~i70JsJ9WD+gqlRwE?TlgekCn<&YbKc$y<4!_nx6w5(tF49DnAp zbjLTtP!|$%AY-+-8Buy(h+mc$uKe5k$(L&gZVFyG7}DOycbqSoRpgFtU;+}`hOK~9 zIb(7{CwWayM^OtZpIC0LCa-UP&%GVCPL>tL=_RiSlmcm;SG4^m`POkm#m9^W{v;_> zGP?zS+%9-nzdm$TQvVWQ0yBWma%r@kpYjKKy;${G<@w5z9`Z;;7(F1_*OHJpfhe+P zA2a4;7NECE(w9oVzQapJxr#T-9+i4prYevk7+m7Ep*{!Qb=m$x^P>#rmK#+Xru=uB zF=I|9askl=lIOL^UUlM3{JKL{tuiJBRXmJv(-B|C$|;evmbZ0h|3J+@d8^rlU4~N< zi>i-=Ct|u9&}K>KI+IjX8HS>GjMJ6!6xd>3CIoZzPYHV zs25lM>sPNVi_U|X*SN;pvMj0~%|u-RN&Glq+2I@+!)XAp%iOC68L>AEMHQ>l9i$zh z&~;WRGPvvR{YUkldA%l=yd(3{2=PDeuu|*=*kc#C4Wet~Ad>lnOP(T+Mx6>75AaNm-;={Vm0gmWJLY#NjOxG{GFFV` z@QhRTB-a}7KW6tDtWL22FgWk=D-&J{*h_70PRN{O4d0xwtChbsuRTn48+2Q$|KH9@ zA*aiRO{fL3??b9S1*PpH<*_<(B1ba#Y%PMp2gZ(eQiElawj3dO4A!8M(&C#o@qGTTQJJqKm93Np@OtxG^OWRFYE0sw2*!hunHxaenK1dwx<-@Yfaj+`kI z^*I_*mnAYN5$kJ;d;b_`&}q>t%Coby^yXY$qwmW_oaH1^A~ZlR*#&Z)fiEbt4l90s zPRDI7LBRzeBEmXCH&K**ywy2u23Etb3tl{8@&*GKM9oJdC6aptUN{{gNGI)0SgpJ2 zYo@hmTzHpzvWJWgwwB^Jme7y20>$%o7Tb@CzKHSwm4hS2`)s&3;sUqu=>mvT47^qN z$;cI{)DtkK5Sn+7!kyNS`DMkIr=j_U550#;0nLO)uReX|P}aPI75o}of((Mn`5s5r zZc-E<9Li~JcXUsuPfLsfJ>i(-GQtNK0<TXB4MQ1aI>$k{9O<3 zvNhP@?m2v6kFPn8Ozyv(%92LM>+06#A})j@V?x$p(22O*diD?Yi(1(~FB%ouAl`x+ z>mT*5kV&DiV`|jiuAll>Z3`jb6vYE8HZ%aMLT!ZZK(U6Ti?#Z-nf72*DjCzr5SfU< z+vg9#6q8G25E9K>V=T;Ed!6im5T{4ySr_z_(6OVJ`$i0-2S-PYX65U&24uMtIRAWg zrKhTn%YLxxmq6`f7mSx{$5QTNIvLCIq|^Q7uaH_$fO&=f-gD*;zDz|$#SGkbQ%=ts zscCs^6#@7HGLu*+xwmfiv{b>ILON z?{^K8?tOo&coUBNnl)>R`i@*3>u!!n2CS&&F<)yhEf}BVMFqQmK{~@qTjNHm+fgfQR8{mCXr4 zjM>Q4-=DPY-!~Z`Pho7o`&)M=aqFdsnL;s`vToYyU+$9eh3&*=a2tiE=8%?Si)PkZ9DX)7uIgx_ziqY67DZQj2=dEoliMuMj2(N-@2vzyKo2eMv~Hs z+`5LzuP>9H;)l9Wc9e?G{dsT{AuS6(Y1EP3cb$L{9fjb7sXG@PEgw31`t++0mP&EV z(I*`=Z-P;p?5MVk43K#XlG=@BUph!GJ}KlA)rI^n`O=FxQi-wPrqQEV;?KlqGQXv+ zbsPb=aw5!+de4Z0wQW3;dml@$kCqv z7`jh!;CZiV=O_Pgoz`i4@0@WU2swSM5xXe{j~yHHU@5gH*4d34s{wE`qdxQ{pf(KD zHQWwrmu9Xcs-J9&uiqE_{kQk9c2LU~3q-~(LeuDRwhVIrD|RGUw*TsLQRC=;n#`C! zEIxQ8y5G(dE*1YC-uk}lSU#&_T1A^9oKP)ZjIhz%gW%(EsfNZ*)NMn67S^PgbzkuG zyyUUaW{ZmVI)(jWBHsui$#Sy(arY9CEwnvo(1f45`)4~itYcpmO&hPX^U9;@6Lm*M zWc=lO*2YjNutKm@eNCrEYUg(5EPRyiRipA8M?OZ&12o1xE>14}zBxzn!2l&BAq}V+ zm?cm>Wy<Q>L-a+qw=Ynourqk*5@`SdirO}= ztH;t6`x?2>a_k=|a2b2Q zdgbKMLCkP+dJ>m{$ER))rsyZ^%nQ1Qhd{*5La8?!Z;wP5qgR+Sb7mqrtoPtc_Ipf& zAgysfFP(O%IDYz>IV#QS`-n!|-w}DE%KjHyD9g{H5}iZ*H^BF99{lGoUuLm2PCR@M zhX>Ueknkz38^w1ZHNk15CTB=u?)IHrbBnn2v@{2fF3Cl9a&|s_aH!vDAe~;!iX8rf zfdldPddmlqvH==kyH~2+wp+7!?&~GCyzAX$wlh?W?>l9Zwe`YJD;p{ch3Y&~d4T}|4P!^=KLYB) zgzL}iy7HmV1gqxIVnUfk*n1viyU?}zmcf}EjXRdXVDTaIYwBsD3x68Yzm9bGq zA5aBL*4CbkeagS31|*EMGk(*v<|9*OAUjs$ZVLvu)~~8nq=ctE?fYYDZSTv+A=bp# zoSYnSt4K+esDXK@U{Qw0T5Ml`&Cy1VE0>Ng$q9p~GAgC?PFU$mJzVm28KY*1RgbWb zwV!tf3bF|v75A1s$NmpW=oMMCsOmRjvet9S>du;rx^YGA-{r&p_hW9UuRn2OVdzp} zUDX{CR;03o5YLB(+K2W5JV=q*)FAr>o;{|Gpx+`v*?!^YZnH76{wWoHO};e3MT)VH zlv_l>R7TuW!qYmhNKH=R>OV;XPS3UHze|L@)As617jnUFT3K&xZEb zC=xBtCI5=wN%BEc@S~my>2;Omd0VhC@364eCXC&xJ?LG|VG3SiJ`{}T8nSn~EfMt; zJ;TsvM^8%yMj}vAOnZ#9ik>#QrA)@=@SF22QHh>d-jQ*}wA0FrKnojc0T7xdz?#Io z6kXgmxex~{3pV;^sD6Mx7n5M@>L@5jhgH=?0ie4`+5DK`do@v8Gk5!c)DMfLEZfvI zEYv{swKOe_H8rFAm2(DUI$Z@`H(JfUG9|m6VKY(v%K1B4PQ=%!{CB?|zTCb`^Q_2^ zY|`Z%U*s?|kEJGuAfnH1eNKKqhrpt`0!Dksr%Nt9z?70GaX-L&^e$PYTyN_nqKAHD z&{)qdL23M38f%U==neGVFp z>I#a-57@t?&yjI-OpU@CS`rr-&&W5u6p66_ht*5DrXJlZS%MEaA|x8Af}{!w763t# zya5gkh~QBP|KzYNjLN2}aZu;WDnW%s?8|{UUzMbonbA!SXhOwzYE_0vy~6Y5FI9|J ze5!f%SGlI5%axBm(CjXEMeH5yF{x)`nrD-+>1BP!(XeqO%Jef8lVhX1G>>}s#R91b zUfB;kC>gAI_q`IDI@ZM`EQq60=3@b;ygIJRCQ>ZVU%!`Tyky|vA>4ZkoWvT&oss4+ z7yMM?FJMta(7ut#QETns1G{5|CIQ@4i~qS7!`QCQa$b#0QNgSgf2^nS;as1#qEL;T-Mp)nzZQnJ{dJZx91Zf` zUa6gq@OC}W{mS&3Z?a-E*Bo6^q7s5KTy8{e%RH90EGkyiOvnyIl``K^oH+=*_752Q zz|OeUiA2IVibcdo5C{y=@wFBI`~eiEO;b(TT`(}ZJ|PZ-nMpt`F`_-*2g&t+>^xYl zBO)T1*G+t2?(1yAeAlvT<-g>eo3D^s8RY8$`V6B~m&)IlfAs|m6IV7s*vmnc-x`rc z8g3hhkD7+;yUB8i?%!D#wIO_T|zrW~*Ez!)elb%kginX1N}TpW51e+X@7mdCUt zwiIP&*%}w>TDj)Xjw$TzpY2+9Q!q2*NXsvkT{vp+7C_y!~)YgN%?!aKRRGg)#ubob| zlunIn=wUuR{T-G)2SJP|8lNS21b6J*x!1u7H|h4(9Rs)-3{XHxjZjcfFop8Eo38F0 z3K+>e9&Toqed!*wCzfIw>!K|DnX&e_(GYA3478dY>>z6(V;ow3eca^bY28koq^&yp z?Q73zmv4$dQCyd}iUI2GFdlkrm`NyVr-A=m#^b8TN5JX(;b|Yvg*nK?#?UbJHzEJ4 z_L_*JN3;9yr*)~Tnf}|jgbMHLw6I=tHy3R~hTb$~(&>aib!7jWfuQdXYr~ z$(q8O4FQiH;PrF)`ZAg4DgI16n}h!GJR3o%SM4=Y6Q;eJe8(~1)UF^~iWH{?nCh+r zny|ek5ds6}V*88+995hv)t*0p4wsO})v%Ss2aJG{l&6eDES!P+`j;xjqj z(A~&xlyF6$j}L?sF1Bx)e~pSsCY~}xTN@Mb{JZxRipE#Yqqc2q)sYaN*|%HtGcqa& z?E8E$F*I0BJ0~~O9GSYbFilM+!H_${vl@zV&i248u!d^&;;C5kO$>|N-o?Tc{2P^{+qMnSs zlh2M;f5DlOhf0!V23rPGt%vB_*-jF3UR>ICNX}6JG!ZvpT}Tq=9pgLC>aJ~JM&o9d zRP^~2;&vejh#^T~1%9O3$lTTEUEbI|cw{Hmz4HW7YZUe|LN%FLH z0}V|n@OXL>z9)bpZ$yX}NnOv55I}g#JuwW*C=YIKQV9I1ryKz=u?@YvX!xloKu0o| zgS>B2KiVLUqE%nV&diRero`Jq(~to!g5;Gs)KCPHGbC}{J9j3#JB98U(mE=A+pa5VDkiA7F?4y5Txl^Th}PO)jCbVVv%3qW);2)diS zvmb%hfoug)bq@vNVM1$ZV%Du+FX2{wJV(tmliUlD?D9L!0EStHn*jBZn`i^J+6Dh} zS}BkTZ$|T$h^%-q1|}}n84MT-JUcfapFa|J_M*IN)ezvEzjR5)!f`V3GUjSpJ@*1_ z`3a~ZdEStgTgl)8p`va{Y9oqQ5}5*-pZ5LgcP-3%uyn# zNcR;*;+#203esT&z!@=dT*n?NVOOXgC?Xyp<`a!0Br+EwkkJ}+GE(MMNn7w2KWdvXOCTK~3I=G|jC z7)HFD!xzceK~k1Tqab^lQv@XsGmwUm2Lsm&FNO$0o;ysWrEaOe z{^^q@9S+!L)Vr#va^(a?&ygzhE3sNUgxJx}YK8?>qCfKi5_&2F+}ZP~N5j1Q>5!Rt zwyET_O`t(m#|eujv{Dior8ha1@}PLBoMZqeUA0~Cqhdlz##$X~T=h?h8i*hys{ek= z-fIEao!5;Vdtda2a;G`f5pg}tm!y=1>JXZ2bs;ETe}GwL;Y|PA1wLN#aGncV2g6Gen@IaxIwHIxc0%imKoC9m{}Zx8sS& z+-~YNKnUWsHJ-e*LvRa^+B61T{e_#v-CaP=LqAg z`AGHTBh^i%Wa7Qn;dO~>k?(I89B#2g->RCiimGA3eK#cutmVIXvoF>Z@HBv>QLJEQ zJaEBl3yeslKr{Je%C06036)$=Ua<-bhoHWt7@<*WKA;V zLqyR6PEh!~x)?rg$kb}#XOuvTYnK57G=b+}8IyRpxW%r!kJK4IwSBGTT07PYA{CB- z$E_yFwn&oz)4CcOHW{m5rN_-kW6%({moQS)19_m^(xhe2j@L}n=m@`3sH|q3hI20L z?z^y+SLLr|$8FE4N>t3n@TD7%=vXRme5a0(Ft~JvlD9(z<%{Wbk-f#GVUyINe3YL- z4MEs75`cLc6s{5X?^loOXHli@?U>7KI3!g85VCEM3&FGG@w)@F{%D2|b-?Q`|2VT- z)(SvJ&5azP;No%BwIj4F>%>UZNJq&~Vns>fA?2w9EDWoA=6wxhx9z18H;vbv#0WTq zTx(6&Z+jqrF1(h_3<_j%HK1Ok5~;zF9Xq;s(Y(O*Rhrwo6X;$gz#FSSEuk4_0j{?5 zy3t98BI{=k5i0RjN9s8l8bRKJ#7RKNP3L#q|CfAuOR%uK5*Ej)DO?TZKz2v><^R&^ zIvqklC?Y0Ki{T(L$W+;ONw+v_Q9&rh*3Z^Sqz*E68UL*~KsarHzD!u>3|1*$LLq?N z1<BQ#1w8Y3>?kMurcqARyVoiO)hecuT8o zSir@|2~}H?pHaEA1bO2H9ONo$02kPE-yR9Erl8;d|&|TpKcvu7+`IA!fxW#BrVOO7bLV=J63f7Gndua>eVgE1wdM zVz1TQxg+F_YJ33QxsHa4btK~lVd2uf&W5RytB2qLaPFUO8CL#aBsvz$LHJAtMbNnW3~XGak^z@^fs z0(HE=VYW8TM~BImQ6RLD7Kp1}#t*+4>f^`>Fn>`Tx5ec}dk*cQa+d_tgD2l_uS1mt zpW+83s1uQO&zIK{{*dCjBsL;?Rlv(CSlhu>>r97)meAEJJ5N|4r?Aco@|I*N~DSA)hb8jlwgy(fl z9RZUx=MSP8ig@(6r8FadkD_RjfhbhY8!%-7K{JF~TA&e-YN409b!b|TM>svIkwCzj zeaAH!T&3%tF>6-%Bbx!uxRthyD}rPZQSKjW2%9l>bUPJlNuS{0V22wr-IJZ;`PK{D zV)JUlbaobie_A^MUH&tqn`cZ;_hYC2JJu z{*@0Kw}mg3$}7~gbY<;@3m4KRw`6d%WaC}z{;4{ZHr^fYF`Zn>w+{#m2*@iguCKt1 z=JX|U$17#(lDVHXBM~7{WRx<;OOi~l(%ooyfO-P-2nP>=z*`H#dUWspfd@K+^GtM; z>W+A^)=lX?0y$T!L0)!8!_Xcn6o0hH2M?}A1f~#GGy+eBlt#OEAE4}$7;p%kfoBeH zl+|k#2V9T{=PiNw)ad*lr^!Qy0MTk;iyZMM85UtBO3YV zUdG8buRIQXv*+rbr9Ps2{(Fk_figT01x`MVw0|S*jPK~unDH=N1gG@$6TE$L8oBVU z=mTYd4CNGQ+qKw!VFRjf*|n>-nOSO!%r9jo`?d3zleCGXXAuG@I7N{tlQBt3W^IK; zlcQx`OT+?<4_9J!Tie`D0~1fLKj<1?udVh&&O4EJ0;0d9VsiewtVV_bs3(Mz^oGp` zA+%mUrjyjr{Jg}I^6VtrfbUjXu+*F?fC8`Eh!GNZ3lIdZM~rf9BzqIRH{=Y;P5R;7 zR+jGO2>$d$Lb=A$_fLgdDtNZ+MQOl``nJz#TgA6uQL$Vks1)F$uj7&Azk3%jGs!m* zLTIS&V6{!jC>@&0kqY~l&%wZHSz94-b7+ItpwOW!%52j2ftUy0_5(yypn~GaC;;nC zMEN222o6$G{{z}}w@X;5#swiP4;oH!LNrv@%Zu1w{CF$Z$4eY=!c=e=2hp5}5Dh<` z9AI=pG65m>wq>+Sq#$+EC_?<63X?~mS>fA#lZ${d8l`VdtjZsbAR1wXcvZCEP2??yUu z$tif9Sh4ELzqA0aU%OAsW#jD9wfCpiZ;#uM%2mti$#2qixpE=bpppJU2`_?8?EV{N zQb1aoe@W9^Z6??=BvU$FBO?uVrI-K_+2}c@G~^U;voE+UaL7bYKplp(^-|0b$Zgap zfl9bMr4&J6+3Mey=YE%LEt>1=62U6KJV|IYeNy8|*eosQBKv86ZMWYNGo@sO1DxHU zGtpb{v>&N0RNV&#z6+L;3twO)Q)E-f9WGCDiz>hzgDr9s?CTpJ%yingIA2cZ@sO5W4TmF>*v-WoS)B7v+j zMQQEYwE}g?Snk--Kl&>F0V`s8iGVoBK7Z~)+l4R+b!z^B)8&!;geY=}7zhApk|8V0 znarZBV4Yr74)pq|E;ts1Dd1Br=;q_o5^Q$vU#@fT)?m{OX)9j__H91?{g1v?;UFZM zL2LSIy_N|=69>d-e z_W{b!!|Z<2!iLY7aiC!4q8tYWs~ z)`F0~kcubL>VuwKbDuhFB$b%VMh!CgUY8D}qB!~lb3uNdU3J`u;>2>Jovs;gS3-^1 zEM2_zho<&5Fo^m*)0KQNbrqoB$v;1zu~q5&m*W`ybsyi2=bj4v;KqfKeAD;Hkxslw z5rhglMZ77Z9?P1+f63sU!-pLc+SF8lR3kv)ED~TJJd-N=OpB}On;pQemjjihK88qIbZXCUBHzOmC!yPFE}CEo zuzoV9lh+O=eRANS*mG7-af!3;Q)t}<%jEN!G8nt$w7KSa6r|HUzgh_pCKKf*$J#w9 zXma&0&OUQnV)GOE5eZNd`5{IMV4{UDu0VlzfY6ipN!ZH2ce@^FZM^(I-6j$PKWdaI zkR~;YjJG3Vdj_YKOi`uI383b}ntZd+jb$uyR}RO`1-G__$5I-~>>No5Aa>Z!-u{B_ z#yEXWXnsnI0%|=1hR4&IQB8})lUc6j?Ze6*Po^-O|;jLQi=W&!V(M_r9FLd+C^{4XzdIMj~P>Lk26zKE9870IaO zV%`l;;vb#3i)1`>Av0k{#c?ifWxfXmEMn1}SNiv*0}GP;0sKnR@dxP32?cFM)(#qpx0#bmOe(kr1vBKUH__5+ zh-&`q<>6675qrs$Lm6MiBbM<>a`U*HNT@9h^lg_VIO9`n7bf_y48gANKdR~Ui1|t) zUF0B<;sBJ&a>Ti&dZsr=Gr`1xVM#OD3=XSI!p83Fx3%=i8NY!mK6zvMInJ*-4e`1~ z615o1#w@mLGQdAr8ij8=-(dk5e(>*c)rzHD?AFg}I8R1wsW%>FCFaz2J zXGhX*;D{`s(gFive1Hm1RR(&!%RLuV@@dBcWTcXaPAWc$z&G9dhihZF8;SscEy199 zpR1L>FU{80X@8gTQ(zfq!J+5iH2^SpOcYllO~W1csjsHd#lahVwph?^!j3l zPfa-@xY$~siHnGlBp)O@A})%i@(5@@&{c{&Mar5#`PrXd-7<33 z)LaZ&?FgVS0~Gxo?C^0?SJ?A`7cO1ug(+>#DS_z;ov2HA9HtdVd?R= z*d)_m8(xfFw*_Mk$wATy3o`8Wt?;keCbI(<5oAp9vx z0eCJUMG19;y`E#-DkxBUz}b6?u}1eH>n!;N4k)#@E=2P+9K3#$#?^{~1|r^gR8KA5 zMGL|%{9as2zC=yQEKcZ6!-)R1>SXi}Wim5=c#3AZ&o^6todbCXw211O&E}fVg|DOk z=>vU**s$o3FX%>$&8&SMZQNvL1>+Zgg5C0l;$*tXkK^NV0e$uGA}vK($S%4A@^Jg+ z%^57mDN}AT`ENJ$^CmV2E%I@8&hTNw^cWYga#+1g`S zD{IG`%4$EC>{{9*+-JhDcHX1fZR^-_xWSfv2I7x9XsNj1{T>}9rV*Es8__-eeWlEzBrM+BH{e|k}I)g{exEA`{(ID?yFCn zn8M)Ysffa@>`9y%n@&?NVdAK1=T8_}!*NHyo0B-Mk)_V1`FQzv&{oulC#ft>GT6!g zjQaVenwqy!QcRJ9kps=;!J%3ubX>XdF4h;i^~A_M17G|MygbLTaTI!K!@cfd9Pn=U zH}}`d#B)M;Ul$ZuCo^Xa)O|h;_ZoQK?6#RVA$`Sn3dcthWpnyIJ=O(ynanaFPjhub1?bgDhVY7f z4I|)Jlhu%H)~-P>+M zOjcEFk*G7a4ku0jT^JVe9oT*w5iOjFB$k8CQ465C9wFZbtgB= zrP1qlP|_G3IeOH3>hO{DYI}Ss^$bWqV%GUR_byhzVC0rktIx`r&g^KqfLD9^VND%% zpDcDAC-B(BwiZ(@ZV0>LIY~O9Zn^d*U3g5Rl^_rAPJ4=bjyX)>$*?0vkTx+LhT*~uP@vj zY!$zqz<%uzNw%Qi;(|zvYf!Vgd53NFQ|IxJMRy3`;eh`cb-~1nRp__pWk#vdLmc~X zp>AY7uy}#Wz!TM49lf#E(3$0>Y?(150V9%{% z=nCUjs6=Lek0FrKE)0BVhh`a!o@je8ER~e>cyyBsA>t#}i`!bCNyMWC?B_q`6x&~w zu_5r1mdp1wt&?$#dX-d=x{wb2vkzM4!jUR|mhBoIB0)wSI#@0r2YV;eyF$()@h(6$ z;$QF6KnglG&Nin@&@Q|JvZS#I(5Dzr`?3)xg2I^nl4k$SqJ_j%J2oVvs_U}( zkfFj7(CXYBGRr@LG#Dmp=d%$1yns5YS~B+S!Q~Z8VuwbaMKgGSpgN_f6KIrOEJh}f zgUAo%wG;&Bwdvhk4UupI`T@!4kUn^tS7}Lcu>)eP^Jt)-U-8@|8XweX)L?+`(btQD z&!Lz)gkX4tW{*r3S96dgL`!^0gWKTBoH!=GD;&iuT(ZbH89@c>Mlcq_1Jm65xn-lt zq>D}{Snogm@W{G-DT%6TBlj-7cWv2n4IF+qeG~z{GhprBLA1&ciHyuV-rJ7y^n3T_ z$noRCLX^KkI=BQ|BLOx4@#FjV(;zQa=c9~ubfWBjP^&+|opC#UHRVvo3Qd<f zQxH^!=XK(&`cvewhSf@oEZSMlN-?b)46xFWGXR))P00jjgqw%1PruA*e*ENBw0C3& zTg47EGP-D|`D5^@$9v9RJwd}-xFE)G7MCBu(oe<4VTZOjZ$rn!RGl+VQ(fxMdZLhl2*hUsHrx5N?@JL3@FvsbIPqp}eh^bXW8O1nuL6%0?r+>CTnEb- zIk2W~Zl_IC#`D)`QPm92o~A|^O427{li4|y_L+usm^9Qf@+~fA{?64Pez;E*u%K`( z*V%?xUcsh!ww%Jd=A7~Mn_K=dd+GX2$-{g?hO9{W4~}F3@<;@bqt4cQ48}dQ$D~Ah=Y5xzATjw^xCs`3vOBzE-@!n8SWYW0K~v* zwr2bmqb|A8#Pq8o&Q*X|GNgWYreSH6eCpDS_hqtzSlk<3|XA`UVzJ4c8^u(D{ zlgkv957S-_qP3O%ij762P?B9YFY9HOW~VJ(8sohEf{EvM z%g3$2F39i)BMMmxj!K4K+fJQETr7#8p%;h$f`-&n80?syJS}H(uzw8D9~I2ba;FWr zv?tFj7`&@xFP&kVT8d&7>;t;Zb%%U&fD`=R`X-to!x2P!TJ8i zJU?Oct*qkh2G}od6DWh?6D-FaZ)lbCBh1&*iE##)*2d4YwRdrH8hJHN4BGT+ke&Z=s#dU4F#$2vR0&ag4txlVSnI0 zRQ&$u#st@HXRDWO>zp#>u-%zYEUNm=y2``PJUFxtzkRVrjX$5GjE01tLV{Pg9EHySx=!3&Q3j2Sr(Bzt1z zXVkJPFGqkmejwx1bG2uakBv>9tjO?>sq_j{356TwEc5YrAp5TeLMQ~aR?(_!PahuT z@fEeYD#Jo&jxU72kqOMKh$}Ma4blNk-8DY$R(J$_0&plIZl#h>q5;00De;-U2M@M| z6L9Wb#^ys)zgg5UC@ufYf|TA8e6$*EqcFOJOoF>x~6Y%phFa*#^5lOLM5-6w*+8rKjhGH*wxG~sAl%bdLk<<6Wt zXZ^SW^}}356O$fSe8qF`J2u3C0HY9nj^QIwQE6}_NivFP`SOZ7>0oA4FRV^iv+Dip zzV#y>wv7iXy2x-q5VuGz6T&Z9eula!LHTk8xP!<>L#+W{{J#E%ZR= zRU^hBAZ;0x>IiRS=hto-rICP9YT7ynZPmgbG_6}xItW;OjW4D3qvEWu>Q1}70X|M< zq1E4e__%H&m~M0KXL1UL6Q3f&Big3L+T&u3A&)rX<&=VtIj?H=o~e1i6URO#(=jjy zEKKxNoL_vCn28Yy-{Wa;h!V2;_Rh z&$Tac%I;m>qoomprzB#oizM(;D~#ct6K+=LAkFafqcMd(iVRGUFh z6Z@fpvPoT>6hlJF>5BoCIxPw6+o6NEiiY8_r|Oh4GtvEY@7&oe=XGIWvI3lS^Q|wQ z9}8*$l>NwyVR8>#-Esc3weh{3z~?D1KbllbO?${Unk}*PmVC&t`v#?Wkwbre%fp`Cs6~P?WEkFeSlW zWk*6gXV>20&XKcuf$LG|in+A)d&{1E$JCuzof^XdXVE+S7B6Rc;n*g4dhE6~>O7Mrw7M z%L>0w!OGAdK=h>goq?gqF1g&=B70?fju63l1O_J{0*q-w;Xe5S+^(}KzooBs)?Vg{ zp5qkT;4}rH&U~3(n{~z0<1A#Q&)v2QkpRjVz@6>~ja0*(z7+;fmvLq|0%HETkiM^f z5=HNBymvov3qIjA2{G^Bz|l2>)$2&;QQ;NumLArrgs^^C@2(X!N(-1dguh;L|6mPK zv)-nJ#YQ!Uq@Nkc3#T?%=xpniy_YZ|Al#i;Z?=b9wBXxya^j?d}CHcQ9@9nUf+9cNB zYu053<79jh+aWW1e708~#$~%grbou9=*6U!!ir4PpTc#>zG0M;*fvs0Sp%h^Md)od#CX>Rcyw zepZ{A{0Bvco<{;X$2lAu?elO?)bV!CoT(x~7CDj3e_FFFvtemzsXnMb*(U`+?TR+f zJdz(iJac`8iRHxjvsE8wN{_EBJ7u)>vuFD3Dd#X5ZKjY3DYVjmm*%K(z97?`=HnA+ zp5#Wnh)jc!Hr$H=WGK!jyw#a}MOs8+cseu0Vcnq0pJ$#Iolt(RsCMuplb8Z0jH-eu znhK1U{%D3w5u3v4Db`%OHW-(P)s)2M$Ic&ecFl}yTJmE@j7=2!6tVJh4RE4ny|E(& z{oCpooA~Q6Rz6rlwMbrsxw!@u$`sv4G7(e+L!7S|3u=mm3o8&q5mVjgS?w{HU7zsA zKwy{AS2q-ow%6I0Woe~05tw}=XH^@t@t~?QNXgHy4#khSfq5ScCse`Dm&!LNX@2{v z=bINJ$I8A)agO9+o07~eI&V6m%Oi@ISAO%VND4oG{3w~z;$Y)+5p^Cx|4|J3QC?+8 z$Und<$oQOO5SpEzvWCqT^*z8yBH987#h7y|?%uu2%&3isee198UE>%tA{6(;Nt7@rN&L+&@rZQw zdAK@yoVH(@&LM-8DZ_?&$8CAc5fNU>*}_PTOBp`y>5RLpn=1w-9oE@)^8R97$EN?LhQxbEoPQoSw$+q zUE~bpyS;sT3oE_!!rX0BflNvUt~kcWqFAgS9a8<+hpqwsI){GSa(+yqYf0PrO4q~i z^NXMplaC&9CTH`)OB7wZTCM(&SM6WZqNqoE{PW68c}(tdjugJe^x&2rQ%{HMCeg*1 zyJNdOjUdB-Yw*aSL)XO#OaXFUHF_i!paai!4uTrtSdlYXefP=QGH~_peelq|Xn)F? zaUYwsH9EzjNxVUtMGto#$(S~cPx92V++}b>WF$Qy5U!WFdKG$j;+BG^uY57}l+2B3 z`GaaJ1bR(vG7K zpu3vAFXa^UlH5ux)gD3l6xiXq!rMv)Af=EC=vc|zUR+J-9}0nuT^@~DuJjJYgM$po%E>9BOuHuo8vG$K1UBtz)KtxxRKz~=|Caw zhFTn?+uFgQA?l2-95Q0~nYX;jJLpg=iSGt*q(SL8!p3bmzwS;xnsm`Kk|$u)n0+$fV=ZHza#d)D!SiY(@t5QD`mR>cHDH17BOkWJtdi(%xww73RhS^6E|d9I z^Z&lrfX1)PRA3s(VtltnM7@+eTFi;~e2BtHS31%cv}un2wS1Eput5;P`eO}L#-R(< z%+1YhDA04s%B9R?!QbwG8gTdV3Xcobt*wE`Hjq7dRQ>?D79glYW0~WQn65V9CL6Cl z5;!OC#0);SxTGWiGolv#m$v{@{#brQ_Y_wz+9Kfx3GWoB{EF|Y_R&j)ZA-&dQ?3Y< zKs3zOMEtxvoJXN%%qP?l6FnZ#tETh*Z0b94;+~Ux!Qrsf7H23Lt=l8fOV(5knBT*i zX7#IEf#7RDkSgEy5fS9}?H!k$U;fkOn^D5koTnAJH0EX)+~F?ep#g^ukzo2^g&T>U zm{AHZku93<`Jm{at-&Ul+9cv3VAp>DMHzZS{agFD?laR!JH<|skJ-rOfD4r=}M3My%!# zyRD&J*wL$VW(eV2ebjQW3{h_juaIoK(C@e6gQeVnOfV7x6wOr`Fb15>hj|S(*i=mN z9|*$AiDL?GDqO;kMFTs|N*Jj=YBBD@S2f z0Q0>TnUT7trVXbo<7ifk^S|x^c+T}u`DN3t7Bwk8A7l`#=HYiP9bAcH$g;#9m>Kgm z33wAi?oJgv7g-gL?J@{K-VxW{7!xHF>SXIbmsIuoY)7*pkHtAd3tWE7GU!x)IB~P4QoST{nCwG z`tE9bP}LW`&XqsP zIGkjz9ohKKeou|0XURc=)jK~KzYx38T&C@q+085n*ft@d+Yz`IiJD_Wn$I_oq zp^!IelS~L8tYc0S09GOMbJ?K63^Kg~7qhl(-mDNu zTRFFwU&VNyy5a)HzF_5g!U@UVbnz@W`K!?~cm@^|B+LPTsT}(iq zHLsVAbEe+8<;sotP)Y>x!w|hNc+$SOIEI&v{2k2?G2}qf@h(qk4802Wrur0*ZRy$* z-f1XEC-OwJ>j z1`!=PcaF$(QijLkw3mwkcdH2ig0w}twf>Z5f&_|K0VXXUNsN5tHau8IT`enbP9 z;kitHIxy@`US2wx=rPYc1DU{!UYu`u-mB9){CH$mOTi2Vy*-m*(nGhj*H%9YL!~C{ zR`{k8r7L)CTve6bo-r+0F3%e{Y*7yM`f-uZY(ofTyMkd1Vh{7mueA5+aMo?$m=>kV z5%r5UP%Ogu#)Dj?+5&ctp2HT5d&B8t25P_S0&FY;iG_|r_nN_pR}-I^#`RnrZ)P^0hFZvY8(PHDtPqL-S2NIH(;PkLEv+|X zsrW_qv43K(e0M`FI2Wdn3DL2wEdU4yLmN=1vZK0v7a3~7M1Vi0A~)z@jmq#0^I0(j z7WtX740G7uZo?rX11a-$96C9qi;^UnUt_8q!zD?uN2?R!fYi7gW%YLGYU zAJ{r##0aI-ka0U-Uhdx@!^^u~rpO$%Is3)6X7Et9Sev7>DO2#n`dB7r14js1jNC>a zClE<{*#w1A7P?Rf8`d_o8pFrhXwi|Dc_2T?Cgl>8IM~p2oFCVem$<1B6c7opsBTG6 z<%SCNEi~TQwg_FSd`)ww(rM%T)zmdtCtJ#~pKe{@G8{3k#b{k;1G7{5vcD{0hYdvGd{B4Bl+ztE^q~Tg@0G9x#g@?&nIGXTxa?@S#-n(9lasyM=?8WQpzxA3OvDnTH{@n{ ziQ{`5$$ata%BKY9Tb+=Hac=@|bv<%SddO=rKEY(aE`_7UzR zbo#7iD8d%&lf#T+0A01CZT^?MvfwvF7G8$+hyqM0DyqF<$>PP%J=Hd3TN6k-&WP2Q zMQ?MJlS>kJnT*NW5pQg|k4G}&o6^!-84=+W~;6xYJ!{kC~X!2U}~ z%uH4)Ma7=KneVOy$q^?UI}mDn$O@3 zEq|uhJb+IC+3w+ahetM$sB&9D)E4J(i9aHeBN9I)(FODADnB=;QP*Q*CnN@KNIToU z9UTJ&Ev7ePX~W^T>iEKPr-3q-s8oFVbeTP{4#}1H<&akW#mz{qLCt(E88rv9!ej0~ zC8<6&VJEF8n@cX782?oP+gQ6YtRiMg*D<08_5Mo6p}}!(9w#Hhb?HSx@;{X`M0t0+q=^9WON>McLM}_jL8dq>OOi!e;Ce zk6Jqb!hjfa*@?OvOsCyw_9{ZAQS!d7SX@3BEURorP4Z~=O+C^n5R4|YX|}ec-|-vZ zL{ zhX?2Ig||AjVJa59T1slVVTu9tkz{bEblas=#a>P-f3h-Zpi3?ZxY*ZYRNoloLD{Cw zrlIA;BynVvq{zp|lzP>FVdM`u2pBev-G8Xjvq7DjwCUNiIh3bt5E>Cl1McdReghqt zEvrZIK6PqMrc3l$@=0v#v0Mm&_sl*P64CaWD*onCJRJIdDhjl7;xz;wJ%rr*z|~<> zrbK<{6J;L?pd--_0Y?zv9j;7y2*Xe=6y}87>8d=K*1_y)K>4*A52@`5_0rCExk~je z3BATTf#^YdBz2(j68Sk^lR7s3y%S?Cwa=VA zJG^QSv6c9LZt+JA)`t`&Bp?UsjKmAce_@>wqzB9dUBlVFpUtIncblhyM1YN#J^4(D zl_>|J)-vZ3nmN~Jbh;-)Xc)>D#wm)@Na$sN?zwq+B7T=zpR;6WqgotYDK`_=|>ZA;N7(HX+;o8M>5k=>4=kfNMYFAi>_ocn3sy?^DGHgxTbALSrH&LBzT$`bCJ z|H-+~1{Srfc|7UFVh)}t-OdIAsEN(@>(|B)7e?z}!@oqpy39;D@yg5RNR~~+rr*#N zC{hfww?AmVPE)y!wu%0q0Q{x+$i}x&8}7jp#D#Oz>k8msrSp|wsT}Q~hPg1rMd=3- zQD$I4+bet5M7J@yI~ZFrP_g_ReK0092zJ9mm-J7s*ZksMBNi#+vF;XNGWv=NY1#nU zOY?{i*7*L;uMgAEhhF?)Jn!{=3uIaK`N2U{PoRs%fBmHfe++(}$Nf3!sL?iIWsu7ak8@<+Q-J9l}Am8g`8&_wr40V%c|i)+_72GR4vD zjbxH_(86+#LJ$OMZ`AT18jPf=i)+YapjfsByFZ=M5YC(idw2vkcR8^z+!e`dVU$z#U}{H?sZ-@sWrnuoZ?Lze zHiYOgciD(7j;LaaflJ1w%r&Gm$b~f`OvwBl_Etof!_oV?{=)^Z8ag^0*E^`N!d;M1 zNoLO3g!xs@ut8T3;`5BHlu&czREMx$0q|7d)joIe0ObK=NV+PZQIDdfHCR&>y*uU{{F{ut&jomd_IYCrTsKSDI;l%)U5Mhde@#=BF!V4S(5O)EMkGSPA zoiH0nHD%qJJAY!`*&&N%AlOK!amQlqAaddeBTV-GZe6fM}C3=Bq#F&JxSd1AxL z=H&NKrUaY5lDLRR#JQzk8xkhOfAqk3$1TAL0NnD7m;b03BB>ai3I>@S{%olXX@d<& zE`lk zxv^|xs4BFW8e$lay{QOAgb_}HQgd(8mogPc84<60hOL3vKLaNOF(Tpdrf7xb@29UU z-4IFFapOj5*tn-&fkfQkzlBpofi2)O4JwD*xi!(IHds;hvxn$pF!byLw>0W~bz+lG z|7)EQ)#R;l_AMSA2y9lm0?l!G~Gv}Zh20{Cf1=K61MUr zgnh-hL^M_MC>PlcJT2y3QC}JNhQigEUA^yW+j@|SL=Igzr9)qW`mTPQ z`_Da<4_0j|2bEaLQ2us>bPHd>b|`QLdFKFLo}hJ5gHqPpDVMvps5fP9lhW_F-H2^z zzLqvZijX1Z1)WWdiUR0Blz!Ob2hL;cxUTyx@7p)r7-}D(pTJ_@#=*U7_;o!o_7weJE?gZ zbrnid@kIe)UHntCaopdhYLSzX!;W%rFxVT=T#611KD$lx^t?hUo=HwoSaRj&^u5b2 zLup22@?tDK|0Pfz#v?3IPV{PfXABN`%-Fh^;q!m;LQb$|wpW7IiwSa1OUtADf06SL zfX0&TY@;V~cInsS(T~mAgMDX;3hVUg(|v-CmE~agHYij;>Ql)yJUEsixQ`*ct1n79+fd@nE?n->hPdV8Vky@!Wo^E z1=g@f_zJMeve@pqG^S`ppJlz25W@XX=Mp_s5#o@d8+7Opd*7XA7HCIpP();8w=Fb1 zk|;J+6*MS1%mbdC81%HcuCA*8gr<#*)IXLf>HykiNXaH^ICi@ z6bg>2x}QX}UQY*!9_phqXz`pJ(U_iu_V zL>DF~rtj5|pedG)+47?Yq0ozOj)2uwT9$Yf006qY?S=_IB)lAixo5!#xvJWh17-`ozYUB~B{xBLG>4X&I-)Z#d&S)JP zRZ5bq;pjJ$BrTX8D@%p{NS82NjQ@|U*%&D1$ONo6E!VbsRKGB!pG4aDjg4S6-I zh|iX(kwnoz{I=xBW`o5b49{e>j@tiTLa(-=7~*mai`)2wklP(1UOP3wl7Ryoh-wOR ztR{q9YEEYzDwFIJi(VU~zZJS!C|tM!P)$9pf46gMn?3Z*r1Jtz1yhv+y$<1I2o%wT zVnPNhb)3=q<22qrdW%lJHAai4O{&!v3@s+>7?G0TM zocvmW!=(a2amY^ebZ@BxZy=fY|tB~u*T>`1@6D%^*gh?A$MW6F+Vw{INb|6&OYvxYRuBL*NY^BoMW&3heW~%O*0xc(dH*Mw)MIpT93dW_O<*S7- zgtTXj_Zz$_#OjN#A28;-C8Ot>qj^C>f(7^7Y#CL+Pl_vf3mfO&azxW7N&YU47JXYW zZW5DDbR$XtHk zKFmn|Hs)<=1J044)%aC$LZ)b0A7MY%K!WM@xrj|R@xs!uKQ4S795AZm9uh3EY?ic; zuvKtjP-X;sZ&l6_N!5bM=BJ6ttaz#DYALtZRsVX*6KU-W|*T4a*y^qIhL}VYwAcaK<>0|H*qS zAsiNz4wsB(&S6kVg>FH{l^v*FYLw4DTXv0gDW>_k`T2qL%qUoZ7VEIK$lw}_SBXpj zX1U=Grhylgn!6@#)X3)3`4uz-L%=y`-wc=bG1XlL0y~eEHC)-U$lE11lR|*R%$gT0 zBY0=r&_T+ovLQR+hgL_M2?Rl8v9sL7!I($D)g7)09;W)0fhQCo=Q;+G>3se5G8h6{V zU^kQPmFpjj&#F59OLjZF|C-Mn+zfiI zSoI#kpKfRE=MEI9_*WzpL>^Jz_O;og!k10yxaRnz{I^xA|5d5vyCVUysG{{qg_jjZ z4Pwfy5k#Mg2V1>@q(SeEBOJh-EOmgNYX@Nh`* zgLzjN6@CKeDD02~$pBIj1t5tl$>=c#5Q{Mut5R+6q|-@_P^PhO(d!*?ev#yQwh0)y z(rA@F0OSmaS5uIrh&Ag`+L~bAl+Dsl0oTitVdAKMh1bh4JyPZ&DL#eFT_!=FEMsX` zRQ3^{DMoP0EUAnKO+Bu){3Tn8`vaYQT;%kQ{gbYTvkWB~(U*@Op=3Gr48cC%d6TCxiNp7&0V8_}DP57cf3SCOFTi;vBqo z1Dl*nG`q$kmPqdBZX%Kl@|Xp5olEC;DKY(2)ngt*SLRG3x{G7*kqIdoz8!W^QU)-8 z5m?IT$HyXjWMxs14#d&0x*T$%gK14h=);pPedyA$x;;@7>dhQNgHfwxGiBh482%8Y zWB|>CkkQ6MY@wpT;Pf|}+X@AUBGD)O&p%I^Nh}C$C`SR ztl51S(&9i%OhciROvfwz)~zG=clw}*B@0kpYkG3lgC>9&mzCPh5c39WO|b)=EIgFY zpObENoS(pa)o>!vD#)_DkuzFB=K91W_$*ucEpSmb-$OS;`H+s7QDXf8p?zxAtho<} z;Q0c>L?tE{-4+u<4agy)$(azMTI_0i@bl@DCQEA7u6_N(_+xH^Qu<+7(V@p<0S>(K zW7Sd?M453zLdGealFuusLq=mt(lDdJ;Zo%Z5-!R)f{F}Bcy-Wjg4e$ihw|CeZ7(nTC+o3=*few&020~2N*9CO)gY^ z^A5Thr4IWF=r^6O@S5LJnG)H^)H(JESk$kTtyG2`1ZNvR_g%gwN`Ve4+~SN3#5e_>-J#rx>!Q*{ZVB#e5d zB<@_imka)l5-|fVxlS+XXaAElKpc#kch#C=MJRWq5yvkJE1ZnxFLX%Oj5|PWc-6Oq z@8xE{ScMxnaOUZ)w_4Y{n_AA^{@E}|QO`L5Q=t~jCK2#D+2MHgOPk8#7nbE={M5L= zl3u%dg(;=ci&MIEK#siqQux~hnk9AeU~_!?A;wL406oi;(a$6DQm?# zohMyrfv8p#;=RIqr7Zbu;=izN=c1-9T6F4cybMYgfYz`L5q+KJmRH?7`~GO`waOL6 zZv&>>tJSng4;%X?rac@T%rDjIVQ+I~+0B?g;vx#HUF^P&b=fiF&;2P|W>+S#F2B`C#iZ*THNT5Tu=c){lC!Me^dGBwyy*AhLT0A# z197C{EQsa~wXMV+#7bRGbA=dzp8Is)TVN=fAya0~TuYCr;jETRaJXz8=nu#bI<6n+ zuN|GGZB5(~rxRFmlbJjwOAs@TjewB!3_*ya@{{{5cP_55_$_+vT1|=WB$MrE_!#Hc zUR9&PCY9sK$(NX=7{p>KCQLg-j-EnJqo{NVd9qjXg7>ij=H#fnP|Fo-mFRw2)#0Z48x65W1FQ<^a1N89AJv_{90#- zh`zI0E$P#TY3B~{%T}q?lN1i2fdQwW0(yN*jOk=1!4SX~8Aq$g{do!N%n5&`a5Q{d z+Vo>%A|hj6jXvrg0?*#sKPqmh0U_{_OIyqoa{dxUJzym(E0W+mW+_ko>?=+K1ZYLeP-kRK+ji2y0A2*X&`}-KtI$H#VXK#Aj?ER+UHuM9UT{)Fr>Mh zK~oTqVo-$C>8&fjR=j_2?DLS4sC8hj>xn2oGzVe?!ZF;*O5ApTzJp+Q1~|xs>A#AGE5PaCg=+g?)#{V(%y<1Bg7Wh-i0y zUh22}ho2x0Fet28>6EtnvlCQVX0|sX#MY2Dn5qI~DJsts6mm@I8Si7B?oL?u&}!bd$3Pn!iwunF0P4E&M|f9g7VfSG2t__ zNX@dy`3KRr&jK_Zu6L-w3m5(Kkn1|1ehYC&Wq?9@BL%Z^a&nr28>srzoil!akE*AR zBAR{su=2WhH=02}xoxDsL6zR++1wQR7o3x!RLt^~IL{j3)6*Kal}muCS>T+Uyg_JO zmc+eh6{#x>%*>8IbcugZ^|Pp|4h&-2llziP3N*F2lR%fmCfJ!YU<)^p(nv-V!QyAN|Pdm$y2hGCb-Q28k5gzM*Tm44tbhj97StcI&$tOvQo*Y8biY#jAS1NoH=z}!#h zcE>c?VrF9-(-5|Sy1IHA4nmp+y z&mxrOgjUNw;>r1e`HX-V5vd0b9Y1z#AlMSVAu&5Y#C1=Z`&@%XBf_&)>Pm92*u?7? zJHPC0u2A@82ZY9|UBhe!RGx8tYe3gZf2OC$&!ICXd^GUQg4B}>oFGpM?ZdEsJ+_8O zDd>s_14anYPIGnDY8yoB##0cAvn*)dF~qRvzLh38MA|fzMkvYAoYMS`&;eue%^xhz zVnaJ_siS`sdvJ-!w`QaG0-kR{W@_EkX*K4+7}{Dmz3DTD*X{ z%I7d5{=%tSfn;Jb{1M!n5>LJFMi?kCfC4Br+^Ii`)(eXsL{gM42Tay{7_dKYGULHy zG>G(<5RHhN|NXMbb6Ok32QgQ{hmVG*AQqMq_Fx7X`;K!Qh;4))zE#I2r>lzf7n%ku zGGW8GAmVfERc~)DGAMUsSV9%2N$lk1)6Q&yn5>gk$z&n`4H+@N4Tp^s<0a0u z23bZn1OD~Z1Z8J$DqWqC+V zL6e@G$mu6@P;(EC{O#P2ch_Z%@9(>N8T!un(1ZQ_<*%K8z%e0ncAiY1NhaNJpD`vQ z{fS>$;%RYE2e&CnJpFhjny%Dy z4V$qPO83z#QEG{Jy<#K*3cC+Bi9G+k0kMNc0f8J!m`!L*OLz^!61(uN)YzPEUg^7Q zSsf3Bh01CXGd(i8_ZotVbKZ@T=) z7q8uEahv5nVo&0$#9-Xtu)N43nGuR4!Az#Uc|JIm$fq>pSqenX5nVz5PbdP?%tMSB z@8KF|l*a3@v0DH`9|fU1KpiGEA+(Zsm?3cBF%?QHZuxl2eHNq-@DNTmLbwM`wZ_1p zD<=;AI%yDJN*fwW9um4|opZtu2Z8xlrbPIxpJ9V$~}1I$2O38)ihnWm3{5{yd2v=`LTo{2dBu%-pQ=go@Aylu=%j<+}Wg}}jqmw6@j zZq8ig1N}AzJ`T-0Z1c-9Y+xoR_5pg*aIg`c)|)qy0LXTQ__MihmncUg88(JQRrvbY zou$JZl<{-e{ej{MnTW?1Cf;_Wjh6k^2X) z1GG~>$CaBQRqlI0(xlv8P7O)1gzRnu-pMHBQccq$}PX<>dq9o=Qqos9Tr=HvHZB=FJEZ z2Aj7!#MyA5!r-5OF&9DiMy^=6ALIz6-T~d)Xm$Pz);k-W2w0)xUSe=L@lJ zUO%7iWeJ{$D=4+?Z~paj@qma-*)M7%Tcwv&xT;0i#%$PJYA$zc=J~DJ#A3bLKd10^ z3|1XYiDNs`aNr%rI!b;sQz=G*o>oHiTs-eKD7^;QvHjUPaA30 zRon}HG5elE6QM1yDe8+2gc6tui5%ry8Mo> z>Z(=Ce_fy6zwTgCdcZ%wf8kBT^7X%dB%NC*k>ki<)xR%wowZJ|95tbuEklenrG{$I z++544S(EA~wktdRpZo8o^?&`KrdIGE6JHgQyA22r+h9q>>eRL(V^qNY9)uQ81|}%{ zXj|laS#V4+p51j?e*q)`PGJ1hV~BZ>l?t^M@i`q=IG&!M@U$Cbqtl^se&KP~(=0sz z&byhEzn3WT6Tbfm3t{l!6^~l<=-ypTLF}>KpVykOjb)F=r}qqJ7z^`@S)8(?C;d7smS$8=7;#lKF*PU_!saUqaNmp z0YD9km+MC*6r?V1x#jwEv5>dUoad%|rX|$|_ybJ3tZ4g)Wz9BU?A9Mw7z6h9-g|?| z4>*l=;!l`Vze_V1D_m~m*^9h)xxtkiSTqdwLoA#Rja^Di2_om?M^WQxDy8U|{EiIF7{*v(Awz%sg(R=Df=F?1c zJD8pQ$7lvBY~XnDPf1x6HjBOv#HqaTyM3##_5ha`?$`PKxW@;-w5MyY+1$*ga05~S zsJHc~u+QIhiL74XWWDK_n)v2)5N&E<;pdd^%}w0`-+a795C(P~l;nHYLiycCWrVi0 zF9H3bcKJQYvVY#3-tHSqQULtXT#G#JcC7wMO0ZQ(#Zl#)$;dIxPZ(p@BkvBboM)`2 z8TxE8CPs#HT5>$dgc=ChfWVI~zI~XoiegD<#Bd709{aJTiPPl%Q3YO{6~4>1Z`<~A z&!pH%7yBvFXfm)FL?I}rE8mnx$BdaK5)nseZzH0;)g5=iD@!WEPlj2$U~S$JGj@H9DV@{9?d{tVtpOiX+;mUDxe zPDQ~Kiuwv^vl4=j{!!GA$0#g6Y4~%7(1jWweVLOu4U!*uAYx1h#LLv)HE67)PP)8d#lRj&aL^|(03!cU$1i6of}%MEBH|Mc z_hW^Lyp&Ul7Mq;=IJXbt>=C{I;u(zKnz7@VTe^}3CVD-_qyl_|t9SObX&~#`=^9Iz z&P8lp4H^ENlr4E>oWVht4whgVHTj7#LI~0+PF&^b8P{BR?v9 zJOyd?^XtBBwx&FVA+wjyq}N6s2=!P*eULd}{rCu3GX)@nae@3tHO(N)aE z%ZA)0>Tgo&fl>C8@iP)K7%3iFA${L4i`#$^^NY$v@OfZZBVorAmJ3Fz4G9cw4LTCQ=$GHHI7F*Ye zi>fJL)h`b z^JmYB@7VQMmba)eo=mFx)>PUZ)0=8O_3VtPY=%kIEvHX->E)%aJMNU~+PDe#|Jt zMN5`Er5X?dJ3Z#yxY6xt*=f6K?$O z10a0pqSdno9IYVmjd}33xs>;3Xv~N(Pi(fz_g4L=&3al1(JS9$l+Na>Q@3GA1VmAn z@$-nkrm&pQE>4NqyHmb2__n9HOs^EV9nVJQYynPn3orZ>t80nRHZKVpP8(2>ApR?0 zv2D&(c|kSA_mqB$yuxE%1x8@20`Qa`A8G*HE9zabe4^SEB5iDZWV^4u+BFze?>*No z1Z@Hu0AGl{o!y9n5$XZZ*Pp<>&jy+}ifJby_tpExOZB#d#<)m5#X1@-rT0p);t%GfYwjsNMZczwDg1fp zR_(TwaHo*zWTG7ZOqlrY?PD-s8FL1yhO~*;BHbVY`hM^CXSx6-iRw*8v9c;LeE`;1 z;oXSYW!Xae-7uNaIIBsI7Yky}qr|a8sQNXpaT*7WXxE5iDKa(O$bv*o;7(?MwPnK& znl|mA4Ckt;pxVxBI1BknP*Tz#+kaAhjX&v`X9cl66bire6jsB13S(gj#0gS5DH!2yH(-W>w2AATO5K4q~Dlq&lkUb?jGQm+fmcO41Ns!&6d4sk@ANkhoAOyOS;f zdgs@(1Hg|avCnyE$4cNqBG0#XJNw_;5BH|ysl-zYzxOGlZkK&P28c8`mL3&hfLx(WXaIQkVRij1yD) z9|WlgOWx2D4-?gomRL%wPOB1|t(ok31 zsLMYFeyR?QJ#YSHt-9YxbkYfB@;juQC8!KgSGAx-0hdegI=kGu82&L@*eQ@EBq83r+<#B2{I&+s zFMuBOWvn~82P5B(1#cikyAl8KAQ}x-EtI@8l+iq+Aiw|anS8(DtkDh*Ho#o6nLIJV z5fWn9PnxZbSV0Xa{wRNJ(8UQgiDP?w={Yj>rVKUE!LUJ_YDAmvMh2pLVD;hTbmwf7 z{fdLY?g65lKauk>3>X5)`Ja07Syq8s0PrjYOw|;~;V(73S4oA74)EC--?AoTv;Dpl*ND4r)HBa1N2y2fCrQXI`@kdGq*jXTQo%H5OJL)ljgSnI%UhN}s7I^*5N2 z)(sOnAnmFxscT=%1&VeU-Rt^PGnQwB=jZg=t-`(D`$ZqRcoreN%vOZ==2&RhClX}& zRcY0W%!Ie+Keq%PHyP#Ol)zG=4uPFn``W`CIf-wg?h!ww4{wVml{)Mh4pE?WU>Yfv zih|fo+nBo3=B#Lwt<H!L5Ab60TN&TkabAC(_$Z4deHQ%v^l2L(!`ss06G8=aBky#`|PwbAELx-eo=@6-3N57^x^g-}5ucNz2RXEj-8(B!%>X#aIsrAny9cESdz0 zA0PKUFG9mSP(5nNovD($G1h1fLQI;n8C2p5K^H0a#H2_A;h9)AO2dI;!oqC^<8l43f*AEiq76Y+y18xzqsy*!2^#xm_sNN3L||xnO7I6 zEv=QX<%>VRS;wtLQ*jAwO?D9kd{LZ`bit@X_oW0rG>PJ=D-u3j%-igAlGp*(ILG>>K3@4%$g_go_VYRU$CLw@{?nnWqLA((S*1Iw4}nH#iU(?o0hRy7$K`$xlVAzK`M-Y7VlTsW zXODzl<*#kE&^37&FLe9B)yi#YL7V7v@PLueK(qnIg*t#*`bKwT?*3@`F)beN_WkMJ z>*H&e%=mXt-m6F`wa=c_Buj=9K`tqhO_JMSeuucM2w(Jl8MvaNvd7eldo%?$A%&ma zm-fApk8AQEx)hKjsQkxzwNk$Rc?%Ypy#M)S$5iec5=VyEp(9mjjlct91|5U$goPjy zKN-2v)y(Lt_dx=VP~O--LASmm4wSj(G-U_k0K#YU#|(ne74vr@Q4mc8Doa_oKvzpE zzrHvFV~A`-n8eJ1O?A8|&YG}J%IW4w&d*K}H6l6in7<}H+R%4-t^A^*i&lm1v6RWw z2BSZDXObny7r^vw3#Jldvi~;ioGkb~%=bM3)T*NqWQBfa1nx$wLA&N|;E>$I7{Dh4 z`qWu;W>g2n*^Q%x+#YAOHYgkeWh(fI6OAe>E4hMGHfPD|*;kq}r(_BuNQlc>bi=Zi zygh%Ytmwgm@rC^?ESv~gVf1zC0C9-y_!x**9zqG%?_(w|ipi2u_Jb(sgIp4iTp8ne zRI+zPC~(_~HxAbWzf`U2BOn_YURRC*2-8KAoBGAu{;O5r;eAro4#AILB5}6X{#baKhBIj)0R#8bvkLj(b4Z0TDXo*y$*8aTAfgsYF*qoZ$69dWoN6uT2% zl1PXA-3)eJE`{tx2Sq&E$I;Qj1`mfgF@OWw)= zOozyd3AAC?33G)m>hm7)Zx&jIg}&^<4>zUp@dP}7OKM}(0~`lB3jEscHF^NvIh>sD zNd70tKWiun#eHlPu@oNJkNkO+urQ)=eSK1vbn(cXM>@i_mSI+=$R3P(XM+L`0)@C# zyVX*Aj~BV^6P=BF`Ife2Y3$7(akDpvdqbbOb2HG2$n;K6bKSV>aVKFZ6bZMVMNMDg zXZ|Om#{ixoKAZvULuc!tsy2cC&Iv)z37wM}(5HbZZtRDikkCXvE;tegT*RBCcs=C9 z;w`7V5ve$ay*&Ta>0vNa%)pUSS zUppS2P-4jtAMDG7e{xCq9hB{uYxF+dfObe%4y>l9}>C*D_ z+FveKeeZv+Jfp7NEJ-5U9^(t#s8z%!``QGP)b0R6Efo70T4i_&%a2zFqqNA^fg63l z!*7p%P{3I=M!0B+_ctyW4Rnbdk1VlAbQpv6R62=7cBXd9k-mp;taGt;cQq3*l#{M~e| zBeKWUCSMEhC~P$9e2)G0zCE2x+O(;WHmS_{#~tWD`|LXij`ZjhyYDa?BJ*wY@&QT8g>2-1*!d7dPr`!x#tlJkl=DT_Rl^L~nlM=ko+O8Z51hLR-Y zo3eobOYXRy&<|rGDq3sC_ zOOsa81xheczRM(f3L%BiCs|wR>^yP`)gPQ%ihbYL^tZG;bK^bbcCv$SC7iqe4IKD4 zH__1<-iCob8%o)Kmb9_6=g{V~U@=$AHf$tduvY*S>+|}NNG?msY^-CrL{TW&hs@k*9DT5xJ%Z&^J&A^5RbSHiCfCsPpAG*LV~ScEzQ{3jqQ zD{F4kqWk}*CED&TtEljJbhoyGh9LO*6UqWVFlkd<)*emz8<)EwbV3A{qdOxIKt%lW zy`+fyoD+QepFMll{k3)Y={Emx0q!R&{84Rsn(r`c<>m?wj3tZh&B8g1*;d4USa`6D z>>mh#{%ks)WuFzboh*C}ffjdhzLMKFK)!BZXc%+<)^Cj(wV`(v$SAgZ=a-y5v+(U6 zfDTC%-0(Ju20C*?DPZ?n5_`K{b6pY&gnR&wKYsSk|3Fu?+tqRj3|m!yIw&1ZSG~KB zsNJ46S7`sLAEUvqfNo4oCY(WD_yp{Uyvd2n3q@EQ3KAeB$z#acC=K+fU?-OL0cchA z4_&*q)9J0H$02AkFy9dz$ z0h5d-E@7VONbHMPGGYGwlW&^-?BCxbaTcukf3X%{pP)X>xO#P%KpPL*kK943BD$%2 zdp2&g|Lbd&33SQ)N4|Fr;W!JIJfl17T-pU{N2EN5KDtlot$YNzQ-n820ShR57CEKQxQNr}_%LStI}zMOBqelg@u~@kljNG8ffu zkOoPQif-P-#jgy-=+q(d08RStLUp3DZM$gAfp}MNJ;1GP%zmC4Me%$3@#4A|E@V+( z82zTJP@H=z_&xZPpmLz+mneijR<;WWm(rNS7-541eBF1m_N@$PuaVNpICbMoNHI@v8)U+(YTg!>k zs4+Bc_<&ucQ${wie??WLTF-T+!@Qv=OTUUU1H+xzoLq(fdrtTQOXf*lrk}NvrX;+^ z-$214Qrns-z$%50Nbx24rt|_hbr<)qFEv2De4OHUVL(8&Fr?_hvWolONqv3eJ>$a=s?$ zE{>$FP)Lr5*t12-o?xVG(s!kRVh;IQX%L{-h5P9AH)hv^!M3D&kQ2_-)YQq$yuwbF zdh!4=iDUeQ=z9rhB2*P3;+xVfj`Lg@@+iGQ9!VaZB7h0;RC{i6E9MX7?_S|#5_da! zhm`UGP!2^mNiya;!*Q4l4iMCWutg6+>8eG=4QC=DSxfHJC+A_vuL#00B3V^VLuvsDM z`G_$S2XdB?@Ai$euAzABLvb)q+RKO()F^_2fdu5BW8q3zaENq1q(?j1*!ggp4Ef__ z2^AF;&=8(>jLGY_XPj#_808uP*o3xk3P-s>S7fgc4E-dVVR1S{j%hF8EjglG88{K0 z7CWg67{#|Os%?_2&ItnO%ZK1v=asBW zMc1tNf=*;?)3kF;`m~Xf!sr1fr_!F9mvZICjlf@AMxO5!@?Ds3TT^XTF<3r~*7Owi zgUH7yY;ix6`xbl%)`LT2HBIot)L&JR^xEkhJYg-tCCg^@2a5ZkwB;8rj7%MIl>q^Q z2c{F<8Z>Go+zPng(VTTcYZW~Q>1f>b!=2s-^dpq0m6F;Wdxi$O-6|0)ginjxm%kxt&C%zsu6cs%cxPB{|s?D@KOAu6A8IgNYMg1-$bQ>u_q@ zo96-%2A*MFso05QsRe3NN9+#(R>Y(im_s;@pbT7L5aZ9p&x?9G|5R^{>V@I-mg6*a z5;95oH9DNLYuSMnlG&%@(#P9^5fTVKLYGAvC{*m2whL1Q<_IlP>`QAY;NH|HO7aiu zhA!nwDlCUMh>Q`o@IG1M`o_j}A#H(j-u>B|lXr|ayJUE+o!PCBR+m=2t0v_QI|QAI zr)W)^?#&i-);tWs2?=u2aDbd6gKk%@T&b};n-FTH4pN~hWSrXv!b}bb>?8!1IdlBc z%z_qtSQ?S6L6C;eD86N6h4LvXKXkg|FM(pDUI5o8L`IwvK+(WY0dZF1-lY%`i%i1= z6@$HO@@I0s>Y%xC#?bE;H%xY^7i1On{ZD!nr8q z>zGypmM%T#l6ZH>FkfBeHJk*mXt=P2(UZ3eTg1+-rYQcL7Yhfq#Rnl>Lo2k6ahub% zwZz@2xqkm*@43(E+Nt`_K2$i@TD7XOno{-j-=smIOX}7AZ*<9}3ICJ)cyL2Vo9Auf`$sYl_m`-w(KW%7yu7N zl`MaH@+)~n$*%4r+c#S4!)t2|p*#dY=)?B=?F^MuC_Uc5-_@maP?g`0CqD-~iAM92 zl+weQt1ao++Nin1CeQiM!iqvamo#ziXK?xc$@STTlAmhU%L!Ays$kp-DIX_Kb*xNk zPy7Hm>1#R}*Sh*#iCy0!Gz7+u(_NLXL(Kzh)$U=EbAsv8FXin|kq(F)qjh9tRfoOR zCJ;uB?*IHOqo_dsIxk$oVipDXp-QzhVb>nwPkYhf< zm1~W?JcpUx>*?sZJU*!?M1o@GVH@L}xJLO4#WG^pD>*g)6IA}yGlp8267QZT&MCZ@*e0(ap-OMRp$bi8C zM5*~n3$Hz&3&jdf-|tDucMX**bt4J4Oc;9grWXJQwD#hZ?{(_cD|qCKar9-nq90wR zyH@j*y?M&~p%}!GvNMI~>%(Oro3GwXK*n&?s$^AV^~SzS|QkKS*xc zahA+I5XK^8cOl6G2LXoBDU0)VcXLyvZjBmbKw?#JMIT>xDiN z5;7zxc&JxL2{!S%+_IRyLyjStCKX=s@=X(-thNy2aTj9z6mY0iQbMRiBjh-Vm6BtA z^TQVE#r>2VhNnmz3^;qlb{_0byzZs&q5z?a|2%%;k{8f+?aI_V$G#Sg zC2YUS<~ayij!$gEbatG3I3%pt)46DOM2tixMMgbxou8fqHw@=N5O@kze+~$0g<=X4 zLP%k)F#d%Cqh)IE*2w~rG;kVG&GuUlfl;jc$xmdA4g_6)61tbma0aq zlQk_@B&azfr@&q9FD_q3_do~$sfAYFUFB68J%AJ?&J3bBrAZ+w92w(Al9w*(mUmu( zlyMxds(e;ok%Ex@h{{HapOJbQ!Z?KZ2>q38R6N${lC?zgn+`5#I{Tp|%snWD>kI>B zz^&Nwi5o-*BO`&&rO_8Taq+OycYBl+`JW$4pNUNlS^2<3XLqI>ZR@khOl(AbXC(hV zabioUA}Q(h8Ko=z%8QjByYRFfPdAHHQT8LROZ`ec=}rnGEph?Ts0a%3AH1}TCpue`zB%+KqWAR z+8}w1sb8BP6CnZ#$J(CBF5=5i1}W4J0OJjS0*qw6uU!?Nc!I)R?yC*u0Z5hD>E+_w zw+k_Q>FxOvo4ryPC#S<1nL!wo@f1|!La2nZ><~HlITTv)*pZjDE=iZ|AqUIxZ|U)|U1_r-|nn{q|u1 zMW4_Z=uNxABCL<|rI3__X^P$DV9Ih4H#siRQ(m#CZe^oOM^2yzCbyhfZ_~Lk_YS|V z22Wh#t1v@LcZ~~Bq6n(of{nesHUN&cj!rG$Mu>i1W zG&?p@ebcfe#&k2!*O#)Ceju_*37V*CjQY1HV2rBA`X+LT<@4I#dT7y$!Lc-v2Vg-R z(n_c!v~`KEPkB2|i&t8b`D?hT2|7^&V;o9oxXMWY;7RAUhOoJLnw-jN3ejW|DJ2GS zgwWreYBnzJMNgOo;^Al6ySH_ez0moIW=!>3d*tTR3;dAm0;pNyg&;yMKzotSq3BL5 z8t5dliiJhXLWFQiwgXqBaMBCm9X=+y3-6HUx;eG+`R$1EsW}_4ejAcyVh1q#-1}R- zRsE&cA9jjTBcrhJ^o`}1U!V%{)R!(PkL(g*`4aov24ZC(YA|SqUGnJQe8Q>OfzxRu zD`X7`tdQBInHI%(OH#)8(#;8vI9p^|kVxPH9pdtbM0!c&tpP;Slgy;!#3ipS;)drX zK#*9?wMO25^VeUWCZtcHKd72k<^A?nX8hsz*;V~>N?L3jG@$07#%>259!~J&r_U%b++&ZQl&?-KnzIwHP z-8rYQoTxF~Wjj?(m(&CsqMdc2rP;&qU56r+|O}FPaP2rk!}?J+Iv&C)$j^V{hdTij0Nx zb^yq`C5{OY@Zk<$f_$IG4eBMeXKu=yQl}eY%vkmjrf6(_C5Ies2|_10{{DPZmOmdb z{Phypt93Lqo=v~L8*E4&zaCMEx&aQpD8V11xY|>r=;kx<2%e} zs1BfH)k(V6)A#M;-3uQt3JDCX3Hm){A+X>!-e-FJ?%jX3*}XW@9B=N);rtK+Z?`^^LL z+k;Xa0B>wW~eoeQT3@d>cOU(mLP zbR2{#);3}Z!1XXYyBDBEbWY|m$qyxkN*Zs*4hFs7r$47dx2Q{+A+t5<4Of0Eyl95; zv{`14WlNWeZ4^{II&Zsa%9%mLru+N$)xpxg$%4ci`0a)7vkN9=KlJ`=%R(zKW&k-` zI1b>?OV4)Q{fjFu?&!1VxEK0g96{~jrpLq8@PZWSvAF8c!GrPSNF7*MIxSip=7gP8 z`h_#KKteJ=Xhfc;Bapli{nuI8O5Rw6AkLA8P(WWNAHHrJ?2=fB8BR3l3JB%iA9m~( zBBIY+-`wyLy`C~ddf;rY2|4Xo6owzF2Yb{dyK1n|6fG>SLY8$yrJc5Vz^^ZhDE z#Uf6dVZ(<%U3l^PqeqSw*Aq0beKO!~pzUr%O<}!RZIt_-MSnG?!7G8~y_(%T>~?Ffn>gbMluI6)Fkf$&{Pi%{?AyTtz4g>Ul{^m4zRTCZ zG;mCz36U^j`{p{kXP=@m%Nuo?W)id-fPM!*zGd&;Nu3KvKLYW5e&#W0M|dwp$mW<# zC61c9@NHQ8JxJ3cY5k2!UEfyAO1-2e*LWG8)2pJ37w<;NT&!>hF%>#mzrK?#KGC@j z&nvm|{o&DsDJ)0SV;Yl{`oKy3VF{i&4JaM%W|YH=$4oKV(Dzd_(qtO{V~Pej?I(DJ zI{*ZqzHy9PNF1UbFaJKPp268AYO4Zh$$S=ibHtH0%FP)7dWT=_U}}VuS~2&}{rvQR z;{g44xNW`8H*<2@BZ=W)?Z%jRm(Zi%K#Y$uKbq9{635v|twRn}Y6v;!rg$$5AEJEA zs!|?Nbzu3CXdTym@A7XQsbUg77_Wvgxt9i3H{gEV^@pbovayl2$78>z>&(2TTJ9$^ z$NymbtL0kN+t$*ryfw?tC1g}03)k4yx#J~P#;b=Z-5};$G;?|_K$^UzCS`L669MRR zIWA6_rF@TdL~23%Cowbk224|_ocM{DPmzbWjyn!JxFUbFT^{zV*%#PiDs%9O003|T zMECLaHDn|GQvAp@moi|Mz9J3cte~EJPe;2OO}d%g$8Wz!3~P?s6Q&Q&*SGQL z#ae37gR+jpR4+mhUh>t&D+p|bt3|I7@9^Bv2b02e<2p5=)Lv8^Q2V6Tpl+Pws(*=sK0`^C!iY)6fIw9CR}y(xf&d1#xnMm%8;2N8cb(B+xPwcIOkl~ zIsKcx-*><7^W4w6*S+p_lbi&>YYBwYC2!t@BDui&zi1Rfq*f-|p~@!}A7ah{xix9N zObYnp0vgNQRiF>50`OuE9Yv=I&jfYCDo9*N!9}a42M3*I+PjZWu5RJv;DF>U&B$T? z$$cz1&O*Trgo#RIcefTbyT0&H%4#0iU}=hFNf8Ju|2x(nb)GjLi;>9v@KXirB$>Zc zM&=ss3~iDtv+R}#$p>R&#JP|WS$XX>SlJ>|I+rBSD;1Sm&Bn?KnNZf><*qUUC>0lY z2s)tv^$IS-Ql?M5tgSEd>~6Y&!>nXBr#&7fPfQ<CvsdWY<7BK5&W!0nn>o^(g)<(Ri)E&j_F8ijlQPJbT^jD3?J6ud2%ZkO zkM`{}e`rsRa-SP+HW5MM5h4PIs{DX7{{u5AASEE`F{pKx zVTR+rG?wbQSzX7mBB~=DJ2Emh*33=?skRi=x^4uFR7`;BIufs(YcPij|0*8tb{c~z zP(x?nVd z9@LGf6Glcwm4cZ1jki;cb6-|~3vq;4#FHt@kcgFl;m2aIMJeqx7MH{OnTnmWXE9O} z&1}lhr1hZwN{oE*E@G$t+|{V!X~2*RN$dR{g{YL^5RZ|1Z~-^INlZ-R41`MvXF1@* z(BgQ2Pe>>e%O@H-2b}U3g_OMRVMt=;!wzr}yapprK*P!PIMv)p=3+aKO?T^v4GFXp zZ@l5iO!|+S(dy$MFJ!8w78DX5ZibuLXgY!r>58mh65=VrFwJIwDg~H7Dd05Hnu2W6 zCmQ20j8Q`~MHXHC@uM}kL2Zu@1%?yDf43UWKI=`AqJ{%AWMZRAMOB$(j2>{qOB(=6 zqUjk%?aR>Fw-2h?yX7ELqp+{g<#C6);U*XqQ5BV2WWi1i7Uz(W(cqDYUIvMUn@F7q zwgfScge?pVu(}gN5_(LrowlJm!4WVsVU9Q*jT#c>(f%bX8wud+CbBOE<*IWC#H43@ z{dKm7>y?51Y91u=MKWN58e*D~=jRfU%c3p5tVeFMdC>5CZZXG{4yFs+Gv2_ ze*uFSl8y57+&e1KXYhCl>%+~*v!jU`Cg=?Ka9+Gmy!z3=x$Qb+^mx_XJ)s6Va9Bw& za=fEX(j^DEe&~G3{;yCG2IMSQWi3%BPQX(Oj`e3R$ zkB9nvl#>EyPXP@Y(E?%+8kP$Szd=(p`~h^blPo1J);g9t=EW#|?djQr_LtG$D?vP? z$uAbf6{G<-{g2X~5P>naCU`q<^9eC>2(BM{tGk(L((cRz9UW-(c;E3#-j3O~3qfE2 zrqVv}2@VRS z^1@-|K2(q3%w0tGg#r5#pa6zSd(dFS84aBTw}IT6$r;pPWJ8W1+%evpxGQI?KXU@n zK-g*rM{$&EoHOd8w-(D04_JUsa0IV%M?o)aHr*jBo7>d;__2JF-vD8~huWT={N8Eg z0IHmcj09Is3P|yqYkHc7;TJ6ikY3x@ukHn85W}`S0 zgh%TIR-XdDQDH>in$@%4*ddYs1LFpX-9FNl1_or)Usxmo>FuwIWWa6>?^~bl)zqv`Q@wYfK z;3b&VE>jFR$K;7-q=j!FK|GG8GdU?~MO4OvU?=G{xOn9@9yHE^_bEqPPCI5wAq=I0 z*Mr&AI3CRPm>9Gao;LjA7Lxh4-=}B97jhgN-{)TiwY#7AaTXO)Xqz(pkTq^d#aq(@ z+N~wcadvEUxY{3wt(CSP>5O0)!Rpl)M0Ib2cJ=%^(gQu{ByoP0|#*$ zgP3tJp7se(h&1?dGJH-1bCg>}m+Y4l* zY1<@XGVdKqK3csiW?=}Hj`LqqO&scBOJ8t-3k}de#rMR<5G7H8p02JG0Gkw~u&D$V zcy$LG>0}_WPDi#w06{*o)0l*WEOKSSY4&{il8ijm`l|{Sk3o<{;R%n6xlK>VWl;CB z%7=0hdb?ERoAL}^h+2j>kPQOreZldRb-nwwwGB!CadL7p3_$gtJ%64FPCA_c1XT1E z4%Xg~RTt!9qoSg)@Quok26aimoObsoMpIC-q*JI^JVsMdQBfjQ<+Xp;LS=0E^%r=O zl@-vxnBwS-PD@LJaK{X3;bzl0gfUmRL1Y4&RXfXPLd?!5B9cwSl_LsZd5sicz7Vf> ze<3YTF}~JO0kx-Qbl<^++RVE4rjL#Ub+jJXWOtK%Dj5?-bQ#2Aq4`nWZVxz|`~y&k z!Vl;o?7w4_B{~v>uzc*O2Nwr;3}Q;>g?)n4@B3U2eX&Crgz75^W! z(kR<+e8vl05jNwE9cP{!i!D}62qW1SxA!QzdSXQR9N!Ihq}T9OxQh}h>IAS1U3 z=7mM1c0RW$FLgi4O914Kz(nNmU4Q%Zqjmd4X_7!{o8$zzq-|c_!p4-v^an!Mx+^BM z*l~std#o9-D$V)YfBP ztlPhSKRA=SIiV2|PN2<+niI|MM9SnWay?xfi}D`2!3j_V$9U~r08tIfREln}2+emPK({g}N!sEc|AO=ANnXEvC zeC6F)QmFz-0Iz}w2=l(<$(j|6XzE{MJny9`jg{yhc>}&n2~+Ek`4Ixo;<9+=F4*&c zi+6+&4Awu1Ai`q~I%g^FC17B}0AvWUj}%0QE;I_n1?BC}4_;R#UtLhirhR; zfa>wpF6dbVBSS>&usx4WOY#dL6J#)#8U*R%C8IB?D7z8<76id>P^YByh*o$qAI0Lb z*`=D55*K)W=0Yq!-l(;jjI##Xs)W$4%#HTq{=tgEsag;)zgmlD=_ohD6Y6U|W^Ulb z%U4{l&R4nCd^g9Mzoeu>QJfgV9pV6o74q78m2oZ*Hn_kjFF*eyP)=|PtAXKuC0ufK zz@?&|Q31P-!EYlPW?D+jiogGlfAS4CqZQ1G%mcteg zZRKdpk^TZJS0IorNUv~S>e|}gY58J$kv!BjM`=i?kyn%PNu+TP)`37SXUMmqmc&4% zca4=mQzH!9qjoj)KA*@3NzQ)r(M0z6XBdkuXyN=_-gqIQ|gQ)&yV!XAws#BWuxU2FisKbQz8q(kesj zIZ6Hxz=*Sm$cl+~5RFD_K}Qipa!A~9781EVa%1+WDh-q|SMIli+-AUyS)A<5V1K8) zc2c27Mi0^)XtxZ*O^`;bj8zgJ*T!UZcBIH!s2-s)q*1whyOGXuSi*ukQ-Xw(>8&2kpzD12w zOAdWH8bozFHkO$$;GrlCbstF5vpB!dZdr{p3+fYbY!f_m3YllR%6~`}X_Y249HjmW zy-+@X+Vc75G>qCxxXm~5t-aTa&2y}dM~{E|MlOar50nuuh=Tez<)P0^#DfG0A+ldu zXI&;tT2k@iQJuij!HZ;b%!>!DVwZ0LYo%WBi1?Y;6BG4Ov4H8$dbEL{bs^{wZN0pF zQ)C0AFXfRDGi`&YwExe2c)TFN?0beYpliiV8FgEAwFyaB(AT8*p}zDBd9wNGg{aXJSF zbs@4|IYo*eK$s_oaw{Hld8$abLw(Y zCV7@=WTr_<9igTXB@>IQegA%ti0nXc$NAS_ycO82j>#(*(v|0zkP?J#q!*%Rk}4Cg z2VIB7`ScNP#zIJsBD8Leqj&rah;G=5>^UF&pZ|jSt8VSzl*v|JfC7!QcP_l9GmLU) zuyU2H3=bnA%sO(r=zjXC`7g^Oy74M|uhYOK`Fej$O6Tv}i4-wmkyWyz{~d}p^5N;T zjFqF2nEI}tXvvim3C##$Y|ENv|3{TEcu;A+(8E-4gkZNDn3!~LI{^(kJi9r$xygjU z)B)mV)!E5Rj1xuDGWmV&W64_7$S#0!Hg;z(z$hwSSHL7-%MSlKt+YL1(46MAap!H+ z$&d2#dcL?5R5aL{wUN#IJX1Ydr6(o`$=r7eBwEE5w5mIWZc!3|u6MFkUm z3b8|%`no7$e$V)kc#xDtcyz3Gc~BI$RH-3b*6;INw>h8NG=GS@qLy{yKCL^qS7U80ctTIpL*2F+6r{MyGp(WS}oKn6cF*>)jQ- z@vxv?)q}{q-^FF6>ggLlx+UVl?P~eNApUXP5UpFPP#c@lvyW%Ql3#RO|YGGYD`s+R~ zxW;yG#WAfgO{24m=x5|^%DZaVUtZ$V*F?O9Ao8gK$93V5&H9SsTDn;^M3#JPtAq74~Zv_U&#DzkJs*al!SVIm#ztu3L`|d~<8N);BwIMNlRP5|4NA za|lcEnMb~bz(?bdi#>vIpwfx2T>4yWeiqqWMj@}-v^H||9%pW2bC(flyU)8WAuK0x z`xdz!gimlw)R;%Wji^EW9?GjffZ3(qKbj;0dIxlhs>=4>Ip6Z?icaAYcP6v`3HO%% zhm@H=lMXc(IJrm@@l%UYm$eMEz5*H0Xz^Uq@6~6@z8)(UZ?OHV!`+ehTLC`qe}eA- z_XDrvUn~!C-0#jE`Gvc)34keIcyu0 zw;77oG2NzwUe$b*ZN|neRQBU($6W{x6wbJkKy^1qE}@pNjngxUzC^TDe;`|r>Aj1{ zjdTUa&u+Fkjg|HMi*i}mzO^32?9t%#Tw@0PM{NLHrYE3S78&_!Tt`-2;-;HObVJj+ z&m!l~+GLw)meaq}o_S-#?4jN+w_2Z;y^Oqj(3XNbd^U(`UAAE(c0Y7lkVARrCs*dxmyvLTeoVIE!5Kv|ueMELoWt zo}OcU@QEuQG(*jek5vdu>Xa5u#Ew@*wp{vdP8+jS)fRGOdES|=yegf>wl;V33DdVD z$*#^%H+ivlu$+*hr>gq*P4}gs1pd_GJtLakBC}cdhY7qY?(mNgFR>cZ)o*1FeEsqV zmFm-_%dE-L*k+!mqB$WZO10LqjceD#(D5Hb9f@E=K6)oGLUI%y>D8;DT@@39?XG#U zFZt{GS$$|1$4#DY&RxHTW>%S~^e`pc!Ecd-BcqwFWGfD3mLHd@- z>x5&EjtrV9T1u-VQ)TvzxPSjtepB;H5m(8a1l==LK*?Qz&;?$Wj4LiDEvnwVwOw-u zvwd2WZzz{!QYqA+CBJGMyL`7El{s9=^Pna3jNAMl|LKaB;A$8YG}rBw>Q6r@;vabS zdfmmX^(ro}I+$`-^8~+O=_oBxDLJ9*zo$g+&f>`(f&psMlM+#*K81gN>i2GLuQN9| zVg3F4v5=i{K39jljN?N~vfARokvj<&?}kcSLyT zBS+ItwVt5o90Q^Teg=WhQ-{XyoK`!-J;0VePIsW>&%{1jdgtb+AMMPplIa2T-VD7D z>LGt`DsTu~m(H%Hz?#q5WNL6(ZDK()r736OdqJbqLqh4*ZxS>$~^eMe*@-&E$cuo2mg{t_44B{CHMiYiZAzRAKV zDZVEERecLj9u7Pw!_&rM^*nVrs*QGrr-5gR`jWNGM4d~)<(ARVPA?2xxmuVbMPkFY z`14bfjqUbtm9NUNG4)JTM3lUAeElV1l1?cMS`qYddUQfVXtFy+#YN3&XzTrFL2p#G z+yhyYD^^X_GUt9OG7cI$n)q}RpQ#+x=&Sxz3;*1(&Ayq9C4)Z&)b%9~Zluo576u+< zgrRNmAKt``{*F7LUs>95?2FyderBCNEYr_^{R&H{u49j7ULW+b?ec>1Z^a42**O&T z+@DL^&b_!@`a)7Nyq)cr!IMvAGN{J(@{IQLcrY%wXhOs_b0s925tD$5hQz?p|RzfP{4 z)S7%M#~U}p=6_g@b1Lhs|CdWIyi=H_mM=AZ)lwMZF-5spVj6b1V1@bFwTO{Bi~sZJ zD#Jy7w>MAJvAPAZdyeZ~Zz);dW}_fx7g#haIcdlwu3cW6!mjg+IfFi@czATsK+em~ zSgA*txi)uh*&&;0-woGHCv=&Hu7sp+o3wnvet~y2EyRu8$+_O~}u7^jO29sfU%^4}F(Omvr=g>00>S zZz&88kDpQewqw$!)z`vFly`60jtjCm1p%fr%cl&goN|kqt602(E|%Eja;qzt9GI%} zm|Nn1?jh$C?HAW)hat}T)yE{wjb%-mjS|DdBUE-97&Lq|&l$1Pz+M`$POIF@=ONSd zN9Kizf$Br*?Wy@9lzFjd= zNu$c#?T}&m7^Z1bcw%~bkk&WhJJY#~@pn$(&*=%AoJF0TulHN9iw>}|s?b`aT;(5~ zuDzKlwwS^=9z2cAvSJKHH6K#DG?#7U|KlQZEj_QM#QsH6kDscrqQ_cg<>%GVTfV=_=@&J&Zf(i73ET`eF9rPH>kO>XT%UHij(}c~ zLrwiBwbKfRiZF)( z@VSq~>VR)dm*2g`lNe*qi;rwuad6p0J?D_IS)ilLHEynm&?E0{W{o^%1df(UGmwoa zjG$~^&CRSv#ivw@=5()JPo+WRRGC$4k8PkxZndp#VY7}w)H=N+xr)gFQi%+Vd z=X-_VT+J5L-n)i}vmH*7&0IFqjh1Vo{W@LfvRu3Hgk7%1sT${d5mg$zJgo9>l`E%$ znP(S?G0iphx4FuAcDS!=sE@d@>wGZqCEAw0^!BD5zsKMv%4P;YW)O1^!gWqR_7I*^ z4Ol7C(9#?;vS*KmU10Wio-x;y=T@wfM{YMBPdik<=|$2T{zXYgkNWg~h<_<73oivfIkq#UZb;`f&2%hRlI=afy5+EijdLETSuRq<|a{Ch3 zc70HKM6;4G?a-%He6lUENlUrX973b-7A;9RIbNP*b&`@qpL_Y+?j<|le$SYQnXfjT z$_5L%`RAtCCTH1x)932m4L-A%@zhT1x?WqJ+Iiv5g{)U7@UoBfe2Icbp6fKBJOLNLtPXR+w^3nRaw;9X7Je8y1O)0``{fA_7 lq!>LW>?~NBb3JqHDOXaJJAW9$z=T54-m_05XSeOS{{lv^6+Hj| literal 0 HcmV?d00001 diff --git a/docs/cudax/stf/images/graph_01.dot b/docs/cudax/stf/images/graph_01.dot new file mode 100644 index 00000000000..387b4e0c4fb --- /dev/null +++ b/docs/cudax/stf/images/graph_01.dot @@ -0,0 +1,10 @@ +digraph { + T_1 [label="T1\nX(rw)"]; + T_2 [label="T2\nX(read)\nY(rw)"]; + T_3 [label="T3\nX(read)\nZ(rw)"]; + T_4 [label="T4\nY(read)\nZ(rw)"]; + T_1 -> T_2; + T_1 -> T_3; + T_2 -> T_4; + T_3 -> T_4; +} diff --git a/docs/cudax/stf/images/graph_01.png b/docs/cudax/stf/images/graph_01.png new file mode 100644 index 0000000000000000000000000000000000000000..b269fc6067f2b2ab4c761041d4b8741b749cab14 GIT binary patch literal 25117 zcma&Oby!v1_BOnMO-OFKLy(Y`?vN0qEs!n=K}uReQbMGp1f&Hdq#J1w5$P^LkWxar zzrpi+-}m~yKi=#5*0~PnIs4gr&9&y7V~l&;;~q0yTk{S9E)6aMfgn&-QM`vhps~Y$ zy>ZatlM#+T``yjI3-VEvhq zHrCkH-I{XcBDPmud?m%#>qFW<29!JsfsK9VwVWPaYuoW=tI?|<76+&Cg`Z)x%wyTJC|Jvtm@j~_p_uU^<=m~kV)e$~;_*Vp&shu`VJ-&cl=d< zQBgsWL_tZZ`|u$eBJ;xsE3>QXmir&Q7cXAS1SbbI5(?hCcW-mHk&>1+H6~_YWF$8) zPuP2JH7kQ!%!w&n@#G69Top3$iHQg6qg+&>8X6k(baZs|^hSn;HbOL<BGyY~6eN*C=&b@p8#!dXwW&JNYUJ)sT;_qx8^ukU#Rmo7ZD zJW5DJvf!e|NedJ>1or~VYGSrRKoA9uBN7@xcJYP zC|WK~&ay`%L~TZ|1bTXUSnj+DX0o|BJ04oD9V^o-Zf%u=sicWI*!a|P_vpUk)s7dhzlew$r#T7ad{u_&6o9tt*leHk<{|od;*T{j#v8 zutTV+srBq)z2R%5n3$OGgt^0ONqi<>^drCCdN~_rhw^^dDFKdLw&-hAan!N5mdNkJ8 z7u98MR-!bcsHph$>(}pT;+6IF)OI}gu5C7aT9~PSuE@OOe|gH7(ci6)CnDwbPf=dJ zZE&zWD+|qzX9#<{rlqB&qkCOVaTDQT)Q`l%8XF&HlyJqdyG}w7)TUQq*#o9EX)iLSe}jI;OOVBcr2A%F5(6lYK${JlV5iy5&CTKA*T!z=hCk^|l=e9)yE4)1ad?{zyz;1}g^pL1 zR(gZq~yvAxnBcCt&qGxuAff_@N5#QDv#xz5Dm6@v&mV=95oe zS692aySqPsE;x4k2HA|8(LEiVtdf$bYTQD_J9n@{%@)?Z#OPrrSVu=k!C>`@V(tdF z7`V9!|F7={Hf|OXUpT;RxnMT0W%OSjjNWU}3cio3H{|_a-^ng6e!VFa7iXQ~1%GV`)>hedckr);jU#Zb=Z%CH?%{ zgVdtgXs_Y1hN8^W^Z==ySFGQI-e@*vj75Af(K>2eDBU1Yce$a88A-|K?(8hz_@M2S zi-87(u>Fq9d&>8T!jQVYv(tQdSR|Zv;awwFRKW+@Uy4;}*rLPpi64Yd*RgeO95#4t zF(44t)z!=#jj}HrOV#|2mnjJd2=;o(-x|LwBt(n(u&u>#L010N@@UNZ*=(1_tc84B z`q|2h!6!{cc!&2`XiEIK14l+*5z-HhjVZUxT%g0&36G4#KxCGcq4wi2St;|2eB}-fG~b z7Jf<=a+|oz^R|lPLO4t5Q@LZejGlOr-`|O*r1fk1(9ViilZnEnwDC&%cNWezScw9d zZ~_H3{qyIiL_Mm?Khj+blv}rXk;)f!i|o}eR(=FR?)e_k$&IKkQChHU#GIfBOZGMn zb-$g)WrC_8qs&X~)1x*u*H4750_!H7AlWycBioEn}Is`q2{!Pw{C*YWn2qtlH>RIzs zQrMi5O-)S>Gj%$;y082OW8%WYIg!kL<*z)H{GLv~#nJUGMlmrNy`OtL&FZ{(K+0vm zzj~u$ebqPIZLp~NngjzL-4j|yyF3~bysXjJ)?3b2}qZNLBzTC5YELd%T zNtes@l~nHNzIcPKJH%MfUu91JWz}Vj$@2&(fQPI#uxVNcv#7oQ0R9rjjdPwfY zSYf7SnOZ20)0h&4V2V{-?{w?T%He)8^$cTlwmEJ2akN7FdahU9dx;ZK*{}kzdy4do+R4D&`0by%XTU$F>{P}Ma z|ML(g8+ZHb`?ng$X3EdcYlTxUj-GtPy+`w>a@mS&KHvd0Ck~Sj!>4$Cd6N1ZYWajh zO8q`nhlgU4U9gkRV;N;gt20mEB{6%CoHn#|4Pe zW&$a5b8{StlSFMN6%m($UW&DE-{%G7uw7^wfa+3*XuE2mvGtKXLAC zhNfL@&Q>|tb*!_hq{3kQdSCR|DruNiWMY({q5G?FCLDC#5KqqAZwJ*Vqp9ky*K{x& zoW1r{>o3oO*AufxZ;f{Q`;D5e%x*ECC z&v$GWZ)NKaJBh8mi_x)(5yzOyC{eB^C7v&UGe)2G6ic4x5eyzekNp`tR+|yL4 zPRdNUtG41M*1$WX7G+a+pmR#u|H<;;&!mKe#1Ti?9JnxDr+Py>J3FkP>oub<7uKgQ z1#{mykW7kw=k_;v7rH|38k&|a{emJebJb?V_|d6o-G+DX#`ve@rKOr@(`+>~wDHmE zBQ*S5=6=unyAKGAZI_5B-ib(NjI>HK&al+i*PCBF$%Xjs`}6F0mkH;+^ovLK!WDRq zoA0YH^vP*{AEC29J>84CE?yv|1ZhQ7RMeelhQ4L@GndXaI+T1h`^Wi8kF7ZjY{KBs z&_%mRG{nHfgqYvi5o8-jN1gy+iNZZkua6e-88wiRlP~}L`LddV+pJt~b8}N% zTpYqBd|9tImTkNIrtfb6j{w+td3i4{PWi_4`Ewc?rh0mAN=r*~a^lt+nn3um#zEdz zQTaNQr)J&vj+~4vxS9_CXnP?jC+F8IBF4qV#ZV%~H`R1&L1i9W#>K)aj3}f=st5tH z4bX{@u&}s<#O;AYm&=Ig=nTjfqLd@U!*2-)wClfd4Rm|{#`XEmq7vi@1_slMi{{1E zVRQjE%#+8+oEhnHO6@~;P=T0;mNw|!R3<1IFitvdHG$tfXdf^@mKGrA_7J>l$iS48$!fdt+#fFh*h7egyTZ@wz4tjU z$X`j+Z76oZrlI84L&y*OYzxA`z_5Gz6p|GTetgJB``-yis@x!?5nBDX-0LILvNG&+ z0(sJ4Tzq_`_x@9fbY2#gO4qd^2r(8V1*Du@T>htjYvqHm&=989)|4b9fu|a|KY#Ie z$y-?b8ZFWd3Je??8p6lN|Dcs2^WwlBU>8c!V>0Hm4&c(2AL;NYvYy*7Z)bf6c-Qw% z_(+fR*2_lC;@sTanL1|{Ztg#QsiNNA-mpy|FFH6lc=ztzB5~`oi^*pnYFgS;cmwPH z_c|PJ3-9N{*Ba{UA>RtWg8|QMucD0LE7m@Sxu^4i+k@^Wes`(((lii_?0 zmG%gc@^j5Ek`fXUAj{ob8-^FF)Z=z@+Xpb)pM0yMs|&NM!O8pDrUdu#T7HIRhK%p# zOg$x^ap#vWpS|}Zy3FNKmX?-6_7i0R7}!tk>^4?cZES7V;IEC%*RQTAjU=H9-@bi| zh=@4cnooiYhI;$f(#q=h(h}rqC?wd~-#>o~Q}+P}Yk z0Zky{x~eV2cP-xoZWoG8#2_XqIp65fKRNmMth$5W-ohd$GZRHm&vP;5>@;qMNb04B zr3eY>+TR#2ABu^I;dlQtQC3!#B4}-%x6p{`L63EGdb%~=BFoN>Jw1bsN6Gs-GP1X? zuPgOd*Y`sK41JfQqn%%X^*#Pf1b8*Xf(?z0j-H=ve46-nMA(+$wrJHBhzalc^~aA7 z_X_pw(B9CASAY-u^T%F_GDxh=i2c#xuV39!w3Niew|q}ri>nc>D}3tDpYN8H@fTP7 z-1%8b%7`BDvdy%hun>|AlpT+WC%7MQTshg9)W3v3Q9F0P*4TJUV62e`&9_&{G@A3nH*=fS>m7Rpf#P7Cc|@8x4d;@~E% ztgL=@M+0*rHKuD$OHxhq{w8Y^@OIy-}hi?{ppg2Ys2|4ZkN8AEVf7b*Lv5r zIDU_g0-c<)u_25fKYl!X_z*DMnIFkl3!r2Mhrq#GenokB90C(%QIcm3$qqLUk5;+_ ztNO=C7G_|762U+-Bt3X#WlbJFB)8*fUhHP{Z*p0_Yi#^JA_5m4hvz6V7{_b(cORqw zne+BfdE0V21mg5?i#@&vc(D$B6@M)Mv2YY6WfE9OeqJ7NxMFejUcmE}A0>wM0<5f< z{giB1BSnOT;Q(0%o>y)5_BVd~XllVwMP_N=lLv9MMtPJoZp>Nr^^-F*KHrmX>6RL{ z>GzJu#KoCEdqz(}BH*!U2sc%L+XVZb9?(mB@4=>n*sEuEd^eHL5ePA#BmVdvDfl~f`{epjC84jBw6vgr0I+PqJ-fL{t*x!k zrfZ&9S>YzhwaVkoxN+>R4zjSb=SzBQs;5i9yv*Zf^w(BY2=Ma*fkOe)LSDBqpm=2c z)Y6iSni^;p*Rb&L6fx(;ztgq(8mXbxiC1io&@~*KiTdZ(;4WGNFv`aQX&}YdR8#By z@gwno(y9l|QA0ioe9QUiq2pvlp6u1Wf-9O)mO;`T^5Jvrv&XM!)DpS9_EraB3!Ki! zJ&?UTt88iUcW@~E{8y!_k^=b@$L(rao#23bETzcbI`Y8C0a4eg)6q}YuWtE#Gkm!-N2 zlp}@r)$ZOkZuVw13aV#CH3=^p6qc0CLWF|BPHV5`7Jd%F>p!vyRfz3B=F0_7N<=`= z*V}vl!Glhqq$V5OHee&~{`&`1Y1tTa*U!>Zwu;J19v+@c;Ix1N0FsFghxA@uL&Kt0 zlOvkPtmwk(DiDszZ{O~n=N{I(EPn$w0eH%il9KW<<`TGpcd1e1@8#u$m>5$Fi=b+X zodBk<;7+`xjD383{GXMP3j@N!1J~DlKpxp8ASjp$t26I^8I{GUj$#{?_Sl@sR*KYj zbUY+oaP2z$XS^4+eS8Lv_4531o*y5tmQKpEtfuAw7H$H!Bbb;)xmd5<97w+X{rz?x zu<83B)Wu&MaM!fwVj=S>!Bt=YNX^X4Ti#vi2g0}D!BbOHrm==ASfme>3s?{&xCXTj zKqW46y*+C9bzga8I$jb6;@W$&^%4*@N#mK%^p5Q@NpvtWrh zI543J`ot34(;S4Sla!a9W#C^2*d4%d5K2+ z_|}$|rUe@MSL~CIjI5+&<1crrP>9NghKA_q=$)T%Nx${>!czkQht}cL%TkKT9Ge^$ z7Z}dJh7N4h((m8y`)uJ%U*lNS#!3v>tyJ~0^76>(=+yIv8}-$wfp@#QHH?Qu&ze<6 zA>~msu!kY>>#-DIYv(tEG@-Dt5WoSwLEMp#pSd|R+s9n6Q0aJ}P#sz+V^wmsITjWc zz!i6?kR>K3CpY;VyMPa&cT{!+ZrH)$i(&#<0OUHr-OV-J3{hmhdGlshS65&1EewT+ z6F{?^K^VO}-}3*96$1F{3X?rh#k)5-+qvx0`|aD^|E^?pD+L7%*i8-&Rstzf44=x& zV>R>d7e0Wfy+9e3{onQe{riA`03ho#lsWN`fyKqeiI=RgDnVETVB!=!Bqs-%j{ zw>UVyf+1XyyYdSK^kzYKO|X?kMMM&TTn9Irlq3k1g4EfGiLbwQ*4NkT40-W89&kPc zj||Vw!J*AT23QHqDe3v$Uw7Km}F-u0&p`%=kB^UUk19%; zB~}HzKcE)~Hzh&i0|QJb9H1EEdjJN&>?$iO18Z;1PX*kxt3-3r)br=hC+%yt=aJ}U z>m!BNuU`jWxmfSsW8D!-069dOupLEoH)+jN_xVip4ktf9zo-9Q@Q~YMk9Bp~XVmhC zLp;Ex1NT z9<<-#Q(2rK<4rn+!b4r1Sk@$bjzI7W3DL>;)MaI5!8F5UZ_DHt7EbmG9Q#~AHUaAd z(+r*;9M;@t-_*p!M-Lxj$`616192DB=H%*HRaIqwK6bA z4zwaeG`Q-q(ww}HAJ+kwfFFdt^2xaQslEM;SQUj=gu;S?q+fI(j13HEo0znuPMb&q z#8Tr3fqf1^ykgS+YCoL2juhy`vuOY`MbSp?PfD@`eAgTVqb&fdw{G21XB(cF&|5`v zadBYs3E}2xSAP zavkkM(FET8Yrv$mljV87L^eSXGvfvDRBYdGc(w;bEO;a(TJ4oSdAdCgE82=k0^P z)A#c=;a%lXxv+x2ehmv`Yn++>Y=9)g%i9~*AiJr#nNh}v1BKJcmm>-ZW}3lbeq)AmGEpLs0<%W|D9`3U0`2 zy7h^_TotnprX2ypSPD!|PfN0~DVuM-PmfDZu7gYsl1pLXYJde$`H3uf>Rk7x`xhoQ zHW94|KC%sxC9s`{tb4?RoBE&ox0=B?U~|EG58DWVF%1pfD<=1qhLw3*pBP0e`q}@| z7(XFm#x0sbx}vP?CDYf@QQ@06RYyj{qrkXe2m}IXbujR9I>LkCmS%)r1v$P(bK zA^MC(3}ls;--2p{mR7Vn8%z@TuE;70;)W~OIFT*c zc*a~5ds{9q?7(evalsxiW{VH6rrTR^5<^2n18arXheAT2nVuqfc&8ggR)ZS*>+wBc zX;E+9P$-edVPy0Z;o<`K4wt0wNtLVb^j;iT3_CkJ3JGbjHtJvU~<%fT*lBG?B zuWV>YH|ElLM^p31+ne|0PY@q|2Of8WD@C3LVaITjt z8UE%an3&*wphy|6n4OW4!Eg5c@Cy(IwnFbhLNFUN#tP~;!QEJVc@62oHGF)sS-kS4 z;hCAHXVW4$fmZ$Rd47Dc8M#J4@bJ+i5n*BQ;%)kPiLevj&`T!X%{w_g&DlSG`DO$1 zyQQV2;*t_TQrb7Qr|SJLKSR+nE6dEviVdJ3ess4%^;2@3KtL%L<$8L(Z43))bg?RX zdwYs`YV?@n^fEqFftdi_Tj@PrV(=KFstRN>&^v5kWn zAOH}3bEDq(Iem0@s)XA|09qouhDYMq)Y3m%*&ow?wHjYi#QeMcj$hs6aiV(mH#vjAgV%v3=+rarY0$%D_~a42NVW2Z2p;94OXHqs|D?`ZHI0u3TQlg zxxdF4L?UZ;p7MQVcjbGELjLfnL~)B_>2#HCJk+aullV?uDV}aWCw%iZ<9W_9PuQ+g zH3hPeH}WZUSqEWizNy>K(b>yF`3kGKe1;X>a?hF2xE0Dr27}itcH*{)^zPsuLtR-O z1?Bl}eH_RcZ6J~Xr0&!HJ*O;~8h^ZtL!#q2wa_@3p(o{_u(rF~%VT*xB`Y<;3D3mf-c}54{$u~$wMpw;>@isjF z+l_1uF{n6$D;OWIP3k^ee+-c6=jJ8^-EA-jpYvyE2703pxr0s3MO4r!(67g;jF%b} z6%`qL_7P`8H#awT|2qX`R$^>yY;~j$i;9BxQ?v}BuMSlWic4(OWo2T|f2*6C{sdd} zcDwCZTn{C1@c4k)9qZ`B0_W!G7b3U>6Rxc@B+Xk~7>(anSI0-nL#!u%-VE@ewbgX2 z__29uExR|l23lhc$J)tePlF4VL%_`_bJB6zF?g2ZVnZFB!C$e`jS#fL<(Lc`T(1$B ztqvKCw=FM3;um5hD!)~GvjUJi1{4-gZEOIOhE)y$A;87Kp)!Yf%aGI4FJr0%8lD@4 zQHBg}AB1>Cx=Ml#;}Z}-{q}`a1va?GCdiD$#KcsVG9!%Z2}`{zEUG>ZrtNygjbp$Z z=a-Z;Lp&=VO^%I~XO4xi()q~Z)7q@`+j&DHDu3JfW(n0U_vs+qYo|p`&sPW$2mmjC zF3K1T4?$jWkAqB=K4|E9+Ru|CX%S8~yicFwN7hhaKq#d2ivx9hw`)jr(9!_!9!^I* zl{7|HC(&+z7g$AcZop;sSph1zX6EKY=P^N>hPpaB@;QSCo3oKM{8g(rHPgoZ8`f9* z#8TOBQs+rp|6A&T^vm2cRSZ~|Se5TzzoOifOq5IvUol0ETlECQoJ5?)QA|T@l}Ptn z9msNcvsOwlJU^eIoCBBzGs=pg9K*k<+Tn}ekKYDuy_a^iOYB>85}xSrs?YSS+s* zJUB5KS(QaMVWD9WfvwL@%wTF(Ki6Z&)k4 z=(f-{NiF`EfnU-jSo6#|l0vSU?#XrP$h|7F;SU-J#Mw?aqY=a4sDe}fduecganj5+ zMII!^ZS`yV!9ToRa4+91DM1I=-gcWKgC|C)m8Cl>Xw95HzCb*AHv0?}R zGSzl#L)?d9x1-2O)|#IAhGR3ke8maj=Q6u-UJZ@D(%(+5n+@PV!OsCqiM%&aix@X_ zdRBQx{bc=kJ#j00D$-%($JVX5bqSv8SO$iZ``)$ohqmZook)@|L%z-Cot1XAz zDdx`{3gq)+I9&J9@RP`2UHbC0g*!o#2C)F-G!nDZ#~))MJuo&l24xbEZqm}4&I8w_ zqlisQi%_L1LggfzGITf;2vaLd%cd8HgtO)l zF@8Hn*U-czt2vH6ji+F$oD0 zoZx(|jGF1WIX57t0UD!9Enf4U-ic}BldC#j0H!Y{q9v`6$Dk)CdI|4yT02j}Ur}oX zo9en(jAa)otSk=->sBdNWO)wVf6V;rw@VBws;R70Q&EvCZw35pYi9=$JxA{YN-9(O zcFrJFd*)Q-ruT0cHo8YCGKUvDH~fFe5)24hLbx1xXl*Wj#n)_D2;7g>9Ae-ilMn^4njm78Vw02s!6IGa@1)5T#Y!wZmo@V0c{X zkX};qtTF1ZQLS}91E9xPTU1laVip=&%$MSy6_@QuU1i=|b}5^h?&`Y^vm<7c(JrPD?`8B{I4Eq zd>HSZ120q+-T*`FzaNF=p26_osv1TQp*6C)stG zAqoW#cKgbCFvjXXX^9uzw){2Ook1b&G%@O>gDp+>eg7K0u?nj0pCcu%oVyrrgo)Bu zsG_o~C%?R_1DGFy2(#Enm&!ECDK4JjeWFY|4si>C0Byx}3Cs*z&vnUQkjl!BOEww( z!!s*Y_!k`iR|Y*;e#qdw%t*QSWLmBIy2`V;eZG#7g=orIa;Avt7XBZ93?i#PoISkA zqxs!TE`fYE9ijy)0USem30?BQs-(G-p;avZrk`v??)W6_pKs|>+<3Qnlo+|i068MW zqnvvqP~w9J;Z)j3G7Rtz4h{lcZPT3BN6VJl2OPlNzvl1opE40>9h^D!vTcJFb;=c!NFqhn^=aO}92X!VCz=Qc4-c zl7X{(Y2TUdv)q0@RwN|@Dj2>sKXroJ)hU*IL_Rmoii5oJN!h{`nO~Nrwyd|m2!Rsc zb3uva%1L|NkfB}Yd0d8p%~h%a&Cd?RIzMc%5QxI)$?D&4{%*CDOzw5_-G3M}WSYSv zY7lZQ_nWpnVga;1s|eL4_wrv4{64-S9&0gDv|xgn)+Yg zUEf-&AaRiLfjeKU4Rb-{2qGO*-G{>MnZU^IO-33oSj<3=fL~t` z+a|{k9zK)^E_=G6Z{nebGu`7@?1*BDX~3Cg2T8|kd)+9 zvHmJDoNp_yq3Ud#_Q(Jev-?ZWT4_^1`pe%*x55cB7rrxIivG|G2LT3xNy?$yUrAjQB4GVwdcZ073`X-%k2Oi3<{EVbLM?(?TzDnd!=faM(*vv4rvo)00tAwh9|(yyf_D(#OG9=dAwWqX_-Y6y!MOt;|G}ynDycnI-AI0qVSHg+sbp z^Ig+=srSJ3Wy}R;!}gxp+1cJ+yo7AyCQmd(D^M8PH%nOH6@vS$K$--VTyKjPrjM+*f5L!z3Tx~Nw~Oiq`a1GbPjqRaVNC9edWeu)w?RY^ohoO(~sYicL# z9kLP8T0DGA601HgMBlR7`nQ(Hk>iXg+TTdPFD#TUT<(J{K10ji`0liQ$mEHNNXya= zF;f1W@hgW6+dOtHwwUGpVyOc~w)e?59$*!oZ_TYBCEON_pDK1#M=RFz$5dDEf6z$P zzIk^-93ey2N8=qo|Xc~-x9bA-X8Gy)S^X&hl4hK_M6n-X@20IfiZru z-@tiSZrQfm;m@o=LgA-`l?Zc5?=P*@e8y+6suG(k*GO4U5c;e}5^zU!jGnekZDMaWXAWYIvR3X+l zA-MHKc`Xz~2qLL}*ViM=Q*My-KpDR7T~;#6vPPqDh#_qK=+3p7lh+qC1F52pQ`&$Er1ag%N)ab-y>FFQw1Kyf7#$7b7wF!G%hz_`{MDmIAbjJ88(52`| zEQ8$W1_@ACTG<>k)6*}2jL+t%_h=nGe3Xik*Z$${e|O`KzCU3>YU*^ZYP`hkX#Iw# z)cen$KY#l4iQ}PLvG7(h20e1w-oIy&8E1%U!_J{$Q>SRpA+393Pm*v&A*G>+!BwS; z>vk{;kK_}${qx`7iFFp7oq_qi!NwL*aJCu1bzx*Lhmb$1GZCET)*Sc8(AUy5`hHtN>H4*xH_f)jx(%)#-jI<8Qoz%(<0pA6Q-f zgkfIxrDQjx`?@{fqN8OYf6R=7PL}FCW)g}K_t&3~0yJ%945ec%ANeB4uwK$0<}COJ zp%pUd%C;MPPxdGGyDFZek9V(?v(>7w>Tp9iPLxVe93jJJ9o^1yBs<*WLZ-r^C zSknK|myY|%)L)o5VytA2@rjwM&y0oP7F(M0{(=~0Yb|(bL z3r`0cf^3cR(-tHxIK3&1S5&0(UHt(;MYpZi>P>7?R}}Wbg{H34#WOW5$A;h_10kyp zKX&fs!+DpD4g%xuI>=t$Sq;>0zp}Mu0Hn2bwvWCVE+D>FhBVpK19eFo2*%!5P+ok|EJnC+K=3eIcf}v ziOC8|94xGeEMB0>U`dMgt0TI9g?|>&RBrv3@m@$ySxxhDvv4xJ{KQO-7zuXW2f_-V zOGo?6q}84EKKs-+H=loeSQM;!%AXg#CJJx@5z=EkEbOUzJ*z%1cHCH;JNd)yft31I zoj2q!gtJ%s_KQMzjWz#L+$U3Dj-rr(HXFO~Qs9^j^z|dMBoGMA_fp5up78P|LPvKb z&Bt@{qLXd0^`Sp~`iH5tf(t*Y0U$NtXJQ;2VT$WQjE^Hfy!Sjm5e9j-($+J|G8W2j zK}d`YJ%>j6rls`1@0w7^H8x@d;NYP%n<$w8n;M%*v5qGm$vA;bP7lNKnH8y0)hHD?iiep7)iiP2ps7+#^NzptOYo)>wI9!R$99{Kvp#4b{l zYNhp=Rz1*4K7NFngh7j+Z}gpmKR{am+sDesXQHjW-Pi7|kW{(>Tl9L`Q4|{ps#iaG z?bZ9&@Y)A(A`zW!)5fj^SAHruI3xrOfg&eoU5}V*TX6c`*9SGeiHD5`wdmtrNBGwR znA|!`SJH(ze4LGB@m8C>cA+`Tdw6n;Yg|}VRFLZR)rLMQ#ZST(uU2gs{5XyYb+4^8 zQPfh4BxjrD{j(%V&fOmZ8ewAJ`O?Q1Y@98Xp|=8rGr%$yJt`r2CTtXwmZqL}&*KkD zweo%D_Jr8oLsF%l%lcAA%)P^Te|8j3vmttkZm9w)7ZOknjc1_Ho@c_xR803%kV?^^ zp{0dt@-ut;&T@2l$}4^m*;b0B6&7_tJOGTnK#C6#WyTEiARET{2Z9>r*s!sorzRQ} z|6;@SF@76;zvCMw@k!_(Hl9Xmx|X7ZB7Nur$gV+pms!WOF}~A5Bs*YJ1*pLOiOLH( z5Vbo#+FZE-o&>&x2&ccRZBsPkU;kbxF&kkWI&)@F^Kz3zato1|kPr z^H8t^@hJ16w|8jA!o<`R5SUj?Uy-6W#y3lWXV0F!!lxd}FOLT@AHO4#f+r4&l(>jKCOCyo;$%acN%C zTn)Y81Y!)R9B=FP%{c*5!wI}%IjduMfeQlder;uC#~H2GKU;fyP!r}2ubD`j>~I;q zV#0~+dIx3v6m=Q@i!)uyp)Gh8knUj156sUq+VM!oq^Cc#u@N(EeK|7&@+%eIyu3V+ z2LN|AFQ9a^adij$4ygPi5Wt7wAQ$`9udhK_6I7f!g(ERBC@3gJeEi4>YFD64|0^)5 zFhdU*s11OipRZFN60O(#3bY-Ncg2q@VKcpuIcZz~iV!3QkTtw5{WK3>1Sv?o%8edd zn*`MV)W3jAGX!m~UbS&hAWUF*R20bGgm2wy;}m?A>%2C^iisHl#d}IhN|3zuxAHeZ z*%7L2eed9FA=Pw$Ky43IB_zTW1glcU-SXDHSoBvLbm!L{C^|65)_r6&`==m?!NQ=<`@3ls;ZIcX)q*``RVP%c~An5J173TruABILxl; zF?)Ni3q#rv)LJ~Y-WL>+64{HKg8mG|B{$%QkBZ@eTlHg;Uz#0(m>CMi4>;ZS*GCuc z@-ey0wfMK#4zo$5mwx={y*_eNr?2Aj^;aYI8-Tt?N6C?G;2DjYJc$#2>#Ez>+k=1^ z8q`?hd!Wl=eubZq@D&KhyTETKpqAFxPuwhsS%YCgL7)ItB#<@8%CyWF0n2{BQgMo!F1`^r)-gFgi_z<8HRG+hl*8qU?a&dJ)Q6B_k&`AQkb0E{|(h?aR z9k7Bo!^^ z2dS`#(9()UAORu|8I}*iAn#UEs&8xrQ}FUS1`=NON;o;)52|A*l=7YJrV7~(L9M@2 zHklY>{2CP4iHV5?t-gc6$5p}?YL>uB#|i#Chl(k?)_YfHXP_Ek*FqRrPa?R@0S#=# zR8(T-ouCQA!lx>OWC&F5*KExecaL{=b|CE)5fudu+Si!t&~eqyAB1I9YL^8`07MB7 zCnu&@6%-O=_#hUMynbDN#H5NI6f6xeZID)B5z@h3Q`A9fcn0OJcd0u4tAiP;T3Yi> zUX|I|%m{=KP2cS7IYU3l+O^B9PoMaj zqV`q55rCQqE|C91o&cAbnHe-sWb^}{Uk0$ZqQab?>dl)s5S`lZLvx1+kb|IU6PJ{X z{HK3P%a3pH_%Q*U*fxZmb{;7PzU$W)z{=WR;%2;npMakPzl8PQ1^>|g@?PNxL(xx{ zdT`lrK`}&0NvYi*%Glt*(GjMmrmd#@picuWLH8SaE*6##tX$sF!40LtQVd$27?3h^ z^72wh4hX zot;ReWdFN$XvEOhhbP39m*&O^7;cA$29NPRB?UIf)EXKj(qKv;syRMBzRlzoL;b(C z039HOtJEt_#DF1#p9zBgZ8c>q3!e9xc--AQ+^u#P3|s}-oCFw`$r)6+5eK?QkRrW#qe;{&Q^hJM zI6O2YaN~w0Gzc&^zV*XQ>V@N6*<<}eI227vI{;MG0I2N3$VRoUW0Vn@Nt zgPv_*0M+LfMq%rgKCyjp@FqIiA28~=Ny2#?NZX(v8{8@E2avhCw3(!N{{+#<1?Xhz zpRbfrFk?a^0l<3zX1@|Q;N$`6t1SxU`c2XX12iE`*<{#a083jMrS-++1^a0JG%;8DZ{ z$t72f{p4-7_-H!uz-rXBJkSAwDzU#mS~LI9XU~IZE_BiW^ql}}Lu!AxU5RghEiP^+ z&KX13Sglk2%yorO^>;#R{j!%J@U@IhNO)pv>jg(1K(w|fQIdcGuzx^d&AcnZLD|SB zLp?*L8G31XcnBEVhSflYiWVTpL>F@ozlSHjr*ii$4QRTdVGEE92!w^9`HvPH7%UH1 zI-HsSeaAR}xms6tLDdvdjj{!8X-0WvB^cz3?Vo64x644eCJ6;Qa8@9*Y15As0?B88 zXIB?!tDrFk)TzXFJbLII+h1c@fZkzdW`<#&fQ*<1Mg(F9q;n1LnD?YnbaW8Y7a=z3 zeAoIrfAs~3MsFR6$V@%(#A=(cwOH zt~{G4TNubvDA`@vdjS{{WS77&z#aaRT-P|vLN~r+%Xw%u>Xy!zy$bP37ZFUeGqNKolu~e1bhSY>Z9B*8VKJ&q6a|l2w>{K zpdgTRa~gm4v9+)urKY}VSV_BL0Ea3Nf`5hH1P2$Fii8AmP@@Ne`;G5WS64rK|LT1< zCY+`qhq45zFQ93VR(65z9e|cUhu?zS0P17hvHu7J#9MKtOK=?+xw}6*_S3xYVr%sXnRxhEwzx%08m(<$?jcnDjWM(at2K#korI~LMOCa zV0h4<{F!ZpZiIiJ&jbw-8z6E&=yrhcU&B4}CWrz$0^n$p7Xa>HgOTx$bLT-LjsmI* z!ew#D|8!A+O9J5=3N_&U3)~C@tMa%JX{a_j`bNWQ%Q`Q1g7yqLnQlE8b(D8f>;U9q zD>MZjhZyMvNnoAxF9bqGMWt0=cZHt{UrDKJ-tV*o>L&kbN!C|aAvP4d?Q2291};Tp z9!MIX--TMx>Lrs3bloD`!A+EpjfxA=0F8kjkXcyhwK@QZ#C3BWkDZ@BGJ;V&0UQduT-<(wpCo+5nJ=RsWOwOD z58bpthTGUJ0HF_uB)k(ksf6vuOtb&{jqlrpgsav^Km(fCa%!eQcir7ZhM>fVlm{;U zcV8-?>W;e)WwAWcRWb){zzp0xJRA>yE3?HLh#IM&8}imdkMBMOb1Vlt`yu4uK7$s- z0gajuv^m}m3=BZ)MYRY6en;Fv@e4>b;XIsU6O!;;=<}kaw6?Z>mG$w0V9td7HY^!5 zL+62*$N$oAfkGxD1K9Q6VTiB)18G2a4vBfPI$LyiQAr61j)6z+G6yYsoXQQFD7OEa z+8~gFZWo;I9_T#x_aUCXxPR3FE5F@x&Ey7&s)E9=onPJOl3h%{p-+m331@S|?)cPQ;1y2F)mHAZK z34zEMn*-Q&)w_4w9~rC6tm&eSJ18-53B*B-%-HbJlHlD|9bgg z@9A-!cxUJ20NsXzcnODjczV`3QJ@oY!q>6c;@4JI?&RHrJP^8TPc=4!s;Yz>9UajF z_+6I1LZ>uz>_Wuw<>JRTsdIwEiqsxIJ_eT{#Xw}IvdQV2&?i$mi?<{ zCx90KT|qZbBzp!`C!iq){{i-aGhpCLjN&eeE;PZp!~i6&MR)J+?t%gp@?uc4bnCxK zhVxS(k%^CuEw8Eh(ccdrRl)fob{w3XG?MPz|E2nrI7q!-Vo`8#kg>qtg!TwPOm_ii zr9h(spk8Q++*%TFUNFsj!#7eiJkb>5j--`jjLO%hbBJ&rNRj~B*vv|1oF$A2 zX*Il>1fY)hw;Eo+E673q0HjFrEeklgDP&mAS>g#`4M@fTXos2|ft(Wtx;mBS4m6)Y zgDD_9Nb3q7aE`g$NC$)+r;-bQZ=?369b16eaJde_lhTfWpN^)92{> zCUb8{-L2r4J2wt4F1q|S+*A)QR(o6IyC!5{SL|MVJZQIoDWKv&Jo!CivqrQeWmtkK z|FJh)ETgup<9GFogZ4b#F~>J&NQ=yB(nlrwZreZGAfAK4!U;VjfatG*&w?^1_&sod z&`V10w6?rIEx`Vr@y-{0RlMn1#|)rO)5M=c`Wy|?ZH&}A|4&Qj9Z%)|{_$gvR3sx=zsu+QJAWFF zbKmcC-}n2vUf1jOJf)1DGdj8!71k?4?w3W4sbF(D-C7m$5KfN@wQk0B>;GkoCF8Ptqcc z2k{XRsK+|=!50P*B1MS>IXTu3g^Fsh`T7pW$Lw1)d5EJZdxzU|>vQsc6oqn1-?-;5 zjcZH%Gs~nW0gtdYRT)kiCNo8O{`jED-yOB=OLn zqtbl$!ikoad(j&f4Z7$Ep_x|S?Zw|oMf&DSBEDcYeUP4Rj>E+i7S2JIms#VZ5vOpr zHvujJD3~CT+1cC6?tWr1f++3Z92EJkYP>p&Um|mBj;t4 z`-8I=z3CSu9>yb+-su>lxsi;}VZa^a!zRo)T@A-c&%jU!jIh$WrB`Iy<$p8Z%yWIO za)zjl;m}JwBMh;+Y)E88Y-|AZLK9|7;a-Brq@`ox!DjlW!~6|PAGB`}AqP?A5s7O{ zn7{%>K14FUPEUuB682+|!TtdOlx)qtgbRtMsqLnv0S5fj#aC|W=rB%bF*7--&gd_0 zI}K>=mIYeRdA2DTWd>Shdu=W~dF`AigJy*$9@Lm95?4mzdU8$R-!I`1+yP7c%g15@ z&_qHd32$9(a~a)uQ2M>AMnVuA=Y{A#uX^zZ#Q%n0b(QIyF@^yAL zjKrnZ4oe4YEhv`T{`XrH3Z;sPYiLlL(3*~h8d5@B{DK&&DRe5#qESXo?LPj~_cL6e zrNK=K@zUQs!2-)PGBoHgwx4f{5PJz3GaV3NMMW9hh3kb|%NqkW>vTk(0z#~xQ>tgw z1^3YMdCt!}SBF<6+OpjwZx7LMbCc=hNfGJRm1Y`gZsBGo&P}s11RR2lZMUQjG7`*^ zwt4<#oVeAkT9coD)6|rw>l~7OBA@N^aJ54`>~G@Qq8yP{3sMNCI6n)*WbRYqR>-4= zKL-jmq{KGvv&_cu|LH9L%84U3aS;DKY4Fnb#`@oT!XvzBM2Q?t-I?dWr*1+YSCp6c z$Odskw3R}J?F^pJw(^QZw^qMNl_>*<>G@6`t)V76yLs;;~u|tf*!Ht{$uo++G5Tk+J60Ld#{xd zx7O~QT*a=3%xB~6-hrs=2LhHa(8emZ^&nIL{_h6YBf2#dbKQu}uVk`1w59B}m%4nJ zHgNVe4lHi$vaav8oi4Xa_ytuO&ktN$y4J(y^*SW70ymWGW}_aisXx5YsVG{9;tmrI zbN$YSMm%V05}ei8J*ZuW%6wjW^R#P~(d*`dEU}m(<@F!Gl>Ugve=@d_`=oY+viVP6 z*L$)AznSMb#&eN+*|tCJne7>Z_pv-(c|0myX~HE&%XZ<@DeYYfc=-@vHYZNNSWq&i zOG4dexAJxPvuFGW*t__`j=A>r|ikNVYQ?nnbAK7@%o}0@X zac+O4bU`d#S9D+RLOyeXy;(=S3ew30JKk?>nF_beB!dMc^qSJG(>>+=jX z6s@dPjUkH#^|-m(x)xC-D>Dt&Fg@wmpCeumuIQ=0XkR(*m*4dhhtIb?Wcn%DZ(1Tq z$?V~bR|(HKV>EVw63`a*#rVa+XMWRHvpq~6GLt!fZhd3JM%*B8Pk!D^ud1opV};!P zc92gk(}s1=7nx}oL`&I4>3gScdtnjIhJVvO-g8zN9T|wEkI>5J{IC`x|0Vf+@0kju zdpDt6moHoW{k)UkzSuM;g+!71?1U>15s2?`g_dFg`Qr*0&(h-OT6%5UH%@{L)1#cI zm=ja%X}{L%@f-y5*dtyuDxq3*juQ6sXauBu{Z3K6&jpJVHl84M^XK~f@(AX|F*9DO z(hs5UpY-8Mp;BUkpVYobj&2mD^A849DO>CiE!blm9c?ET{nj&a_17<%j)`!Ien@{* zdrJKh97yR4XRJ^v*=&)Y>9vXJ;hK>|yfib{Bemh|Fi=j>Xyl-+R_dGiYC~UMf`fxZ zAj-OKO`i$r>gp6ALHk&%Thf-W5cHMV^y5)VT!1b^QlW6PD;3itxIL-RAZvz2M&%~G z6q4dc5WXo4yWHe;b$3|^_~@wjL}D|zPMM^C3D4Hy0{dLobpF!^<9fX_VJ6^xu(~F=ex>?Ds26NmOS5 z&*c>{^eR3|z0<9ZdGP+GneMw6L;VFmdRsdg@`SHaO1RQod$z;Z*EqpO_4elo$zwqs z9y0u~rUXK49&-<)xq#sJo9A{K%jk@&#=Hl|i{$OzKM^iA5@19b8(^vKv{vM?s`1)! z(*uZt&X~6s^fwv;gp2bpYUvDGq#huPN>_B7By??MHqJUxv(|Pbr^VxCQOYS6n%{v4 z)h?uf+nL;dLF^XQvygpcoHGZq&44-_jx}<$%W4&7#cey67VElTB|{5JU2ioncN-k7D9HGF zs93YXJU!-0u4F%6wEKF+&b*41ffEI{Fc{Ez1C1T1%Y`R~+3hQitcu)mowtPadnfbjI(x7D=(qzf=HJ#}^M!ED&Z!=_K=@HPaoy8VL7--Y_Iabx z)ur{F!-=B4}mtGb(EUm9S7C_I_Wdcbe(G`C~c(I&q-#&n4fv~IEske}< z5qq4AsCZO}v`)6SnTU&Qeo|(A_9GMN0J>whpw5=ZMh>Do7jZZ{{v@9_jWn27z-$&O zy*rU{gIf(3tmuLtmj(7LdM=d+tqN!`jrL~0tMW%FbxNCrDUHk2S@Y(}g7AS8S5Q^O z+aHLGwAd|YK)}?r)SUDSh6{p%yuYR}nCZ?24ohS1qR}5363r4TXY95D%t~pY8_|LdH2*n7If$Ei_x;C*Zuq3q7nXo{D6%`386&skvZF@B+{*8}I&1x~M9mP3FWJ4?-Udtt$8&rE~u}6=jOETmQr+ zrZCtiE{>TnVM~I%t=mc|9gWWl6bZKlRudx+FqzQqikKW~4*BL^QedR*@FP8M#U4VH z&>lfO`f{J1-E=_M@y_n{n?AfpLub_gbAVlZp1a~ zg5Rx`PzNN!QQHJZZZP_*lDabM>>$&uLwDBsYhLw<@eI8cL9Rcu$0|T?A8QTTg*hB7 z{!Om9sRUSc1o)VtHHM@vKzuNpj=|yKEh|RO4>`u;cbo8jT(+xD#qv|}3}z)D^1Y{w zV-y)x@^+7D_dnTpl2jmLrTFRQ2D`ZCQSowCtfXv4pyN0D$snx4?0aL&FWBVL9Gr3+)#{^c#hA;qi1xo z{jTuco82j$;e5uLcSwe0e9I`l0ly=rTrfEC-n3!S+^!#k?SV!U1|p!tLrGW_r*!(2 zUzC9mgfi(75%KZea=Ok1i``zx-1y0ilR`F=fp;Tc&O9tcp}EIbznM%8SRM~@zJ3}r^@~2k6)^n8p89Ho}bP;7A4|65dF#eU(Kf%y_)}&170QZ{_e<3E?PQ9 zX$_mDT0S}}1YI&HplP~%mT-O@pqzk21CsPfK0&7oBi34>LSY-bZu>uAo_BjYI`sJV zz49GTCxo-gtMEx7a&qRpn;WMlyaLq(m>2Hbp#ddDGxKR9Rs zTwMDDVNygIv#_EfIyeW&S5s1XtG{+e)NNCBPaj^kwZNv_p^4uzS~ckSb(Xreb{9wu z5UyLhJ0i1!;Y=tOzzOdV7BY zjz>@*N7UBZ#rq0S9K$k#E@Nn9Bq8vk>Cmw#VB{#PtN#N7#d4F)7DvJ=xee?uFn|Ts z4Tj8hE5j#}?%BOdJz;EY3|t`Mj3vuFc_^1i1W4Rbutot84f?pI8TS1EDO`ilqMoiU zSTVkqh1Mr1^pY-JyXF8Xe=a2I;K8@WvjPeFV(H4Rto!yw#>80Ex$sSRAUQTl{n%WT zPr(HG^5wk=tA>W4yBcFCWlq+jUQSwCT3kG==CiplVf`-9IRKIZG_~2jOql*XBw^Zt z=y=oBH8UjztRGiFsDvY7m|iKRu7?^Hf*}HgU5Kz~0*)LMArHSkM@Vb?VX*RGF9);M`wmFGtZNS zMZxX@j3lk+Kqkt~$w9cQX(RsQC49ZDs>tsOBg((4`Mdo>1Qp<~B!P4ZR~a%GBPa;+ z-;(Ldet_d}va$lD#MH!O6iz9=yTg2dg=wDroaJ0XeSN*t#{xP8WM~f@K0NV3q_LWD z=O{=}hYueHB^p}nQ+I#(fO^)8kd6MWTO}`ENIhx;Qw)%y_o#6DHuLtFix4k` zLt#MRGE@P5hoEhg5(hwZ}#QyiaN>C?hQ{Bl2*E!1*)*5Hq1XSrFJ3Q86(v8-##15F~2gvxR9a zOo)Fo=Yj$N{|o2@cr7n`z^4LI8L+0{N(L4IsWp4r3O1oV{NG1_^q^2+#TgtPE-EQe zw`mV~I}Tt^B!7=+B4B1MKPIn@`#sRy4oy!7qs0hZ( zfENO529c;Ek_Zz2wJvbm^PjvCBE9&Ss6wP68M7&1nSwge5~?3a#=(EWpI1~Y2E;-e zpJ@o+84zm2i|D6P#nG&w@%vc;Ge8X!H&_F8+y|Co5YVI-L#H_s%0<=T9)}ScJomMp zCs5`9u*k+p@6^VR$bomy1H=9xd{aPlr-`X46_$odJv^!=`L)Ub6 itD{iYwg3CcmS%43RSs5D6<^qb5a+RmI^|l{!T$%;ddQ{# literal 0 HcmV?d00001 diff --git a/docs/cudax/stf/images/graph_02.dot b/docs/cudax/stf/images/graph_02.dot new file mode 100644 index 00000000000..37bf0f817d1 --- /dev/null +++ b/docs/cudax/stf/images/graph_02.dot @@ -0,0 +1,16 @@ +digraph { + subgraph cluster_0 { + label="device 0"; + T_1 [label="T_1(A^W)"]; + T_2 [label="T_2(A^R, B^W)"]; + } + subgraph cluster_1 { + label="device 1"; + T_3 [label="T_3(A^R, C^W)"]; + T_4 [label="T_4(B^R, C^R, D^W)"]; + } + T_1 -> T_2 [label="A"]; + T_1 -> T_3 [label="A"]; + T_2 -> T_4 [label="B"]; + T_3 -> T_4 [label ="C"]; +} diff --git a/docs/cudax/stf/images/graph_02.png b/docs/cudax/stf/images/graph_02.png new file mode 100644 index 0000000000000000000000000000000000000000..3fa4b8708266aa3de67285a2d1571adda2ec453b GIT binary patch literal 33089 zcmb@ucRber`#yXbaV1$vMrKGQqhZgBA_|EV8AVbNW$#T=SxJ&rB+05M*&!smA=#r8 zviG`=tIuzJfA{acANL>k>-~7VORnoRp5r`^^Ei(46{e@7v6YUKjzA!6J*KIyPased z2?UDYL`wXN&9zf^@dwQXZ4Gt82Km2~(u^1afsb%ZUDd$t(O9?pxozK;l_$4Ft90<& z5L6?HL~3f*y$J&WDFS_zBhFcdtohdp3XDfj+m$~4VwUsPV*B$ir={}0+!7FRrs^V` zWsT%}_F@yIKU3ZOie;I!GWC)Bk;mT8tUt-MI^Z}R^kHe}(+vhyK^A`8MxXD!NiqEC zXX|f)KPDN7H27n}@BgPC^pn3T-5An5dh`QdETsyglzq?GpFg);w-e|$dyh(5;jp%5 z(BDJw(j<}IkABOY%Axd2eQd8t6BHR9uI=TuYH^RUB9w_pU7}5}Ao@Gf%XYZ~{C9dWtk>&E(qM>I5OX=x`q z@-4qSH!R;+FHU>7i9k4>JYmVMZ(v}6`}A6zJnyd}5$W>j(qp-CKy^-90)bjFB;v82$RSQUzdNhzD-)gzf4SWxbIr?p@HHEo zXZQ|DQ$yntF4yZdbHdwut%3_*qo)p3ko8?xh^S4FvU})O|2gxN?9Qh6gO?m$&=FEB z_}F`%WM_{zh?moE-P&E`_-lUrhlzG%Ifje!sZ5INT8D>EUrfyr8hqXJc(aJ5H?o?e)Q-`j}k; z0vKx5sQCEvQ(73my+-Mpv_sQv)?D( zzkfekQ*es+3LZ(^tfYd+)6;WwbW~beI$Ztn6O9Ogpy-&G{J6b)_pQ__Z_wU~a*s3cXhU%Du^iG~+E_~GdtG2rOkd)MX%Z*Dh!Y4??>qY0Lq6-V9f?5`* z25E=w`49536C#CW?tM1Y(xM~Te13LvKuAk#W5LK{b|fl0+b*s=B{x@`bo02deb?J5 zT=lqcOzq(I?c2|n|1H+k)zwJ6yNNlh8bdugGBS(HoIZV;$j=roWY*Tvag&Kt@>1&AoIWzN3{(GF9)UB60KPlSW1nw{MqtE#0uN==0>t&CFCK z#@@R(n|V=^CeWYh`HL5<;+fB%lZftaZeFW_6a#MRZBb)it($lxIu&{7*b8~UYyZ-H3kh*4XAHlTTAIhsPz+9MCQlcO$TlDLG&3!S` z=TDwEuFTs_MVq|2wrAfyapTv|c#~PN<8pOrlS1ChhCF(-lO-%IBg1uV*_k2KarEnc z96@?#&vL{yq__Vjrf%PN`rKQG^R;)mdwO~jb(!O0V=*8T|=Etu4=VZyeBk?~Dlp2e&0n9!o4@9*DVnCK}DT-{yC z8-UHP$xL14*;V9N+udz?>QszQ2Cs~RMO)gjt6yIn)7IWbq|P-fonBbT(9fiyp~*7J z&B@9deL4R0M8coZ(HdRsobsigT%v&yJ7wFtx*o;F{Tdv6l9hGU%Ifvy_NPyu3fneD z6pFh1{1l(A9p|$${zg?*HA5$n^LDx43xr3Nhs4Jl-0dTGQd0PWn2wKcPP?tbSN-7w z4K3}UDUqN?`N$G+=l|~z`u{N?v-E7DKk22E+-NN=B_$;-y@fj9!-u1`MS3E(SFW(+ zKF!X~PMh`Z&(6xy7Kl1@_^`9H^WnpXj|ndN(h}HN^Bg%?8yg#s3(w4re#5^T8X6W9 z6kw8Y;FF?k;f>agGLI=1#yK}Vt+c)V$hjuD?E#73_Wpn6ZSXfwmd_3ZU zy3!Vc#WQ~k_Ge$ee*M*){A+j^D_9~P+vV)()AP;B8y2}&Y;4#$IJWXCbyZYUw6X{o+5V{Mt5 znohN!uL_y7-p3Gn%FImcu844qJ7QOKG>ceVR$7{RM*H~K7(4w3gp=LJA6VGfbWBVW z2)F10`Ue-i)(2}N4jwpg;NU_0->lpxJ1y;y^(S@{lOMXuYo9-SupaBkcak$*jo~%F za^(sJ4oBh+I_iM7wl-{1(N8Sfwk=HbinG#TY;Z5QLf@Z1e};wxckxmX{KSbNj28~X zXKu82bUZ`O7#l0OP;+Z)u(r6k_*AAr$(AuHfhe2HL|_6D+Pf80ZgQ+sj0xzj7>qLek9dxEG%N?W#>+w zOifQe>^?p8yFJ6x-5mkB-?W<#A=>@Ql|PLM((Uc--)##sGk-slb)=-C>h9{QjyuwS z^S^GKS#~D!>>|VM784WOvj_3*V`rzw_m>wBUTRt$iqQ?JGfeGh&A{TESz1L#22|KB zEPPa5ot=}jDM4CXM#gQRI;>;Qb%Nh}E*8&Hp3k2@pDS^jn*H6rw6uf`fnYE{GsE?f zR$o&vI+T3WF7*1E>*+Zld+amH*(vYx3Bd-lne08!ap!1N@lirW-}?;_3Ny?ZZd~K; z&Z0}rN%b+ie+><>va*hSe|1SDgS3O5#&Yy)_QK!phNdQ<4JC)Zin^#>fq{X%N?tcz zFBAM!53vUJf6X>&e|o~v(b02dejMq2@4kHzJc?HkiCs-!HdI#fA2@*YIQ=WxS4>k-an>Vf=9&$a-6a<=@7P^oZFGO8k7Z9ZQgRmTMN91Vz{P`_c(`i*hLuU*W2432} zXK2dp)YIefGS>%?fhTje5ctlk39($I9Mtj{&p2fT7Kv%ai76xm3yxgW@YW`*}jKT zTKdiG?Ci|1=H0t@M{pec`964yn-Mi3l715-4MXV6_e;t9&C9%MZf$Q)Ru0|LFBGoA zw>oOR;W0H(lacK6&gpksWMrg2BLk68ktkrEvy3?~H}6Gk!Bi|RFAtdBec#s7viLhA z`KeCgJFlhb7cX953h{vfQ&vLC25Vuhp0V+t-@h|oym*wDxHXe+laG%Na=swTWxxw; zZnrop!iaJWeMp?RxlWQo{Lv@f@0_m=)kV3*9W;3G-~sX?FrmAL$AS3cB@GmW*%(S1 zH4P072Zy|}G9T=~U+1${`hmBoDOIsYkrpQJ%9f0?c0v;-?2nlh(WlaB$H<^ zUfd1~JDX?WpOEmw(@12`9$BwNhqz!;YD;5dc4_IFTI1jDVxuERhOr+8gydbuDzv8O z=gV9tdVajISKe54r1V$eWBS>gEME4DPf@WOSJD)`){*Dm{0m{+?nhxMMqb+YPEU${ zCXT9*ty>rv{*H`98oVM9GXGbAR+d|~tfi**Tuz7uP6-^{TZF5B7FBnbpz1oLX= z=H}Sg*bwHQ+yTsW>dAYBX*?3n=8O<~)@Nz@Vo(eAaqkZcN5_M)TEL1udFR!B4dXU7 zh~9Hwj0S`pe>EK;t}Tz|CK|~*|M@&U+)$&-#mV`zG2vSRW1FYAc^S4|*E{E|G){G5 z&4&+K&kqw+npyX=CnF6042ZEcxS<1 zp;ntiwngRT8~68{$d$iyn;K{`*&IyT8>_{~q@}H$FN9<}_~Fjz`cn!D3Z$RXCDBqr zRyw4$9e~17U;n|(V7b}~>sK3#gAsXo634bOWj}x3S(RoDyli!#$anw7(ywG7`$S30 zqYH6&d6mns`D6DP>1Tfao5B0|@nb-BJXIJ|XTD`^*tXqdfRpR9{5yk~+Mc0nscYX` zmYtI$#u8Sojlo`BTPrTljEHDP>J_a#_<@>|UsO~SakikqP~VP{kfxRz(BEGbdQQys z>x;8)JEpH=WT`1D>+3TcrhV6!P*mK;Jeis*;%i9IyLW4>Uh(u?NmUC4e)Cs!c5#`X zoLp@7UEgJPzVr3v4<9}t;8Zr-mUpx#qg5 zdUtoq3itq|eCg@&X?i3(AXMx!{?2m&frD2_C|yYV{-j&Ut4qy@`fS^`V~2~K$xC_n zZuw*EzV!5T1)k=vt`bkh<>h5!YO(8t*5pA>4vx!Lua*(N6A$IVvghbzsq8;L9tmZlxa!;&=CfbkZOihvIVq;@#w5v5@nv#{v z9r{T@dYYO<(g-TL^3~}E&*Cjaek_2X7LDAp@qNg&?1EH;q9=NlD(U)MTwK8<{+?j3 zol%;s02*u`YHM9(l$u&vB5N)=ej zwyKYA+CoJ|b@`An0s*kV1Xr!x)~2SWfhSiGgIVcrG>5SPF`MR_B)nF-)09ZG%<^$XWAPT?d>s*)~-F>8}mQxJv=;2 zUAjJg{D>K`&Oejbe8}rFvIPb!->pK_^qs-kvqjo}oXDV%FGxW_!DB;=cl(YV(qIjM z*Ebh#CPY#O)6rmJQCmEcx&FAQ=ft=nlG@dE&UWeL#RheOlShxz%pcmb=XX|a*~j{NS@&tHUyX=Vx9x89RrsTjo}WIE zY48W1l{3FDXWNl?vEe>ogSSo0%EH99F!jw0+C85?v!c*rVR@XErq?S;2%!wn6#TQ+ zi3)+3s=9nkjCt$Uttj{Kly~pm{ZM>AVsdV-;PvZOq_9abYW^T>!L9W4I*D>ZnKVdA zNsk_#GcX8B{K~@zqK1oWOZ#O4ANQ^Kt9P~1lN3GYD0_S~I3f8@v!MSHGo z+Z@L`4~Ll(KQkiN*VNQ>+dmzB6BHDLh@7b8Z7h&UAVeNGvV|t7vzD`lhI3Vn%4tuZBHL-DRbu!MJwaKNh3+yU z5Y9gJtMp^Kb@L{XNPHmcXl-oF*;6oE{q^g7D6fx<%NV=arpcvMT-@}nLuE(D8G)@m zD9oCho4&rCjQPC8G#^1_evhpLU(ikU0e>o5Jip7@^4#^o59w)XeWp~ zM4C^4qREP$Y&<+k4<5*XQ23H%r0rn#Fm(`v&d<;P^XE_B)v4MD<@EiWxMNXLLDndmg>AgI=xah_c5$byRr4AnxCJ|LAHcW4s_X-dm zmQ1qmy6ib`YQF&v<4QmNn0k!eJ`s_d;o-Yj!nW)XLkZYj=Iwd@JbzDm{p0uV-;e)z zgE+LZxS02S?CsmP0|NthxTN((SWC322=upd4&L0#tuAnVeRT<_Pki4#dB>l}k01Y3 zzOiPHui5g`vn66&0jwT8SX^Cl+_mco5ME>?)2UOZN=7LO^m0eI)N^}LI+S`Yppp?c zeK-5Fk;~C>hU1|1@nj`$pf+P;V>hGSLMU&5%m8LfeAYZJU;Yg^{R^{(qQ%wK)rWPVsbzYhtPLQ_eQT929UxBGIGc!Xim+UniHV~|S z>QsAGDC0zTaood)%ai?Tsd^(LBkZiKhmc5hm6og|tX-%GbuuH8vuQ}3NOPFO{rkr; zQCeDBsK&MlA9uKZ{rZg?FZ1(Ho;+Du{6Xoey}h`osMqS^bySL|C&PD$2a!_aeYQ=>IZ z33gl(=X~j+b-!WwK>R0UouOaf^@(w#p_bcIL;vzkGQBOjGvJjP~D7?@+G;P?nXI`I8Td zJ4d-n_Q|^|n4yTq-62b1GTi@Fb>Mhn>R*s~CoFt??GCrRl!OG6_DdinGBPyEHA_5r zi6M=WkaCb1vd$N!Nlz14X(4(hk6_Z)*||JDtSuZ9Hhvd{HhFqZp7d{Iy~Cm8B?BG- z4+_u-tcHY(zkL3jrYB-`;H;V&l?l1h#ijW_x3=>4l%%Hz*A5~9NhP3;{B$AzVM4-p zTNf-b+^?x=;-cy2FJDd^KR)90?#Yuutkg!7iZ~h@abkI2>Z?~rQMKhFGGNt!#9W@L zIrKR2r;pecAvYtOjRK$TQ3*}S7!-si^@-g+*E`P_z$Ipqq z$OkA5U`P7yqId6-k8;YW|k$;&JItg=tsA`n=p zD*4o~W*i+~iSlawIgU^T5~4Xt5g^jh*}1#W&cxGGzUK%9L68uj8iHzla>=V`MDk|q z?^*m{2{fwg(}hF~;*>lz>>@-Oj(~=i=TyMyr?C&?;xsbaGmgizaB_CGwFRJ>_glv_ z@M9W?{7Zi%Z^l1-*aP-|pe7uN4CgzctzCuTQBhWvsAG~HmvmxdWxbc2JTWpNgG2*Z z34X0)A0=UygF28tYLqANfxo}%g$oxDwe>I2xFefzGG0h!DA!$X(bkRt?ttfQ6#je}Z2{JS-U zo*VhYH+O3NxeaZh``sMaHxvs%l31kuQ*O~$6y)SOtE)j@>~avM@>3!Yvs!j7pY^3n zaZyqEAf_fK9WGt^m7$x=P|}DW;}a5mmxkj|Q3I9mx&;#Gp_jdU{NxGrB)eyG|89I{ zdKzUvSRWdt@n&36+OG5UjbW|5u^!OAj(yo&p+~6X+n2{Y8)7SAfgp;SlAD9WeXK)r z_wEdh2%fpz#Q$?`_%fX`r2-?gN$Tf(i^9Fn98%-W0BQ8}^j%$DQrEgVIy*bR9R(IH zKqwoVTAmx-!}>95mxiF}9s=E#0M*kf5via_U!mF|^M4>4z^r+9-me|x22EF4NoGrk z)Tmy>;yl49y|M3Q&@Ui}uoG#{QYeOq5vkt;%7fP(8yjOHAqXB!X#Cq40`}6R(3aWx z#{Vd*RK++5n<(&%2tBBt+S=Y1Gkw2cWAotNy&xu(ES#WffIK|S%tcsvxrT5Bz&_L9MctV}@h-4idF~qFx{~p(|c~Wf1Gl!)Z1D^8duV3Ql-cG)1R+^ca z3DiDG5y|(TK`ukKMr{DmOI%FsgEf6{&u!kiHMod_u*gKDiLYpPI(zlMdjT%w-;0gi!%Ei^EFuH!$|Zdbl(wX#mUK8+_RDRLQYSx*C$H?g0*qALSe7VGHmo8}t$Ixa<5v=^7gn^^; zELWPvR``0dF&Igjq>R2{8>+FvFHDGjns^Ys-hpr==pFR*~j=>eHt#Kn6(q_wL{C z{QkZ2cB%ytIe)W*Vf2gZuCC2ZO=hpG8?h{rxXP4KN-A2A2vw{Fp}`vBRq?EHSQ`X)1K6jT*p zS)#9Oo28+qTDLer!-08wbFF(Bs|QjW7C9(^NjHU(@Ms0PVA7LTnvOcMsOmal7WO%i zL-s}*wFy-zb)}#3*fTLq04Q-&crHLBysxTCOiV<5bqgDio{dPesv4ncW^pj&=kW0F zkr5PnA{DQd0iVr%mM&t$0#YEH_H=gc^$BYj=238y5EQKb{@s0f_BY@iF*PWN8q-bb zpXc~X$F=|whbgQYmF8IyR&Z9JRJz*Rw|RnAoLiYFHwVxE;SLu@dM7h;t6GCf;mu_M z3G$Jro8T9rb?lfb5rSUHmMBwVYDY&0V<{&W*N<=Z(}c~z?-5|ZsDOEg^l`bYfErcI z=54|+j0@;Ifg-g8qN;uC4YN*@ht_j!d^|fhx3{ZHlqKvgkK!#R)cPOHoKvY+Geg9q$EEy%(iHcM*Nb^+_ZYkjx^D10#$*w&Q(1O)TA z7n`U;>a9H7-J>+qwsJ`Ud})^ZQQW$HTb-!ryMC^ux~+|6Mcew~MduA>_qjiYkjh$H z&tJGOFfkEf_Y&L)cH;Fd1Bo(HOdkK62Vpu?MS@;WdH#ok$Y$UFWDqT_$qiD|gZOwb z7UbFu#A(5sH&9)qdbZDP{c6a4_{u%Eutn{~k=9)p7%}-R=H}*Pc7TlKu+gBeM%3=x zyZ45J!$^-CZCgOh>fxK6*z+f2L*oY)F zekgv{f>{AX}BaxehN$Pz$5-D&KOip8!b5`ucjsH}qE2$3F)uk4TDy zp!mVQ$~Jk^^y7!Th{yO0DP}*GP1<#Vj@8cC#C|@gWKX)Qo>;PfcQX^gebmS z=MnC7q-aH44F*`RmI5>!^s~N=MLH65eX@Yq4Fi1eSRR}PM zkhXSqxa>$@U!HX%r*OJ{{-fO$Nk)cn1X>EG7$)di(a+B$L0Qii+Nl8;PkX%DLs7 zM5Lu%BR2&5Hkj1CIb>N91af>GMFJS778bs>w#uTK>j|zC`^Q3Z?cBNAV>CE0z$SX8 zCBuH&!ZQC1ydX8!^ol;Ki;&YNbBOe&YaRA#5fr`>9DxI%?f%!&(C;(t^d@W~{Oa*T zaE3lU>v$ODm9c}$%F1JV`3P3Lh;My611a18D}0E}R5K%#>>&U0xnQvL^?_>O9@HHO zpa3r0xa24a@tZ3B4yNkWS-(oHO_|KQ?EAN;)a2yJ^LBRJJUk4p&J^Siyh5Rpsz+1| z;eY?-%N(REAinQ__8A$Mu>X)LCV80XWdiHjLpc8{iqNP@l4PyhL!^0BX+^IV^S_mT zC(oR@iTJtdOa3xXRb=pqm6@UXZ2Xfj>LPo(KmjW~Z3+zkf@2Q~^0(25bL z5GJl*6=U`2=WL}vi?dw4as_rVh&^sQC>@@prM>o;`vXve(irH+C0WuykOc~=f{YA% z(^0~2KZ^>Eu4;mWlbj+5DY({jgN6U`-k#LdRFGOoUFKd>5ExnLg0V_B1|yUw#kPqkksqDtk+KCy14R;f z4I>ny0zpYXbfc(48z=kSmJVfg@idW@hnlJhsk(*RU*$~xg(Iq}6tP;Rt`j^FwN>}G z1e3(#_f9x`K%UlPXh7P_k-#co?c# zw}$2L%o4VkpDa&BpFbXZmc7E$^qtclNy(F5Oqsg-2kn>Vf=Hp9&Z{%INKfiyFD}@^OEW3&;SR5fCA(YtKCr*@t6ULsAmy-j$ zTJkD$%e$tOY;I;&2EPRIp-=fq=ooPgcK%^S}ZOBW1W=b1AhSa{`|a)sGysInU|NB zmDLb%dR{itT^3|b&{CtLXGKq2TJ8$3VNtGFZy3G-PSstZ$6`V1(4j+hRxiOv29sJ5 zfUx0OTPt~PZfK|LUq18@Wq4d3#bSU(#Wviry?rPT)d)v0X$q9b>}*%5rz?mc>>;R^ z@v*Un#>RV@h*Gv44j@~>HzR&E#pQ$Ja_QN`$R!2;0|^2+cs~t|(Br~a;aYtD90V|u z(}DQg-i-OfiM?dkmHP6AJe^Wjo2dV#LP8aw&qXl7OxV_Pp9dD|zU%1o&xy!(%Z3)_Nxh>QCo zr-I7H;q9$h7?6|m7cz!{q2VSJb>V_~l9nKgV-{m}fOH+_x zcE)c3>u(!TTc12dG@n2|6SbkXW#$;fZpAYw%ofJK-z{0!$=4C?t8y^;%5F*RO0%cAC@} z4~qjK{NXhhhJXE1Mv(&C0@RC*-0}OOrKKeoR|0rU@V8hc^76&0t8-ME z;I)CIfW%#I1sbR8?R7Xj9~TtFY6>E&bar%DWLzR!Cg|xMGS7kc?Az=?Pc}Z(qRa%| za{$`KY6W5nibVJd46M4%aS(9-XJ4O6MtkhNKDy}$5Vjv&)&_(~sgdN(+I zgFXZHsK25SuR9L`lgPJ$<*|QELC3unYHX6t(U|i*s5K1DA0*a8kP@wSnhPpZ% z`~lY>*+bRX*o|4om`#dp)r^jcYDN73)r>6gw*LkZsRvyNKOT~n9yEP8OP=MB)3IB2 zGW>$(h2(hPzzvKiHXAe+2UpkHVlj!gV9aMxj2Iah*mk{@6c7l|$sn695Ubq232kMf zcMHghw(5ndg>8yk;9_I@V0FNCtb@XcW9uHc1i;$CSBE9k^XgJWQPC3UoDkQOfeZl@ zx(*`8?jt5|qM;E3&Hzw>zo6H5L(yp>MAX!@8%sOx{{7{Z74fq#1!ZK;C_GFhkt6jZ zulr{KXfdDgfN7mL0c<7n=nM^!AHfG2NGc~oDNIdiAnyH)HB0Mo3Viw@2J9?|J8L;cX8?FIP zux!hLF-oSaM~NP}JzStvq|63N2%N5gtyvPpa_)C@)R_^5OdEjTBBCL*kM zF$RH&|1UPESD}qtwrqiZo&c&bF|pL?w|-Cyq&vCX$fTq)o7Ts`KVn?-tIk)hU3;pZ z37v{d%66n==Z_NzzGIiQwHp9$j|sM+tohQ=Y+__2eeokJG)>g3wjh=ieAi2ol6qh~ zQ8uoCm5iBW?mZXlRu4*!nS|YT_170N2@0$CRC`EB2)5kZYTUh>;Hg$GUaSg*R8~Gdj3Ou@MYL-}h9j8+DWUh6+#i+TO zzxtKCEwn1N)u!WyukXfwlR_t-0_|XcSw>P_O-;J=c40w5z98_Yi#CF4JJIzY4rZHTPc^7T)EO&hcFNZ{{uBI5b_Dr+_`&fXT{@${`QLz!SGC2-0A2 zIV&fp@yU~|ZEd7<{oOSF^!D?{ylrhN;;7rwEJ}_vQ-oOK)3H3Ed7BSMZ1pP ze(Ur*+N~Z?0DiC5qW3^=uureUkATI7{N4~JKCzzeEe^3HNDp9(an$QI!CwUiA>fdF zwd05wTDrR5T3SH-Bd!G4{SA15qL)C?R zKs`ePgLk6J(9y|=xfYd4hMow#xKSR?YA0P>qzbEIuLHO!!P0{RP*qd24%mp91Q3D{ zDyaYMyrk#4T>8_eoj-mw#~;)vv4Ro zraJMu%p;z!OEwhXg0%08LV`hYW`{bj(RF3K>>mHEiHKs?-5G=T24gP47#W(|F`6`tSt1T(3d3k zs8Pw>6J!bVS(`HoYC$wVBETF>l00~D^yT}Ca-9J>2sg|mjM4bo<_9oB@daJ$e)naR zan=V0TMgpbGiS&QhN9C*q^97(2i|y`(9id`yip-m7N2%Qcd*kjd+haJZs-i+55ff@3XbQrFF1Al|aH zUT=5|F?Y(VlWOA9s58`wSP1iTrSI}yzFdY=8v}+&7n&IEa{c;w33d?5J?*SLN~c3=YWL7$XfC253*SXp?gnB)MtQ-rz>vk zN%J%3oyDgTxZjwR~lsu=NoPvVw-eXK&JCxUEPeDmLb!r>w z8JGjqdtinubojx+z5G7bRJ`Kvo0!Mdg-Qzw1xXV<4oXU9NX@`;I6CBzd95Yb;qMqL zD~gV80~XoCdMh+kLtmfeYbx0AV%UG7U6L(a83ZRy$oG$@qF99D_3zJ4152Qvu( zpufLgU;LCX-{tPULW@dDF#N#PVXwai0*!mkf4|gGUU~THS9Ja?SnsoS(gnCdH3WYU zVD&BYWxy+D5@x!i9S;iTJ1>U%Ofax%)qTtIS)>~1yXpe1V8dZDzkU0*(r1Q*Mg~U4 zg}J%IQ2C$%IXEmLH1&G!Fl#1}NZ7X?W^LbL1b~&x!aT>=#Dv_UgLJIw9Y^t})+Yg; zGLK||4ueiW5SHUl`LCh?1IR=a5CoS+@sG<|Nqtfe2IEXM!|EM`SL_0N3ya8y51;4c zC{5C&*P}dZYnz4}J$ibDZO0CAAh^C?EP@Tlt`!x1*n)uzY_?LKaI=C2#TNSd*qeSmmTMND>^RiZ^?~P4o<1#g@#DSDPUGJ@R2K8#%}si9CoBwWq*EibKSmUd zQnMo`&vI#t?c0|HWW8b$z+wO^E$TXi0@MR)_fMtz^07O>7LWjUE%XxjS^%I{pa*I# zb$gC<`_85)+&QX!{P^lXxcIVD&jr5+$A6-90*FnYzsrj#`9Ga}3~eSjr{(J66r!Oo zTKzmH9!I(|(ZBVtXScc^ACC^4(<|3a+=LD-$wqE4@B_GfN1@ z#SZ-e`T0{8ubcZFpxMg6?}hpbsvCk0{fFW{0AOi}+ze3^gLiM;x)l+jtD}QejNR^~ z#0w_2=F_Ja4Fww%S)&Pm!vGuUNLf$Lng0`->rXQ?!Z2AJ6 zw&+%BZft~~8MWUl)E~$?%aZj%kvleR+Jqocgnt6Sh2a-f3HmH$+0TaP@)$6^`jgn9&S;@}Q zGSKz1J{U~$dBpd|E)U0N)Dbyqys1QG>!eaadkHfs}eYQEm=sX?=ZtQW=xa+tF>; zwv&UMo(1(5=v^FOydFOEK=lfEiVAn@);oJnJ%x7?ji-YJ$G#K!(Q5^?hj?V$Tb9^p4c4p} zRw*3C#kvn9C_HYpRa?J$_3|Y+jLE)As<9*&$O>q4fpA|}TU-0-6a56`MP_O$rM|*T zHE_RG!Sne3JxuO!CZGi>IVy^oo10wqvxHe)y(*6t4+RhhZXY*UGHNJ-{RB9RVi*U! zeT~h{@C-dUdgq3#EBtrmpcN7mxuwQM$Vx8x_SMijg|vX9+WLeWsu6JXh>ob%(Kyuw zuQngkKd0IglxuhHG$OpiWYN#=Isw%u?%qB4jUh5AAew{YdGX@7APd-GS#&zNyMqF* zLTkwiYeot!*00+{Ns>xG5DymV`6|9$LEx0~cnfi}5GlbsA<4qJiFQ_0+1UM-A8zNr zetif~@4x|pnEFMw2zj1}1uRb79Pl^-7}?;CUg@|vzmrzIGz|!bAB^GCL?wn}PDDb$hf;m@|=1#P|f|Gi7^2zay+Pb=% zN1yEJ*^vpR2T}(Z5v(3i<2e2sLH6QB^;3UX&Txx6VF|7d+lJxKuWf0JhV{TJ?~G{h z&g>oZ)WIYUsOzA@<;7oQW>&g{U%;&3Ea);d@jouyZgU81BM#0x)*v!-b9K}8A@IO# zl)r)E1}Se~D!vDC=gdFeb-ouZ3xEhnJxHloRQLdyE$cLf#Wou~xM3GlVdy9?S7r-E zU2}+YYcJ#DNklG+-N~xraQETBDokyd#!2v9TmlEaZkub2;0yu}=ftm2OE-;XJ$q&$ zn~DL*%seI#gR2=TvCp%s&wi6 z;+DHFG>`!RQ~aFHQzcao*!(b=n?8`JQ+xWfk}4Xh@9ROBnAXJv&m1+>%CG2#*}mV_I0U{XQeGC;uR z%YYRXwF@6RSu6SXFXeqaZ(tCipmn%QQ78Vk)=mVn4#iF_%IPhCuXEAyv8QE&rL7%= zu2Hu~=H4Mz6$wo)(9c7h^m%gs^M`*A&u2%z-c|;ot?+8L(J0dJed~LkMz)`H!pjt< zt`DBOn2E4Vg~uhId24YPOjCX6QeI2NTMLk+$-Y2-3nutpx#;gv4{=JJV{zhS<)RLJ zU1`^>l2a+;p+@mPeyIELpC5=GDmlOZ{DII25mt&wb2-qFUL_|=QxKt%NK*qBrycp> z?fa@{BR7{d``pO)<&~C*}o*PKN{nKYsk{@O+Yc7)O6dCvWRql*1#YQS_lr* zhBh{Cz|ha1?*#<_h=czl&5?s}19i%qX9j!W!!4$hxCysxB4}dd1nk+o466f6=np0G zr|O9?5uX|SL1uN>&*WEzWd!*4f&7g=y|$DDNy`w3U&?6f`Nw*WEAT{U3X=EVvuAy0 zcnIE`Ldo}7mE!2?y3umxIi2Utj2OQ>TCd z!08U0Vf-Io+Y^s|F<V9~>9XJ(Ev)?F!WmEoe5!h1DWd{V^r=u*)OI&OfLf%s%^y zrq_eOPQ+~G<5yuQbO3w9$W#y1BXG|(uq-Vz9_=}8i}@QI9SvtfCUJ<}8|msW^XdZb zBncKVI%-M~$+)GGs}vS07DIM5G=U@DdT_9Y*Ico+waPy;K#Oe*b`>%TG96-R?I60g z-@JK)j`DvIC-j|ZcIEJmADx}mx>y0BTe+U5&2H?1T*JY2Q0<=s{H%#d0yL@1mrshY z0`6kHA#dO}SQ|LTD5OqjL4-XwN#{VJNK+%_6qt%0XenUOk0uiN>qrW{&ecPxTsl)+euhx&z)NZzWej%%9tT$)V3LL6Fn0+ z+TqRQWM^-&dEKNuN&AH`VT$MtzeVxHYg&vVh5;emkx9vC^=(N>394d#tRdLP@LV5O zBLMi(^+2o7#jcO6P$S%1+0xPR0D2DkMQ|h}te{_H?7?k}SYi3h!or)EFCVx6##0Ua zAhISQ36p8h2%ho3e|v`0QNN;R8PHW!3BC#E>7ipULRHiCQ7xerV}dNy^i^0$q!>{s zQG3weI?)v1-)N17P-~Dw|8yz~4OueJjgO6wmqIy)D+NA%1eub3u_sX(;~;l#mP}&p zoSE1wExmxeg=UX)Vn%1reg{YZl>zP^+7Yb6Iz|M^y|K4U9We(XuKQJ{hxph(HZ)B1 zmTy3hfUpGv+h%G?04-1sFe{xex6`=I?(ceMn#|&XnSCT>$2otm`4`Bi;Z=qT#ks>KFaqrC?8hI; zNXp510O#|TuF_~GgM`9I@FCL8dMgYF4lrZi&`OJy-l6T_pxghSH1t;D*vWob!K0vG zL9ZV4n6*I(h6<3Dh6cK%k)a{@6|lhc`l|CV&%*^&TT_D;6G0)NJDi8Fpvomr!hwq) zAsb@2r^J}(+3?zloE!#4nj^t#5-5VItJUG9ghUTv#~UUfNH!j@>Y(_B<2%iqkzEEs z16(sM`VA)*6dV~HB|EPHzHqy!s2~E;Y@&}#t4QOdM;t^Mz;*bFfrEoYFsaC64!I2V zN(%PDp$0-A#CK;Hb^*V=@Hvd(wcaxF8&XiJg1Ek^E>Mz16-~YHc8|oKO=`F&+{JGv z4|VlwG{c#Ja);XkR&{3{Z3YoEh9;sL5*5TMkUemqd7ecjd}7e}#G(4JPz{dk$y4nW z$NLnpyTJY}SNKyw-LNPhREL5Of!TeF6(dZto{cVBQPdUF?SseedkVP01cS2=(bpd;3hdc-e>ot8Ao3 zi>01L;C6a}C~|i!>=O1@uflDr;I$}myOl<1X($G%8!>4LIthMD7KeMb`>zM`1X6O{ zvp|7;2EPM0QU(qbf#Lg)AB-Q039@jbqG!P#U08qr`oQCjuKqP@Jf%cEEmcJA79z3Y zngw3ZK#+31LygwA1iVHB9pXQJ6u*6o6pYo2o?3Z%dHjXapD)R#Uk!rZ;K&H{jo*EJ zOxw1#dk149U?g4PcBDfCQeLY`?^Wz7R9kSnFpDpN2Sb9xj`vr2Am{uX^%Gt*@%TtM z-Igt$OXtyiC}Z0pfnN&8tgWshu=kIRDWDjG;M&8QXIaZIWo_}1fHd)Pgjk)X_h%SlkFT0OC4RWGb_K53W2fXY$B zv&Q(O$!Lzj)h}{6nyvdhu4;2<-#oKGn%YaZm*GC;CTgp_yUuTmUR{^@wkpuz$H9~| z!Rr+rv!FN;Lzsw}DZ4aD#$fyQ@85o~P6>+wYZ6T|$>25+jniHE_!JX8g|NF|;6z}B z8IP=j;++p**YP%2_H5CY9E9RkKuh3Gxg zDE&SfJb^$E-izhh+K8-4!rO>+Va~vdgTBt}-h-AUW)$Ce)q$dY2-7lNsa5PbhkoU; zaq|O9$~|phaI@?)KLRo-s0CP0X;P^;*S}#HePZWd{+1;u zY@~zefK5vXG`1@74zIGA(*lfkpc`7FaSH4Qywm3<@1N(uJcnD5!Vt%4ywBZwlz zRDc7eNVpx$@pzKaXK|BSxahd_EY~ zv~T?{vf;u@P4(xUS@Ob)A!~CzQ=p~+LGF?bwa)7vKVZbMkZ^DZH~H}JRdw@mrhcew z%%s16|MGe~>HZ4T!A>876!3te@0fn@d7KHEo6B)1*HTS^FBKrn-y6WFUkN1vuX`I30hZdR`B9J3<*2-`_7%@*C#>qKFCjvT28-m(Mlk19Hslb(L@SKz?cFXWb!?N@lw2+bFSC6wmP$EON12ygW$ zEM$Ar2a0rvn>ljlX6qyfdC0DF&)-LcE_Yt&fa3!12|A*UM*w)1vqiM}>h4Sd0uElf z1*!L#{!RRcAcczAn}?qY)H5((CD{POT=+UUP_rc{5g~_=DFfmEJ03ggVDWBI8#(zm%F5aeydPi-TDY(Q6RxDwA^c#x+S9dHD>y(Md{5 zD^F(+`5gp6;26sz@Hy!JYUqNb!u@wRN#=S$!|=c<$6Qe~Ggk&|+Prx)w(7>S5$5-! zL#uf4%!O)M?8= zE4Yc#XMS$(2oWauI{BS^lF`TMlYN%=gsIEj7`jP4dM^YNFu5}vrlI#Nt8?-7R#0Zb zuGRmVfvU^R!XjWSWBCES$K*GPp{noUSq2uzg`w{()wv&?;wv-D_P%4U3ig<8!59rS z$<4el)}vh|uDGvX8=gMiZO>&uf(q*<(a+dym2tamqN$0#d2K5zK_16l&kG7LIH$$( z{UiA6t5cpld4<;#!QKrGNjtBTr|;{rT(N5)LlM65azMOj5(in^N1Q9edjUvwumzTt znaw((yZ0E91H-QqbivbjO$7NddM=MJ@bsNNdv*^Kk%=tKVYor&;aoV}5F# zKr7IBu0he}_p22}0KBS3Cr?hz&!ZQpqVX?Wp_oS9_yZR)deaQsW|ORP4N(%qGf?#Q zt%J-XvoBq=$Vn#f#ry1lmU2Q%Yq?Jh7CvC@@yX3Z&slK$wERwZL14)ggm`FYsEQE> z1&xo9NC7BlNH?|8|8^E+g;qh7r<0ErGkteRQIR`ta6%97B|@bL!UgZM0%2Vj$)9-j zW<*4?q$N#GB?tsab7+yptI`n4(1JeTt+Qc|o;>_UG47_E(+E?TItznqWC-LEye9{| z(0mC;fG7Q08vp~s(9|HRN7G`L6Y&QiPJP1LW6*tr)iieFg-E32QiK;9ks3`dSOKVu zsHW^Mz<3D({asQi#T z~>_cvlKsQ(t>}#O^$C z$_PEQSlN(>PH!b8AJA1V$L^Wx=}p4hJ)fmSro}O<j{NU$K+3+J@;WgVI$wP;*cz%*Q=f*nvhlhiU*XW7ht#O8kyLayoGpAo)AAf~^ zfMx`@8l3LO3aw#UM9(PNps{aYsX%hCc-%|H`I?Ob~i>O~(o4S*94MUU6iqqH|t)6>6ztdwt}I3h4N zarOUd?99Wl+WR&BplwQsNXk@-X1t9EnWEb2)kK>}iY8Rg-i8#S6b+)#h<0=BCd!x$ z4WvTTTOp*Pc4aIY$CqCF=9QnR2B0{$OPnliPw}$<>Ef2Xnzrf6` zn}SXlr{&~moBg4X7`bQ-_kx-bpS#!JDs!}8 zD8(gWn!-ac(Zpo&)m2Yu4S;U$6^R}`t*#D55MNMW)wgeK?dI=e$Bg+v^NHRaeMe%T zuT>HGyUa{kcd3?-%6f&fVNa;g{a2|uGt~n0?2(qi%$rdd` zK5Dm?8UK2(7hnS4Xni!t5?lc#(J(_p*(+PT5Xi$*$32SjUw9`oVuG z8HzOSrpCQa(TL)^Oz-?_3JgK zlEeBBgKnD^8Jfa)07iu+EMNj9bcwREvWC_mxC$g|%ES&TzK#x>?tOvDsbPh}f)T!z zsj=7-WC0Ht|Ci=b%1Ll$@1x)6z=3dgZ0xxU7wl!Z33T7&3YgOkGQcMlXPd&y-X!kZ z=gpV)@fipwaKHdSG1-AV&_fRa^$`o)-P~v^>K;i=tAODhyIW*u5~mVr2sf0zm?0#U z;IODZlGEvNfYcacQ()ncz}dH&{$u{AC&Wljm$+ctC!uuT^N0J}aQ;|~z#ONeu>6qE zXPeNASm~>P-h%N0&`jpLFD>4N$<7~{2M3GXNolZ`;sOU)6-lr1Y65+Lkk1jL&VUIQ zHWKKSOP5-Bl)#1|fMU{eEB!iVib{h_qDsB-101BO$*5Tv{7FgSKjLiy?h7D$_~ouF zPMi;Bp>go*{K=&Npy3A|e11TwP*-zzoq#r-JjsNfuL(CP1V%BC*VBt0%^yCDY3`Je zHfq{`JZ^x#vDgvTJWXnGu}#J$su!M6)B&6v7YvA*gCqqqFmn9(f$r_7^T3&fW7Vpd zpIoQo&S!Xjs0@P!_hL>Ljvd>wakiX}Cq>$^srI_cVi(tg_gB#5L@7rX*RRiBGbtB| z49;vsBtY!qdIFwMBXT@>A{WMgt~_6}bT~9Jd=xYkDA18=GQJ*>FmjFl@4hl?UgilM zG1RR{5XPjWf5FHR8lkQJmJ%z&50%S1?p3u(VxniU2-YXeNs!#I?@TG*1q{VWmxYFV zEQjzlu;ROTG3mXtDm4nx1mjXJX>uZMke%v&uHDh2+D>w>Ql2wSK*n&{1!S=5>gq+f zKqVwR!a2F|-T|8tN;#HtEyncp(9XL;EvjRk^#jy{_ArO2z*$aFYt+wioC9tvX=xH+ zg)?LUK?Abg>ZPZ|ZvSL`>XBVfESC{1#@c~N0Hid)dL_f|XOo9Xn@=m{k)jdE#GG&Y z_U$}Mg5pRzR%Fb{w|)&(jFy`Abby@xwmn-7lD&;I+7ypB+wauq5B^E@KWI=k!aIIj zUe1!4M zSyne}PCateS0Rfk=u20OmEURtnn&dV4{*VvMGe6UkZ?tPM@#5QMx>rOBR5w17pWZ* zp`uS!A%q`_pj9hZg36dqor(`*%$GdL$U(#`^Ei$0tbh&mJhkvTfzmFc3?dRNTlObu zZoTwB{6}j$D5M4s(b7U{Le~Z*hlJv}YL#aH{t-S#eIivAHZL^4u6?J4kOAd)*02Fa zm*``7iUsF2Qm){+(%Qz|G`rQ$|nSB83Gj# zO51Kp2j^xL8#lMZ`(BBpw35oEk=_tH?gM)^c#9ITO;Nr?l8d2Ky-XCJhJ7wq13DSC z#IPtyNlD_2*Ro~7=122vq#piqJ$`_azO=4mA4!QhFVImhBYwfDWsWq%-u_xfMv2)3 z-LV1fuB}#HBIk%hebFu-HV%p2N|}Pd0p1F2sLcZD!8+SN=!KSc|2HrxiC=GY@^Gv zjsM{SY-l9nfj_p)9?aG4&=Rh$F7xOD)%IzjUCMLr4OCV(TK^jHugjSWY5k7d64+** zu#TO+myd3XI@FiyYF$78;^<1sZN8S-n7#eJStiezQz*P3`cPk*4SKG3+~0Mz3#U{n z=o)po3jfx|$-n9vlt%6Grrk86!FPTMk=!x8_x1gk zh;vd?>lk*Bx36So`doHV(l(C^bW2?wFw(L2B?Fg;O-GOZn-U$Jz~_#RD7#bN4SbCa zHCTiKcbj*k?6~DL+9GFfY@;x_Lf5-I5*ms;hqhQ1Uqc>f{8N zC?_xfA~*1&tzz2gmMPL%H%}P}zgvn0VCUNIwzBasRbrIom6Wi)_*t^5_23iRS3N(+ z8xFWjd4Z206C=wjD{f)AeZQe~%8=c8QWEhJJ0ui587>iA7{2%F=*ifj>5I2S>n4awy$xN5P((4U!I^tnYm2n6OF+3s{lII`z7@um>h%6!Ns+`CYGKK_lq0XAWDjRe}ecYipGzJMc*U#?EmGrR**6 zvG~{{&W;&JHykSjW7t70I`48B5+2T|*DuMKk~)X(l}(BtAkAkc2h=|8=n#WCf{-9s zx3jgyR%6|l;R&TcH6$X=WD3|A*Mq<>KrburmR(dNJ!_Wl#P$~tUm9U@f zDa^{SEr%1ztfhIu@LpF%1QN4MjYr8Q2mdB4+?b1;BV`9q1Pnd34V{8SO_@4l#ZD=0 zB9cBq*HbU2Nj!6A%&uMiVs8ni{Y{g)7DhGsz9&nuY?1t8IO96QJhedVC_J$mZY{?i z@#LEaesTlKVbHiW)(KoqVE+l)1TPqmIK$m6D_afKh#)UKISi5mKXhyAOWRo|28u)_ z5=WDjIz7mxtmF_(3bFB%*CIRtD*C%-7a}A`F??H5 z(4PE!i%(TC1CQYMerNKzH(HCrPJu%L=c2$Q)b!!9rS{cQKXMH}xyvq~wQal4^%S)? z6+Y3wCE6ho?zdx{X*V`}l)!wbnNj<--Vc?j%m0fPUuqI7dA?n1C%HpHw66Yb5wZS6 zMpH+)kRntZ6hs}nF(;#7^z$3HM6B7lcB?AN*t0P)@@_Mo6*#}9zDz!=GX-~QI*cs; z)sBuk@*3g|nl7K^ff7{&235YJUxa)LxK;>a{VLW;Ntcr-3g?^OIacXQPGv=QQeLAM z=T=d>x9y(w)UzdBcuBZ7=gE68GpBtje443ta+b4(QSK;F{NtrBpVEC1^D#tT1wKV6 zZQJcsL<0(r4zw>Et5;7Q;sl3c-^h8~?|JU-_v-_&&qB)<5*#fT(oaQ-%_A9N|4nP9(9a)hTH}v@-cz;?5H`GcN&@7D9aQt3pn2YJg4J{{k=1+R6Au|Z+mR{j zB=K@dN-}89(if;G?uiKw$MM35D-HP7%W4bPVP_LKo~S0DqZ{H=R}oe$T*u zoCo_MB&{>Da^{Nb&YL#Iv?h}Bp>#yvOL{802#QUuyGV_rva){beY-2Sv3}5*EJ6Nw*kPFpkIhE-6jRQNFeZa(y zC=Ip4`ZTG7CK#y?62;mZmSe{3EN?6`d$}7|69Ctikx6A+|8|h#Q51dao7W_Ul!N?( z845KLGG~y@O=e(*V&tnpo&L>Fuw}h^b+}4TpoQLE2Z$PN{p3Owgw;Hrk*(kf z2Xw_mm60p9;Um8-XWx(>Iy*so;a}EA?)zVmc2Jr!WBPhq%7SPp`Zny}@I3wNAAUnH zABN-&{M~V`8v7+T8W6#w11ZmyykZHRUszS$^HKT%BUQh7o^q!F*^98{#Iap+UyHgI zQZOf#anFza&x=_KN`G%dH;)wmlFNBT_vF{8_!I;bo{7gBt-f7`MxI*XKe7E^OSpAc zT&;|=Jm8aH#fPEGDF(DoS3%1e6;p zMi)1CcOD-^l9Gn?F!phCuvP_6kn<=z=(>Q#j=UXrYgX z_i^?t7A>g5TIQG@xp4BN{hyWa;ogZ&ci9Vs68dds<;t?J^43+179zzS9!yF&CN#5{ zI1p}Et%{mFm{#P#?6>V-=>Q0uL|CT5mH~BEyu*Dn#spmkn<3~F*9YX7A3`}!iqhQ0 zC$)TC9OR5-kz~QZ1G*_j*QZa<28p5x0Le}lgh)1rl=_~vaTS1Z=TWR0;u9r|eCv|_i@j-%7r_qE7AQU$Z zCQ3&x_4MR#c?;Y#Z|0rUpXLl7TaTU=1|ElTW@G!;q7}^a|GSIs?VRmsmg%hUz`Ue} z431l|s}}OHa7>z{ZQ||Fcp!B^Z-_{RdQ6swS%bFpwh}gNEo;3cBMQ7tT3uJ7T7~tKxf@IBQ%AMnRKX|;)HIOgVzH)4vnI0gMu<*cJif=&#NG+)4KY~7>dx9 zQ~T3R#z3rB7l=wu9AtEC$(C7k+^|P{{rzFR z-EkyL3*gAJ*37V>9mVfzB%1b9LzEw!I7>RL6t22J7jkmSM|F$E=`MRv_Vw%5F^l-p z+G=dX)>|S^U7i&#tnk9#1BeS>0}6k^WCK4l*m0w82EoLU=*pO1 zFkFPfKvfT3^0lk$Jn$*&OXwPQoRnzV4VeQNBp}t1b;|@3*Cwnx0zbYuk(O5f_upoq zg=9ZMO}72Fp43|+Y4epC8fVX)TAN z4{{1kdLtAMY!#UkJb(7gp~bMmOH}tKa}>5d;)H=bqiu&`mQc@(3b|=OvB)8k^n`yF z2H>S{)(e)4IyyRPYWQ4_K>fn|jCHwGKuUMT~M#sdnXjrU?fymQrxsCh976SeZpGC9C+3ThG8E8Rb=C2FzL|P$Ukko zM3I$z8Q@4jg2s;@j~qI*k*^7xq{Pfnl;i2;)rN67rhXJXKfc%YK$Lx!?lT88c)pT6 zjn+7FF8IQCO(&*ZgWolwr*;r@hlsK#zeMORj(-2Bs^1xtt|QF^c=1dSYGf5#EfgQ5tl5G%PpgWoC9V z|EECR_&pI5CYLdZ>FMo|I6zKa;^I4@ef_(he{ zS%2RNFUYY-Xh>P4$jv{$`mEG)6cva%AzQ_?Cxx=#7ul1yre?jXi(We5|0eQmpI6S9 zrDeUT*4>2ocoqrd8*(zay}vd43KR<)(LsDs(aZ2D*RFJ@O$IfDCLE(S+P189ePtCDdEfzERNC+72e;pbfQHa%;zYG1xOoHtHd-6AB+o@!qAoq1FyGR|x<{Ib z7?q2x(rANkyp(m*A^-gTiRhQq^aLx2SU+{N-^DK17nbS9C7-{$ZW`Vdv1%F0_?C2Z|hBC5@Od#R@3Ow%S;Xs(*gW-c0xI|ryfC1bEly` z9y18tC?t;K7ek)+r33aA%+(0?t)re_5yL`aD?q{Pq47RQMqyLr4tO9q5(5rrUb}X8 z47aj^%)%@&^hJG!FmAi?q4NXdj1}q6x!1><_m(#@0Puf)#ZFxI;FO(oBr1Z@no5y{ zSs?OOy7>0Hq_6l@DYs#HLv>A!rAC^SZo$9K10kIn=do+go}@oS?oY$DriBd9>)^@V zu38M-)8D#lL4+0l4}?1=gqt^8jAQh|28*9Un^@6;?U~K~eZ(u!uFfC*qw z(9Y0T@yH9(OOMNW{AT4jA2LqCq7|n4+o;j?qF_E$*mMWUM?r{P=+efwo)zTGbP_uD zMmDz#%w--wEQ%c-$2cAC^+$)X_&M)Dhh)?&XiqR4CO{2@aWc5ssXfx?iQF+B4Mr_fis#@IG-g=Ju3eEyOqMBcGbMp;?eq{}{ z)I4WrVOJ6FQuz1(Hgz8Abee@FtA`VM@g(PdRxUP4DMksUjJZcY_uNq7*o>tU`DBWxWL-Bmer=KT5lsBv2c)(p~@HqzK+WQ-KB z^u&SM=@(UOsPhTXQ9U2ER3b^ECr_?u{KiS*VF^KV9a2AT6Q*;JKQ%p!ZveI%q=XI) z`IZ&dlYjBz#g;bB-d!&-u$ezgq8)Pg_iU2$a*-40|GqnkfD%$9^P_v35Bu z5a7UHm94Q{IeX>|)QA4EeK5Mg<^v6&;6&2V@u96EsI(mZRp+F)dy!E}%nkbwd*-@)Gr4xU z?5{X+M2}C?8nZYZ*GtNv35^1ec#?Hzu*v4W5!kyv z@6?{AsYg^=v~c0JSyW(neL$NI^tjhL*E?H(8)n({TKmR*)-x^;O-}8CQAzR+tlebL zY!xy+W8(>{*UruuW(x#srbnADshEfFF7^KMrHSv9k)B?KI2dio&F2>Qf8M$^+{8ph z>6G%57cZdMjs7UgA(~%XTD*-i^8#u8$Mt~4>wo`UZL^r$MZ<k-fuL}M!SF8%ryft@8PekMrv z(pv_ER_6Hxg|chk2`gPISvKDSUjOHr$HK_??b}ytzkI4$?c*ctJ0gC0#6e%!VE{a@ zs>)1HFXB#vUKE0AN~vi`{LN);&Pvo3%UQ~NfQtrBN!L7W-K#zas$;MEfO`IK`iz5v zLtt**gR3uJzI>(grh&;WwJYPSUsEjHixza|)JR%c4trbDh*khdhQfJJ)Z;=1-|FMs zw)Zr!MccuB`yN+mnpTJAZ)jAsZURs5TFIuiWR?KIQWk)kzJwgXtm?tNdp|ti@Q8Jd zkWd`oONcSQ>uo{@F-BPyti@hmC0b$7L-%Q1AHhi6WO)qvj<%TKBUow&mmHcPnS}*k zzTHk`!MIwe8zd!ZbhNd!YBJaPK=^xC;rqAS(Dw}2a0^MSSpb#MZ_OHW6e5bzkJ=Lh z_OM*NMo33kUE1{h4_L*_SFizCIb--*bks|-*Pqy_bTA>I;qBWI4Uu2YiE?o0uweJH z=R;!48-eKsw;^QbukJNVc2b6y(vA_e!q~N@7wdHaNv;i&t$}p+F1QvdfqQyMRP z7E7oKc^b7H#tdAqlxoH`pmT;ou)pv!G8_UE-XpU%oIjiM@_9KMFU(U-M{?JM|1>< z?w#AW1-c2hJ!D2OpGlE~;c^mp1vAsbhj*Gip2-x)$bpd)`eVzW7AqSYp{gLGz>sHN z2^jib3j+kjb6O%-gt*jq`<-(L2&S-3a&y1a#IadPAWSUeSfH6Jhn%BJ^PCGIA(tKW z0PC-N7H$D>0D)7F{mkcMh6gNZobKt;prkQLjLV4scbg#Fn z6nI4(f6IO|E^cn2RM+HrH2Knz{gstl$o%ZcQDSClWEhZLO)bnuO=9R+oWA4i*=`s; zIXafYna19=>dm)J4;ed#_n^C@!WF1_jMK&92`S|m0JFEqPoPCw1_ai4GQHLsHg^II zvtYl>hItVYg&=J-B3%L}u$P*^z8Hyxo+ZOaN&fgNGU5{f9j<2lZ4Z;^;%wRp_K9Y2 zNT0)V&>B)i^HrSwLZU(jfj^Z2y(|Y<3wN7Ro%csOmvFmwfuJRky;!bya*2YfjOV4w zii+f3U3ogtCk+l-qWxS*ZEf~>^7ixlglMG~5qpa#?YTh%3v{XP_t7vSZzpEadm@a6 zUqn&Q4Pwun3f~FAGF{qTtEKCTiL+>(p|*TxDOK9ZiT1m?U4Uh+eX)5SB z1*c%7*2~X9{!7oq7`w7IUrBN$1tU!}E%O8V#aXj9er+2LfM;?5m~pvunRleUWhn;5guArEnMouRljG!(2DF}0mD z2S$vT36QXx*!DtC1)M2oM)4NpSGQsrclKWci|U1eg1M$$(=m2Ey0{wL0%9PzyI`>e z6$va9^P7HS1Uy?EN}<4*S!(?I`}s?kvOM5}7MgJz%S%fOeFm`}L1Nki5lwDhg;9C> z-d`gZ@->)g|2)C?SxyIuZ>Z3{dEMRJ{6JkuB?kTa_23(0gqgtuP8E+L4%2-bva4Al ztXSX=cwNW>h=PDY!u?K6jB$FA1{)tPM~#}GpX`;{0@R*4GqOPaG$X0P<#v$O6QLf*f@|mo?VZzP7_;1 zJoKFa16pxwkg>O*!pkzGc32J50}X@6vsTHO6bsllERbGN`tk2SK58d5#48SNX#TDxpS#9h_C|U`y9K?` z=)HI_7Oeg9lRaZZslNr~{o=N9(3k+2@<)qL>PVfa|YIsEt`Baxz$QdUDtZaPo-IO(t6GUn`rN zo72eW%696Ql$e+$P}aQ2aaB1_U{E1Nm{?B!SN;8+-K(psIjb*MQ&Z?^X=#DRxMBbI zp-V*e-;h8KVx$hN2&L98XLK= zkU=6jiNNw_tAbf8^xna;go!{WzUn44>@1F>{4h~tM%8piw{l94p!pdbT5sydQBZPn&g zjIvOp2N9tqbadK`^Hkvx5HQfuCCSm`(nxZ{R4M|ZUwz$&SkHlXb#C01sya`yunG#} zz+aR24D|Hu+}x$}CthA&0sJgT2|+o!a@bmE0jgazoJMh4eo@?QU;yYDn&>ZM$zyk$}UvvH9-f6(+OJ$ z`@hSUrT1DHX=*MMN<~8h!I4C7lw+czD#I-;5&J*szP?))!HcY-(5V40t?yoZxwCQ- z6K~2m=2#C74k{`tZeHHKJ(+Q$2eCRMaUb76c8FAAdpCp7QU?ReZ*;9JFUFMOvR_M) zE-EQ2`_hH}!&T&AX9e{392p}>Uy3dNAWVFUeO;4bhL&Kor0dPcVbIJ>x< zU995u*9VM=tgM#zsXR)^FBHMTkcFVW&bM3D|5!EoN+A*WSM{v53M~(^+00D+X#*~7 z!LMJxrt?J7722c1Y8$7&{vsrbc-e!0W?;x8#jltkLihjqlW(HSj0$UFY8qX)VcNUh z`y<~3SNrQDI473=-B+YoS178P4RrVL1l)EBsi`&m>;>~D-e9+lSw%`?fm)AwR$7BP8LXXeeBU}B#;f;I{vX98T;j| zz36aKQ(5-}d>QHJERT;z_wUGlQ#Ug5UvIU)xw%1aQagHDXOOVvcYZTjl}JXBPBX$F z)%2;^v~{*!pbpqUF)M_IJs`GyY-{JR2?lP~{A)|qefV|Cq=+>qZqj&`pNoF$ZSNXF zvgObGE@j!%NHx=Jtzl?rsG>4GF)`sBeR6VQzwx_4tw@@Vj;@!Q;sLw+VL$jXd(p^p z&(^h%{2|N{Sn26H_xZ#3i=@9k3~ty2{o_Z2=8Ld>+TE)q*njf*=|6o5+-S>>Cwp5~ zG7RmJplu`S@mwB_Uarj+X>{s{E>LX0YFMlCW_f#i6AJKrxQ^N>g?DRAmUBB33$9+Y zeHn6BZ`r$tXv>9x?)m+;{mvXq)!SW4AL5(UV89{-hxi!aX<;q`K?gD}H~#O(uK0lb zuRR3=>9#Z~KwyGBm&r$axWAa#v?I~I^GtOX&$t5Klk1f=qZWxc7VxZ zFJ{HrF@JuI$py+Ne=YSk&2PUq_mG(&CR;e;NrP=L9_K4+YzHI;J|(aZMrSNU zS#yUprA(!GF>4+lOEm^<4qILBC;PRb!3Lwtau}rZPE!H4K3uO7^F-QBS=M7=*w~XP z)?B>gV^`#wo5E&1Y%Ptg|09i@JodFkj8d8{{kD0b@Nx^1Hwx$M92^MrBU4k*0JuAI z)~uCwrfr|o-2(&~cV=Si5~E(`NgXNu;lF-^J@e_cM#9~WFDsAuy%a~q+HXHM75A^- zb8*ACSm%^Fl{bGpZ=_bGD_`opX6Y|DTPew03JW!DZo6R9s~P%|!EbY|Y%NVq6%7AE zs>@W=m~t3!^?ZGdC_&`?sRE=#>{!8inO=DEt~hs+55Xa z{<)(W!hHtUfr1o4cceJ|`o-_n+Xm5=~h(6Fr-E2_6C5)k!zN`G`aS@g+2 zgD0uznTO%|`U%@?=L`r`6pp3if66gdfi!-r{yv>6gq{Y|`Z92k93xBwUdB;@ieUJj76;J`+lN&ca9em1vgBtX~ZX|9kQYY)xamQW?k21ob) za}T1QwHjPb347CQp^pv!@48?ZGX9AhZ)@mLA=#_QqTDl^>iAH{)RMkjLkh5zebI?r**wJWoc(>yo=Pet1eEAcP7?e7zEW&-5{%>c{_eE=nEgj6Nt=m$ zlfAL7n3MQ5do;6D^b^mFV7h1?Kj7bSCWgwDIM0OuhPL2@RI`QYBcl!ys#p5}}=LENW~{^OKVSPpH1HsD03669<4o}bhe zt}s4oHzP<2bo!U@8R>8P9wO?<^)R8XE99(l^X^kUHO@TgrkeV* zCVinbMlVP3eZ_kbqJR1Irg93byIo1(vdGIHj~rf{c~g$hVLsF8$jg3+M&XQGEBet3 zbsD4S+Gjcm`s8dT;@zDadS_YC6rN7D%a6;dva@9g%z?stAY;!1BbrwBu^oqf+fX4g zZ z;EMva8&?{`^Yim{b?aE&cKcEV}3#mS=XgROz{H+MC z+RG0^@!HJ^iZdk#FGV3N3EcaPHeVYLQ+M}~mzL-!sC|6z>~+%&^g8R3ml2n1j`S)H zYaWwW7&rT&=XcM}#BS2*l4)t|+1$6t5ibvytJb?6Ldi^Lr;lgONL7Vd2Jy2uJ-2+Sfx2N=JKRBhwM3J#Jexb%s>r<4(pjqZt@5$M#dgm|e`wvi?C%t`UVQ zYu(DJB^!W|L-yW!nsgah&MLsv8OdgVA6PqIisuWjFd`#L&&9D!ALE;)kKc->KE@B- zaq5i7#zV1V>WfX1$jBuGL8OJpua$Dg$IZU} zC0=0)TtAQIP=+cY4?I7Tc{;w(-y?k8Rhz?vYStuB9m(xE}h%C1kAmXVhu;mORt|b zId5;RwLQ36Ib<&G za@`%s`dF(rX=O#=VwDfqnYLKH>n+QHdOD$NYn%W8-8`J_29~4G!-=r7LDC2LbX=q6 z+8u>~GSrb~FI}Mun3z=S9*u`%&0q3Bc|UKf;hM4BH!1=K%iMc>KIca)|F)Mq{(*}} zTyC2ajn$tNgeh>Kx7nho?<{&4G4B=_H8@e?=T&AAOSt1EHDj*mL4Lwlf~fDYW{{d?#`mA8y7 zQK9vfOP%3Z#4grvKC8YSa#7Uzl$ig{q&{ZR_x_l-gyMY8eQD(%AL?MCwTZYZ zq{n6Tm^z(TH$mL|P9{Sb;5dC4ow$fYTv1$@kgnxht-+_qH#auc;%ijEdba+v zlZj2}?&eGw5XxuQ7)>pQA7jL!aJRKkOOzu>Y~ww3wq2|!Wo9t7!Y_BS6mv-MlqNg)m-M$gw9HovfYN9W~Tc?;jtaQ0{DJM-6=_>(?(&PtR;V7iv<{ z{3sG?8k#n{)kZrD{Z_k-?CfuYDwIGODz^(eE%6Ux5d>NLvl(SC?NiRXxp(+{b7!uVG4L-RZpNZO9Wd8H>?wEsn?w?LT2=SkY>Qim zyAU!83m}cR%1N0DdD!5HHMWz$5DFpt9^$go>$Ntgdf8Xk=xagN5DHQ6j#5)(vH$}F zxF+`)LmY*<0WM~4o`B7M(A&LHzd#jOIsf!^qdfmHo87%S$xGt=8)qWtpANJ2^)^S5 zs-VY<^{?*{?ZQr#!2X6?(tWe zlVAd5sfNn>n>L4kJI1hP3a-k9l4^D+=2)OVT(I69JBmU8zT>uj-vl1=`>w5iTUQf8 zvQjlJVL}Ht!p_jkUcv>Y~54VO%7IK7(G7f#zz1X zW~^(N_=CFicI&x&o`}^=))6lf3(Mbn4GPIr&2LlA8a_R~6aQw%eHW!w zjw``Px2@kHH`5u05DD9_f8LA6_jH(hL#whA!^&bQ-J9gR)!!IU+;61Er*?LG-q(-H z2xoOZ+KNeV?(N*Y_8;(3z{GE(n+wji6cLgZ%H`oX@H@6D6Y?3GqH4&biU#5`(N7!X zJW5nXH=o#B7(F7uUI*;B_$%RDt{#LN-3{KPV=HezQYy(h^+AxooW^Rwre)Y?g)j^g zQ#m8_bakEODS`ot+&pOR{|(fvi2F;=48GwVUc}<{EzBGxW~64D^sG1b)l1xIZ)51H ztoq*$w6^zGbK-7u|2{9J2muT#ICb)z(gLGX^ORJ21+U zCuMPipaE{=kJ%OA`)04KCbnBuuSw;m1%6XB#cbqG&M8AV=KFAppqa;~Sw?eVf<890^8u4HAfSY?(?Z%H!z8csFwsXmvA zyEh@4%HKqYfWQ!NGKrGNID86*@9QgKXMPCQVVgcyUA*qp?D2U7?w2;r6*XQZ8t(|; z-729L*ay8&D3hAD)nbLs>k6A`Mv+Y7UAVpfT-}&r=Lp%>J9zKvu6hY4(Xf3xIgh%}5*mw9TF|fYtKorj4dI68d&*sS$(eHSP?5-$&|gLP$i8 zbcY{DBP}XW@2+1>Z5(T>nXG{Fm_Gt48124jP=LY}fie>6AJKE~5GJ~@_zfB80qSs0 z=k0nJ+=7Z&>Rtss#_!Vl^pNX)J@y-+!6>_T?Q~!~c{Gt=^rK!zN_{MrfG(d!#Y6FlA1l=LoWBAZ*nvH{b0oF-8^?BIO0tsk4>Mdl7~J+o z2Y!a(K&;;fw~U-&M(JFSOZ3z7RJhp5W(Og57UOMV+yt zu&-}!QN62=oLV7D)84WO%l$&1Q%3jxk3~e57Sl;&ldtQ^xEfXU z<}lR)jVweo`|(i;okVS;ZJA_%z~GmxoZtjNau%hwOZf=P9~n+XO*tKR;KJW5}8GOES0L+ENTVQ5(C|=fxMMRAi_)JQ<{{ zqM9f`+}yEF)g)w~Z?~yIR}l@8l#&*Zqh>U}#$^mKVl^Yu`~FF@I}rGX5$8<%)DGdI z+_C^l^Fxpf`^txaAS*C1(0N!avJx3eZ6J4+c&ozht;CZ0fYOf{ys#n@zOz553grkK zfXd0vrlyVB}amS$)de@8RDR#=%it4MK|Ux7%`7F;r6YfM{Yo?r2Acvm(i; zd1hszdaB|i*1UwKoX#<7SO2gILU9;CM1ZEzXWeeo$9`(l7wz%mqU@n9PwMW-*DYsz7D&EzhQB4VWroo zbdUi6LCB7|BbrVj0T+3ZgOEf3EbP&ztcKx|7J4E!trn}N@pn~uz0g$R$CBUd1u-mm zj2c?pU_p*}#dB9aZ<8zYk3-_yb<4~n)Kkh6!&N2>oeJdU9Q8cC|A>O~0f1u7#T3^! zLNB)J-@M1i8NQ)ac-tK@m{MS1lughgbYfT$TwzT5|IEroNn67NpE$+FXh$&cPPW@n zQq78r)i9z5MR?T22O&Obsv-xF*~E{}SDQc(hnA`;06;}m(quWUo)TiH*4ix%dvJK@ z?c;-j#|{Yq`9@V=kM1-ghp>j=8oT4a5#*vk@DGvrW4ogFV~5t5EfdmZT`su08djX)y=AHe$s=V=5qX`ODOc}eZqBP1?-kFRZPZ}{0 zQXDN-3WAbe4px*F6P*+P<>EaURYaglR7NgwG(COKd>ALy5C`1&wNr=0<>%%=b1Gx6 z>NVFx(w)vpMfGz#2XydXRa=LOSFZ3sqtu~5VX1ky=GZmP60C#bf#H&Csx)Q$V9t=4 zGL^g#abpXMQdL?;dU|(*w{9F*IrMkr7U7E3FS(w>Fv8lCI> zLgzI&oKDMkEzORasjYPsoIFluexMtYzia&10$QGgU*eEr06nJ?z(nBkz9vHvwbx;z z<5U-e-z(te1-!Iba}Yx9wgVt)^2*MywcmSN3;@7FQLSe0Pt=V%u{TM|kKGQ`@tT|9 zhJ8Rwje;FcY04hQ-DWz>hG-FrLP6x8=LZ)5)n>x_Ge$e) zs3P3|O;4<99QtZ&oBXfn>f9E^?u~h8M~$mpt)c`X71AvfSj3&{75VV!3r(*4@%-6-s1N#K?;nh`M%71ddCr4WQ^oAhVKV_?VKMp8odz zDWA)qYMAQp(hLjy2G7-!vb9p@iGhw{G(qz&gj8a4M@DleZfZtuQE{!jI6PEwR3M}7gw|;o7nVCYHnmI;7cwlXHWx^d< zI5pHm`e26q#S>n z-M?1$Iv+Ajo7NomN0-*$#nTz=l$`fvCtcuL zd|6%XVt=7%d5gXHg_XK(8gqB@^3;f$;BA-v!Wbf>Y5+Pd%lQl|HJ!>*E@BBx%hwb-oj-RntfGBYHa6c+lD^-q` zL;MS-Q4QuaKT}hcP)9*Q0Y~Ih|89hah2^R*6hXB~>>Pbp+Cm{b4qx7eq1N@KBSLgR z?c&l+nHgquvzaciw|fe>sGjayC~WeVbZF;g z9=vG)|AYIAOB%LVODj6)%fRU?KSla2vg)Xv<~MM1T%o(cek}2ry+yR-9^1m&?Bzn; z^(K7R7n$bj1g)(`EL9}O@Nx~(w^LjFF@9|*10Y(jwP<#1Cs7p_2AogOHV;r{6bOa< zYsNlEuGYS5!LD{9#SZ}ZySx8vmB--X7NOC*)eI?TWeP2-C?QKu)C2^gCZv871OJ4J=2L90)_BGi=j#h9oN3-oY0~@~3A)NP!KaM}8t*)ENg8F!hK**CMMV_p<48zI zZXO=njphog=24NmA_EM3ZJOc7n4k@MnT;(-_Y0GZCY@nkF;@~647E{7krS#RTd+2} zQ1E9G8lWJ5S)S_3kSx@Sv(U9Hz4Ka$wu-*USDDOOu;bnMaTDr(w+JJWp_Y@H`lDu> zVQ8Ni;&!2rS>{%uiCI2INxP9lGwgV`;A&2L{s~#mKV(cL0Jst6$nxoOxo4K5#5JYo#-Ugg4(Ct;R+=}VQEYUJpLsq$ow z-$_#lG5xrAU*A;58MaT)L14kz)*vY3m6o7;yV;Ujc5iaawye$-*e!bdgjJIp>M^h5 z%raj;++|n>$j50|wjvv_duV(kn=~1M(+Nr7WueI{sTz@#h23U$)5&OKFHx)2_0K0! ze`AgO&OpohGoQ_jQ?bL{o zf;Rz5YHICn(Uv#u`Cj~ze# z67DuIMUQ;;Rn^om*Y^AP`JJA=Q8sDldljDB<)&D&7|T2uPFMSzZs~T)=cJj{96vM0 z>!@JIM@KV(O65Z~m)OavbG(~MOqtRJtqfqb+Q4#4A}?jEO*}h|5Xawjk#6mmxoH*$ zl%uavInwR%Lz3=f9iB zOvbcQB@!4SeWp3dAo7B)O8Cwt9oN&YOwHE!qv^clq@<;4JySC?%|Vmnx5o}$2M33n z{2LP8PE7(BTUSf5Jcq%K_*IgctM6rU*L~pRZk>|pd&1GMd|rCsvY9DeXhGf>3C_5yW$l7U#VtfbaV;8e{0?2Nwlhyk^ZY!UY9 zaxE#$UTec0grtdx^pY?%><*f1YT@b4{_Ful^-PRxo)>1|y)KT@tSvBLJlrS{~2sdl=$WU-JDbZ9G_$p&K zK%gvTCa;r;xA*I4GELpLc61yZ9@`~G&C(gJnJP*6S97J1np>3Zx(Z2Wx#IH9>hdzz1CUUT zlR~Fd(wD32E((RtktP>!TJMVG$ZjcCMWYF4V0*w1)zebbrE8~CQWH` zou1F=rzz6}j={sp+bw#i+DcI?R27*TK!I0I7z_@zpVM$%g=1Fv6)RC~u3b%DwcL%R ze$4BG%9uFvqsg>)pN~vr(sxtaojor3Cd$ZAfImnpEzjg!KaL(`o~9XFr%-(IWUSnC z1p}(xs8}56v#kC18>j^OvSPM=oUS2V2_++KU9A0}Q%?EVUkDWpe>h?x&NUS)ajuwi zW@5T-%wjs}PZB{BhlMMuBlM>hup6!@vJJ1K45I;4$bJ5#k-Z-kVO6>&Bjs(%g;`sy z`a3mw&G)6-{-vr=5rV+LEz|tVu~43{(1E6 z-%B1Zu4(J;Sg-=)lCd$6me$wpt!LtQ6HaNCHkUr+f$&BfO+o)5!P9ij@waPg#ym3M;o$dB z7LSR~c%JJ&5+Zs1{m(L%jwtjQ>Dj9FS;C709?KQSUCT0$GON?MR@Al|lfq9sMK&xr zu!SnsykfEXhc=A&oBT(FT<8D#|KR9oul^7(stgFsHMq-DjnwhTJ$w-Ge7d|@t(gbP zz?EQG(e6Elwfvn-HwyT?{=-^wZdZwI9Yub*$$0E{{pj4O-^1E5RV8OWj)R4zlqW1i zO>Hq-D5b@smmIU``L=?cQv*-39slf(1zjT2{Z`r_K|@4-G}_}wvc}BkemM{a2Oabc z9uSE0zReA+sR*CSTRU0|8>#uxj0!Kbv;`u)ZEY&}P@hR%u6hYz!5QU=VAdWv{k#85 zC3H+eF(-!yCEML{H)2^;LQ`ub(on8--(&g-)mJ`m8(-p~`@VfpMU&=HASI*42i}F5 z>gu<5S$;T={H?>?V()v2N+Qrk_)20@mKi9ntQ9)(nq+vZSjT#%fL&2ZEdwA$8^LRC zX>}-4nvRGZr$?@o!Za9OsO)b4skk+)V(ojsN!LTmP!+gqZiyrBdH#4tvfGuCLQcix zMmM0MB8y$w(&*smP#zH_xy?j-NGUmpQ-N#H+-AKjp7Za=v!?%CxS3Otb{f%I)|rC& z5T&+%Nt^bdc+sqKHm}pgS_^|tZL`CszKV+Da2!!;3hC40-;y^Ge5DoVxwnBZwOEN( z77-#QKz>r4J>M^@?(_W?1Ob^w2YX)ZDIDn>y@0f%#+!{rjj1XybLeoYW;~;Va}MkA(+&-1n?*R8s!jE8nTD(UHHfg z%qKl(?3DLwB=zjbqvI^Z0q8=ejHp$kApTnDp%V{ft1RgJGErJEwhHXYf|f8*-#spi zhwrQw5T1`Bdp+*{{E@WSXPVya5vz8+T7RU6WDzvTidYq{I@GT;UWdsyA@Jj7Xwj6x zJHoMA=-)I}Pwy*JAbx2kW-cTn6gogvIzIqWWKDz@2TWA#dWUOS?aIxv){uYST6U)VT9Lj$>*0K)B$0^tEvD{w2_JmkI{q)s2 zcGdAY5Ltr}!$DyZqV7zoI?ChmnQqduzdyJaEx_K0P&I0>NzTZ`bVf;*L>VokHaBVE z#e&I9(oWOJeE>=gEgc+kb90SoGAcC8=G6qsZkDIpbq5XmdjmThCBD|HQ-m%t+5jIMs?fi|6=aSOI+Ha z4lH{yl-T3s5My%mpP6X&5B0ug_{&CCk`qS8d*k#Hl?hg~<`VeyY`0F*7`D9qQ*N%0 zZD_fY{`zz90*o^ou~|btop>@#BCmDyswx>9YsL1#s|t|ZSr40dend&}aY@^p+-~>Y ziN;dC(9!JtrP-+`;3HGB{ewlRX88w={z#=tFYiPciQj!g6i{nZlq{sH`#-5tYL9aD z_q;%wyJBj%A{nST?RSLDOexf+u+Ucg^x!aGTVGfB_V?~++BQ2wzx;(8d)jRqgqc-W zGXhBE8jEvSdj3rvI0r}}2jC@D%<%s?QmR-!voACv#RVP36EE6^aHRvAgS*97?8_@Va&Va6bCyK{~E z<-T&A4(YP~4H{-~g=ILv)@pRKS7V#g zpHgTKD74EQ5z*0|zAv}+#{KkhWazMvG|Jf{6Yqn`rs;TIK$r@)YReCv>Ep$p2P!V* zz3=ZEfVYK`%@A9;hH~+ z)%WXH&)b(P^Qp8SgTXma{dwoKvO!1ZV@m7u&}WTH1#f%4wqV=9WcR+!|3D_JAp6&! z#L&K zZRtxsMwJiK{trSI+mWMY*qE50y>4)8|>n;@|DmSfnP>~#45CyyPgD91~vLE zm5aAji;|4kI@!QeOZ@Ui5dEIARKtzxV>xp;kA*E!Rx-(~NIDk($A7Bz^)q!DYjN%F zJOx2KYyO-#0mX;x!oosXN)UEfzHHkN(-~NwyPi_zRclUxR1qa#tqjWz2 zJ9(5+u8}xwbqu!knb2dQS>ktO)wogs+*r|AD*gORe6$o)g@ zeB31;Co$h~wR`^!f&H&5c|S7&k)RBF$E{?9@BxJbzRIWJ={0n_V&Wq%y~lQH zvHqIFu@XdWhFs$GxGO6y9IB-_#XShuNJ=}YqtK-Cnknh;OVdx2>Ez<2Qt<`c)z<_% zKy!5fsr`cA@0__efvwMqUf$@HJhIJ^iCR+yrIW&ia=`WsCUN1J2i`?F!}E`?jl2CM zzbUIz-i-!2v@@4exxQr-xkdMnw_kLrTRoy(Pfyd`vY&5p$zz#Nbg|s-MEcwocN*lr zz>u;V+oZI8n}@U8IOG4UA=_E-9hbK~&f~lP8x4A?ke-P7$!6Xq(wu>kkjD?%-3}uc z2bUR@(oo}v1T~(v8}67Z5e#T}SiGm3v2Pw7=NFtLZo&@ZbxsB?d@w*nCc6KM z?b1zK*LH3^aM4{W%3RQSzLgGJo)gqvbd!8po!N2e^!c<3*vIp*e`&VAQu6MI(-1xj zxX_{opw3Npw#Rb(zBvRfEdi-```-Cx70hu{4>4XINo`)wY5ULQGfx=rsQ1o47X`q& z&IU%AiOdCs(68&g)g;+0RW=gVDW%@~LZVckvy0GP=DFHpy?maN2H#eF8}{YuK#8CkR(m%2H8Z~OT@0CUi0 zZmPl510J9POKz9^i^An{ zf>Jo+b-h^#@#8l?=0aJ^KT$dJhGocd9V+vwaUCvX{}7}qL9<$HJYGD=O4OWxMAnAB zm4Xvf@fA7r%To-sFJow{zp|T^%p|~hUb(_3vrv*oT4(;w)!^L7qm6A@j7@7hKDd4N zs32BNeL-k%P_AmdGo9O0@hMHP>U!N3c_=yZldg(I8m=YV!a;Yz^!2#2=M4}z*3H%S z6-G)ny5yAG>F1Oe`B67H{_?YL@Ke6nzgN!h;g-18b@Mox+Vrr;`Ya;j5(NkiK);c) zi4$5Uf_*pw1CUwH_bVHXr2nu(=yylFyHlqe8(E3IHG?anQhMxUoMe2zN%xzwU9M*?f4n?RN3rkxGl6uEe`8DS3JxdXa0@ z7AB4ze!OPD33t=r6VR$z7sM0Gn8AYu)-H6J(jX9F(+pkRj0^T}K``lV>l}SKgy`r& zCS`^Sqm&6XTxVXCzu&xPm&h0$R=Q`jxs2xH^K) zJzi!URj|KuBQL$)c0K3Q_>?9!Vn_xArP2i9%B8U**-xMGKx*t=0!Wav zR^y#>2rTcLuG^xF^1&S)pT(@PJl|w3ufPN+rA8Jy;<9$M%W4l_da@Unh#>v*FxU|p zz5VOa$qnK&cpZ&Uebf2+nJm&eJ+BxXkU+oGEx?sbCY$D^a`cNPJ!tOpzSVfeQLMh% zpvAi6S~^kDn&-#HisLjT#hAT3v4UQt7$EFQiJAzI;L{tNJFo2j zi~yaT%>S4ZYXxWdn2RWf1L6DIdCF3jz!C$Z$imcTJqkQXnYu#Yw=z+kKh=HD{H@`5 zHrEMk;4jX@t*A%>X9e_G+TM#YKIJMDmdj#sQc2Wg>CJEZJ%8I?-;Ag#?K&SNt>V@i z92+anmv7oN4^yud$msV_&IwIvLs7;4g!uc=s0h4OjETXTCC5F6HSfE?O!bVSg86o} z`5?V)>8g%Urhoxx|EFi~Ztnv)6qOUBi3vw&{ly`Roh7CIlDF4`83h?irZa6c+l-UI z4v9)jEjae4$1;2qES@()weiYYA^_*6!F=2Xol=t#I)=a!#O5;ou8nH?&d*|N4l-%g z*HGl0LoxSd=gy2EffP7c0_0Cm6w~hBW>jEQ3zu*2F3&SPxN~?~2yJwENLhkf%!EYP|F4!zst&8NW;}>wv*+@oVh*0^~EcX4Yfyn8Y7CGF@ z6XQO!DgpuJmiCI{u&^e`d23QHdvfu)IX=n{%9<|j7exxbO2LN4>ldyU-X5Cr?YxWa z8ruiLgp&>*{395thXpa2&4#ZzLvapLK0dDM)65?hKxTjG?AgjP1&-Z=!_SCx@fjoXRU_ zxErS*0+Lr(MsRz0F`FLVM?p^AjK^S|>5Q9Sf(T%MC~qw3wiDwzML@+?dpOj?>7tK% zUxCtmJ^udb;eWzCG#Us7TE{Iz{*j6|3!8g02J-LYBg|8#C?QRDLr5ISY$U?9-FDSa z6I;krwJ?43L`WJ9g{x!W09CyngCSQJ;uaBf>opH{HTPevt5KgePb;-KZ_GVeRXpD{ zlj{Sos+z9WLC=+QQ~fnrrxQgcFCVv&?t8=Y2bzwh<@6mFLKBr{RXiqOH)!SB#OXXI`(q)}eeAGkoJU}Y$Yy4Z7LWIV&9qm9eQnnGdMQ0ctas}$ z=JYNc)Tns&&j+-Vl5KXq@lpVS0yhFLjMgEr!hZ8m-x@_{t_f6KH}ziQg)ud2V5kp1 za_`|xEg*SzJ4~DuHxp`ai{v;VRxJ~4k-_BD?$gf8sm!ytrR$xV&a%@SSUHyctf z)x?+&?6fh<>|^FH7dY*HmI8Jcp3Q{kU7rivD_Me`VhJ$6sG2&4Pfrl}K>#$g_-b?f z`g58q0JWRx_C^kZRCltolS?K7{jf}0WTnO9s7SB^Ohonf%C_9|N%r#Zfxo6Q_ip%Y z<>3h-w@p3tvHhdMj%+7c!L5V$gC5D8@97DI#A2!y zdC;f@T1ev{s>$Abq?qZPv^5=r;auA?9yqt=aWOX*&cMoba#+^69Traprt!ZCxt#XcTeYo(t z^nzOzGMNS;EA#(#!mw2AM=NyA{ATOskG(&=(7%3s&c+MxuZp2A_f-}~jT(!hO!Zvb zyyUcIy?l0Ra|B|GFdI>WsVk}P_zA(*he^q6>By+PKW^wxYh-B|xaBokJ^mZaEhzc` z!1C!i{OW<^ssv>@l+k|d_7*o z%T;@?u943tSnx1YJsld8K{$zD*;dgIqG9rJA^tVa+T_SGjpUD}pOBczOx<$dd;<8x>bBVtFn){;Btgc~*?80>;TANW|1?fiq4thucD&!c#HQYWI37ba(@$mAI78t5zEOE@Ud_rPcF#HG4 z{h67gqvI8)*@V@!X+Z@Zd1l;CVhqswM%zZ|Qi3O#$YK0nw7qk5q+Pc?TCr_(Y}-bs zlXTLtosMl*l8$Y3(y?u`W81cEC%4}3ob#RY!?d0L}l{OdDLXZI> zG)v7)zt5*7+n6kW@letx#oL;eOThrrY;R3x=2G&1C*F)t*gxOl*K-Ygp=QKeyxAps z^bv%v^i;`SG!u~uS~;Tx;VG@by`|8hG9A7jHK&D4Aib4A6hm;0KZb;3=jTxNJO zA8Q7~ntj6zO%$$$ntG%roHrX4-@CoiN@R)iia7a_uONmB&9WF+wR~74A?QB2p z$O~3$qzK3^?#30sV8yba96TQsXuR*=Tu66b))9WUJ8#XnaAXAz=t4K9c=*rFWf995JMCj)gWqLG1I7&b@#=BHqjSUbAIn z-NQ4Tmb%B><(cFYYlPaO*@6UWoLVV?A2%p5)x<31=?%$;vkN z2gzVfnRjE%q!p3IJd=&$-ui0ROUuMcP3iIM&;Eywwi3ZC_8Z}>l(l{!j$QtRk3yqc zA~nG8<&Cv6)|A`&OHeV_9Y5i+{t?I`uUgJ_*5 z(E)1d(V8wWVe7#Fmd!*z=}m6_DxwW|Vm)1}giP`x<5{ybGCBHK<0Ty;9y~lr_a`_I zwu9bTv`*px(NX1_+d#yjvFg^lmLZh7vMv>KT=KYolIF6wv9Vfz6e`>$>AN_ckz`a= zzG;8YE_}C_jIf<3on_de&Ds*Q=|)=82Xjb7HBx_mVL%fTgN;ZZBGcOLy&Xs56VYib z#YgAM(_-|xc@@hA5K-LA;l2GtU|>7p^=^&`+|bi$*wKOPbv$Y|OupkTQn!69=^xt#s5a_K z!=)^icf9Trx zso{*<@u!_K*_zk>8HAlohy@O$Fyat~nSdAO3j{=bkm4W?lOWEpHFuv&G1mKhrwL2| z0{f+np2q>weC_9ws0~)25dhE!bhTg2_)+c(5#lmW&f^V! zOouEqV3dbX2sT4kIz#2s*&fA0cEhl@oAvdJRm});-Wqfo<};Cw>(;*h&4MIiFFsw* z%*@S?!+&77GUlPL0gvuKXjj1`~(;9Bn0gYLI16jgRAnz0>W>jra3Ji{c@q6Dk6@J58GAk>&cjZ zVC zKmr`~PF^AAm;xGeW(`$WKMo6VOE{e;X(tzcK27AzG8epGvxCfyv~o!x$n-Ak_4wu6 z{3;CKjrVXP!s=3!ZAK(ME>|THi&0K4+?7-*Uqxj_$>N*2jQ9|!rjRUYDqA!zu4CP6 z(&MvweVeZ0@zMLkUB7FUchY0xR$%i`2=vUWZqYq`st$CuS*~`?8~s37l&@4E{S4nl zRpkyW6yjmaAb!Rw5w{laXZ$g6nfm?2ihl-k(p_J+2Fy0>jHu%ij(!Dl!*z!6DR8-8 zaOC1j9%EP^w*|KAW#wJoQ;dQ9XwfXdhHwV^LJtA+LnrCYM{wyck|SYc z(d*n(tFg~C3oZbF9$wR|oxB9^9fnWZKi=j_Zjcqo5H5;(_xPgw1&L1Bca_g(CgS$g zOn#bM@8TAdpuwm&QWU%tn~MuO?+Fc{;E?gmweVHur$vV(*MizD4uf{X$j`ODiLNM2 zH~=ZiOlGG+*gT>xuO1{I&@4YSYMcQ=a29GbSu@p2+R2edPFqMkia0yqK4ok%?g^%K z`d2{s(b5-$FzHTW-_yi*ObarQ=hdEvec9#6%u4%F+4ZLdoEiYwR?R#_v*#9~r9B9q z3=V@(`I;{Cbu==O2Zo~JL-eWe0%;?6_!+TL6 zSt%Pv;S>B0bhbGkB7!kf2-XodyJ*KH%1T7_7`DoRzx-S`>DVMxE3QtCdfXoh-0Lf* z-=ODM!6#B2_`7{~{`5M`x)`$*jYZVFL5PLdbnq1QJ3oI(yXc%+met1G=9{rD*(J05*#4u_vIrABjdS)rOa9e zfO$2H?OZ*82dM*nQhV4r6RqfBp*lYy8Ij*OP~W9{;JHs$P{NS^ojRs-)bgYjm=}oM}UQwuM3V&)e^dE#d z{e!YUwYzBJ6*b>!dtspri~QL~(`Xwm?et}7a)!%d;#L0C^mqnqt$x?`+K{o@HT-Fz zTbATX@a1fAg@I7Gc6F0Yj1T!E#e0pbYQ6g0erp*Cznvzgb{Sq9r>d9#e6e!<=)h@W z+&;guKR-tRmafE*%O>&abNwj=>gI`)zfdh*)0V?#dOK!?>0veyO@y$;`BmU7JmYvS zb%J)rD#LUnTPMxxkx9h1yy>@P>r9tr`02|q1S`HcPY50}JJG=ND*_?f6no-$=%nTQ zvhzn{V^8%rv~60o1mU1SU@o|dY2%*=t%**_ud@tIMKVxm1<+ykEG4MHtMz*u;pXWUoY5h}ytT=@W}K2;>xoIC9$7?dlc%jt znTpTZ0cQNm;RC#5zxjYRG!E0Q^am+WA94^_YSu(@Q}&}w0c^qq%Hk1GhJ7CV)&sEs zGV7n0 z$lexhF33LU>h^ZGTfq*i<>H zh-g%ES++^8;=~6jYdoWX%t{|Bbb=S(`;qMe|9I}Gg%d1g6Vl_q9Zz9B*Zw(glHtGL*! z@8oSTXJ*TUdE7?N?RYrfo0BR&{|FIry!-mYqYy=Q%KNCu zdUhkUY6jY`in(QsrEVi+P@Tj;`~jN*t?>9=eK1;D(ghm;+FZ4eGb0}_<;Q~sOh99S zWA5EvR44at{BG|XAyPhXh-OxYQI*_6W0$9o>SVtNNq)X#Mf6zcA1^a7&gH74bKjUa7qG2tzwvJF)SW5w#Wxo+v^Acw|3qP!M zGkW=7@L0v+=1VKoBDHpUjbnFJ=O+eE^{SB%Lk>uWU!8te^LgFUWJCstie>XY+(eaI z;#W{{Ux@zt9ZB)N$o6pk0xe4~m6dOFkn`95Px4tvs?p0!fP#dAubD=10yZ}w->?ik z&--8q^<@m2j)qP}Tji;v^xW0-5nUZa$EnZj5q?btxjAPi-C8Y184P&j*&@PTrqDG zEivN~A<*TH3J(x@Dk7Ht$IAlCCoNo^*p&Mw9p1XvP;4U#y{)QQO-;i_OZzh7Q})1# zUz;0r`pkbl9EJvFiyf{5V1b917q5CsK<3g@H@cwSM6l1CX%y-uwYwm~OiCxj+OuUE zTu{QGvpptou?<(J}D6Uo2%iG44ckM zKz6_3Gp@~!PP!U1{CPRMJuY-tSmrtNT8%s+OMn2=5S>Unq{xupLg%2 zmAuemmJ@J-2gI)hczKCJ@}n5>3Hy?`3kZi>q-2-M=m>wzIeD*9BJ%g4(KMup$wR9@ zULY3IKGhW7Xn}oS2-bOK6Y+w010d0aq%Uf#M-4rYaPPgoo+Ze@;jt$2IF@@#&o#`& z9}N6>WRjRWl~W^SwS$xBhazyPs~T|mUW{hQ+C7rnGxETkAFTFOQ>p;_Y5mq=`2;8V zcf9@SQ~0&k?W0b3YZkiK{HP-hHK4dWb#~&JJj^d&Qdot=Qj1uFTN<~(XUKDu-6)?2 z9`I<+*it^%)LXB@3m_RPL$n$jehvmx(O5m0nm@|V4~@9MK7Yh}u$HG_d@VDpkh&sp zU*qFahlT>6Qc^9y22nPmpa8N_lvLJke(L-cy*#~U<)^-@(j$R=>GxS&r-cLu>n`DN zt@2Y6YWKeGC@BeAcy^08GMBT+FZ8FY5|)c-tIZF}=THKbe@}&Htxr~3(g7c9(5bI) zK7N_ZcY&)tyr?FXc2nT76*V{JK;OKWkOsrUAXmbzv*n4Ff!npL7<>D4HN1My z>+3rzXmxI$;1N#9ikjb@Lg~GZ1z{B7Bh1eF%&A)(Rx5~uKjGTOvDZOmT2T#p^ial) z6Rc^5#PXoLYKPil|6QT_ucA!4Djhy8G2eNm>E9TOH=z8mU-&UWcs^CT@k%AnPczT6 znNi*?qMMu~vp3iZR@z-=hb~cB`l16p)+nY#lC+DyCTbNzCo6tu5X% z-YJmW2v@Vq1hSPXItgvnQeFIK=5AyO`qjveNG;*2 zxESgAX~#uSp58lEZjNgXb)NzyjZLy;$3$)UZ+ilxz=zvl^fVuN5H1<%(}CfqhKTg0LZfQs@{aQg>Ek}t32YaHEriY_0$7UeRT6` z`ccr0;xpk110tL@=3KukS?DXZz8jMJh2 zfWpMHzISOVZaed;8ni093g)gMML}Q%DCkkvmB4#U>Miado)*wPN_M{OS=_V}`qKrs z_VA`Y*ZWB6`@Vk>y)LSb=5K6uFr{UOcY=o7nMd(XXKFWx0svHG*$rIT~lo*Rl!vDPRQY6z5gQNW^-j zV7t6Bq$DW|+gg9n@HEc#^^nTg&p$jn(&#$v`1q_@-uE{(NE0O^b4RcM6t=f_o~2Uc z?1+i3SmBhAw-=;royX09EDEEz=jL~5nDqz6>54$i0GthLbtkgci*(gMZ4|Oo_0q`TN*0n zm$`d4%c6h$!LRjOltJ31bcTuS71K?yr8=kO z?6}UmHjoDe1~mBO0XtiOv(tM@@M?dOlGn+Y82w}CsUw-1fvh#vojH4&ce_=%fbX@x zl7tbX#>#9eeUd(s$*6|+T40b_9e$;Sk7Pb(q5Ukh)1@VJWqNq^>+~Y1swmD53y;VK z&__LVd`)e-{2e6`IT?kb!d3fgv0XYWqQ%=FqBnDjZrz5T0VWSp(Sen;Ci&^hZ4vWY z25Fb!>)J0MEK=E<9wXVcGV_A?{e+eJW;mGu3inZC7LL*oM>+h@pDwD6@1ph2dpwx;)UokZUwtfd=MRH2cnDK}?Egf^ zoI49W{UQ4Yze{NJjgE{oIM>MdKDZsRB@r}{bzb>==UZhCy!?^_xI?6eu1FhC^o68j zbGW5Sk}~V*Gyrf2AR#hv3SnoayZ7qew_NTEdD1m{K|MFYK&iv;|_SsY6r;zkCfG6Tf&C3C-T%7|(@?>E!H?@t`s{ z=V%`sp%05du(7p-KDT@M4l1XWMOx36`8flZCyt$Eyk%8W~_hzD9L7!Xr4U^ zyPW6XL0OMCPQ8g70oQr@$e@f}8pSAJA-pf6Q`?WV5~rNy?0fCq!KlMy-ULIwp5Pbt z=zHOpm(xr5@XaWbHKcFc8-in+R6`{{8-Vi0SN-+SBEXb5N}EP=e~MHF_G!;bzOZ0c z3%KIq>6hmU^+}%FZMgN4W>Vxt3qXraqQnG} zPNdy>-F!ZZ_kY*n8XZ`fL>$XS(r&g&Sv?~RY2(00I!@QggO$l^L~;LC%P#0@E^xo* z+_mZrmz3ruV4gvTrobMG;L0U2Ij7ekoDZ!J1{~fkxoG-`TeYsyeO8XMWF26mHEx< z*sV4kmK)rXuj|_=hMlmcm%DSKl-4Uc3Z0+{+i~}=$+1oip1}zY3kz>qF?#@D3l$A5 zo>LIdIIL-(Kww8wq-qdnrK`b#uovJ@g6LLa<~{T?4Qv8j4uWl>79h3 z8|iu+BJSnjk3j?=<*|BA!IPrG`NriSyL?bPF9-L^&HvQAiIJ@(}*3%Y4Kt%5iq z++6%X<7~W7q44g^Z)5q$^~~k9$3l>iCHd*#B;zLj8i7|y^VI83x%Owu%J1#W#U^xe z(>+TB4VRV$7^{OObTJZ@kXv-fz$eSj`XBKnK`3_E;CjkmMW-RJqM-u}as9~v+cHrf_V6f9{$(JQ}3p8Mes9Epa<@7Wy;X0J60|HXm~-V7IqfOU<@@ClTf zo5a2t*?Z3BETJhmagEA=LEu>0G+&c*12 zMj;31Y;*H4000uz_3)$)!`Bwm1psLBflw4v|Dk${udV@JR01OBUb@o%LDt9O^);CDmcAuA2*kR zs;8puT)v$h)PBhN>R22r>!UK>dwe1m#Lv!=FaFuW``qL8*R3Iow!Xis`??>eu3n#) zPz7hFhl)ngQYm#jlZMh$AGC+E`l!CAh;&O8(y9nVx|<)?*l#LkdNFGStd@gIyq%81GY30uQVPV2g?#`fs~;( zU#hf$1*_-L6pYEsCiMuVL+;uVbIxU_zrTmLwA~WG@~zrdfC0*?DhC230u`pt7o?#r z&=V6iO!FW%lr}{uot*Nr-ybZH`=7mZf~i~izhWW1(Zea&+C~qwf{-*4+VKqGc9JTW z1+u8CUOXhADM+;`Eoa?P_{D`2*HA>j;Z)lz69Ma^6Dl}AGulM zd2z^iThIrmi`;CvJxy4&;kQ;_Z$gXz`3^cZNp$8nhq0=)R(PCi0?1cDV zjoet0vZ7&ngJu;5x+N9W<>^U<%rBWX?!TX-hl?iu=q$DGK!Ed6t!nfK1in$eU{E>I z9a0gN8u3nJFW}SbKb-*C0U2%s+2{#T16RdIf%%|*QZ?otxjAaac>XCr?)z#H8{K+T zFe!*LTBG$M|C=M3saF+i-kUC$(5Av)S;xQY!3GJM?+=Qay0R=P=FUGC0&4BuH_7@f z#(+dT^YKoXvZc)tj;rGeyGVWml@ae-rMRUGE2Dbv=%q>NB}}EI`&@|bIOPGypOoz{ z(VRdaZbIjVv}*8Gq-aOB_@G6Dz*?r(oCJi2B3hMN1wK6%?gZP5Pna+!rmqOG!(-jL z`z4E6GD%OFX`3D6W&`7^GQa7w#xX>e3yL=gQT>hrDI+*sw8Nt-Ta3-7!RazZvpYI` z3R359~Qybhgh{6VU$ISr4l|lG(hk=?Sub6Albb zBe3Pv3r(QgZ0;v53ff_XyvY%iGi#>AlJYv^<{v5b;_Uw}Q*XZ-BovH}?a5rYUm(z% z)(fiPx-oXfnne!(r2P1tvZI*l^kIts!=f67BH6N!NR3qWqO&AIFYl{X!9M#v8Dn29 zjTP9}W>$r7e-m1u_otv|4Vj|mc8OeUL2LK4LrFYT#HP5^q6ZE#G5K+e)Fy3Q085l? zgpu239A8`-v#7Cg2b_$I5HeXZ+8!1*Rxwe8ECP9+dpeE+4OVy4J+b@=lPE$J8-5M4 zvU`Aj%;RFP^T<;O8;f(LT8Cw3^!h5m(CTd9nhc=IGG`|vW6CxG%EN0}Odp1(NrbON|-ID(Ou<2{)(&PDxW&OP3nDLhT6tP-2%Y=>Lbsw{>c#>^NoT;u1ZON zcwDQHT9isQlJgG>?7380WO(9BL$-u>waTDVfU?emJhMQ%;FFv(@!>VOf`i3yA+vrd zi%4vSv2fY;jl5YGHwG7@^1&HD0gvQ2uc%#gIYTm05tof}#USQds}he#;(FYbu`k_& zw8f1E2Se?5>g5Z)FHtm<#vFu|yQB&lpVc}9+{??5*b;DsBtcw0ghu!f;MUyT328DYrf$`(5 zH)$1v^U?x(a8DDXPr5k!Wy~U8b8@5F%LAM~>IRNZd7fy__xDUxJ7HBE` zBn$hYCiNqzsig+acn*$z$ac6l@Qqd0dLTSL)c-^lJ8C#a_V~J|aUe2LUB;<=02Uoq zEHCBqAze6fNN)sS(62~pA$~BfYWSATL}pa&leEL$g@A{}o8p~7k!;6t{Lw4(L?ZrL zK|dgCPXbALFO8p0d>g^~_1+)&szXu8t}6FC)c)IvJRTCfu=_;J8 zNkPj$QkNe1(V>th*sU zg}r2<0QEmjya_o)@0}L@U<0b{N@)A}M#~%ezR?ubXO0Ks-B1LNmlOnvVPc|@uzJ@9 zt&;?i993QYS*Qnj0hW^0*}4qnh)x4wdBy_}Qm@k}tbtMH@^#LCQGknM#4fyar=SiQa)*SRMqhHT9oF+Y2v4%BxWKiRf`>DXB0N__1? zSTU%0;PXk>le<|E@~sGncXRoizoynr7SP0T@3DzZNSM^6c2rjMJVLwdw9~Epk+&%s zcwM-Z@~sQi#Vl)elwh-eu*02tayVwkqh=5}X07ht6Vc4V(^7hM1R;QmlZn4x`Bh( zlAEf$mFGuZ*OjdVVmPgxx}trZafa7aKQJd4y90#^F5zS>uAgwGAAZ%Pqh=I4D4UcH?jslD*@GF4(>81QP!kLZ1-K~Y*S3h9RF{R2kH zZ=k-FY(q^>IE>^0XXd6qQPerncqn~b{Pmn;>Wh$>&idHXs6=5uIM$l?7;}`B9okX{N~-BFr$!} zMFjk*Z16bf0^sYjKiL{1fXirV=jeissSvaji00;O0e<+aQ+e^nt5q}!cj+X@s6ErN zcY7Vqqu>IT?=q?xfVL4AXY0(3z|nmT#r8YZrH_uLE~D(#GOFrJR+tXxwpN`894HW( zpdFcFamyVEI1;&aA5$e&&S%THUu$K3+29>$*S^j-{`j%*SY{76$$HKjM_jA^D(o`( z-k3XJg&uPm=d_H8yE{CslI}4$`|9%r8361sAi3GfAzf|Mk4o4w|48$3YI3uL_=~S0 zu~~Hm8iju3wCBh0tLQ9xp3cq9|FOeTK5lbxRW>C&AVuuN8DQ2eqnBF{Kr@_gGy|5G z6+B|3oz2wYq`>-mV!I5nK6#KIqjQ=Dq+P_WaArFsN`~?6sx{+eUdsZ9uqOqVy(clzI>&BkFieE(cT|H;7PZ@f z>p)J__IS)z)+`mN(ZNbyVW|DE*HJPq!v-L|G2_`M-c@Ix_Fm^1j0GvmLgRxTP0EHa zDFNR$vBZyhsfi0)Jm#N}vJfLyr}ufBE`oaZSrx-+^MK)RwSp=UA~5HjkkHW0BD&Ia zcN&7MxV@k^G3JoDv)Cj_^I+M4 z;8#e>oW|x@?YE_+;BvF3IN<{g=WTtq$N&;pkj?x_lG$zkRS< zAk9|(y&)<`?ACXJ|6dMDcIsNH6kw&YAAWeMm03VmwI|C&dy$r!vcDx==i^Lj*t4_*~_3neNnPjp0BSn5<sE#9RwlSLRApT9JyJZoefzWVfh** z!_Hjx!2rnkR=GDux2j4Pe}$6#Cx^3h=1CRE@GoOeZ5GAEDITVKj3zVk@qx1Ga&qIK zHPFalb4yDt=MyqA*4sU9uQGoZe5Y_&($MzJhAoewn=IMiqaf36;BB;YvPX|g*P3uq z+$W|1=UP~hK%ad~#sD>hyz$sMQgfPg6dc{x05&50bS$Hc{9{s{;}gEP{9^~t61>aw zqKD;L-?Q(-pz0kpSPN#&dEuhW#gPt=kzHx$twq8IEV)1q*Vb)cQA=xOcW&=Up(`Vz zLE_Wj;Kj+LpGiW*X21xjS$~Q0Nwe^4fZyct%VrB(*E1W=E(01Tp)KSx^5y3|-oLt2 zw!Z&5X?$m=&*3JB-f?nr5*8LF3z9Iqqy6>wf=5bI(xv$12qLC)aZ3jP)?M)KB zG$>sY((xevAN(Wnb8_+Y-pCX@l--~4%n#_t->v;D11zwd)v;`=s`#hP<3{hp!`h-N z6vYJ7D*M@q9|f6-|FClx8MQ}``ULmk+AYcM(cC|fO@MG2YH(b90Kut&XOH;(Wh zy+!$~(qeUa`FyPf15oXDxdB=-&CJwv#~HM|%VAn$@WAgwNx2qI8WE6tQ4ZzOoLcw1 z@F!jk9fa7BG(iLP=JIn`3pWoX@Qqr}-_`>Y^NVHmB2+v;5O0cq>S#-K;9m38uB&lS zn|#`!!QnW$)6bs=ge@&z_`(1`8rI%MMW*{BDZ8l0d&4P#N*IVLIkSof^a*1KmtnkWJKU|E|7U00 z3)fh?4F{Jkm@dMU52Y#l3i1FDdDsoSak&*nf~u%bZ`qhN|B);s_MkK`=bBd9uxP;? zjs%oQ`=3+@_;k?J4Qs)$>B~zdMEbB$BKLw0%Jkb0{b_@dq@)^JZjdtBd3NQ{(>OhdP)zZo!Lj z_~r&Z680!!ASwVLJz!90XKt;ev{wKeor6LHd~3ChJH$Z|0|UT*-jTye72cAA&c=Jk zw5z%=@o6e=;p9!QNI~Lwm;h)uKsgxl%=^Hj`j-XZh2!3&BccCLtF(Wb%$G8=qwEH; z5Ph*AfVvTv%n#9Hgt2p>XU~~*?$ZiMxw_Tnv(edvVQY;qnQv!sm(*kBN%K*SM+^=S zF-b{}*(h0zBa$BdIMF;sdxIuwigsawd;Ey~Ux{eD7m5td6__k1CBLP>v;_e<>e&uM zsXWi?koL|D9a+eyG62VY~rYytE#L;dHV5wTvdR;AyC zjfDj&Ee;Rw1S#^cGgH6-40vJ_W8EJ+@Ot5FK5ZzZHTex=3psH7r}-L5+g~@xXi0zF zUn+!VPY;a4$7Gn||F#PyVRqHa4s|SD_N+v2Kji8Cc|FbiQh9ps&d+80ZG{~~4$RG6 z)jqfguYv+#5iw=@IxpX)t|}Fjr~KFx2iyX|$Uq}|AqdM>GR`X)?2>n06-dn@5gY}E zY@{U6e~(Hg99eXA$?6@5N)o6QSl*ERZpJroU3Im=wG+3hv&-AYhGmm->0>KaUV2P*B4FmMe29p+L3jM)>$$q^XZKGWr{sAB#Dz zujM0|kvCHY-;dI7k(JFngR8BPuhqVlziP&V_-4-+PRb9G1y_D_j9E)rT~s#|20Jwt zA!%N+==wXXvzI8WyhfDa5dW}~5wL#nR!hhTg?q2i=^Bl~cKmJF17vYIXLFLG=?#>YLC zISz_<_+Qu)Ux^(p%kQ;laqC>7rx9Q6Z*G964Xx`mf3uTj9#zV7{VIq=p1oqGlJ$=| z_1L;C7V7b$htXaCFycPgAl8yv*7i%j?r!%N))CsOBOuH&F4dg?yxh#4crPC;ql#&0 z7rkm)WK~?G7~>Pw&oqa9CmjanQlk~c%GN4^(T}?@HB^xQ!;kPYaFc6uvXtwx9QTB1 zbo@nPZfB(iuTiJCB~JjW`SLSH+ZA(XCL9n11Unw6WE z-1p=<|K!RdCf?+zX=;( z^Xiqp^{@p&C@fI=J<8{&E&~xb@-T_%>%azy z?PKCGkVA%JJZP`JJnlmO#1Hbnkws44DxE}ju3o{??&i5VD#fE~1nph~0;bVV&GfF2Td;RW2npIyEwQKAM zs%(#Q+fZwo+Yi$1h_O7TaaP0rztCUSUCIDJ0Q&Ka6y?C_sOegBoVKjZN1FS=)c#@= zAuIrPG1Me8y4Hce!^msx<@>9*(`#YgZ~N}q)zLh*X-5hu|rJfLX%R<5drRq znqT$*jVjZsd)IxeMH`SQ1bQV;H_aVhd$;#1R{{2-y!k9wPvD4Rs!ci>^m z3ZDVezHPj&|0^%UbWZDZam$t&w9JN>bK9I{A3xWZ)KW;97j?dV`RB)v%> zM@A-ATS#rp-~e)W!nV+E|G@CGfA2Mi^#>pyyyklQztxwHG~Tnc)6w(6qpI9NTFQ>5 zt#X;uGM8atVu^`j{LvgV$Hl6T5objdpCw3 zm+-=A-(7-Vbt$ry<=mIh=kZ}`N$aL1@$ZS0$u^#kW{QIWM{K&R(vzH9Y;itF7IkPS z%v!;29x*))n(x7yyiC7`dzrAMs@~!EdWbzwCTiV|_Wp+W!0S>KU$IupQ$dupde8gE zrMsBbc@?#)vV1>o-&n~+7;$@h-`3U|x;PfrSJF^Zn`f8PWM9P)O*B9Fexm*)9AD2t z2(z9WS4HBx%98u0pg5@fUV$wBer%52HGm0j3M7a1?c|Fe0CnGU|Et39uFl$8#?tB; zGsA0u{RkkwG)Pi*HU;Nf6*9b4e)yz|V|xBj%+^nb&S2Dm<)3qZSzctGQwJ+}g93;5 zHTP}y{d}3<<<-!(zlz9d2On$GCRG;g7^6W6&wyY@M(=}DWCHpKREPLbOx8HX(Sl2s zXLqP4Q4iXLQF*~BH$TKxdtwDJ(#|yId7HsJY@v2dSOCr0eZgN>mH!C)--~@d*G9Iw z0~G762sfst6x7sAZEY(nD%Kyu+S}g}sDG@Ot}y&G?X1o+EoD%M(?tM8%OwrW!yteI z5DO6UI=2C$iBK$ZcsJLh%)bl#xVanchpuA**#H2bXVhgSqV>PH0E4r0Q?RH)v~eTN zy${^T*F&i`{3nLDw!;Z4D@`A#YWmMJpFxoeT7|DhE}RXJqNGHSD!Twu*%%cGgfhC) zXB7!vKsGplr=Rl4YIDILd-wjBCpd?&>#k9F)HAqvdM|w z@__9ongs?>RZ+T|yiE;0)zBHZI6ucJD9EteFxsEh~Kk|G1Zu(_t1ZCMxT| zvwe^yamuX&4XWS?-tJr%C-5dlqxvv(wP!5}v^YV013TBLzo_vR$JvXUgo;>xPGhum z(z`hpDsKK{*60v+;Al zvaBpZtRp#n?4l7J-?8od(?c%N3(7!LUiwn3xTJg^k7;gU4ozrTk&W?E2*km2p6|}< z8!%fPnNy|KchQ3vmyl~}jUIN^XyWhsm0fqffKT<$CC4MV@1-sy5KVg4MNwOnJQ7ZY z=79E(@kxx(jTlu`<-Wj(H-q?54Kp3if>=FfmIDm%OXr|0mzLDw!s?!d&c#8sKpAwT z2F6+cIL#^|-}j4>S$iK*JxG=NPglyXHn_3*8W#ccYKzwh2-yGwK(;@8UcY6nOtUAD zmVxKtX0e?S0yfCLGBps8Jk?|BnQnl0J1&*hDR->XsxNIJudPMh^Ls4JWjnO@Tc^ggpjcWrniG z+3N4O8ySn}d0bI&QS-Z-^~~E4G#MhQy%XwFo7oh+YoQ-r zvpuyVD=tY~J&aZC^1rqj!D&*C?2Wxp=rg_4xICteMv=Sb zIJ;2JdiK!F2L#OM^YqEg5MfzN))7ub#8Xg}uXY)~fWqx;RZQS`m(!I_$*`LCeI+e< z4S3ix5OZLnb1o z_?393c1PKrK~0m!9-|qWpo-Eq{Og{?uPU#QZZ(%0F>0Nmas|pS<@<)w zW=sfGlP-zI>dfiSya+MqW-2Hsv|%Adh2&rTr~m4uo}``v`+&e+S6={x^eND{Qd|3c zZ%;|(KXm}S15^vVYrGg?UD=E zR%fE<<&-$NnC?oM%Ey`Gv#0d!f8b1}q!zts`G}kJr_YPr0(beecG^N-J^_X{RKeNi?9J_AM z-d*aL>~o#oTk|xK?emXW?Jpe>1?N-0AD8#w0*v8Mx9d z1O?pS7$@Bquav>Dfx^6h60ifR^i>70*o?-PI7vMZ<<0-yLd6n_=|5!*q+aa)`PB#< zPu_$VK6{N1xMuJ}@AgZ0y{(;{(IbRyyq)(Mr=w)8|@N8aHVM|mfnrN{o6 z?{vn_o|ib}+zXlf ztBcnc+yt6Ml(ttbUAUm1z>20Y!yf z1h_ty>3@b=b0$Hkeesfs?R-BE3T*jvH!lZLj!Sg|GffE)V-cU5tzlevlV=}y*#G_4 z9*+e>@vjTwjOOD$ZsNzZ&+IKc4BVgWm6tbXd?>=g+`__XNHt{FixXU%+Lb9g zmB({7L@}E*=L;}_QG^xr6_$5n5uulGY-dat{VBWRZzZ6Hxw>^t&X4~Z`aezn8Tvq{ ztw-6~k`ggt;liq_sL7_b_I40JbhIv*kiGrw#Dok82#9H?gcM>!eT$ls>NJ(5$9p$6 zfI%opS~(;u_mgR7Zzy4}_96!gT+n-;;FJHzSMlb7+HT6v!0unE{#vL zNWRj$xnch|Jm8b-uQPAj#{G^vz{v*wX-FGxJCE}1MDf3Yl#_3Q%5*j;C~;xoKtDeL zXV${p^UKJ*JOg22FyOP>ng0Va_=W?57igkDh>pvAhRELD9{hAWfjMSBIyzdv*GE+G z#XPwrn|pML=rCFODK(U1TBf?vGeJf)j#@)U`q zz2waQ4KF-Q2e!KW{{$SVT@%#S)_&W(`u#V4bl-Zc82?W^~Q#S-w&q+chfdB7eIF!R)# z?fdXHy@@MZk+TwYaWZEO2A-|pwL@&ECem`$=lniEY!LqnRSfZYE# zxC_K-CIB^FQSrBDG(;SKHYRZW3*X9ziHweZ#TR}qZ+7^NqPR+TqqoqlaBN@1pPjrN z%CPa)iVru{^sShcxN(h_M)#rq5HQml%HiRCb6N}=#K$HqVusgo$+gSB@GgWjND$D^ z)SKsQ>CF=W`AJ;%kuaai7Gv7$v6BaS=0_yqT`;Wkyn^#FZCX^FsQCKF{!wW=l?9w_ zN*_${)~*#FlfXmhb)GYy{ibkcO6(B~$npG7J-OC?O*Dm{;nwPCux0^}ZZ*XV_V)G+ zjY#bD?>6hLyA}xX!d;KfTJaSIzC5gTbgZoxHj%!;GAG1u7=QT4i3SEkMr2)-Mu%BW zWtIPY+dh+RqqvTWI!D3n5PhfoXA9jMamuqPNCogS+iQ{DVlMW{V?hR&Sx;RVkc7b#_@Gb9w5mwIvGWjJFTY2Zh!rGs8&pvktVFc|7yoS zU+M8Tu8XNBKm+U_IwDI`e|&ly)qWd+ar`>Lap{MByHsc&64#})FHCUkdd_g}G+$)2 zh=`1ij)CrWTiH_apYW-u>X`AFW`e}4Gz~$R%Qma%pY|~l8#E)}s8{j7!OX%0vzfQ% zqThw*73lo{aP8d|+TIsT!l{+pAW$4?ZZ@B1eqIH2dxIw0JUneWEg?Q7b9UizXz&+rME$j$zo%jQZoI8F9j)4vC)hf#wX|(rIjVaK zS&+e#_8HWzI(RLHC zI>+WsqIV)p(@CtMQzB(dUEQrp)hL_Dh;yqBpMEc;q)xG|G$x*o2skoBcN(A6#w zUr+UtZrL?je^Wk&QkT(c`wGXdRVSW)oKGK>Nyp*DA3m^rFjy9{hPWPL5`VCg3=QDZS%fb>2 z8#m?!JjkEz5a@(<=Uq!8nppIn z&d&$CJIAUItO3m`N90*Nzs_6Da!d#FOD9D}VW{zKWRoCpwuf_hPxP%%E?Moc(uIe< zgNi!6Z#)aKX0_hYl|vFL^|yKL8BW~MW1y|*95t-?v=S6<&#p#E+1;(D*X5CpAanXi zdMXjgmf@bJ(EEA&9V+h)ZR(G@-dBUZ_tEk6-YmJ4Or0Gk;= zu>Xu^)->w#D8|SBVH2AOYW5*R`L#15~#J3Mh%_VpB6Q*BWR^A*A`9mXnV1PjsK;}H`yDK#vW@n_|1`x#RB?c~!)%9%ky6|#$OV3?piqPi zp;Cv`*~xpq0Cd$jb$7|}aKZeA3$J8(*xMSYg-usjG2gPY+H86)|LG%1CZ02__s)k4?Q#gEFfwCy-Z4rF zQTDTsQz&k$ac9yZoFx=J=X!Z#6lh8HMk|ZKQkz%Nh17wkIS26zaYcrmf)7@ z|IDBXO7m}V+JF4EBwgyZ%0>KTsiKLo2#mMNvds7!woCsRt#asOZ%L_jO}AJRqXq_N%Z(MKel7 z#Uinx_bO&RleP`WCqxM&y6pdOM*KzY`Bas=0No2~F756lVK{HeDGB=0Ma~3-kWDyMq*U5r$Y*1@k2s0w5#)8zct6_Pn?Hp&a z1dy<`NP25@zHE`je)$fK}bnG)|ZIT zgyoHuo)0UydcH%@irn81zE*$E3?b5%!TJsLY}a;MZpIYNmO9)6J(68iD*&4A)G`B# zR8*Dv-7+B~Puvv3XHB-*WY@PK0R&^%XDtozOkx5sAJIYhdb7}nxq z$}<~eZVEa%oi|Vzp|8jz6TdB9`ZVQ_t_=H@v!TB~0P%bf{WcIze#nm?nHDq0=_Nqe zk;MIABx+#(3*HPMGG)mE&#A;@xO6-HsPzmBpt9a+^o>8K+C6Hb6o1l-wwlx5EXLfh z!CPcjH;j$$l`RJb0@2dYc;JD6$+bWcYY&u3-a`Q-SPQXpk1*^XIRgZ=xe5jAk|eL_*=1J|pcJDH+>FXa5W_EWf z5;RSG1tDy&k58I<<6qT{ zPw6X;cW_$he|EPU>ZQ%hon@0=X;Q44%XwOj&kjT}E34E!aUEKdWhzc*Q#5lT=`k{` zUp5O**M8AYFFv0>-Rqzs3(Tdjix+1k>@x*7vxpLRh5&ub6tR)oT&EV=Ls61buQKUx zyv84X!cAdf)29^(gNt0Qu9x(`TXk=EBa~Xu>lD7LK!y(<6c^@t9!avx2p;5=9yI$9Gej~ZlrXMy0&AY zIcjvPu^&#i6XaA9f;}fIui+#SQ7gGSKeE)aB5Gx|*x@ER8s08g^mZRVXV{R!68F9~ z;MdT$F1g9+dWZ7aC$>JhYK4Hdi?U=7Td^G+F8&4}iG-h0V#uyviYw=|cn4ed&%P$| zE`O|_)OkO05QmMl+iBIHc>akQURZ3!Ms1y#-M?CGz7#NTtr+|4tNOD)GLzw>REKxp5vw2Ju6KG9I!VL=R>t*&|WRR8J3@H}2s- z14rRZakL+GXByakVVbp@tl=K+*ZiZ|!g5%wbY!Ztj*}C!q~PIzCW_6^dnq^B;o6Cn zUbBvlwzf96s_sPP%$o;ijIqAz9r!Fc>akZdk+j;Av$50j>(mkX)g}ohgJGHH)Ac~| zBO1qqk)Bo7i;Fftj)2X15QO~xzkU~2v4QA^JCaaZQGC%83c<;qcJ~{T&vS*-w6~g{ z@`iVhi+onA^VR?WSvam7R;5B*#$rE#mH?G$uS2uoZt&rJcWbx7$fyh4vIhkPmpl9W z6q5iZ?w6#YAF-6U-@)|d$Wr7xEQKe94n{P+%NrhS#dKG*%U3xTMJUiDzD?u=`TYoJ zHV0I962TgYGY14pr)%;qrb%n|oIZ7&&a=Ha$Y$73g9xVTDbS4NP}dLt+b&Z9h7*Z% zbIkSq-L`nXXKBL4f@GL^RX_f%W@LnVuzUD#rmv`^MK6C5}QXA0^(<=YZnx*v0~}#;l|uiQOtOL*vKJC>is)O z&MaQUxCneE!Bsr9rXyqQ}u#%ZW{LPyMgnFRbp3Qdh%@lj_5G_OFhQdK`;h_6P@>~Yj0aRiOnoftsz-~ls?e)AaTUXX#kRetj*F(J>MRPkF zBUj+hUvxoKnrZeBo1V$iMl8m>{H}M-Z20Fc&re5`=IdIs+d7`{it=_7kKISojd-QP z0*rs9Je|LdG?k5ht9aFiwNahu2?5f`>E*ZcIcYPif(-C+wiCR($L-V&EwQ4;VhJ1F z$jL{rB7RpDggu9=PBsAtmW;>S`myBVWVAcszElvo2uOhmH;_6mn0ht8CSerkCJFBH zLy!T!IvOsF#>TQH3|8J|0@35PzN@|b!bpa$w+#!~*Jn<^;})TBwvWd;C4%$WQP1GBZ4=eew{ zu%q)`=3g3&ldVRe_vuOcv)sE<39Vxi0Ka;oKrBnQ_xP$5QqAgiI88|Lr@dbRmVrrV zmvpU_#Rx(qow=Obex{^NNr^c5a(DzBSb~4H=+M1ypj1X?Bp(k9L?Qg)&dyo#RRgZi z*O!l7j@~L`%kGswb^Rn2WSXX+kHw;_lZty*w8BWQD~hF7rgT6J-vm3B*&wB@H8JM% ziQuo~EuDw2x$w?W7Fz_}%prq3V@6GT2kEQC?#LG2F|!6NC&(g+kuV2#vh5Wv-(FwV z^UMmSqE{|&+i)1UIA&&)!LW=Hk7DBA#Z?HKf-JPMXImRTe%a(NQ*84)bCqG;Je1>h zJ5T551q}q)q8yOex3aO;E1-?I<#-)l1MAs5@h8X-Llec3u7;v8T=wGby`;wx*Xy)h zNYb2nv7!(1P?c?k!1Rmj_)(_Gnq$mHo43pJM?EZ$O-X0uz~Bo!W);&ohv%6$`OXQ8 zJ){Ud-^1blcyD^rM)!SvPH=(EWsIaojJh;Xeppb|-u&xmoBLMEXp($1s~-_wAmQu* zpW1l)(|)iwY4z}FqQIUSr4gwWv6N1nOB$U=xhcj{sd5`ngdiK10?){MbX4A zhIHAzO>231_1h?}UXvSJD?Ghv4hrKk&olh%(!%O4YoPYmD72A(OJTK-hatDth2>=p z)DPQU@*C}bdK0pJ!SkYB`_a3-R$AU$z2z~TQWp}-_ zj#KwEX)uBSu&}qvk!NdfFpUfslEgb4E+QxxN7Yj0*4&7d6=g?@D7(S5fOCx+{|1E- zdu3_0zC>kt8-(gKw5P65=t6A8yN#wPksNqHE?PZD!#f03grSVsdXi;QuwWR#76iF( zkMC?sc6Z;_!t%&)9!aXTqD}!LvCq1fF+va=#GF}r!+WuUk@SW5AGKUQZ_s824Di1f z01fj2dXSN28FLhrc!&B5F~XXz`k9uyw4hxIHucpqNI}~33El0aJ_zJ+5&IijdFQ7|gE|T{RvM z0?X+i3P9C$NnL}@0(NC>l_5+dG@Q~4u9ek#tm zHe)TxlycDX7*zfg8f+6Of(FTw#iW#{gsxl>3I@SIsPx?)_*|^hR`ekz$)sd;BvfF0 zV`UOXF6e7$#I+$4c}at0ziMC9kOS?)AtJO(??u7LfO4*1RP z=Y0F#oi!HDI`1*~k6*JKl6XFZB;=n|u%ssKXYQLWakvz+lHH}Yb2?E4|MBy- zI(Nwx!tuGi@wq2jFt|{w7p_fNZD|ib|N9@2;{phY6a~vyH;d!SMzWvdCQv@w3H+YE ze^B2^K>!;#Vx;jCk)jijmfoZLq>*$*K2t%&zQ?&yA#Wj3R_99_pIs^0hGybOC`W;L z2w-vQQo`v9acW!Y<cH80q=dDf?0{y(>*15g0(^_uUaW6Fqy?n)ck7 zQRj2cJm6|ZVSDDiHjZgbqmXW2k2O11Rv7y1%zG0nVy$+w471mHvoDJx;Px#O&Vw^n zwAu;{1zAV{{auCv!4fhoWPu_Azz>f1L~$ae{L$<h<%#l8%-j=t>5FiMeg^ZRIs_Zy>V`>h9)KZ`@RT_Ml4s%w5&ghyVKlfti|Zp^YR9ghV$IV zz}Y+u-XxCN@ zd|X2QkCllrgdf7g(=N*>WI_%7FA#A5B!!|xON~GaNH)NO;!!>no%(e=P56R{@^(Ai zQtAzqbtDqTMC5sifh3!QiA!!?l`ufze7f;(WIYnl6={fj5%FJNR!#?f6Z zH7ui@&t%b~g2Hduh9O;`pc_aiZ9n%iT3S3l2I*9~;#Lrx(cxxD0 zD-^6PxEGp8#8Fl7JD;Le_ibF{>uap~3r4GLDK|re>FgbC+1@eQ@hgqt2&hgK6Pf;$lPl1E4MEzg`?q&0*-8igTolNWiE1UxKmy z_>7v;;vV7?@wEm9q&nZ=Or=i5e~Zh*+0z9T9MxOoPE+OhpNX~puL0_(C`|Bj?`EyR zp_L;6yEfg_TowaZob~1rnQ3pJUvbO3@TyDj7=-H~bUMfq0`}t)(#e z3^p=@%gBy6vbYjHK`s~0Cf1pQbMjhyQJ+u(O~tikvEcB~S*%LKeI*pH`*cgJKu3hQ z`!vUc^IBL6mm>%Wgf+DkksZI@0$sw-kIsO0lw%cT$Bp`JBbvfVB0DH5&CN^Zw7)(+ zJi>S!uCzMuDt%v5n&t8Kchr0gaN*55v=yVY)DT!?euzqXHDDl=pzu~<9{`OfM3F!- zYfr*vpZ+C?fPxC7)-~?)$mQ7jfkm3DY9{YUC_DS0d_!|{doux}g{=RxeJ5A_bMv1u ztS@E?9Vjs-sfLka*zch9C`#TPtJN~d>mi7L%&CWVXaLAe>`z3~HPeu|xskx))EIH8 zbO@!cqF<@J5X4OB}jFT42U{WQ}* z-ahK3;0<8xz`RG$>7u)BP-`YvbY6dN*Ox5n^nH5^WCivA!kcVAe_-8|SvG==R<p$o-z#O0rEG2s;AjqW)xiC-cwQmdg$KfZxJDCu=CaCbX8Y`ZQaJBk#6*ls<9%>;n+cD z?Y=`{yX9JrdvjdGZnC$naB)tscb{SUKENY3&>sUoVTCLidug8=T{iw0@mGz$?E!q9AJ#UVPY-mLL3Vt+W$E8Sh+ zlHQ&+*(vHO9Oo-KdkF>ZufKZ`>2f-s*Dzp|tX@m=Dm`l`nEikeVE`&~&QUQX_;sIo z8~~v1U#q7nzL{$Vhdz{|>+59eIAheJQsVvXEO5_XEl~bRS#|KbYgB-%rtd)$phnTl z<8q?O7y!_%wwK8O0I0nlP2+FCVcK@rDTTypCgR^3Z>P_Q z$<$tY(FKO1CIy92*LG>Z002>C`;E_ao#BJR`Vx>&3y zYv_qTxWCg!e!+WZC(MluOoM?`GhgT|)we|_mXcEaC{oave7ABHv*Ug+u(Z;1v(^NI z1Q?jqwe!OpH5QnoW>T6Mf>Hlxo<9*viYHI`*Gk0X0BVBlZ6ad6TPYr{Y}!%&s{fnW z59{sfK|f=C`RhKdlZUOLOM8=1E*$29di<{lzy2o9XD~2yq3^d{ADJhtS~y7B(^l{4 zvoJf=dkG=T_4r!igUy2jpUxN0aDIEEUL>Kf1?MTAp%KlyRZ_&LF$E>%Gsg=$VMZVY z1x5Gg(#i@)obl%{>Cd0(nVHfQiSPXAW)gS$i96FZ^Z`UR?|ej3TV^q%U(BBh;Dy_f zA;kQh+27PVz$l(Ra*qExu*XnD&`dwPWvcN3@`7|VH>!2aHw8JlB31VV11pvOYkyxK_88IF{PX1><-xmqfsBL#eZKtzN}C_Y8RjXbKY1GyW?F1y1xJCkxy108@!~W^GTyaNc4FT#uit+ z#qs~F>>cic-JP9}41tJ-#OMsH05(%`en|on{X?=C23&5f2!O_+U@gks?IP#@{l;FZ z7xkV$A64kM_955--HXZ$9vuZzNKt2z+u{33&X-OfyL`?<$s(!iL0Wo&FV=+dGt0Xc)V{1Cs%&wB<)B2m>*jW=9|%ihc!RT=2!2o=!ZsY zRwVslJTh_iadT3oRcS=zMDVG3bf1+7h06kR}5H5N=?906fgpImIO64 zm!+hWBDIikl2w;nXb%*z%I!)7elH#bi(Ce5*v&0NahWxJU@OFhkilHT!DRrv>uqi%BRLqeBvXQ{lqVT5fEn zNudc4|A>^#(D--{8w|NF>;|{MRp%{F2ntoSu>mW7sK!si;iD)JD@w|PTzgh_?cTg+UfUKfY@Z1TY?b&pxCJw zIHZD@vaXz^lnUeO+>kpCVmyM~ga?-1_Vdnc z7v%$eC#Bh$;ODk;6rq(A0<5j$XSP7XjRYy}R#n(YL<=;;v`7eQmJo-CfZ)28dcvF* zI$`!l_*)@4M-sCQl{L%fgp`q_Uja1eW&+}M_ge`pB1s4`xFr5~`O}e$$DEMHT+fc( zBco2CDW6U$s*l%mpW9UYgKOaW-&yY(%Y$J|ewEJ z7*cdm*Vc1=8v0kgn+RC1_Cr1XmY#ABj5gtk489%~Y>*DiXBDPqeg68$T&qJ2Q({ZZ z`j?lXF&c-o{e&)DLEgErV*qw9w_U!S@b<1 z^Vsa7E-86k*D@a~a~x!b1;wOx29F$PVbWSB&h9je#EKi38*S+J;;4&rJ3Px|s{WZy z-95838llpAElh;3@Z9(Q!46d&c-PoM zms=|Yisd&mbpKeq=*nc#mW#*I#!5lU%Osi}kdGB)AhneKc}$3bsAJMI3xm*r_?hh2 zU)W-sYeFvv)VJrKMq-|p)V12TQ)y0nf8|J$1|iL4{5dueVJzM4*!oNUat>%O z8g5_7aK3PWKn4Y&ZpT4K`LTu`Ag(pMX&)8Y-hoGdc^E%30_$f+G5|aBGpG`kG27m< zJf1AzHxlS`)4s2c&n?*2ol}0UOs6B^eJ+6Hie*-zi^O<-i_#)Nj?!htV|}c*TiY+Q z{5M!pntay5FU2&)tzSg5+oL){q|uK~Lk7CGmIm)u*ablyTUe~bYVIBa)-YWC))m1t zFn=mFR@&zR^j%;mfFjT2>NFPmZib(9WVs=r0HWBenw(&gV7+*BJY|J>Sqq}tqKri< zPGnQr=UqvvGTW;XVY(MAEQJY@ScaSedKCh~i`?~q+p<+01&8f_W{3rMKLX!OZ#21I z{HjVpI91>y&Z2Viq+;)4rK}E>*=KOcM;GI^)f}ogn0aN(I7(r_0C8js!?-`83!{ZV zb(P@?B7&$@u3&PEUZdJU$C~%0Bu{b%SkRynWY>Z)lgq>?~Rm^Bd&~RQ_@VXy#CTaLf!j-JW-`jp%_k9+PiD3W~7W%xK zl)N8uz@CNDv~XRgn;qE{BWNG8szwuU9Y({EI$kgK`CRTNNn0t+r|Xifqq^!3yiitc zZ(p(Zd7b+aC}%(bU1td6;&8I>g3>4Mq=8sf_B&MgvMKc$i#7_{*LWDuhP8JVKzu@)vR1) zj#DyEje8D&UnjSRK8d?xvKfIz$Yz!-pxjNT?-L(_LqtYf zZIn#Z-K{@TKtaUGuWg73$BRhE%7#^1cTzGxY{cOI)`jw1;esZyyfUbQ%*+fomQwAH zzHw-%3R#rE{#v>#D_h7CLj%OPUKL#;J0*m@1w2NUw2fHnw)^m-A}wyz*bpi^IZrSB z6$7?TlWcI>m(1ow)`8hj|74t!t;4%Mn2H>$BE&s2Dw!laP8^xCDvo?C3yYV{!~YVI zyHs<7RClQu>}3yVZ!=w}Rq5JsyG-0yu}8ZRac~ImZEB*TY2H2`=Y*H3X1EzvHZQvs zm$ElNaXr0?%Ke$4|4tTEvB9?yr6$1R^5hFVXmZ%}sH|U-pUau&T+wd03fiXlsW5PD z(@A(=o^osA4VJn?4=xtD)lSy&Nho6Y79)EosMoH54RX zjA^NVxL;YitJ0Y&sYtO#YN94)aoPk`Bv2(NM4%jYnMS8jO-LXq0?j^v4C5O(QULT% zjZCQ}bT8QHB1Vw)411A<{?%F4y6#Pe6mJ_;%``%PKwjcQ%vIZmn$GW}2#0irs3g{G zUeV?OOQU(GT6&Jmg{`lLwgaIBElElLIKf%Pd;tK=T?kIs^xPrqpup{L09QcL`r2XT zI)$3ylgoG32Eho{rm~|Ub#FsDW6~(eDFG%aQqys~Zt6l47Hu zaFKdL#mtgV)~WO7k)O6a^Mvg)OIqR#6gB5|3L?{Li5g!&ag~*unUOsnTUy4MAlexr zNi?SJ?+Ibg82}9d@_GhZqYI5lfZ!i3cA2bt@E`yK0*q9|oc7`Tl`Jla@UZsA!b}Em zfE6tDlkjB>9b+5`@W<63&iMYk2`eJ@*mhs-^)vxzRXrL_Z=FsedPE8O_lvvnWviN@ zlhz1hDqXKC-_MkX8}rS5o-b@cjK>!{8(CilsIj;I32E}`7UqE06w{dy3Jmc$Qf*Uo>2XioQ-?m};OKC=6r!V#!un$a#wT{%*8~YxN;XQ) zJ$q}!khS<=p}<4LmPEj9<&t(ftsZjkQS&)+GVoD&SwS?OOPDIFX_ zmP}ug%>?`bO{%<-IdmtV*rfmQ*UH|u%@u+ca)iuxXwGM4DA5yfnX2kNZ~ zf>MTw=d_e`GoZ7rF|s;MuIuYkKdqiM&+zRj51O-#DAx+U z_(a_~gR~bd`;}@Xtx{@x>c&)XsJndc9+pZuWdLDJ6fu@{9DJOaOELp_bqxF<0096N zT*Au;837ZawT-nY2%rwAI)hhwaYOPOS?%x~RpskK*ISC0B?Sc)ipw*+>~WcD+IF?( z+|3`EmnKdR202%^Nx$o@Lm3&?|NMO!L8~l3yl`g`wRp*Qk7Xt4OE|oz9V~Ww z?&wNTT4J&Y_zCd?1Y6j~|0wNmYO;u~jc7zdz^}Qa6is6D-3|#%5k3Bkj)#>)8C&J& z`oaT`{QfZh$^)y zNrt@_yjT|AIh<7x(Ho2!rUT<`PIp`aDFH1+mi!PM3vIdi>Gsbbl|#z=Kla3A%!sbI zUmum8E^XF5gndd`R~6&HKmD+*-IrBcgvL$0u52gcHqT3iN4{Q}8Yld;HJ_Wf*V7koz;sONRXLDP42JQF1MlPkr<0d4adTP?mW?i|`b?Hsa zn4^_H^i{{z*h$uv^tbj93i(gTATCE&W=9KX{v9?bq&Edt|4gwLG+n5OIBqm82sFS*eUx4w+9ryj6W*<9M&5xfo0O}_` zJ1Ufsocd7@xo=iQwJB|ngWj>L?f z_ya&*!`|7abik4-S~C%;bxBQbm9VF?$5YF(X2sc=h=+4zUV3qhh_VeYNq+uHrt9Dd z=V;#{?IXm{5Y@r{e(<6q)Kk_Mcl5MFR1Rpozo4F!ctwCjc{IwUoLB#o*V? ztW~l56JyHZWpi^R>d&aosJ7n6ZQl(OAgSw^WRQZ_~?IWW@Vqcg23 z$SYynkqFn;41*u2)Sqmz99x-BzCUk=F@X#eCoXb>Ay+XzdOK9s_0+Jnh6U&2QNm*2FNDP`(fUuyNMLfh|PJ$T6 zjVKj&;~&jbSj`|6C+lVYW0p}>RP?bx<$6HWzT_Hb%s#rz?yT(8b(4(#(}+8md5fX1@q-V(Qj8l_Sx= z?{b4ua&ufzrmE2N=ejhi&XYqAZweNIq8Ho)p_kpS?hbygk2ev30}LgZEL#ppma4VwfK_Vj+!^6-8%dJ32(;q5#^-Cbr6% zwK@1aD*AHXYkv_GOoO?M9>b5X&=*=N9?LtzEu6_+_AfW(f7YH9l{73Yr71Csq2!pW zvFb?_0}qf<+NNi)goyD0)S(38ECv7-{!TY(%RZ6P`jW?2a)@fL4OW$;J_A6}CI@nu z4iW>lPGNV}q#Ldwc0?lOXcUov4{Ia#(ZoCQ=yDD40s*KxuFNiaMC3$ae_qs9UE&(U z8)rt**@#b1jtl*erO<#4*n=x~9-ZW@R(sPs0@bka(jqO8Aps3Yx>R{?;f$iH3?8I4 z=d8w@e8pkVI;&*fKM8OmV!6l25& z5I{)>M~%Mf!Wis?Ta}FcSX_Am&vj4TzIt8v{7yo%dhY?%a z8h1KAyTex1!Ew8YK+A2{Tf{R2hptv45%+G9xU!;jC(~JXt&sD42?{=!A=2LQ?w&+K zmvP~(SoWIths}_=R2xagIA-Q+e7_x}_o66%T1Y=}%jKYWe2D1}U~}lh3-%kITZ%89 z?bxIiQr{36p8(v&}ENZvy z*tTukwrx%@nRsGjVw)4&wllG9+qU!izxUmG_kC5nyQ-_}oYURA_xbj>*0&auVTVg) z(UQ;tW~JauO<*XRE(rOk^u1<^^1o7$%^1C5s}M+3SBLAZI8T`4#pO=Hp8mn|3R#`7 z`~-M6V@bLLEe&MwM@w@9OxyS*TP!`;8C<PC{P z>Q!m%&M9S(3D!H8+VSf(7Ng>|KW%OgwMyUm>FRlWWdq$kbjJ4 zLv01nW5NLp3GIM)Y)6Et)LIXz6(GH0===2ajSeOmm~2**mD)+NFCNI`<_C{VPb2BP zPCCP*;Bwi2nLG|1Q`KyZ*>0ak=p?%zDenPk_vs!v2C|WgbhcI1j8Yy4g%CeX5JXf} zAw|Z^M?(r|0K#HkQer>{9pTN5jy7VADX$)d*WJlwp&0-X5fexOm)~f2ixIeHPL3)f zyjid|1PaUIFEjxGghe(?2##=%VCux4Vi0Q@p7z%}JYYXqUj(UwdA-^d&>q)F4^(%h z>zKrIYe}}&$=uOY)y6&X>&0^aMdV*_>DZQAxg2*4=>13h+jF*hF3NF}U#wLP1RoEhEiF zJDw!q#gBSf6!^f9Lncsc>W>V8r|_)_`AyHiQUEQ{s@)#y(jx z1Fsc+5f44*`>-D&^znNj2N%G@C$i`xOx~oSoD_#$RSp$xOXIwQQ5P0Wa4{m*nL{M| zLcq1FEj=W@aabWO1Y%2W2fgz@^HNVY10?oI``FydyFeZTCO31TEm&P_F7j8&_V6S( zbiw?nE?SfE)qLr}6m>fB;jV4f0(Q%_cFMly^eUTt;1@--mZ^l%*=1r!SX9$<_N~J5 zSB$DY_ayiJ?kR6Y)zYKZ*E4rz&9wnS$y@VJa><1ZCFD}LSVm*erIMYns9Q$3<#*|U z*#)VLNG@`Nu(IJbw#&1fuudMfA^WW8uA0Og?t~d1J<6!1g(yt+5dM>$rZ)lAw31e@j{Y1|Xh_bl z0GIl^4o7v_I#kVLR5F+I?`&^MfSo0+VszW1lww~DA4m=C`*`3Cn+LkF3-v8-dSvGI zVLKtDzYPc(9kTKfpsCC*sl%hcGHkulCIYKS3&l44d@9?Tj~cqXz>Ps|2-0*wiA|G1 zmruH@8DX;2TAx00gc4q3@d^GYRd~yaU_4% zo5x=V%Juvd2Sb_v%X)OqGz?6A01%J-Pt!}NxyA1-)C$g;paAQUaG%tq+4(KZBrPcO z>6^Okh6vQHe0~vxai0Wr>dM1c1kzp#?#w?4DVmzzO-$P*jTvIpzZ#7H!IS?U9>hjO zg#8ctM2<+Judk@e!fVlateK`iR&&Y=n(}&>^tlql7sO^O9P8)4pXq<>OovC{>9Wz_ zr{puxgZgIOH*x#@$$E|J7I=C!FtnF6uH{`&7A-S;4Iq-z!{SrcyV+gq=X<56B-t|z zxH1t?YiPWAhw17jGxxpYyVG}$&blQZ40))`1eJl6C}n#r4A$>>z_LE0A9q-bDt}2` z{`S;dK>-t^v2M|!)$Z%50lT+`Gt|k>y67A#Fs$IXj18g$+USPq@kXnjvNoU>VH7&R z$%Z-JCb|bn1XBdNkIIw4s7RmIXd%SDHH%EcDB3vB$-qqg1O=o8U%TayR^{03?#h>r z#|rqIk3!KFZ_g4A+S`BAusF3X2AOO7m^;NTd;b#A?)8#W%9!=cA-#LEBN{4c@p~Pu z#Id+4n^TvumUso)vt-li|NEJ`t{+gtZi_RQUk;D)nI?GMe+C4yZU`C!M7g9DN_c-)jsy6?Fy ztQ#q>M~ga`JawYvrrn#4fkKd6^E!XEUF;5St!ol-j1T-}KQWl+8zRy)>m5cVi?=Byg_wle)b1PjM`O&MLptXbGE71cF6IP-M=#g{QA zRx)TX7MHXH^D*euG~*XA{N-Ppv%^O!5afE??DqO%aYpa#sEgz`PtOW&QM4ABTmSqX zS{^O0$4D`SQaN`N{amNh8a#5BClK25*F<%{KiZx}h>p9Q!VjLNcR$Oy6kOk!C*FxI zy|SMTW6f@;M8|GBwv*B@2B3py``Uj9K^;>Rk9v{eF#`Z(?{}l`g{!Xn`dO{N%x9Ip zd2aJC#jIBw5zRG))?A?I-Pp5=5Y8j#YqC6V$|2QWG)eXv!936vI9`yU7lyc-Dj?XJ z>gt!qH>smHd zcmfcrYTKKFPx;sy4M0eZD46)zszQ|7^PV(o)*)jflkNy-5l-gYlYaiFH~b{rtUa4QvnX_}kF9LpOnb0Fe42gv>t= z)E;`}5RuzvFuJ+2NX>_0_@Zil`Mwa{+1c@?(EiM52rc;GQ$u9`UY)w{&DRPJ|B1B_O5%h2Fm^ZUjwE|7H*U z-^ZyMai9UBs3>UA4c>L&by&#%{ZRk?1l;`re-(7tVCf4wH9M~o0|SH2VICmm2KY8G zKx$zp`SUd608(Mg8>z`%{eeBf=DP!Ngwq$WF$RfWP{#cP{qLSPH#0ry#So<;?4y<$lFv^Z0c#m>Km6+W$L)$uEmF=$wGl zgJh_h%=FlvZ=7uSa&VMj2f?ei03l8>kwQw_bh#ey1E7B(4e$Ju!Sg3F{>Rnq>Ee>= zcE>$@*IP5W{s-N>M@Ng*avMANY7Q$g>XD)05o+LaA3qtT zO~=cHkMd2ZMQU>!wM!HRa*-4h@6B|>^Yu4ZT~!te%6zcYbteow%w30;7KK`k_^d{)KC2fIMCiL6W>U6Q)y8E8ML9@i1_a7u;W3p$D0e zfPmnU<>K@H+>fy?#Wodw^hsK-n@r{MZD?iwy5W2 z0suZvX0pV}-o8ADGs=CKa>K@Rkf!K4G~(j1D0bTg;BjnVP%&(Oc6{zkB$giXh%J0e z{uY3Am)9iBgt?}7AhbSq${Vsfw!-_m{I`G7borY z7_Qqe!gLunKzYUP-moJ=t>LmND@ADUZ?{(-374?C#b>wphTp#U`luJ0l--t~4kU=b zO}F>^OTxDTp3v0(vspib^(EhN2^lJ*bti`qT(q2e64yg)xa)rfB3$*)2Q4?mnf&in zr!z5}V@f;wxs=8Mco_m)5oiFhRXX+!93E~35bFr5jG<)VV0ov@euogQ|Cj%W)-DG> z>2;n{!Zp~3gEMQ0ci5m*JE zAxk`G_a39T?^)*(5uMzhNfvH(9z#EW?XmNv?!cFF?t}MUwy5RY-7J-s7VVaR>^&&sUl+|uu&CcaMekf)p1-}`u7 zE7M&cT9ENs_0}1_bB0P84vEd+& zxSUUpCn9QZU<=A()g*H4b(;GiXhFq=!W`I#XMtoCkzRUQ^E+2i)}E<2@7PoaLfRuW ze5T)%Tq6Izo)Nub3z5x=@%`$T6_u@b51}$9+u9A~ad+F_-!kHApP!#PXdHk!n#h{x z<2KSKDCh^#SJOzV*Fj8}9{tz0z;|w(8_KHgi^1&oKrtOknaU=lns#CA#R$5?xo;Pnp z%!VHk=7E=agF*{@^t9deJeu0!L2aFYbq zWY$1cmDGgF6^hoC&Lr}&NsmPY1p=&)=j>OZ_<(3HX>4`J<~9*=9u#S*f6wPf(Z%IN zr-%K~WQOiF9#TUE^Uoihe)45OmE5qUE2SPsHA*%;__y7ic`SV*gA#S6>ux6ykJ^uV z-*Ks+taJM0Fa$nzFJ8|^Da)9{Mji2`+$OEmvV>;2Iyh$pY6Amfs@|Z2W#@EUXV56@ z1}vfD$A(t@EpJ$;7_{$cdLzcn*zl4%5)2*1!YnIS$K8JW@m~LCXaBfmUN$l;>AT}A z#@=*w{%nPY6`s{`cTR4Dj)+haxc1wy9Hgzr2vr40-?jhM zV5ti$P63DQ<&$sQK5v}x1TaFp+b(c#BzF-JjZCb5?-Dj1x3zY1!SBELHQkrr~M6~ zn;&G=(**viX7tNh&xp}c@a^Zka?w2vH2aTHHa%3g6Ce>nQ$YXv9Sjz%sls3MMwN7x zyl#UxEvZvM$&C}%WJzoA(*;-`PS$lHH7XA_KVHAogQ^xC#t@Jv5#hd+`)S9aqb5@0 zQz@1veW^C_Lmw{Azgf*>{;XMWT@|xxSJuP(twFKi$gsYDrHDBV(SuwQy!*_-TKRTi z!XSKk{kEpp&%<`)D?fTq7EibUoOYJRX<<~JwfxDe8Gw8^{<)6vEL{fUizvz%-Q&Fw zZ8+nzl|$zvt7svgSN-bn>NmJ}e#g zqOe4isqN*`9sLN}@;2ImNrNSz&9}wVdU2Q>oQO2c+31;0&KD$PH7s5M6KYt-lulswZ1s-s@Pbov-3xOlf7Wk*FfDff1EL~6Rn|&p}4Jk)@@6m%{-8ndD}hn1|z*| zKwZ^o3pF`NEj7_9D1-*4$M5kE1VH;M(Py0>SBsnnvJB}*A^-P=#Qnwmok{TO>Z4fF zJJ-*?Jgb5nJanB1t-(;rF>Ji>Jix}^AzYruksAFqIs+`Y*D4}DlLc9#?AL0=c76sE zWp&K!C|%>TiS&tKX29FKcq6kI@|Y#qx@B zzQf>*8kc#8_{GD7p%Ov-+)d(2*dR-E&$jQ`z5@Wju7=9>Hz!XOs+2Og$X`W`WeiMB z*vN3=(zse<9ZMXb#C9?!u&XY(4F#~Xs8e6L-H<5A@Q<97oN}U~<8H{*PpHP}yy{(L zB?$@OPfurXSZC7s*r?VPG9^`_Wt7G>=eczW#2;YpY0F*xFu!$3oGV@ald*d&G_9GFjrlXR7(E=1~MFX-3y)`jSuElq9k= zk&VBa)0#}@$f_DO^_4zuKtHjtzht$y5n=$4MZ}znDsyLCQ;t9Y7UWb1)TM~~R__); zeyvqsRJuD2aoQ+!c!SO6C;DT;dqiejl1St7)7DbP zo!l=A$+0PVOc%I%4)+6zoR0xo=7LVKBZC0t?tf35U=618V-C%lTKA-)ZK`x>iY6uZM1+ zWPP7@+_CZBgIF( zXoJ5{=;7~_eOK4Zz&el-b4Gw{!~(C~dhG~3&|JhE80y-MkJ}Ncy*t^LOlQ_%t9FaOO2SS)VKidxoon_A_4 zzFUQcjERs5OTtQRCTJ)1)*WxwHOuMAngtjXDvd{CV5LMzvkwhr{^eY=56mBT@GpOv zIrth15rqyduUnGR9l)v&$ui$ejNGaj!zU6Ln$+5u^pXn8Yqg;|)=!;&%0nN~p2hLfA>gZJpoQ%9 ztdrL~%dGmIG7oW+P&Y_TW>jS1u~)G z6{j3zSX_9Oq9DVa+T(q#wutq5p6yZ`jlO>2+8Hi@xT5MQMoC^?KI+-9N|F+x5r+oF zXMA76Lx5~^o=VT96%q>&2RYG(D^oIm>g+jCgu{C9gbV#dvZrGr;v&E(xKZgB5IP*` z_|w*KrcI1Xt)dNMwRl5b4U^?zNAF&Siwi%p@|T-_h>xeMP>Q|(uZZP+e9l`ITCQ_h zGx?FMWYVt^A4ldV%}}2kK|K1dD?4rH9So?WOSZ2p{$UZ}8++jdoCcS=nSQu03GVf; z6mnWdMtV)WR+s!q@?-bl#Q~7WZ)<7C#AV$c0Y8qS6osBBEC|Ea*r_&5e1fBH*@sn_ zn#`fChtTS@Tr4q71D6~{q)z^-fBT}!bbc>*Ay)>6R04=0D4Y^E!~X;YR+{K!ov*^7 zzJQq;Gv+@B5S1Tx8dA^7xY3euf&wa;W6NJ5S;ZaTzv24&%WFg5ix|EPSg2@rn~*nmD^P!2<~E#b{A;iQqLhi+6P#>0NndDB z7jxmOUagOY)89-Uc(}MD=&%4lOH8z+rn1fckL9tK+N3{2Z=_>Gyb!zbIV_4KKd$g$Jvei5jPQ+SlL7AOU8uWu$jgRfT7 zM#^ZWH7|V};qr6~>PTk)fKvcJT>I1>lma6zR)r0k>9-9;lqz$dIU zz3L|+13yvV+VPLNPRwSjs^vmlE2}j9PJTeIJRgsU(y(#au@=yWQ-wukceeTuJ8m@Q zQG-av#N==!pw`ijk)4J0zrtYe3*}n}Oawd_f>u@*?-KnT^_ls`ya{?eDZ_HC&nA3W zN_7xuw7qWwNcb37rCe+)T4O6lFlkr?8Sm;E8JO|Tmuo)!dh9RjojOAnENGg7q*Ej?~HQQc_01wh?&l=!1S}Nlk({%s7Yx| zw<%oF?v1p9QII)wf8dS>1%lo(G}297k8qbl0$vb91rb0A_#Gh!&0LRNMU@ByiglV_ zDo6u$_TL6QkA7$*FY52{Qv)ha>uw>nLhsb*ri5xPBuy{(?;P)cYiX(P0y^I~Wluq* z$}_$Ufou7ZnSf~_KAFk96#NLq;36q%J~Z=hfVjSDWp!a8Pe>nj(IL=|h`QJ4@3>g$ z**IcN?v_piSAC)7yR8b1gj7<5FbtwUm@#(<7(ixi$|J}8NKoo|*(JJWZ!z7ig?9r2 z5HUBV$IHYT!BW{;t1hh$9;}3&+m|oOK`|)5D5I8iC>{&B0tS#R&VBoOhGNX|h#fbq zVO-#f{sU%cq!sISnoy+};4|i~V$boDog>sWANJ;KKzt|&CWSv|o-uh}n30Zdy0g)2 zb~5FCWW>sPy&ZBWnuCkuP}LX3xwL<}Dvz`s5=@BS zPXlfSpCLbi45(p98-lc-^>>c0c9B03f4LCPxZC&Da7kJ1;qATIx=kA=Fy7B?`a$iK zp@9ee^V-R#O^=_Op;d4#%7B#i!DREq*d# zhii7&ce8}Ar-f2=4*t$b{Ca0?G&HRa?(ag#Vsw*54-{Vq#f(KW83!R<0|wp^=^9QG zQ!`%#eos$G2h)i=E>-+;DbRb zzxap#>}w+#KFG}!><@xzvagr6=nUo+wCfnw~Z!-c0b#NUGWz8jYZW4qLYLOyU zQlUEUGi?TyFXR*jQyb;y;kln{Jwlgc1U5Q_3eA;cBd9a1WYmxmactEnexPG$a4g1s ztmdl{rCuVPuJ{2v7qwu}WAWh3CpS$qOXwt*Z7mq!GCQc`|1bdZR~l(Gouh#P8b zY??a~{5joX*}k3?2bF`N>(wH$?WXy6vHR}4v6b8b5yVIZ^0wB)$`T#r?BO1Bl8<)9 zP^?-;J8m4KrD7pT>CObJ{bt^EjIH}A6A(5&%0K-k`DNS?7z>%qsO%<1I0ytV;np~t7@e$@pQ-LhU3)}?B=IU~Iw zwMyaBJn-;sxJt0gCBfp63fjD$#26GXK$uh4m{{7xf@kCNF$(gYENn;FRDg#uNXl^g z*9;Rq2+Q`Sz?sC{tko+ouOTi@Qq_K|Is+MXxxp$5q6kET`MV~3RQGMGOIp~N_8OuL zAFE`|Y5Q+eDf^VND#LL>zl(zHtP>Eq%;Qs;`#72p#}G=|g}-N3QICHy|Ew}*7n4d$ zdK^gc^6JXf)urdKBLgnvoW;AZ9?HbZs6&HB$H+Q{MqC-q0T+oid_KL6WwaL-6UZnP z8gdw>)B1|kQ?0(N+Kyki9ZN=I{pMB6wg$}l0bnFB*3mOZp3iS+Lqlh{wXxUtgYcD$ z!K2WjNp-=H3Gdy|=hdh%(4Io2t%J>APeh3JuHzq^t8m;IUn%JXk`)cQ>@L_sC2{Tj z;6)cNNrX5yXb=1=>lF=AxLu9$uR|6^mL(-CSK;NyakHE%sGRz<#bZ|k zN?=M2GiIrRoI6cTULORY*S^lZt~hF{9FHXazhaj#X3R|F^Hq<4p^)ZNlGiq?f{NON z-EaL4Md+PZV>>w&Tv%0Ii&kEqwihC*>JTkJ^;}OjKx-gGpvG4T&VHhhclY7woC8f! z2=1PbtLW7e&F@ufK7K_Cv+;k78J zye1=zJ~G)8c75k9dNNIg!A|8nmO##~Uab_QT}DfX6?o+6auY8uI{5uMuQE6RSmU(~ z42w**1TRv)^V8wkSO91MT%*rGkV6dm4@Q{#~|g87t@bdW^3!ks*49 z&Jq$dQJEL|m}}hN~5W?ntDA!L0w)?f1NQvg}*O1qXc^?;>94$e~Uvaq*jXBZtK(h@!DZ=HuoI#x zc7;u(0~z;%b%X5bRbKkH_GhUz)K~kfCtDC?2YjALz&2>!zwVgaTw@Ut5icbfa@1s5 zibOtM3*eR%fR2TwR(6@aLq&NA4b2O%2r}%Ha?vil=j%S-fO-kS-ue*bq<@zCo+)D@`2sYDe~q ze>$v+;q|*=!V2;C_s`Y(S6+TG)fQ{ltvKTYjev@b{J{}+ZHb2bzvDyEx85sr&;-Q4 zvGKu(9u?>Y^#9^tPX4i5thJ>j1r?Q>a!j_UzJ4dLvXFcK(PMe@t+16HP6BR=pP#`!wJgn#6>vao- z5lauwbNsCEmhJdFoZpr#y5^l9bn5|q^D`CypQC)IJ{(Eo<&{rxpP--2I5mt#d>9M{Q%a^9^rO=`|n9ZDSj)X$^JthetB zO{i?(V5X=FKi;AXb;%=9u0nh@a*Z?J21{_^5_)vTAF@3_-D}D(OLs`WEP$|EE&xKT zKuzJ?mc%sYJ9!SPEDIqq)Ru4rGj2zGKt8V3NHG8R@ zf+kvW;i8}I6hJOUQ8BsrYX^}Fd#Y_c#8mvSiQVMoGCfPlznI=uy7^UJ5OHCjqc6R$ zSs0mH_8hnjDY`}r8g#ManV%YkML=Jtljqq?2`gDtftCB7!~VgxiKltB`L#G%A>)FE zDZaZg_%_Ku%4Ollq~31W%6i2{$byUM)z6|Q4iCyT`}Z6WxRZ&|izGn%vRIGcLCTrH zs|!M*o9_~fb58vHG0uB8!<#j}gm+-i{I4qgA;vtI@JaF#^361gduy?HJKleWCt_YO zW|omISlL0t1G8dRgqhU?k$}eKZBlR9#AbsV#PBDblCSf`)*#4tz=b|PAKZ`gy$q3; z#bf+7B3wM8*M~k@ve_jDf;`--8#(Vz-C$*t8i-vYr-1C0_MR9ap1(-`^wtfpfBl?A zy$1>!VnRs$gi=0aVxpH*pW*dAtz#xcujm`u*|Qv~2Rxs3<+=&!;6${9gDztQ+dKPm zd!jO_E$nsviM57A88Ul9YuVFG=NvJOM~-RhIRry={%|>4uFNb6&9pD!a{j2eiWJ6e z-`S#P(lhllMoZFEdM9PUqyiXOU{nAvLnVb;(uNpZ0} zsbY1iO+%W$5zh$p5B*P&9qD^4wv}6oY4m9vqy5WaX-3jvy5hZl_Rz{mPoO9(Cl3Pv zbYzSom9+Jg;=uu>mF1|f5>`|ibLQ?%skZp*oTx)vP>`KkE!d1R>Re%A;lX$7y}3xX z;AKebR;AUJ+*YIM^ozDwE$J}zvurI})J|7#A_bLx@Q%u-Fi?t9x%_L)OWNDsL;n^; z@V@oRR_$8DyL$F=%a($w=jX>_f<@bnouYM0t#Q|?I@g)6c2ZEv_@WjT7Zw_iHYO&! zIJD#x6cjdp=^4)*n*L*j(^?H9_J>a6W+adoa$u~kU)^@Js^MnGYZKV+o3R;6UbFPV zvW{ETBhWG5&0am|fRn^tv5nHy?im+}NlUnbC)c2Q_Ijs;Pl>-AUe7-y(zUitiVUt+ zL>r1RY_t5gd&w@|>^u+iBuli$Cv0AFwOST`#6Pt%VrS4gA)C4yL0x-oKkKU&llga) z;Doi`YVTJ4sOC`YS1nWT)L=scax-H$gyQAlJHd?}9K&RB&WlZ(TZ2m^*PgmT>y&2L zs+a39$_KRJms#{|%a!i3mDJ7f$xfQA_<-F#;w9FUDZIi@{YvbPX-n?s>BP6L2g8!6 z2$_WWYkj{Iy~xEwsCTNsq$uTU1_ltko4YW9eaXo!N4lat3o$hHiILkzE(T7)yU{j` zXtwauwAs7)t=+m?l-Oy$blEt7w5%dhcJ_vEFBv%_$=q>TVE66Zd~5;gUjJ2lR00Ku zoy!w<`|%jqHu72noptn}Z!Ol`3tUY+*L$0bJl`#%wN_=z9Q)#^>Z8KG6GN% zyv(DSfYsD+p-6yGx63J3HX)0sgr91ODpoo1ekWxkSTQTA9yAGFIObIy><~+W3OOHtP$sFi#dCYmkv)OKOM*bLb`l< zD`^xG_ENp`k;C?(0~X`grQS!M@uFMMx+JxK^TzAD9xI~i$;d@bmYt> zA%oKQNYjZLabzj_ykJjHN2DDvF;bxSdI};gE4nB@#!o5jJG1K>wq0K#s9vB9zf(!% zU>4rm+e(%a6%WC>);(>n@xM-AShr5zmlc#!Q-w*o&xOwH0>~IZ0ZCa1+U8| z_S9>6lyWH1z-jW?rXHPmRDP>v`J%t>^yg6GlLg+zmQWQDETFU%AH2GSi=a!uVJ3u4 zF(6y6Qhu0doS@sYnGy%!PaZMItAuAOu*$LR_>4P${TxzX&p4TNf~!Sv%==^M6^r?I z^?6DjONLa&ifxSATD?mYHY|V~rwXLTpycm*Wpt83{cms~Q5Azud%@UgR_lgQC{~^U zzkcrK>G4RmUnNYPEx#SHH|#6t(T^?*knp5(O8cZE-*$4nkKUPebNR#(5OV(3mRzU5}L8^+RlG%BMHp`f~FYiCM80i?go@)8b-=6tzg z*b?@(5!AK5?D}v}n48%qkPzH)vB`Mno;m_y(1ZUuzTz;A^+1$WZiICx&0%xLt?0f# z=U%2!$OK*YpW>w4S06t*t+bd+UtqrP{%ojOP~h1q?O$z6`ZFNpKH=_jA)#x>2f8ou z=i6YSWy`_*ItaGKGrl=K} zg7_S2{rAmLupfBm-+!g=U$F#DnvLHpmVrPKKEGRF0f6_P(EXc`yieb2vQ$4dH-Na$ z3fJT`oC$Bm3ORXxDOa7Q@~&`jCU6tD$9Q&@;REj9K5JcWFX)4+BJE|MB&4@91q`2> z09Y7k;vj%aF3Zo84<{@%+Hi`?ZqU%NBpL6o=Y4B{HjhjhdSHCJN#=&5=2E%D=ycE5 zCFZIZ+2v1U!AlIjDx<8n+XjVVa375F?4l>@>2n7Ogh!(SCIW684f*&wd;$8yx8Txg zm&ag-)EWwSTpm$Kh~CJ3(0swg1&jglLd{Nur-(0Ay?Y+oBaWUVa-bOQxsT|F1*Ne> zJfSZI?~v@}AO`?YDAHU&PP%zzmbA#CD$v=m5Y^NQ@YldTy2{ef)O+JUk_Ze+g7(NQ zW3gXBh=rFVAUmIJSlBiW{`T)&NgW=x=j6s0>XE-YjbE?9<2s0kjAgHYhJ6hHc@j+0 z{ApY)FG&%KgcXUPXczaB$(XE@_vhd1E2BK0^6D@2dvgBg*g@D#v7>hRigrhqdQ@c* z2NpsZtEu$W6wUh@jp+4`wX~L3x6;zrVg#$*k}g5xmDIu@)P9`#WC-qH^%rzOxMZS^%5#-ZNBqy7R?Sz5?`OG7kt{PI+kd{s5zOA94MsFEgG7T^#CS{^whak zOppAOj~@C*!U8UYmKYSWKhL}{B}^ZrZ#wqwo6D>Hudeh;O)JG|WBKci1J@&ED+O~1 z6Ui9q7PBnLJGl=A(rxAu#v46!O>8fmqIQKv)NyjGeW9 zKiwKZ)(@hbt%CdUNXqSdkLjT!m9{pny#!K$Eb&i8e)sK|#UDI|RDoukf?+Ww204qr z?+cAUSSqhs2^uvOqb34_y3z4=vav3XQn&3Tg{XdOh{{2Gx?XGN5OrCPdpQZJ zpBbP&VRt$4hF4yUW34OCUaf8Sk*wgl zlVBoYS{!2OqXun$!QU`r!o^)Hs_zvInlFHS5ZH!FNDK^!OA^81g@XZz&7@)WB>+3? zo8a7$gP`Cd|AAt2dcKA%v;D*jHK)jG9?TgAs3SeP{|E1;&ZLH>^a!yGN9knug#>Il<0%2VKkG033K2K_I zPrl#(3=Je4&1lg8?&_F?>*SBdk7Gfy`2NW*KFb=Zn-{pem=!t@ z&;fwNK~kj5x>?fugUkip9s5!AaX4O&!rPo{L<2TItD;H2cQvV^0cQL(gy_ZCI%e=s z2lbU-+}M%9ji)`gV-HFCR2&Kdii1&M3+=Skt@u(t3>*$^S}N%Xy)?9x3w{^LBd5{k zl>56MzYGvkm&OX-UCg*aFp?-J-0rbSo*qx*BMa>9b&=i6c_=E0*>@%bPoBnAKv} zt?{YO?R?JQFSWgRjITG3WQuGTK(eQXi7f0@%#&!&9ikLLsd6d5yb)+&3v|xYB%tl9_?>ObZq!f+i83vE^@Sd>+w{3*@+Bm}w^-&RCy=EJC#P$t&jxs05qWoP z&kU3X8Rov%2&qqZWb4hV6Pu_vZeU~EcMMrVkfb*4uPYOG-znKLl&-cP9mW_@tO;?< zOrVe4#Qdyd5rrEGq>wq~)Kma8Kr1;p1WtNU4{8|6o@##b;UR$zg60tQdlYcfr3^|Lu03$ z!>Ul!vocy+2Q=xXDg99zyO&>frEiLG7MN4W{JZ!3{LD9v^fk+vXnyXzYB+k=i^#JR z2<--e4;{!X@#m;crvjl})1)45F2AeCl*tLRXauPv2SS4bVLD@(=}uRpu6K$Nzuzw* zn@y8!+sZo<^EnZHp!HbVY3WX7g-{pgz((${?%CeVgGyZgqe-07q~t74jl~n0v9;od z?lsFC>4?|n+ZGbOds`tg+dAv(HO7>N2G@w)XP#!$lmrGL=MiEm>&uJg_BPu;cln@%gtPR|eK(b-w$qrLIW>}-wAE%! zb=L{@X!Ka00g&b`h~wP!YQ9hiJKfvX(?$V4g!%|L+2BEOs7w;@{13K zBmI<0@vwa2)(a@4g|_?;$t|XmmIT6;7Q%HMWv~>)A2rq1&qoIA8?tb*b_ut-Jya(( z_T^MB73zb*{6wtg)iKE7T`l!k4SOr^TUHd zk;d45Z-DZ32aQY+ExZ2yX+QuS5$+gwbD@mOX|)QkA;LOV!8|r-qp zhgtd?^taX8*)h0e5(iP54;rT8-<{kf)W6zb{w~zUE+*RbdqJnP>fo&y(TzQHT9MEv zHH%JgK%x)Q)nfEo_oIlS6NbhRI5~WF(}sLRq@LXYJY7s-D;K@Eu>Iw&M5U8;%y6?y zA#^xxk)Xry6-zK7601(Hw?yo5W2OnChzLXz)v9dK44V}y(yEoU<@pIPbgyEnBYrny z`;k`V8LsqNj)NMYu*xXg4w*e>7B18a9p|Ojs5ZPIEYpR{Q4fOD#@>$NOG@f9sXDRv zza{NR*NxH*QLCyX)(t!v(d2yRGR&!!F9Z|v#yLjp?G3WFu1yXa%`w zg@4!VTsCSVCetJi1PJth8_!d`m@7L}WQ7j zGc&{(Gh@um3^B)S$IQ$WGsw)$%*@Qp%*-&#zt6sN_L)0Vw`#h|C6_Igq$TxQ-Tn0Q zegy*iC4VNDqb2d;9ytP-jb#+ZLp|1~L{NNrVYx6S#swFfon_RunfCT4f(TJPIo{OW z*l==Juf|N!)^+n24i_B0ps&7OzCUZBP!Nxv&&dVJHds6V=6hOE5EmZj3XI7+1K^{g zVX{uwehfIdbUcFjTFr!!SbCQ1N!A(J&o_$pf~wQ^^-Z;<1|u%L0e=R_K5!0vS%0GB z9CjP%v@u!u2}M0&7bJO_X^BWo?BR$IX1C;7(dHJgoUau4>+~`-cTJt@@ZvWkr~<5e zF^UI-Ge)BuXOW&M25aiGg-b76%DIz1Zc55?fF(LSefjxqCXFp3!tMJ+dWQT+ebk=ORnJ(oE0_b)se_ z&_Z<@$|h|H@FOL`pJfW60G*s)k?61Oc;fVIM(KB#${c#I+jSo;kbPby?^eGEdu^Ga zBhpie3ux#Cs3l`=T8CVKWAL9{zJgYZ4&){&S9hTJW z$^j5++Oh;1v@MQ#RIFPwqQLmISDUnDg;h49!d;qG$&Na!G99(($6@RpvJjV#q@(V* zj1%0ZpXOKiI4srD(yt@qF)%h7Lx){afLi_T$uV=zD&LM5p%5*e`!W{?gWSH1WOUG! zL^S;-mA4I3-Rv=l9i)%t%~C>+%VoE!x%mPwqjR=2jx&I;i^=RcGJaId_ptnCp@W+1cLw`vupgK^#IOC-3x_6YTxbfu5IR_P?@GSY5)@(kUHG7* zv4&6Fw6Ei=HS0(FJBup{GI8g$6yasN&04p(ILyi3z3~N9mDF;8gevPr;`PJd!(13N z@;)`gcOhe=_m;n{e`%ptI6OWq0#<lWC9Im^@!JYfpP8q*ZjKqdi z>RK*5)IB$`k2yVz3GVR8O+F}4zv=52gnX&8z1sDxH%~h_x*-82AV%`xoqHLzfm~M6 zM8fS+vki#&N(apM@sR`V8}h*C+?$nE+n^F6+bYk2(d)>I*0*zO`gLNE-+IPVQd$b) zp&zeLOv$S!tjS?vVHl{WxAxy)n#Tzy2@eJh^kVe~=-NKcXef!<_a9^NM8-&@TCbP# z<>`>u)>x+V3_omLTt@F#t6`Gcs9T>09zLkbPO##2onOp8Zb;Hd+!mUYtM|z>&gO6m z(p;9DWIev}*ysjL{!HAe=icS|=6M|&q|J3UJ+=TOVqP=?fy-BP82Eb!&UwNRp=^aB zyr53@NC)Z;`6%SXr^<$d@@vRP?gLB^<_Kdd> zo2<5+GiAN z9_`?c!4F{TU*5=rSW;chb->&Ff8TZCaVQ>v=Yz)*{J%F}-ac6?A%iM403j{y8%Egw zAg2X~PdHPxwY4#^u~=XFCqkba{>Ik_P=ry1xJQ_9x%%DrO7}x<7EJ{cr8nQ?epCLF!MQ z8x5Y|@lp2w;>fQY!D7HnYr&~CP�@&j%aSsyT?r>KwoS%9+=F<0UMpqdr>|iq=I) zM20N7T=`jdZ)WC+tKVBG@LwvRY5nY9wyKSpD)9LbTJl@W#HP>irWn2zSq2R zD0fy0^N<~vj(-{p*ipc2JGkn@(W z4HLy{wcw0MrfwcHJ|qv z&RUmL9KhxAEdrV>5ky6`?EkzZd{TFti47{5V@E|_&ridoUM+Q2a-VLx1l~Z|99v!8 z{Z=3cqvCi|kkZ%J7v$!-9UA)~DQ%R(++%u%ZEKleQPZ31s_8*OFBNa7`U$2ZrRUr{ zs^^eF-sfSWlz?nvNrvIu^9p#1a&%IX*hhZOhNvAn>=nb7=ecMCW{6$#c@`OIJrR+6 zRD6A`$zg!$8}=tbvVjb^`=#uGm1&X1Wb5`VkZNWUMW7Ibb4!PwIGuUBUw^kHQPQ z7Ck8HbTdfFnHL*OccPc-uZ)pA>yn|@AV52u3DO;zkxb95k&53FL~o&~KXNI^4p~7T z()Lce4{|hf2e5?#Vc+Vm5SryN_R3=CZjT2dXusNn3F(21WUO?*aZbijXE6XjCX+(I z{|vz;O*5Z@VolcnreZwy&r4Ji&p8IMtoWPh%{nR3X2 zQwHpd@7ws-KY^ob4GP4DJ(xGhiik!f)~n1S;Gg)N9TrlTi=*uT>Km^48XV&pK4V zem!z7k$XW=dHWIjfL%o#TgI9N@nC;=lY@nHBM#KEn#9M?+QQk!Hqiq4bgN|M?q1{b z=vo2uC>qs+M}`m)H4ZDgTZfeI8V9ORC`HYamMq3uCDyD~V{K3`JUFA%j>#vWTtQ5l z##rD%`y{`psOZ8d2m8{QQ1Gq0D779WlIKod7^;D+u0)}k6^X?f0apnLXvCE+(B5H~ zd$dT)05N}>M77}bQq>_r0ptT!!;GWf8U2Fu&#^f9_Cdz-DnYkq=gV{O!;PnKl=Q&z z_{c3k{5wGlhl24^WHN;O&y9`jY7|6V;5c5Zb>v#DqUosa2(~_k3l&56}X3_gHO08p{d2^OVl1X!*z*n+CHqkn{ z{6YZ!9t2S3%$T^NbS@{#`mozYrfLHAUKB$-i)DQhMe&jt9lH=7W``qp$YZK!Jm@Ku z83BNxp>x6pj^C68b>@#_KgKc6pC6%l+cynKL%{(yl;I9dD41wy&L$6PPx*PDV&aNs zj-Y**H75OH!yF*Rr(@r5mJGR4IsWwsasfk!3W`wiIO;4jzQ+>-I3%vRd%m3mbtZGR36JAs-sq|Ix;o-}$nqu0y(LCu7t`mIO> z8k}AxR?x?7vJr0~eL5RtRW(Xscl&`(WhPrw%KOBWgssC&QLBh~qL11u%-NbV;4m4e z7pni?{%X~O@c$ow)eQRI{_1hV6gw3umC`*!!=EmYz#3j1s^KW@qhGl;;qlQ=^5C13 z198E=Vyao%bjhBIzvbpKW`kUtCnI-Nbnq6%RmQ$2og@0FV?T?+4-o6gzgV}}AfTNP zdn+vDj05M7S(qyg4H0IpBb5WJ>kYr@6d27z*nC^GZrR7dM?3a#ZEFAioI~r7<7c!_ zYK*A0Z}@GM6P|{i`LQjf$#cgM>i=C}ZQf_8WO;Rdv1sfS1A;7Pp_izX!WhS+N+)A+ zyqagb*SR-0?ivPsNY7=Yb}~Za#@$8}IU%J%KSk>BKJiIsg`KUX9;@c)05Ck(lprz%CLGtdK1%h-`({D#q^_V>D)+0YZRny z61cqS2GV?!dpYf2Q-TUpPS}!tAtWaeH{RIz{cAjlY_rXvLsmmztYU#UtBZHc3BM_g zbsbW?iyw$~x|N}*ee>;fy6vGEom1L8+prJIJg{Kno}M^vu|!}n?dFnHEw4SM` zeJ)ijMKHYY9s_1iNoOdf6jnJ5eT9^O%x!~uD^=MQj;U^AA81C0neZ-Mf>OK-T&?@- z;qIXz1DyYj-1suSU*elKPb>uDT#)5kA~f&T@zaehFLwKo(DfHA|2HOVF(^`LqSN@78Na@? zviq;>@TzD`xXv=u(&!&yufut5(+*XCHlC|UM9OZrk}r;PM`mL1g^jXCw9+Z4Mb)_) z+jo&pWYTuNEML_n#+-lwQpOB6IDAsnlt*UU|3$ zZM}u52zC8v7&47#_a%mzwl8ijudGy1lY$fo@c%R-G4c;cP3j_}8I1q{D#l}s-qO=< zO<~EAi5T>B*5cXlUtU0$RqM5iwzL96^&_e%w4Abnv|$!|Ue31X?$+q}8>@vSE}1{p ziYqW3%ah0EVHTovH(65Jh7@zxNF$P7UAVXhzh;-wU5Zxgz*Bp(~Owx z1<%z#Bq1|Ov5h$1#L|~!C=8Af>u2NB&Nd2zdAz8^J?7wY<_Fv>qd=1Q?ATo)Xxf9F zLT&UZpuR&J?Egq-Wvjf-=6;*TVt=hA0u9Khc&3aJ#n(fY=Fw-gHV(Ni(pT9r{6iJ& zCMK%1-KljQi$?;xuDJT6DnTDPo9ToCZRB{~7t(hDM}ynS(c4x>$o?=tS?#A^O?Tbp zRcLIo+@^~3|MaU8m*`uKu5J;ZVWM1p9xZmBqp=K7WwwgRrG3Q0%0d{^!_~0$iPyFY zPFc*+o0;5|Nok73v4s!4<5;?~C;HYe&;TJZWo;S~juIbBudrd$BKhLI#0}VRn1_}v zZjtsiJo;16Tq)>joutNkXDk!u2v{*N0vBq7%piaYkXwQ^7unNoO;r=_C2YefC4HLm zdjv^z>S7MJFjc#YH01mf=9Ld3pmaj-;j!kgSSY_u@`681GjOw8oAXwDJuHpOLm)$XpmY6pq^G1-wyw@aAe~ug;U@qVkc1U=-s@geG#(~7 zj1qmSGP^*rw>QN)<^0h}yRGPOcfh_e=zeeTw9MQPdxd>6&Y^jr!;90^2?js{Bf$yC z?Tq96KXqJTi2sM<`ljk=QIB}Asm_(Jq}%aT`%__NBV_~oMUdTRkq8aAAYkj%f+~^`t2Ze zs%cje_Rt)wprsj&2UMV;n&TWaXqOmCP?GO@a%Wgl#S53dq&ob zE%A^UtLBCZgguKr@zJS&S#s5mVKWAVg(+!jLjVlx=GL!5K<#S1WNz?`8FJd2LzU#n zo-<7*s*(9eAPtND?BuLw;*sw0aoBc+j;vBGL#zlG-~~j8t1>|ZmB4DAud8QAn4^Z3 z{%|!5STwg9ipN%!Posa>)vzCHWyOxZ^j-yfI#csf!=$X{j=yK-+8_pu$5Ao9{|^Z z{!z-*IbuKnIUj)!v|-*f-?2s}GC1+HWt#zA0H}6)6y{fs!`v~86_Gq&nEH!|Mzxcc zQ870Ibi*2zg@JxFO8kE`Zu2%VCT1OAsI2PVGCjtpjMfPl0C7@8%0lan@k(ll0U+_! z&gqCUCTAuI(3!(L;n0@*@LVUZ%nmr28*8)Ta!PluuDsu?NqRtACHsP#tFBf; zgC1M@-LmXygz58)xlaX|A`Rc}=vBX0JD=-J?~0(`#?HTpsR=7{}q7OZbZ5H`WVa6=L8NQ`LE0RACoNP_7q z+Be33%-5XN`3ZR5mFosUjmwoL<7Ms;E#u*20q8K!6a|rgnXl3(8-&@$^(ij3C{LI_ zCwlZ<*cOYmou?W>}tq<{zh$t3CAm3@NL6DCWbyx$W}?1Q!LVk>wPG zL$axs3Tc|fJO;~|e@2!z5TQGqykHljK$B^#G6z<%C00GKHZ8~7`yX!T`82I@cW(Q6 z@1}ph-_)T^jxR>CRlc6s)NOF8cz6jlxV!kOjyj zAtGlx;{;1N>AUI2O>0tUzHx>vVm;9N8n6#fc>=M? z`EcM;Bus=*{uJlZYRy@`S1t|j8tiB>c`mvg@3n9y%lR+GR^v%e=0rCxALZ$m)};zg z1ncs%0y5>1jk7O|=6H3VE*jP-_5%Gp6C#9A@2m##^K@9gkC(ERT1xC)c=Ye^U}*vd z!bDWnFtK3b&Q35STk)#qn3#RS&jxV`H?Y?^fDRL>FuA!zJRO0y8e?li@tOOTY=zLs zU$JuRfItIwz@OZ+)CYCrb31!4?S)4woirDxk{)e9CS_S`-VJdMW>4s6$p9f?xkrC6 zDjF&!MTJqNG-hFSncH99JfOY|SA5~+wRxTV(0F=xEa)+V$-a5#aOa#zw{%RpSP+hJ zr(z(YX!nbv7*HW2Whj;ObLfWSOo0Pox4QazR8kUs4HVl2(qy@+^UCA?LzDfe`kO1n zEDy+7Yaxj`M}mkwxB6C9`Kbl@Nsd7PQt9;EN?7=lP;-+X3BosIWC(eSE(p`cwTq}9 zf`|fgf(H*6r)TG*z=!>@)*{R1TMnY{4e+CMlls#?OO%;l_o7mG@slTk5K0&6R39~g>Aun?jl(}16;@8)+5 z4iF?ozPz#hQ7@~80v=@cOkCI)Sf(JY@VW!HzhI(=?DD}u6&eas=iu+RSRoSYe%tB( zqU`cP@@E-kY0}}gEa(A0x&U!APgDgarOks~0%Ig1fM|l4*vSMguh|oXK_q8{ly3g0iZ7u{OjN9@NrMfKE6wI z$yuagLM~7%>|C1SZ@+GwwLKuBf?*Jmi?RodB5bs*$VUQVgjkHrek*OiZ-G+E`GrxX znRCTqg2MAde7}c=QUMFa9iMmT-(=QPphMKJQS!gVAa~MM6m-2iGy2PJ47n&x^QvaF zwBBf2_Q`pN0yW{}4bt7i2rh?fAKmYg4NfRS$S?tS9P+TOC(g z#{;({~0D`(%7#WgJp@H`elYVJ;}ReHkr zXu?cxgP7m8^F4fr8#o!eywP(D&oSwsA!X;fv7znZkzJJ|+`>UWd6={Yt4ezKS4dRy z7P?s!(#jgLb01Eqo+o4>o80I~T#RlewS}K*6nvAQ6Lkc=)p^U0$?0EDj~wp-*xH4A ztx{z4oJD`-+%BSgnRe>kSbFFN1Qhw?sd|#%50ML|oOG*$9DhvpOJ=GkG2Bobd0ZwH zDxNr}xrM%Vq_$UmZOU)PK2Z_|EV@a|Dvr5?IrOmy3u50GoAXA5TigA_x#vw*$mX$o z7GoSH(mSnQaBH#OWoHlbm- z@LE;X40S3kACig1?WJ+cBJay=F__^RB_19mi`$4KqW|q{;c25;P=M4ibk8I9iJTF7f%!?^yX-Ue z?5i2B;t}xduY~4Deo;YC0wujgG}o$qpMFqNK;heeB1Y0^vprNnK#GRe9j+FbR>cjCg9&b1}M6z0zUIgj+n%-}BY zsJ%2kf|iY;utbs5mBfk`E=uCZ?&0V=3kve4IE+$e5662!k*68Wfp0w^A^2+?Hctu# zL;f7B^y9+pW9f0WztH@tAj{ifpd?FdWZ*zhT9S&BX%Dii!^jKvHN91nu!cPc&8$}x z9zhNQG$2!9e}o79@NYYZ`5>9W(KfGc!%hV-y7r(2b0|F3j@wr-PoYRjA}I(A zmOgB`jZDBeURP^0;8QA?&cHOt-Gv<~Kul!0xs}ZX-HnRny>j|S8;eZ z=K){;3Gn88#@hIfjM1^!3$P9tT?;b6I!!|v4_>+e5?^?7`CzxMH0~~{bYOT@19f5( zUwqYLWW0hvA%8auKDlx%kOQL9ef zIJ*BHHKNFZsrVtT3{rJpUykhWD)KTe;kLDZlaYVh-ra3BUpKKk5!b-+rKiVSom*FA zBM1VmQ4ofwROydq3ZOVb*Q9c?SViIS{XDS7xuJJXf46Y^&dfWg`ZDCX!9|8SyU7Q) z!YPCM^X!;G6o?20AoN8+=!eYE$h+I$mDpI!px^ep<)~}`TY2DCxN45&k7#Dk*ty?V zq^)6VgUv20TU)n|kB{Hp-o99G+u3^R+hP6S_VxAs0>zz?NiY))#Zl%cDkA}`CeFNl zc;?KW4m}w#sdslSxVk!Wky&_ut#9d&T9;`>j|aNBGA-7hN<^KTT4;|SWBe4==oXB z?+s5tpeu@iul|ABho3NZNN9WFUVVoh0YFJhOBOqHwl?GRVETj5%eTLEhe;~R$$ev^ zn;@nErJ_$q8?>djf#OwilYQKzfnMZSL7V2KtP{)w;Vy%lARF*cDE)o(D?i~MU0=}^ zHQt?fpK{kYD_(>nUO7J`nA`*JXjVw@C$6qg`U$qi)s?2l$l=So{H?Y$SQyA@<>J@} zdhv?$Yn_e%d`Paf0!CXGO>+a&l;5lQwICsqSckv5@=}@y!J0env_~cs5t3NcR97@Y z+Z&H13?w+{T_OE*^NfX;8r*Nl2nLv*p5{p+`|tmIM{V;WP6-SYF*e?py`Itx4hwte z(h}gD=*aK#l3rg2Pq3Q+ZENU&U|HPo!9KmuLVp zTua&`fezZgp7)=dC$gWp@IL(SGLr(ID`uDoCL@*V| zuSfjhgAT7Hdc#2&X{)tk>itM1inZq^v8KaYRyC90i%Xm_<~ti@map%k)AO(KS`;bc zyR8#|6lf1DWqhMQl-vM;V536dPz;a&uoi7m*K)=)^712(m)SSCRh> zF^;By&0%l7#3bQ%{|i3nr2f01L)Qk%GLi_*0~gDWYWjuL_1MJ1Hlp4Q7bVeV5zvej z_Y0|O<(H=20qb^D8EI+B$i`Ne*PrA0BSNMI1w0EqMdG65pdEW?Q`(M)R=Bi0WJH^j zUoGdg*cT>;G|NS&4r>P`amtL;bHED=vrdYTB+RH-Ub~?rkLJJT{Ej^MbxX3Nb|Ck) zZZ)Uc4mb#Be^hC+$X~yTW;-vC;Nij&&(Yu`dzywOJr_PQ%SL1tivSYw0pbfxRFleA zh${S6D-gGFH#Jb6lD>s!__iC;@jpC|XqOq>wv2xW^%Z1fJ<(#17VTg;Sl4|8%T%bL z;~g9u8ym`}5Pm$fV~ySWxi(~4!?$zYh0B}9kJ)4rD%ZXsKuMtu=1I; zi~LOkl5>YBD6`UCcM~|nF27AIdr&7Nh=jG+{q$)DjBqch;dNJVT7RTHDsj?2Kcl-n z-TMcfMxQ4 zVDLAoD|R$YAntDd^GQ!LsFR~3$P1r*0?Mu}8TYK&2Uy}+L!nJMaDGktlOhs!zrt$? zd6ISR3J^n_j`VCf0Bg=D$@m!MY;RTFQqtKqrk-6qdGdZ`48Cmico;9A#Gxdi7+wN0 zIrj?WU`VN;JkqKYhSxICMufghCh{?FxLq?6tlci--})|fSr+D|`}ye%OmX?X=1=)X ze|9R13D;ri@0|41^m!I}2#KKtz^I#y!OGrcGg0>^SF{dvg7N#8t~sfBai9LoH)+~M z7_wbqq($&X0G^!CAU6%qJNv5EhqgQ@VW9%>AuukctnZ~57%@9IWM>S@r0x77B*N{0 zC6mP1AleB$HZTXFeNf|lSdEQ@3xZd^z0nEMU$0saP)uIfH^C62qJ|*`CE)v(LuFLv zkIR8-zr-u65b_SY%m>(eebSLGa6nlW=~k0hGK?+`*kY3u5(=k9%@7sFXh6>8UDwuC z*7h2mHuyA*U%(@^6oF)W63(zwPQcnPu@4Ot$fG!8KNV|kaP*BXPu>|%)jL6GE^NTD z(ic9!omEbmbc6>LikH)_;#2ureEjT4$A&|GzWOVD$27AzJ3E`N^EDX0No)!Yw6dgHOvM3u}L! z;BJov?7KRi!~~UIIsQ2r@$>cwmQnEnz!2-dA|lE%F){pYH5-!FkMT;)3o!S|thEa+{y+0i~}n%UuD6RBC0KS6RW*vvhCav$E_&M%H9ewm$$JkSZznuxOHdF?ul zSxdwbaM8dT?15(uj*Qvn88kNF#}s}?n*jBh?jfB8^3&Qa;7@uk-kakH#4EzHYmO8y zt4W0I_}-G-mpY`bR^p2 zsP>tXrtRyg6+d`kaQe&?zl4U_sr1Is4-PPXGUN0xp6z{CMt>362v7ntHAuQ<-=lbX zrpPUzc(l%7Zk&T57hmc>^ib+Pu!?-W<#KTmy$EgNPGlJnt6xX)T9?QEwAw@Qq7{;x zH5rVvLE_#$O?wJA!icSe3eryAB%qYD3_k~+-)q$L0vdNIoV7!;^C$vRe@|fuEG_G)>Y-0%e9^92 zmdL&1s#Il=r=JCd6N9oH`2FE&Q9qMpOD>{7f zC}D}oeL64&8en&JkQ4Bw=51}~QfF8H%)6>V2{IVbXoV9jc5yaXxSCY0Q|oavQ}DU` zBVhP9su&)ZMg9Ow`T>V1#^1Z3p%2YTIvDl;45G5Wh1&n)Cu zh^UV-q&!HoAN0>gOi5DxJ(Z7%+tms}_wNaccK2F7TsRGoyY7WgcMAU4rwu)K#31)c2{Z;HX8Vf*7qcZ93?w2$FJX-B#SosRZB{RW0sTxiH>`Ivx~ z0-$C^{xF@k;dtmUyYR`^g~+<=>{+LhT=mWH#h9)-0r_!l?tR{q5MUKRO-CU$Eq_&~cwHfj(Y6{Ki``{SCy_t#Ma}J_ z!}ixc`Z=^g#>xa4MtZy_zt4I<^1qm#30u z54kB_m5DIw%3G~Z7;SnJ@>xIhM}NcV$6m1%Q(Qdc)d|-&2pX2UTQk;Z^~x`riaQH% z)vTZR+4lsA+`gR?nRB4)%G259v=clVvGYaxAhiTo-=iK@1Dg0jzHjZEIaG-@RV*W8 ze&9M>+K0p+G+D;B>#TZJNH=Z9M+sJtp&9@(-ATw75opDR$~%`dv>kLrvRd=XxRVvd z`uf#K6LlVgFz8>&?CYN)ltADeukKP%%)(-Dk5*CS(*E(7)xIA2l_~0@Z+1qu`83RA z6LbL7gr!gQ`gfq}-_he|lt`4}pCCFjPzStB$vU=Dm#2tnJ^tJ>8mPsHaaB; zARi0b2LOa{MKYMQ3DGedFwG~RZU&?I=C}ay3ibRuHzIVYEM3{_>6ah@=QzwYMJH9E z8%BBCW8v2l`mDTG)WmqukONQ?oc>;p{4=A7t$VmO990Ye*k2i~&F1v>8oT=JUaVg@ zes_kad{kvmM6%NbiGcQFOo3{Ttfe0wjg6q>`Eq$_R@Nat!0Bx!58l34?(fn|S&4dv zc?WNnkq3Hg0-?kq@QbI-nc#Cl%5Y-UgW^bgTbSaE%8NQ_T`Lf3iD0ixDq;xJP@gc| zrWo5ip_i+)XD6Sp2>La|%T!>@~Sa>s6xlF2gbAr-yDk&-&%i zr*G@@m6@&gk}AVFP_W$RkgaMpyAnYDP+i0{pD3I!D%fpAT3-;ojqlOf8(`fQnJOAZB)AOuVrLV-3W zgNK557S(o2^Bi^76_oY%jofT^3({F?SGmtb(D$#e7}wV{CaO!pPP{6qzH$1b=mIfZ zxms%Lt0|M;HH0LxEq^_woolp}RS~{~D8Ja-Ct!Lf zIOF_BCsl_E*xyiNyAx2TaoYXKqd(hRauqWXkRfMzlqj~baq@4dF+hHfmXwPP)VO-3 z<6=isay}`(p_~;Rv6faH z*Oht8Ik+3j@34M8HRc-?Ib`4?W<1b|;|y$WLz($>axL4j6v55qz@~Pzxn<}GHqn?( ziv#x_+3|!+z@&r^ka_=&jC9m`qu9uNf?H9o#=J^_-r~lu7E!=4yCC*5ZAUM|rv5#F zkerB)MhB`?!6 z*&Bfds>ii#P6RUiLzBkFZSL&>lXyxq+57g31B(_m9Vmlz$Oc?VF5%D98kNecq$!6B z5+68~M!8-%Bb1M>H^w*fX1REDV3kYxRi-^rS_&!ReTxDzGK0L1L4{Zf1*ngKUtg1L zG%$mH8%UD!2hHM$+Cv`*%-X$DH3vlAY= zXI%{-LCkBbZ^7+ngxuuO{SYfi0e+HL@i-?flV~qzD;e5NYjIRJJP&93lL?F#x}*R# zvuj0f0n^NG|1#vQ+vSfvrw>Ls{tW7XyC*WdII3Ehrw`|xekDIO(AupD*w<)1(%K)6 zB6jy`z5w6bq3}B{PMLoo%-%O((H%IW?*iNtMvAO|SGjXu_uJ1yfM;Qmvcks)@EeT# z*qT8HFP{_XfUciHQc{Me@6I@>WORd(2f|bU6?@zBS(2#sIdtFZ;Gcahh;^kOW_<1R z80aWg+MB_Df;$(c8RvfsoN5oJIe8nhzYVbY9q)6qkqId2y17w2osIlWJ%;&_KgL!@ z_nyr&peJ-&~J!n z53OtG=mk$55GPFrjFIHcp*1;JTDVN9!T5to!9A0yI{K`PzT5xtA`nt8XuIKfpmh{` z9E?kM*s4Ev>LB^6zM`R7GDn4qhw7p2DUV@X4bL`019A6d_n?N_Kb@+|ZKLgl({Dka zvQpHSeiJ0C^=yogSLq_5=7-E23&gE%FH^8uEH9=LTTFQWX00q?5eXc_ZSm3FR!#ir zt)`(tLPZn^01Z!w>Bw|cRpxPt_4ahB9{nHX0nZX{{(t6Fac%ZGwzm`1mTO<{?XbjE ziWLx`7$a)yWA4$HaJ>vZPvox{ z!d;qQGKuxOS&Z`Fi)V}};ZeD8K|2Gyzls-nYE4e|&Ac^|AI=S|P8e?jDk@4|EQ4CB zAlzbKD%Z`no!he;A(R0X`7>CG1!`BVV;t%}5c(WB+fyN*Zs_W0jLD13-iq)XiJ-}x z9<=IPD)r`>zB~Eis*ZW>b!~mxr8JipP@)Kt$xNS{}F|)Du(Lj6O*n#hXu0LWYHb$m$5AzL~#9R zxl8aMa9mI{M6m(SOOiylJBQ$T1!l0qhCz^>Mh|&kd=YU&cKGv%Zelq5>IP=1) zr@he%*&p5bi;{mcu3nAX z!NyU+*m6Yk5>CTC;nkUok(Qyuk^r-!!g3P>ZHvg8Ew1saU%-fcABUU_){K!=?A}rY ziynb#oc`I7;w}dmRi=I}u{SsXi5CKhoVq zNWHgrqMnIVI;tC;)WpM*r-wPyYm~dJf1qU6o20o+V_()1$7Vd@zeV3})g%HXY;8xD z*foB0i@Eg+LE5VjsEu-09rK%M>iQuKDip`^G=zWH?^yAoF!N^R*+r%slM4ubVkuaZ zFq*kGV{}|HtL9F)KcpQ}My3D!@}mj~^SF zdw-a+-o9i?N> zW0DbM9LT!fh2K9P^1PeCX^J$ z>e`Z}WA8F>8yuX!SdQlojKuCRnoGo@*mZD;lw9fcz7<~;{Ah~^5nfnpIH0s@my{0S zVAH)7SAkt|`BwV4KW~shksc8V%gdzOT#X9>K>l8Zm5LwzMw=uAd57WFq%rgdYO(xY zBNLQv+}JzP=X>_x`2ptw26$YETZEpQ*x6Pm@tZpen%62!{!PWnv41++d}EG~Uzc)b z?QHLP13#_MkyO~}xATZqXZePGBRu*>{Fasl-5>T^`?8THXaz6sG$mn>EPpf6o1g#B z8KU1AE9P=K7!SB(xUinDP)@rKiqR#_gNm zzhhG=Z_)Mz0~}AwI0prnwn|IGF6H0;9C$x;gaWe_nYAKVt{n5JovnP!uRw6q_IRhj zIXVtnqED)rnO3hU95pFQ-iwH;`HT(PcE)J>v5d1gIx|%?>s|JYrXDq7zO+cBci0QB z6KR&(nqt>FFa75L)40R-Mru!&1FW6yW4m0FUej9HL@xV!hoX$cM8-%x4o(_B|0pEhcqbP~!V}zE<5qc{7SIt0(R>r+u z5-YNNvb8lzj0P8LsZZnM$CVF0?d=@HygoxmA)eI!-P{Or zR00q2j}J3P$GSkr*&Vv2p{N*iH%E;%qbS9tJJK|1WY4f5`?`nOu>m5`pcw`(vfKa~ zv7Qg9>-VwgEX2+u7?;<|N#zPx)UgYI#2(E1p()>R1wyMv6Cq)DT6DJHc&Ukp8Fa;SnoP z>LGu&`lIcHkoZ1G_6slD$>v02c>P2NxwZ9FbLv!ASa}+O^K^gMa>ISu*SQ5~%VHh2 zoyDDZH!0@Z&NsZmF1j8bSJlI~+ahXk&t|M`S)TPTvp*Aku>D`r|1v(l1;T&GC?BGa zEvS}nTtY(3(C+=YkKgY<9T5JVWW}r_db@sw;T!ml;DF0%!UZgkZ>O&dyJuh7BX1(O z;+$u73NK1i$)b4pIk*(Tf7*bNKu!~E%`xCfbR;g$S!dhd*or-Xo{pc8M>GRi8pz)a zPRw}K*``dqrKzP@%+=oVaZ@q0LROiGVQi{lpdU*y04(|T{1eJnF_rFl- z76RmdN3Wi+kD%=b|BcgnQhc8d>+U1B6wNCjQj21!`%NZe(Ghuq;69>T;HD46we5=*X0;!BUWiRU+DOQ42Q+Anlqj zYK+Wv={}Da$Wr$#L^puY!VaVw2ml(!Z0(+g4hZ%NV<0LiC%>hdxMC1hIS^X}aHSJN6apn3wcsMdxW1<=j~7;sZ+ozjpu|$jH{8?C8jc0n z4awd%qoyRKV*Yka%lmSIDz4Pq`J~2A6Yv$o-8dq^-u!3^j7f%!oS@Pm?T7fR8bQkY zSXkZ@{Y7Oah>~8mdFeNQd8sz1^0M-nxWtj^QMW!|ybpm}%N+ZiM2hzy_kjlIWwRKt z;wsCySL?UhKpLEKasgecw1ruHsL`DT+dYOJ9xBTb?QHZ;D03E(<)noM3y4p3Atxgt z@k^HpYw-@xN4ino&Dx7Q3VUCcuv;`C;mS&ihcpGC@^B3X+{g z^+HC9| zN@~@agc^4|Hjd5eoPQbYdmM<8cGSVYl@Lk*%ec{zq_fBab=O!uP!YlT-QLl+a9B*= z7(Rj@SIVjIa3Mya3cET!`EOY15|;(c-sZj;JT9*aBK=`YK62bVPA?i$PNOuadtZHH zha-DF-!kItwh3>g@2USR3c?I-6RBsId%VB z8s}>L5V2hunEvQOA=j@s9QemZrrrXO$2T_9f+5BZq)sCSW50I~H`0*58z6dVwPy^D zxSw`dKD=R!B3S((b1)g5JZsDdCG61s*&XcCPWQFSd;pj%yP;gzZ1v_0iwatZcAGkSksE$$A*-QA13yA^kLr)Z126^G)k#ogWA-QDd@ zzyJK_o^vg^DY^M!k)on~Km>v8r9_fS6{qf z2~WZB?cwIN4|G!fu&mh5UNvJrU=fMeuyJACKYqwYZ(4+TJ7c?{2Tw@Z+Rc8lJ7#gL z`G`}^+EsQz ztRdoooo;v*Q8bIcpk71Si+75 zOB3pI*ZV1*)B9+f$?@X@kgcFZna_32MMlm!$Jfo7t>2FfJ*4N|Adry=ZS^z?ig7z; zG})Sf#K@JEO%eqAV%5sb^DP}60eH-a%}MAC!IV5a$8kO)xVlPy7ESxro2o{lb@qdc zR|po+SIIndVI;&QKR2H+8-G9m2kFm@(BkyLk%0UzsK%B&xg$kCHdn5-$V~KFHr}pf z7gbj)A*^24ZHoECgQI2oP@N5Ze|6n5Y)Jl`;Y~EO`~%qb`9Vc}zNWJnWd4J#ca*&M zndrhnKp$=Hfs*&jZrEQgtb7&Y^=JnEl2f9h@?^&pEfrlTsZO)(vwD;Yf7B0e8babx zk?4sjG^uvyXL}KA^QWL;ETK_25rr)V-rJ!+9r5E4>h3P`J6b9_B8icRf^le$js7;f zUN~sNLBc^p>n%?`-`uMz>Q0KmC2WPCl+k#ch6Z-+AUs>;=B1x4~PV^nT zgIm(RV|wKU5MG1Bum2#XCZl4wIdNKQ-gkA1H&g5uvA7pnJvnHsaP92*zJ4+2T?EWLu6W7hhS0|F9Z$(7_m&;4psj zafH6aveYNZ$LCq~{cAW-5ewK-RMTNXr6gfEn{8^HJjr_U{r}Fp3bof5)`!3k8qMG(Y+qa3q3g*aF6 zgFiY}9(NHH?htB-YFepk7{31cbATHu7$D-ob#*!%!v;hgzG8BoJAHE#-f*vw=yvzu z7K+x(tMZ>Lngaw{=Tn0rbU+=(?quN3H6Y*Q>L4 zx{SL~9cVOeXo^`H{%{0MCzG@)71%W=%|Nh?+*9+dp~R&`Q$kU5hdFA$ zZ5Wd}WHpMjN7MD8#Gr+&yh%yX!E{!5d;m5r0u=<3jo}&+^=vL1Z1+5c9TVrMe3CZd zEfH2m{n?m8j}4Yoy*B_0ANSgM8NbjhmUsk1?DyC+-p#%P2L%#vOHhX98y_j_K|&Rb z&exfB&g4`1H#GJ!$pIGxs-*w@nND9HZ~=6b)XkU+JZxqqenUn=M%kNnX~e}va@p^} zE&)3|^&=XRR9jVNoU`bQQG3#THc`z?wKZgY*Ieb;mk3zq0GP4HMz=w|^=z_&d52-A zd73^wy8K=<9Eu9?BE77vjfd=KCSm607sNgKlHF_jdyICnDg`=Hkmv90CO0Fk6*BD2 z*fG8)ob4m?GjU@0g>MaAet*k}TYrYRXItb8K>;?>Clgr6N&8djQ3#NoDB0^S;;zEF z3>M)_aD9_4N=}!dk_b&%Nc||W{lx^BE&i^WS4sNFyu_P60u~1$fL`@Jy{MJBL%+kn zQLvm}{#j*tvQz?aV;^S+8|$UY69;7PHMn_Jr`J|2HhY^WwUW&3oX$r5I`~IyTHq7k zFPwqC;@^y7W35=HK0}^e^VA_{4U`xyRQxMZSFjMET^znK%{B(s25?hLXVxfu9y|km z-oO!}JR@U@my6CC{${sL0to8H^JDvfI>aEuG!18l zDq!tI_;sk*z&!;L&gzwxsv!GoF@DR72P!Ee;IV(`JS}(AN0`~r`xvriqO59FuPuDJe4@-CGH~ZL* zcgJI3LtR}(?kftSb*tbhB-jWBl227du?2sAKJWHFjM>l~2q|{$!HS-v^}M_`#cEwtjd{MIH z59!YoPd{5!4((GQB5r~Z;f;wS*gwYK3$r#)I^WAd^A(vD`q-a4$^T9%({yYIV(1z0 zW8XX)eYtA7BEKglcw^Ch6NCk%URkGynyaNg{(}#kA>(&xs@{j%4rqYXD$>~N9wtYF z%WN{ZTWr5Z2Su9;RrurQvIXhTWpf3?iO^mEQ!CaqDo_}QS}@iS+r>x*q5y+*QxXLh zw-k~fKPCH&671872-WRP5WR0aB9e>qbMd>^3s&&2mV&yPKMhqJluXZ#I?f+S8?)&6 zw`^oJtY@V(MSuRy*-8~kwL9`Q*WOv`TeuVzDjyS8TuRtv=f(PG`2|NBxoFw|$rvI{ zsuji1T=7Q)Jtm1_7gfS7gNv}Erl=zJZhmv@@GrHtoH`)-Fn*7#qRFG)BOcl+wmUw$ zr^ezGHhO`F+Y!`HIVe7JeE{`Q1qOlM^;e^_@W6s<3CZ`TdM;`{m5Cu2qhBU|J-s4@ zW#zzJq@oIyZwer_@}SF=iI8{7tVv=k*4&g8#@!AGlnbp2`yfGtpHn?EhL&=~IJYT; ztSbD!^~PU>N6p^;`ICJp@Dza)MTKptCziS3Oj?lYNwLh%J{>O%OS^Xj-jz`IT%@<` zjJL3Sv0UU8AY60)q1_0DAlIR0TtUg8qWOm~E}X~Dss^r7jtjT&+U3!QokBrr#ej{jnWLFme zb(wOk7PB1a>i1I{f24qAC>{K{%^^$Jp(5m+sj9;PDpsUVd;jLiLh}=OBR9?d!9mws z&a@z40j=AkYPOHBKzkt*a5!!}b}~2J?^#OaK305V-#Ulz^qgP*)o=cm-ORV4G4eY= zOsGUSs4*e^bt@6rtMo_4ydO9%x3cDv0`)U|CIl$;K&|f5m5vrm9C^TT>JQN-M zL*ViGucCLv=Us@1j3O#H{Fy{I#)G}s=H$au+W~d{*TVQ?(vVgPer9W* zs#=~~-dul=xX#*2{N)nq z**H#l0ZasvBuH(F^m^VGf0fo|B<^u7OA1ELhQCTHgYP}|U zz2l?Z;3dS8e!U#cYT_K1kw8>dTff-#ey#@Uh(TQ&gO1@Pdft;xX^b>?R-5-kxOYcO z28R3@U?wH^wt8{-b(IY^I%k`s^0kVubbM9dZyyi> zBqLJtV=yW!n2(i`rO5~pQqtqEG6i2yLEZ2YnB;A8LHW}GGywzJKutpYhRUG% ztSRhvW2N;nXxh4n0-g8N-Uq5Euj}d_Xq{~r8cD>x!*MT}+H%w>!?oVQjK8&I5TKzh zLytie0YN~6kdjbKsHrXC!x(@VC!e4Xs^3{;us&umvX7&&*g&zYr)qRX#gZtkrKZpG z<0lEJ|1P*LjSagxwN=!tPjk)Xx;ekexP}1s^H~33l`=8+-L0Z|Y^0Euf=g0ji<_N-6&N6p>c?2hU&kg=-Ek z>Q}>oLV(~pHtg)#lq*UO;S=p=TTI6s!gXl_AP#8h3k5tJ29Ie_UxXrUP+Kd)8?C35n&W62{>>C zy7=Wq^3N#$>Z)(q@23q(CQ+@?9?!JTu-_%+)Oqztx(w};$1b}q^F_JV-DX#Ugg&5& z9^3B-8~b(@>nU|jX|op&AWs2p#F|dY-(_iy@#^wYKf(%~8Pn<%Q+twpF_E%4R?m zTmMPdYE>#6ntyZ9;zAIqbLnBT`-qd2BH@t51B6DgX7-HgrwzZ~IQ}5s$LDDpl8%Ka zp?z0e6erA@z4=p$6gfO<)F>ZY*jK6&DY^2WF;=_}`h{79{QihPw7){!_NBlRu_1_o z`?VGb^UkIN;QthtEVX|13 z3iHg0?6mXwfP`zFN8BHD@p+CFvu|FXV%B%3mwI~Zde$_uESDtl8)#-<0Bk5VPUr)cs}CThAo~5sBm>6 zjROv3hH$_5*yieTZ2U<~+h?`H_F6eB|F}0*FtKtOCU+wnO3>MGmSCahmEP=22Lki$ z_}9I|x{&E*wl0|OYKq1=%}J{TTHhh1C)AU_?pG#TeUbeNIZM7MzMHo)ydY1SN0hliAy;N=b zjJDndbsdPi{6b;raY>crA9mtc8ihm^p?rp`Dqr%@R&w{w9pYdYDtXb%htHL=w$Smp z`_Rr8?HJqLNpMZKZE-$rqa4v-$Px68Q|28pNMtkc;=m`rOkW(y;VdJ1us<)`(QZUf zvZx|j^b4}>Oja9N6_~-~kl4pbWj(KHT2O?0jJ}Ft6r?rN)1j5^{~lJ=nhP)>b@i_Q4Yz8? zOg6UeP2&CQK3%mR&7P88-Tdj~Bq;o|Av1^}eDCll}pVepa8gjqj8#`b1q68f+`){9};`HCXp+E^F zg{4ADc(#Sbj0|x06H_QtB}o1bbjOUos4iU9LN-!;?CsT7@4(t{rYVC;NU8LCveRIe zd;jqF&lI=ixZwrBHW)4=VcK`hRrcHGh&O=8slwdMOE?t!%P^VG7pem*-W-XYZRf$v z26xZ-1I_EQSy1O}be#3G=e?OH|ok<^GDVu`XuYv{p-1 z43cT>NxH^&aaUYyiPL2cK$2Vj{b8tWxwlM{>3>E{c~AU|nj*f+Nx|?`o5_#a?4R0~ zsX1o{l$P!?`RD$U`&2Ue8J3>CcGglOGb*X~^x=GhS*0NiDZXbTZWzkwWX0(g3+jj< zW7X}Y^>M7@A<*-=59!K*zmobPQx({a?g+Jg!lhwaLP^8)Hvf2!H@=N<#C2~1x~LfR z>AZER5vyTz3ta!!7M~F6 zTb2Bsl^_9?4pVu>q^UEkV-&o7gpDR&77!_|XJ^uPhsDI~WZ|AG1_+H(~X(lmqnZ5N+ zJDB#pzDB%aR>0!=g!s@djG$f(~!TJ z)7DSA=qGH=yX(P5>x^V=tx4(pW3E?kkpDGrYWpIo)j$}vh|;tesO(i0wF3?Q{RNq4 zsqYHw9>Hlg_rGYX&}q}bH}LPNQ17hno}bjmcp+%^-{1kIm*^r^Vm6^p2d=j1(T!O? zKnKbK@k+}Nh64L_GU z@7Vt}W7_VaEhI$D%zRD$ct8tur$2uyCrsb}f2*{3fHICR{ySiHdm{KhWm-TleI(`j zQA_k+5B`GuS2*PgB%vf9oLo*h5L}AGK#Pmp74+cY)`7&Uz2M@#20JZmm0urZaX$gm z1h$$Yuf@OGnnM0ZNcQS)CO3~?XzDV&rP0YO#6)_=1>s(guSKc~j*8a?cswrl@+MM0 zHg@MUBDY>=_dGbqciO03l;dWWxw*$+@F&z0QcZcoxcR-T$M;Tpab}TyLXv!?c6L za>$~-z zazO5_!nP{OR?ba4>Cl9Ft{KQz{2!yK%dHtCGaKvBU${*4y!v(<*XxR5+K*OR8@3gv zi--!|m=Q(5%=*!Uo+t``0N#&|z!?_J+Vwkg;y=LDQqX*6<#2nQsnMTwkAC21?in?* z6?tV}o90DWP=(72aq)Z%kmPu z(9o^2qUU%*n2mPxRD%a9hMzy5HlP*SI%QJ)1ow+}o=dw>rst|LNP8Ue{EQ0{9$4}Y zx$W*Ci(;xB#Zq2swZVaqUz1n1=y#pV1a zhgIOlQPcSOC&7t-ocogP=MiuWNv{JWvZmr(a9v1NwYWRQ=SS>K$10tnb!L{~*sF3J z;g6_fUdGonpoK5Jz+O(*@Z6n3C6<=?-0nGeqFf zGO=QxlUGxl&v0zb+UV1^c6N4#+b+wFqHhWrrqMR1_6L3WcJ#*y@pK_&rx@LLxM=^~F+l z89KvDO`$WC3CEl39VoR+=S76Pd$o=djSYl)7ew@lLmQ50Y`-o{;~Kt~>9?i9wc$Hm z2-);xTZZ{^@(;uP3xn6RPxH7~K`OQe?MFoIHphJMr~Qz#_32;fOa)`5oZx(8&0frW zvo6+{auX#T)CR6mc#O-Bq8_^lHz60r(yrH$(mCX!Zf7YB(I~0=NUg+iv4$^^BPs>4 zc?D^o*KDb?T6R1-&ro{0?Hh z;Ypb{VG@!*;W(Oyq;pPjnKjHON6n%Rt0@S(?n1uca~xj-;VyRD@GzflaDiTuK{;C; zZ}pCzqtGs)DFI2*!sgKegyAsn$S7}HqulPo9oy1v3v*+K4S=}dtbV1z{@3k}v*zQ~}jD(Q|h{23XjKHkF@E=bt3E_xS5 zMXygXyl7r_Jr&7jfAO*WPuiF63PG&sIXj)AO)5zD$e}olqN#Eh zo-)LBWOzjW=p}oOj~@B#US+;S3fWgVFXQ24R4FH^;Jt394?67oxouTW9<}%7lOA)& zykE_3@+L}-WkeJ}p=+^*mSw{7$_A-oJwKgG#-OGSB(jrKv0&;|DK#%yTE!*2VW2@0 zodV5NL`ai!3Bq7*2`bOK`TQl&8;ZEw4!rS+$OLXSK`}5-5W|n0>b8gUP+VnoNanww zR1xu?M5m{I{0kJTQM3~rh*nhu-cnWV=VHG6GYLJc@AkP|Lm>iBI}|BH=vz<(fT9iaB+=LVD(HL^6mH(}3SCJ3Zk z0355A&WW$E#8nX?Z;|PN$AhQb(Syy`*Silx4LJO*$|6hJThwKR&2(wQVz7{vC`t&$ zN`JjdHl6i)Y#zTe8Bv0nOrVWBL520YVdEx{(@~26rN(zwBoby9>j7GpgH*pSEUW6) zUt|>-rVN8tKl^CuJkxE3@8dvNdfU|N0C@}6&M2V9cyND}XN#Zp+sKQ$=J!&6+($L| z)n+OB$XW$XJQ>NlF3Ix)Va-4LRUB?a0X8E>Q#s-YfWK7<}X>8BV3Mmqv_sRRsa?I>0f9U7!=G%&_gKs0~ZJVHf zjOf@9{jM=<)SQ}hyusEtMvH0SwABFlp{=>DI#Gu}f;u?G9mN;pYT?nb-mTrM#yRPa zf(zlyh607U$0*(vtgv(waAlKNamXUU?T(Yp_eJ`=cBZsW+mtEn>RuUs8Q;61H&w81 zn4<;c)VrN~;lCTR?Eyer#~%$5@Wo#nI7|O1sk@m51zdMu=~b3MnF`};y0g^*snmoy zLuuCsL|AcAxwXaFX z-F@By_P`}n^PlDvajrj#UCFrM{nBvs$WPcEu^g$QCa0=B&wFGAoXE+h#&r`(hD+5k z*7+2b{^6xi#;kv%uG71+jfd%EbPU0|Oot%PmI> z(>2EA=Iu#OZxovLQWMBttb>Wv72FP}WvC&j0Hw80wV0C#39kV=?UG|RA}1Ku^*Iv> z3DFwfNi8t&)^VSOZ)B0iz88#kAu+9uoU&aix&a!J#oN z(Xc8<7d%$PSQx#xL9Fh59oiFi?~3aUl6{5l5mV0@nYHu6;l;YYSXi?g4u}&C6G3S3 zZN0Bi8T8eQXH}+NyJ+{%r$>+#9_UWYTm%pTk5Exn1j%v9y59jTY-;I@wuq&t2vtFJ zQ+km&8Sg#2t?q+wZi#NGTH70fk{o~YmK`dvr7WzjqjX|^bPZMiYAq-BcYb2z2p(Df z7JcGFTd>*O2^kvQ#^u@aGHt4@ERCo?kM@4agyIBOx>Hr*Iwkhmk0y`NKd!&tsd!nj@Z1q!!5dIe@F6MEDkC%;9u+S z+SduR&1w53{p+onx&pWfnWx$obqV{FLL()sks`y2`y&NyPI&nQ)C{hU5p~SX8HQQP z(C|lrIc#$!*Y}gww9w$Ac^^}+5*jhHhLzu6)kZeC4qJL3&*)TI?>{*DJ?wHm&<;JM36k*qaY`#7ORa#dxx9=utEoZcYWB_O)0E zV?k=0<}-*i29)|r8@Vm;RRJVB`*>Wt6}95ED2m;}L(Rl-ab#1lyWBD)?Ru6n*SV*w z>T}XvyVaE1b@0c^M0H~I2H9kG`GZF`prU$870NX^ZVzIZ?0#B`qc@ncj;dzg%WIET zRzgCZV2t*NO}-@I)Kbt35YtpiiA}#u6q&>|hwj9Gt0+0jz|)dt?~wWu z3ABeVS4tO*-rg*P{1B_CEB##P2ZO0q#D{QrlMJS+q5;R#vT~aE8Igz?y0xqDz?Ib; ziG`>cNocu@Na@cXc@r60P&DftWWK41iAg?m`7~i%{E3)&k4|HPif;eF#nrF^`{W)t zNZIG@c4y|FkvCjpGzi=rsy8 zv@rFB2$UAgc=iZvDK|i$u5jm9nVR1rXS05Z`*z;aU>k<^%lAmoXd`7d{|hD^AsOIt z93syU&7CwDmrl&Oa`n1(z19Ke2-f!l!OE)*|8Bjwpe4{FI9GCnj>{UQ75e(cAB}*I z@z-1b%S)BEy7V7woDOO;SE1yo6?<2FpB@k*Q_9T3g3VqW)3>QN-lmUq&%}~Vyj6?+ zyJZ!&gh~`3(8`mG=C}~@FZgCvQjt737G`7>#@TmVv}^orRP@camrX+@1b+-)nmu$h z>z|$SV1h?U2$^qYAt90rO}N-``+E)*Fc2W7AY~-SSE|-zja#k@6AC3M+i#Aa`*WKn zyyIqg_qT7T0Gz9%^ZxJ9$pU;)kMfx3m!_Q3ywA|#y%rh=qKF@`HE620!Xs1{`+1p? zt(kjayCVsjaAR({*dNC_hf^d)x%dxy89s;f1@B3~p$BKJ-%kRnRAiO}vu%Hd=Xe-sm6RkzO~F!-`2s;}eRf8saxeW14V7{c=I6M@#kNggJ@C&alL}p$y;n zBJ&i=#4KuCpWCwQ6MILyAwUsWVxs27mh?AIfG3dZ3mF;H5N{}O36cHT4`bXP+>k&g zrwN=;0Llv(!Jllc!_f@B8_XCibfH} zApnO|mxcF;bBIY6AT;7PwJ~;bd1iN3)4rWs+42DeJ{ogX2Hlk z6+aaAR$p!7>EfDtfY~WkSnNZt$$`$URA^L5&58QAdK!3Dbp4ZmpoKd zsADKiwY=U*QT7VNhYC=_`~92XdUFQBrqP_ZGEipefmY$;V!55C1rkuu@z#1bpTqe4 zeAOr|u8kUWkNG#ITKAh24m+-(_zr^`pA`xu&jiUDdPDHXER=j6v-{}+`tURE#ZwcP zK$GL3Dn{p;F<)5&7I*5o^-hdMhvk7&U*1nPo^$3g)eQkUQ)M1*X5X{fIe_6h3Wuj5 z>nbJhblz?WpTs`yCGy>Ll;8WOM|^X~(2Bp*?O|@~?{ui8FHn71f*gDr8Wa&eV95^!uvC_C9Z9Feb zPKxinT=N*r25r29WPhjFa0dt5IKWa^SO68lZ-KVYS_UK%7ih}*BP>FT-$fCWw12hS zz`Tq1I@fa>9{(cf7*-tPjNpU@wd9xI&oI@b0JHy}{T{4z-H%k++gYkO7%@6y#a>vL zFHWO0tO_jG5O-n0XRGw{dIL3KmAb;hOs9#mYJ2;rQ)!$Eq~wPZro#{)yfLmO9?8eb z8)COQEUfDN-d--3Lct`b=NXD(3`wsfsPiiDd*ijBiI&f)tdcSGA>%#KI)K)pbEx12-t1_w~_1 z7g)=1vYyFH)q!DZq@qTLBbYiEC^~^)GX?zNJ9rIVF9N#ANh<@r+it}E_`VTQez#m? zZJiEHWh_|Y2BM-uZ7Em(h^ZpN?t>KIsOET%tO}@`DlBXYTS6R0PNOO!KGoYfH<@TIsLUW=8gkzfz~ zZk4f5H^KuXg11<4ELZrZFQIY!coPp_hzr&kUZ;#au;Be#r6jf)*!DaU75soR4e9?W z@Bushh;@YDIac(6)WF!t?TQ)x9*Um8@TI-TP}VQwF`$Kh$IvuwpM_HYZ= z`L@#4d^{;>AgUT$IcBF*KHqr$uc)f07KM#f3F%i+j^CqJwM6Hm7^MZ0B!maASzB}5 zZ#ci&eOG^*vg-MEFWxjn^oYl1VV%qd6?w?$YR1Q zXOq)Kt7ISQLKwqkw09LE7_RR;ZYzhTD4wAYdt=yG_?i#A z@)tIZ)V(GI12EnjJur~(UTwRid8&Bj2L_qRNaq~!kK8+$8?rgVp%c^-EBlE)lhIhLAc16%6z` z@f8&RueGpKbQl`wG=HCQ2ak-)- zI&>~0?W9H_#-DGA47V7{Bimhg8m>bb4%eNqUPUTHE+#ZD?crAmSf=65CK?ESWBp_Gz+ zeco8?`EM@(oiGXMKQA|^`E%|DDbcg5Fo1)_!SHlMDi}{h`B_(Z@&|1i`o6~DHkWlU z&CWa#7q&ycjhyKuzr?zA{wTsVi~&?A#aF{DZ*;r*r>BFrIO@JRgmEEi)lRhV7qyIe z2mvkS!fMOLpW%r4_8N>5_tdvwmE^RkS^x)|5U;xSz{3l4ieyo(c?V#yoqDM&rk&g5 zhLVpDr&!U1u-KiLBw?kdwQ$wwuth5D$Ybxf7^Z4sG^hdPYC3lb0j^g|_$sD7!6r)X zQv|&=trEfAR{{ooD4G)~q*sT}Nhh$j&2^s=8tMq5w+N;jzJFw%1zK8XI{(~KJH<<_);u0beLi9yx8}Iif5^`Ipm|YMWhUAu=7P3v4gm)0xUxCr zxaoT_6F0aiGv-Lc7A`T0W;_oBF`hWucN zT2-U&^BwSpEZ4&fvbq|?Zl7}wi=1(w0?XtYu9MvLwN@;Pa-WZsj2W_DJGG*ER0YQq z#3!&&U#*EU!cd#IJ&rbyfne4)9y`QFt0tW8cTvaKPC>>| zx7Nhd4_*`}nCwoQZr+z~6Fu<~vTf=3!4EnW4FYDB%ZS({52T3LC&y0Zg2+FGe=BnT zlUMbNjYX!a>qdF$$-@N`B;N?XVAl1jNj!SZT*`YXjl8ON%~fje^<}& ztYwG0kK;_Ux5JUWmdkrIKwO1UAOR8`2}41=5pla6g`PU^k9V$tyuWbv&k!<;_Ve2q z7c4bLdBJ;%hEiaxVZHuf+X#KL6W-5HL?5*n>I{ysg1>WL&bn29lkIUfjd;5<53rT2 z#g(akH-#>)3Y3KWFK6(Q!g{KU@fA28TjkaQambqs(HkM&uojA^#W?}pXMk%eSmCdt zG8^Ye!0VNswGW&+A>o7J&DOa&%6}qM|Nd|z=u_#L)@uOfS6h2SK!f~mBI-ZiJI}Yy z&++Hy=47R$3>h%_sg5#5<>f0e5<|0&U)s$t-euO>Ar;1bp>lvQRUp)V{{qsizJy5x zC-~%JPyhXxFQ`3gZ7~0B_>c&uX+dgI7{dhMplVGX%9U$S` zO!1!(*>;34Q)Gk(Txo1Jlcr=pZZiIyvR`8W^jFOzci-GQ`o}jyf5L=LoYHxy_}ho! zg7)y@kYB;v9=_l=?|nc1ec(6x!`2wKUblW?O==0spZLsqp0@TM7yPe=4&~JC0g?t} zs|jS3WAv-nvtXKUGp27iKMg$<=kCocsie*{n}qqMv#+UlBmvLeC*ci6%*I;-0V6AI zWxsya5HLMQ5-B*XmUjyQqn(&ZfSd`UDs4ZwI2B{OtjczUr=Kv_{Isu#d+fkUK}JM& zu_bDmlFtcJ?pe5uw-vUsGU^DrRh_cl6?vB7#fY=($v(uFtiH?7ik~rwZB0Tr6cx(S zJoUCS3H|(h7FAl)c|SKrv?97p;6aNs6VH`YWN9{G!V3%dNp2cGxN}p!_dyL^r3L4_S4UzqM{pA z&7FW_=%my^7R%u2KH9r0nnm=VoVN`hdeeV>87zlI^tnB3rh^uqMBWxU&(P%m+D0su zjcMgQlM^~Kqr9Ys_sXSNomD{6_rVii(|~5oxSl&TV+r14#KtmRWi4hu|IQK+QE^?r ztgLHfHaI^)ZYrhMpPPtFYmX3DyziM^vHvU1I&0eK9@IzH^FhxUEBE4ky5<%W9?AHa zQ%lbMw086DY1*|cp=5tM8lS6E_dv+=qmg;oY=Kv|!L|YZ+5)pN1;Bt+_{)IBYd?&4 z6`xsEIpXL4DP>lP&@t9qQ-fGf*9IA==e-9uE>?9J(?*U5Cz7hkrP1fn9pqOS_yCi+ zEd>|&lOb>3|AHGg(tKX8zd35-7!vzMFj8hd=^YK*%=09?-Jou%;lkYLL~QGL&HJP9 z!e-WZit&@f)T=Ou=P)dO7pGfK3sQimjJprJuez?X$AevYa^-v}9q;H60$=SibxHPh zfd?J9`Nc{rHOUvY@cPO$2L?66?fgk(3Q$)`!(I>iXv=U(|6-Z1d?WPCXxg4hNw$%k0U5zai^Y^r8%%bhZCKMcYRV56D)Au=rHHI1UdagiPj5k$XsIbQ^~*Ew!T5jhi%FCD?x0B|uK&@A^|t>-+G$E4F`JiM5| z93G8T+hc~Cg^h3OzNb1rMOb;nY!K z?J_6#{)_o#HoMEhU^Ixsdc~lez=_j!nV*DZV};6a^1SpHi`ypY`K`x_$MLzYMvgRi zb?s~_^IOd27WjE&G7X}jiD7FcUnBc^R z6%g1JwRTC*mxdyg(6DYn#RXpT{e*M^8}a(N@B#c^+ADdxWn1H{x}m+KxIxO_RBGYE z4|jC>DC=g-=lYy%$}T~G_WJOv!7TH*Ds`R+X&llhuTdPi3Ce)d^^C6q@aSo`C2~zT zBvVMLGR+i+-blSn{YWTy=J%-3&-!X!j>TNSsr}~4)Ha5n^|$=@95xi|GY}M}s4%Lw zghlPP?#y^*!*4>#h(`1n?iK8}7;W^!)U=xDW$hN02Zf$>byL#CqX6sV*p2b!5nWr13RRGwN#M#=$g6tnKu zL0;M`Z+&oA=DLZ5vPOmcJV;b5WR<&3JxM2gSq1xA(Ge}F44ZC+4X%`d>7}eG2#f$6 z;8S906}Eg1qcBi{bTc&8pHi8hu8`i3I=oLGyu@C@X>E4r673qRe`ogv5U|NvsqUNn z?I%-BeW?hPNq47rW4&&Oe&=QBv{oarIs3Q=o=@%6L{)14P+z;}({IDV!#*{ivPq@4 zmr!oshOcZuQ$|2QRqKKGQlpHgOU`SzsmrNbmrqyDW5|jm)Q?#bqnae%Z#oh|N|$ab z@zAQNxACLtTRNvXKnOeLc3oR)>}q7&i!fbP_cUp!b*#D3;V>KDPX2JU&d=hqE?dP8 zc?m>$fvb2QjwXdnMyOaF`&OyGt3fpRXuuR}bcek&k;*95Vf)PYJ~Cd%uNMY9lz^;s zc(_05?{8`8pi6YhhoGII#i}Vi^Br}iH+ZNvt@ug8M;nuf@laZq`eI-ub}|a+=E?QQ zyq{+=5tGBoTt-m!dVYs`_V3Md$R(pDT@5=EF{S<3By0$oE42oRROjl#ILOW8{b#ne zJII9YnD$X(3PsyB6&?SVwIS>5X~Os6a*1l0O*4KNSA1LEHB#%1rCjO{-ne+da-6DW zwFdc6SRfEuucFH+_CCYw1$P)8wM^S8&X9+_`6jBX2kol&)yt>FEC$QWFG*q?SFR<99xDo`3fUFCi7% z*4j_fr@y#YH=9+Q2Y3jeazQ2c{kV373Re*6gE^myj+(YMDX_ho^|i}?*LWA$<5pGe z+CWC%69x2i*THHZgsA5RsJI`|$qW^1p_X7ooqVQq%Qv}B@}kwHPIXDs;=hW>VZ*iI zApuwP*l#lZi-ENAC z_q+ZvO-uHEA{}S{_WfJmxMes(wY`Tno`q?O7ea45Bu>kr}U*(I`n%)Bqq5lf5|45UWSnu z+0EkfrhE!RB=(V11>^xF_6bA`TS#ACl{OJyQJrry;fMB5#Tf~A)R{OpvB6K{Ka*Bu zrzI(;5{ZM2R{n@TtE;_}?oQ3le6QaFAojYF2)V5lWzT82Ng9u2UF?@AHFprShiL72 zBtVfVHGRTc6;fqvAa~S|Q+S9A>OA1|LA5?5LAG(@Eh#kRxxE`#!O0(l1nQ=fm%$p* zMyhXiPN3{1^sD`UGTPomEHCal@6zM*csvD^jlGj!8D#oLu3IE;xfoOlPasvO$>3ax zOR||HASn5U1GYkav`RZT z&F1a~S=D%vi^0%PF?P}SNBL{M_5A3aFBUu1)*qR^!lg2(w<)tYIrN0YyrVp|FWQ_L z90FR_sN(h(#WyiOfk?Cc*zf0nqTkS15QoZM`BWa>wa0A!*u|CUBvB{(U75z=(I{rU z_5RNPpzJM!;&}J9-615wfFEWWjns z+LH2v1s^`hycF)RI58JA+}43-U%}B{y9I&)8p=ZYv!$^9-dh>$6I42<{4-u#K0`v# z1=+Zr8tq}d=jpa7w)gGUQyL|QBeq*b(kBSJWs`>7tM1HO5%V`XFY!TlJLs!Ioa7=tT1|+6L%5L zKNW?1qKg7(;$nB$WOkfO-0U(2wzf7*y@eCVp1QcVyxCNZk+fS(_|z(a zD9@j|4)fLSgd!CdlMIqqfUGHn_;3=|TEG5!1TS4$Pae)}O-v&jOjMt0d27qRAn6sd zVe%YG%;88v5sD4n{jw-^$;KXmymXr9SF_do(($Cqz;F*MAZdE3iq?Hox{WqLp)YL7 z(cq@@)CI%VuFZb&j^tQc>+OTQ9jOTe(2S=E_Yu9`+7(w&9cGJ?m=E?lWu{fpcW3wQ zwQBM?O!s=|q2JU|@3^NL?#Sz^kRW&9Qm->Le(Ji>)Jv%Ept9sU!bu{nLL2~>u-W8o zZ_#$TI-jy64rjh*-fHcpOs>P-ilJcIqkl@Z-i?=(a^r2{rRLkdqsHEjs9?B56!zsD zyhp?M{kt-8_Ae@;o)BIxILL^aDuTj{8yH{dpE2MWh#=Np;*aQTq3M-i0t#|DOxE@!7oE{zX`&_G0om~IsX12xTt>kn zY%Orz_^hL%p3VlALs;sWDE=t9?_)})wISQD6Oc~>;>0fxofA(Q=OBa z-kv*mjnr~&*KXqF*u)3>eji!njOPc-tFOiD}%5af$}zs_PF7(CGyYfdka1)Vl~_yWW^b9 zh(zFwAMjsZXAO=tt3(rFZ>3w$+uGU9c6M``&ll^76kJD^TnN6Yp?x||=Wg?yA`~wF z6~_vPsDNuZ@CPwTGjtZE?=xxnQVET?*xTAsLP1m|H-n>|x-2Y=E%-T)N>vZ<5+P~& zMf@rl>O59|T`dw&$OqNmarj&h2esZrQt?>B#Q|F-7J~6hw~LGsXS>7Z_;IEXNHfjCg+v<)68#v^5ne~Ue5>;UQfmU8JLQE-%QCuxZ0BJ6%{=B2+ z$3g$XL_E748b**!q<_u+*p$m7S@5c$*gBsgV`a?#4;K`n@#_4%sj7CZlheolje!nl$iwD;0gsuto{shH zcVV)LSd^?!XZOu+!vgn{^niF&a*Ze7ZLNiwv_W!NVtyA?N2X43JR0tLl|qLr4vsJ; zgM+bO!;9Q7x$ ze=dcjmN~^B05=(CgRh8onP4{O)o#~6%ss;P%P*C?;1g+z-d3uqV+Wr<>{evZvHTg+ zVd|7f`2}qnRv$eiGE^qX!MHRqF|e?(;GFQ<)^qf*dKWFtZBc+W<+Q@8q?90mB$Ldp zN&P-Jz21;8^+0g#U-HLNMT`O*qGK(ed-FTvmMJ^AdOF4*d}hx>5kU$uB$>1dQ=d$H zQJF}*A0qlRnQsBvBd7yJNFO>@wbq>xfSz7By2*H;r*{W7G?smp@=_9UZH4=Ue`?n9 z^)rX&SG2g|;|2vTN~$(~z<2uN)3t$HDhhJhry5wmce;7!%K#rXXVpft(lZ%hU|7E1 zG>+$n=1!^8ffVoZG{vb8-JbHNkfQc4=yVud;$RU6b#RmPd_9y*ruPfB6-g*#t+A^y zpO^dGu?FEX^;se{Yv@gxj6k>5_OK&I9_^4=MJXasd`xQ_f06`8ea>(-?o&4rD?(I7 zT`t^wa{M{nKNY-gYD9*{+216DSaT#w82|jL*Q!XPxO_)#@tqS3<##14XqcUQ&AxhH zb|-G$JGMGuTu8t|)i7!`9+CmnreTNMv3BxP#gKw;W)*gqCscl=A^VY&bU!U0-68ol z&+~b@9eg$SP6N-sw@{p0Jj#eKzj2Zc@$~cwoV_M@X*G2J*_iJ(V|3BypJ=@)7bkAz z^=$=e93KKCE6ZVTBuVok-YhD%Ho)DYV(Qkyf}6HW(Ce4Es#1R1k2^?Tb&S?`6SmfW z$vj^?oStmuJC&z^a4;$lnkWwWh~)+j(E3c{eARtM$PBDSH>Jz2M+E zQ20+3Jx}Fm*0<-OGL9Jk`Cq;*?{*t{GnEcg*hTX7mgo$T1iDqm$Mi7dmbWI$b767^ z8w+OX@o2H=jgGip8z%2vQwP^96n$msdAzegJrio&gdx=sum$85L?a|h0dU5Sj6e&; z=FYm@G0H=i^a_GRJAv)-%nm|q*6?deGK>+!)LX` z*y}3kB{!K6{g?#+$zaEkM>yt31#9!1`VgI;Pa#)LIh4ZFC@ij@R&B6^$3&mK<@{x% zwhmyI^9t>*j7Jxm9CJSOchF(gODUe7yu8=X@U(kASbAbf2QX}P+ZC)}=rGp(f$nve zrs85-I`8QM!>|m0CRjc*MPW#JMU?`zXI0HS3?ICJ}=`abC-}2WrvpA9TCQX!h4#lBM?{Osiu#PVx(|L;g3p1YB^!<1f@7KYhaZ(zj0Y>Bs#eEapy)jTbLRfx_oByu_2D!ZYyyELg~Q|t*34Q)i>u)%P_r{-8wyX3*zh;0F}%uac-~u)A{9X>#fr?T<)!(_{X9-XrpgibBOcoz8oy9-QviKtVRaYvQ&{ zQZ$qkE!(|iwJ_N<=l8x|rMN+dZ@a?hE%j`1n~^KC%611dtP{o4nL*GJZ}D9Xc}?pONsOL zb`Sg)|8qT|@H8NX{r@n_^`N}c($u)PIZ?K^ufV2ntx))=vHvdV`)+mhMjhk|gy{Z@ zu>Pg2|NVmq0BZ2Y7wvmi9sd7DjsN_@(+f{v@!>xf@bm=zJ*+p$2{0JT$!Xkasc>Yz zH%ArKjBrjqamnmwR=97kr&<3ETVJ??M3OVC_$Jri4(_6S-dR4cm;uDn@dnNjxbm+< z?9|xhYa~!TO>fcz?9#_-n)@?yMl0z9DgM9K!|ltfj~i*Efdz{e zoM=XV&l=CEG0{}ygj)UbHvQ0lC3?E4KZfB0>y%Z5CB&E{4|UCx$opcW^4mo3?gL_C@1`FyUs;Q?7M z*%Tg?PTBQDlZcv_Zs~qOw>7WUy37aJaTPju(&TT7W{7!|E+ea3F%>3MQY_fb*`>PH zUhPxQ3nh6=BsD8(8TP| ztmC?Rl~O>-6ecR+{vE{5ncn8-gy&Eluw~>LG}%vc*0XE}&?RhrAc>nqw64;pAn(2r z4;DV8g9!Po*KrHn!j8c`pwK(`kGOsa1w8|Un*GhO@gM==D;@R9^TCzQmipeaF-GF) zK*nkPtj0dtJ{^CVD87PeSK>m(X~4`J^RJ7-QTt}(AL4m}PuPjr~y<|S~O^9jdi z$gu+X{;Nw&Qk8_8P;L2Uqj)$CGaRSIM^l6vp=3Pq;_#^(T34AKAb_hh*}>}~MVlbj z!%N8T34+*H8TwsT*tpr&q|}E3IWL5ww z+BHpUOoPTJZg2=*KlA#N`d4@NCjZP^UA&=CE^!Ao(9fjmdD*Q=Ot)z|n!_VCN2#bE zV9udRC)jUi?Y~L-vYqq~%~!SOt9Wo?(Q7TkZOv8zeA&3IemXkxcTw#-w`jk*KXQ~J z&s8teJin}eub#FenvMB{2*RhtGQNg7K7c0z15p&`NqELx7GXWUeQ9AIW)3&OgVM9& z$7`Lfz`UFNj)*8v89Se;^LYBPFk(qJC)0pYcd!t*wa~Sqj-oMtzNSeFCWFjOgPn>k z$L#-vp!&<{C*|oIg%z!c%Y<+{2AZ#8;B^!5!=BD>jVb27P#b`+Caq zIPp7iC9{m7gcN(w_@&Vu686q!$wgm4`yubuF+D8+HJtmxoG_tkx44LdpU9-0=Z)?g3 z-0(-#lIQ|c{rOuDt!tCyRa8!CYPmn@^!7`{sBCT(*7wNRudAVV2{a)e2+Y06wtT9&t)}su zVNYkP_@cBQI)lvYk`&1c>)r)5lD>s~xX*wpo5!1!GuBCS%m1<|2=dOV-&s(lL_|R- zI!eiYyS&M{I93a&))+aV_^UM)PN(FiEvo3-+S%}v6IHQ6k+BK%S8O~>9`9`-w=zER zkcE<^I$OGZkiJ#EE#9aw|Eo2P%4y3~j@nN>f!>SPkD#I1<&4YEOYHf@KSp4Ow-9nui^h= z{L}VmB#CfiyJeVVTmE!>o*=W6dz1wi<6AsQy3=*g`;0f;wVuf*;PEY%h4)=1l=TKTMP`(aA`3Z}wMamIEzAt8xlJ`Xh&)!bh_;ZX`EZRUzn+v~P}{XknymtMn@%-=%l z5Y9jVU&@)R3IXyKGb@s%$89(HNaN6&xr_a^ACH8n_j8JhC4MM~a(|Cx-kabH2tku+ z6s%m1fx_nFk3Z#R$s=3iv0X@6kQ77&GMZf-V4#YsD0-l37xyEmz;!tF=@ZT+wYdtf zkYBH$GwhsmfPmk6d&8nsS$1kZkxJr`nz+RXaOq`PbTBs@Jxo-LV<@NVFH0$yXsi3* z`AGc*sWq;>$Fu1XNs_Z+ zrj=SNQ~4BWq|IR-Hob?IV*l5?a|hg%9Zu7PNArlL!Dk!?w&uT>6Cw=P$eU}J>VQ}K zMf_m$T9A2}>n^msWy`Z2z??tFe_0Cc>w(DDH|Xxb?U1G^t?Lw(qGOYajV;PWc2VT& z8Bhmv_TTnKVcsD%EZ)3{}c}e`gXHL3l)_Q zaZ*KK?pK)Vx~GY8fVoll>r&x9G2CZYQjZ?c03W7aKu9i$VSOSSzn|Zy1cOj^5XhK{ zx)1>Ykq-=X%3||6kj1cn__HU<@jQ?4_00msP2=s$}# ziFDm#kh>JtBPw=BNVHY9GDK0Xr|y{>~^nA_~tn&>HasoF7!ggZsH-Xa8wPm#)^TG z^_jlCJg+xo(gNia|I`tvD5^^yA^`)2Fa|&e>%y27m^Y?Ht|w`; zBrVkAnOR?xnG5<;ZU*}+A|eiRg&6V*A*}&>tcEsiSTz8*1wI&`RRjIINWmYhi^jC_ zQN4EdhME0)hpP#E!OPDaRoh03N9_00OmNK440ijuB`C{^$wR}34Gll7N&>vIRWbuD zSVXjjrT!_0gC)&fN3kt~>ydzD^{&E0PwELqlnM0*7!G(L>2D0_sA|+(Cg+@=vE+*w zRw^i?e;g4`Hs70?&KO1+3Y6ys5@#BzUx}PWdn)A{4YA@5FKYUh{&#Zp_lbEMN^w!i z=YJE=L|;uEg`4K28px!`)u7DNxkz$#b+YQbub=_U;}6pez?uf1H;LphhAen_IUWRt zFM%Z*Z!{Q{8yJC|;#wG$G5DCQUkI8H0BvV9NIO^n#mqS%7n% zy5dCKnsf{yS;7|5Uszd>7FAYa6LR>l@bN#QqZZJy$!^q-HYxWMlN);XYmV9u%4y@5 z-xI^`k>s+<$4nbMIUgA4O3pt3dkNIxj)8@K;^I6y1&gLhUe41XavQPsS@eQXq)M-z z7Ws}1M?JLq)JjwrX{&2%BoIqjC44oB(fW1FPkeT+u$o!y7z(e_oz zduA8eq70Dx+rzGKDwyVvW~?r32Aac-?g+_o0=_&)+uuqIM_|C7AedriL;t-8I|;a$ zU%C1UiMw9evL8iT9`xwyn~cRH1E3-0+T9;+%s!m#5fPhfz=aLkso4W@Qr6+6 zAJwg;vIk0e!>GQh*z^e%pKROFCv>bQkt?D*zQ|1TtYf#BBN~@>dtG}N<@S8X#>7h6 z=~Ffzg@pih(0zq5nfLl9PpVjGc4!AT5@*c9fd&rSdQjN*LoGLW47)i2Fh6zUoK@nH6YxW*a^jd-omH@9~CIp1ve7lnet zGK=Rg1g30q`QN8nA;C{J-gGMHjW<<_0E36MGs;$#Kp>Yijp=t z8Pf~IuFk_9H$S9R2@5kGR{DW}qQg{$05%;P^6{+93xCY*qbbk)_eSH^^Y(vY!>9_RdQmvtRS3)(u{@Rf}jQeRgH|G zUu3J;*x1CGK@ak>mA|vE9lgA~{cwMIcRZ$!Yq<`OjU?aijkXkqz4pZ(+9AV3H?r(E}H zT+6j5ik{OX^xjHb?bRknI6#?(!Kg-Sijg3?;!UZ72w?Y2|L!d(;BO`tj5U)#DuY4g zN?-^yeG^B?bRU5HiKkl|NziA6xySVXF|C4&IMFN~5iIndq_|Q-xV6O2Vq!t%rBmHZDiR7et9|#`QQYD?p${G*j&lKd+zwiPhmU zrpl(0=6mk7M4mT-(>2mLl29?pm6`;k7MD)FR~RE^Nq7~OL6mcXZ|D`34_KzAdiPhqT8+hseoH>^xd~-);-G zM{@fXbY=-_9!cQjDD#I6QR^wTxFV%gJBAlQCcSR&z-eB!`iX1!L4X;s;zNf(#_Tcp zm8O~-FrB;{n1TM00{c?54_iv&rxalXWo#`u7MI=u}zZ|NDB_x1D74o0lE50ohES)_%SBwZgKF}as3;RNVg8N};BqF@GugqEjl_;x) zzrU$rL8MZdu%y^(BRA**=}*y4(3jrEVye$JnMAUwE?9d7Rc#lzsIqj^aRlE|=$i~F zy2vPs4D-krcO%*>Qd?$C=rEw84T~ZXJ4&^4MhQfL4ost+7S-Rb6DH3dS6L3*0~u_GzWkJtVer&M zPfq3VkP>ScrM^#c=Jw&2-fPLDV|_7?@C9OqM!Cf)vX^eh#@JY)!Ju7 zA_s=z8UoSLBQ}H)ehA8YsL=R16=Yss(0FFRfz3jiYtRvAk3MP%3_UG%UQcKV6S;t&{LqXT3`z@mmupea``?dZ&NvZISr(Mk zL~Jse{=XT*%+m9`{I1c#!4yVp`@aP`JqSrc|j;{9n6QjiOJ124*JB}OG*CKt|x!&#BjFpQ82&&ns&vT+*u_%XK7zHC&_!z4%2~InonVDnGwY&v>#6yHw(tO z+b?(a=>mO`Ogj5O$fRR>qE3+AM>nkeoU9SpwNowDER=O~#g)Dw2(w3<3! zX8l9RL*3J(ZMswE6QIRaR4@Dd+D*B9f|ZB_bCue|!%b8*3Z&TDHcdxkt`UF#i8q|n zShNn+oK4c~zNX}G7a49L?0$_1+c%oo0&b6oY-^v92)|7YkXv#X7TFFeH#gAIAG{nJ z6211CN#vlSsfUW?O&{93+U7iyA!%YaSqayS^gYi!EQ_B|)0YLYjyMVnuiRCX-K$g|dVc$-ta$a&uWFB*>2T08_Qpe)kllKPmGC74 z!eRl=OV##5w)&oQ#j5o!S+j(l^cwaWoGaG^HTJQ6C$y>$Ezu zx0(4mkJD%dwN&Q6Gd3!ES7m*pmE6jn^PsMT7T6E&thC| zHtMI1RQTB{#`e&;ddbbnt4=H$*d{*5od%n`@j=E`*WpAVAuQRdDO)UcZ_p!i5Vp~?SYRk;X!M&AatRCl=$E#7^W_b}kw||=5UXP`PQgni8Cfc9r z7fn&1g~VC)ZiZg?E@u(Vt%YQha$3iRt=X_vVH+7ZETFw}xm77}y&%hPwWRw0T5Z{) z1fsJK=hM4;TH-vrcit^s9BH@?CaSBj_1B5FM6DK?YvKWpPB$~%^8iNzL5hXbZ13!l zxcW1*Ik3-~aq?*IT4AMY*>XLvo+$|MO@O_|2XeaeXv_G%%T`3(bKP@IU$_fo_pfP= z*jODW*06iSRcY+E@uuT}E%ZY?OSjasqNbs^c_?nfj$^Gpl9A@TYZtDAfwGE=mnW+M zLo6L#<#;*|@^5g<=*EPEg?g*?r)HGMv*|>@5F6enA(~e4@@56^_{Jvi6Kx%)I#;dE zl4UQ=LEU}(9IyGgf9I61>R{uOk`vgy;5rWRR1~r{cZ6F^+1vwtgz;v;=2@JVed-yqLOf7Rier zIk>YP*dgh3fQp6|J-8G8X4v6BfiR$zOSiwHpB??G5!|*-JGktxU+FZDMs;u-`VNWR z|Ja8YfxN^uMy#E&f+dE#kJ>hzqG0Xf9rL%r7_r6G5V|k|&S^dyAjTr&|JY$YAVPt{ z-+^{0MQew;v)MPAUTKBTr7X}#=X}61oXN#n8-ukLE#nyilcbSpz#;nZKEa`z@P8(8 z(7_7rdTK`S81RzR1iFKD#1mp;x!CR}fpf_PVT$m<=LREH{cs|EeL8^m-&MT`kbba- zJ3@^A$Y7(Qj=Bl{Lk3%a@QyWEQ&U4vPmlgZ$Vl+i4455&&@RN+*l*A4d{1h5{Zz8w z1~$jg49+xg#tT~oJeenOGx*JfWMm5vjrLV;*lSww{}#Cde{}$FQZ(m(x@aMHB7i<( zpy`<92gG>0l(dUR*b_@HmNU74BaX>w^D&j2A9I2CHMmN_E$tmP~SY z|0OZ!_-SWO_t9rcZb1f+!TMGjk;*H7ZuZ+8ZpE{hdJ@;d5X`DmKAawk z9Mwy4Z~tJw@$vm%&B~E$Mj1WuRGU#w*6p|`@#yzEQzYUh4~JIFA9rJClUq#RXRC9a zU;(C!4-yslp%C(9_X$3hDV;@DBg2UgtLwsmMM@eOZ-6@geTUf8%H7%MabV0oEEq!~ z-SPr0AI8(i?0~gi+-LG~CZS{4(vhDv$%{lRT(#-kRolKM-b3CYv&&UHd<8Z{JP5ws zN&k=Iv??p5u%BFLiWE<1a6|-LOG77LQEAUX2FCt2QtL_5StgOG1)P|uS-@Fy)%1`K z*rM+EcJ*|w%Dj?YsvVXk{G|$A3%LUga>PXjbrTV=mx)?Sf3%H>c~yehs^N)+wtpAD z6aFf2IvRmjCcBN^^x$}M3Drd%5ELxQhUH|}i6EshkI#c<@PU`uP*XxoTwGi-Q;?_4 zMZ2-kzS@wr6s}Jzm2jGXE`Mzf|LE}$siX+BsgzuMY^wc*n*7_#+aavpE-MEaiRe0Y zi6E*8P$`VZe#E6(B?5RNVeYtFMnpn~XBe27N^uHp(&CnA+9vmnzL)|*+5~5M`i`Fw z^|hk}?i397Yj9!Neg2QwN%@`Y`e{@Ej8;mvn;_-nrlWaidiKl2fr5XGEo1Uh@civ` z1wWhX*LWEHMir0i_zOH69w!JPcVs@r6CF`H45qQxt%(4z!U_XcSoxHi#~Lf+5f8Ll z9z0P#*eYsl6yjkQ%X|+GUKthmy@g^tQFZ4z;)~D%Lf(XSqgdw|A-C(vY51LDt0i;V z@DJV>M`c`%KeIQEl72H|;l^(QdbPM@F6#EwG&2L!nWE@^I zDJ#y-)~h%edxl>Nfwc$L=n5f28BRo$>VZCaDCqAL7n1*uJeqv5>XRDWD5@Ja;<*8{ zVcd1utdxURiIBd9gTU^mv+$nhVRH%61`=46W{)PS6Vhe@v}7L8$6(0y>gn*gf1LG- z)8V-+FQPPD#~&di@YpYSbMqUT3-_I+uq?QBV1Byx-Ys5{X`fojCJsJ_kL`r7FpOLF z4W9Jz3`3YkT6`;DU2WK=(@DsQ8Hr#Ufm>pI7?nr=U4e*mP%n&|!GhjE;v0N?5R$jj z{0<;l*4?8pM*qodfzOsE->5o(V?9TXJEGz^;kOFE%0+6*T zPH3}B$Lq_g_ryiVHH{E~@pRvwn#f$hI>>>S=bCUW+5dJL@n^L?FDnj2qs@ zUhdNITTGJe>xKz=M#E0!Xm|Houh%>ww({?_fzDw4>cE_bHyZuXy&I zqS7ERkoA|;4I5I;jm%Gj9$(o0a;$-1^m=0%ZE$CNBTF$s*1I3p-GH0L5FSpm|eaCiCY^w*yIeTc$Th3QjN zXPx$WYckw=h=jzK_HE02i^@B9ZER}pgQ+E16ckX@*cM^&C^?zKoN@_OBxm zB+y4TjnSeUeF1rK>`&fGvMv+FCqPc2At2nT851x}oqCz(wtXL(f@>Dn`=Q$Q0v%7+ zWij2sEg;K!0-3Mb+C-`yfrmvV@9&ZRSL|lfiZ12VtOj~E2Y^E<7k{IPiI}kil zCfHdWuG#@gM5#Qfg_X92Wfe#6`?BSn1(^Cptme0n<55S53aDSyniS;Djb<+TY^ za%cJRS`36Wt9&7;K2b3~;gB_fA~5qB&OBrl?(lkE^FFa+^3n{E@im9J>vQY%J2VRy z^e|5}Jj)1s-ys*L?HZz(^E_sKDW(GEJCYkJXa$b z7cGZ&&OSNDkl7~8wEA%@=vLKl;Tu21 z4)?>;y8S`nFMnh41@NA!e>8qq+X@s!R%ol3U;u5y$|iKa;>rnFV4#a5Dm6N?9RXgm z@4q2`Lth`2H6fy7o;W%B&Sr!=rXd;R6uGDLtBkTfu1&lSj2vCVn3sHADZ@Hv6fH}Q zeiL+}mac$j8^;o+7|tq4sAG028uyiu#i91}W0f#ih(9stpaMLMZ}=;rari z8i%HD2kxZ8O@Sz#BeAS)X)&lhEV++cNVrXwWgZ|m2tWAka$AB9;NtecK&*s>wb}(m z1WG_L@SAMiQ@U<9e^wNE+#K}jut7;Ucr~9%|@hbz|oAkQJ{7l{=k%HgL+Y0A$ZauU?o|`##T9wC^ zak1e1QJyo+{MLc$&|fF(m?r%2=;MS>hqKMj;4oglIaj+KU+?I4#PTeRW-Qhg&O7cu zPS!P5ZXUK~Y$Fo~rtxVl*xCwQu{{L2N=)LE=(qOvW|vK^%;HhJj4lF{u7O9i7k<9i z@&;+lv1ya3(*sk|2Pe( zBC3au_1A-N=MZvhoJC`z+V5E}2{BJv3c?@270mF5>QTIqH!A5G4CO4d95Am|F`iiX z$pKV8(1!J(IxEs@AfuW3HJXIgbOzx=VDKM98{IAr#Z)W`Q^Wkbm1k8kS&!LbZ-_#< z&rJkmpbVPcb16d`3qem?!^)r>GFgdrXOV#Mu$gnF*mK=q@?u^&y@e*Fuj~YsErFCv zCuCMX#q9gE_4vO5drJ9O&-?`~D_}LDsUPh{D~?5>-NpV}#?*>pcCQa&04COD-&#|$ znxfL(m5uV)`JI}spp6sw^{|;Z!(k(=E|O$O)~yQSw|B~7i<8}N(EAr`%lSXG+EbK2 z$^4*0w4_^eGC~W(X}%GfeMl`1RqdZ3>$QJnk9}Z(U(+kWK@{T!TT^Gn?gtA-0&OUy zooD^vIJ<6V-r6!4*W>rKI@M@i>W-%~T^{?(eJ$MM5#m%7lr+%O6O`hyp74uv>WaMM z6h;1^SmCL@8t=gtW8bGxQI}&?MpzGxX2L`ZS zK4<5~WmtWLZ^&w?UCKxKNBMMvzZB~Tftujxh6c#mStj6(WeM{=2Dm4zL3w8~75LRs z8|J)4oxZ<&8;ZT|?LI4usSbSg8|k7{z=#h zlbOvO-2N-ldsqZfhiJ+jSx)>0`|5clf~zWmP+X`o-j#LiC#mm=B89h3dPqPn!nDPM z-JTcE9OJuv?ISqIREzYZ{~AY~&1@WHxp3KHwG0&Q47LmaU+QU4U7fjWAdL>Kpa9a^ zl~uc`Mye)lBaPAY8MngwWu?#S8bye}TvY894g^0Tn(m|XI&a3HwYK76(I37Yfi~X; zDe9`$snLbbiU!H=8ry_P6BY~MXd-I!L4h|Z@^L!m+ZY-7KF&y>cktZ$cOfY7-R=D! zV#!_wVqt>|>~1p~jNei*m*D2|*`9N;6t_}G;ST& zC#7rc&sEM1gs?6SE{@T3gkLG<_2^qx%;vXX6<$-H-5G_-^CvvJDPC9LzQ5H5sr~fB zdR(|60phYWPgkX8&s1LOpM8J??Mp9Zqb%L9#dALzw#w!^A~=ANAW>(q>ZjW0ea}0& zWv`u%$PkdLfy+6k7z~_cQvX!&USGryQ) zLs5lN?@XWemkC)UU#J-$Jh)%WV$cJ&!(b&V+539fIg8dxn(NvXxQAWOOR~zkiQiL_ zG?YdYpDY{0^NJEAxK9PaH!^}C2g5n=m~eopwV0s(>u;$Go9VwiVD6eY@0J1{{}`!y zJ>z0My0#@UOJl|-+FJI!Wv#Q>{FhO6w&oQ}pl3w!n*ZdI?~AsyUX%;VQcTLZK<_X(fN#0!M9)9Ux>dh312;JY6a<`i+6}04L^M$t{UI^nhMl9m)nhwv#?&8C zgLxO@%WEayi=W?W)kPE{UzrSKR^|5gs34Dy-R~|u zDGpf2K6~S{xRie+ ziF(lx<6ZjgI-Cpjl*R|$we%FCw}`3PJ@|^OoMPc<;Huif?I`;``9Xu6F@E;~d+Hwl zgR2=FEfD@_`lAI868Cz}@cVf6q0GO%u!r}l6k}8yMeDP0^t+Cy>2|vZG$G<$ zcI#(E-F7!`A5M-CrTw*4_G225cTUYw+}O2IZ-vt{%eNw$RhEeDhas2C^#r+UnibjC zLpB0}AB7ZFJo-Q&Td*?tY7UG>Q#8V($>3>4bOqnU3u0bsbJxHiSBI43ENiZogedQI zI6XF2!CL$p8O#`s;_66^a@P;1>@2|VR7cvS_<(MaMlqbZD)eC#kB*m$yeuy$1pmL- zY0zA3OpTuidUj2UbR^ye6mn#`rZHlJZTHf{5=&C+Qd9y^)WJxHqgwK977j7#j);9>n{bNP*|3 z?PX~3zM=1URuIB~!PBM^(7q^v5U#~1B+!CETgt>zgG6rgLMftM_E{ycI7;AZ8qxn zX=H-Sd|sOKY0bcl7xg2=LhEKpiqW|xY)quilS}swzdXVFTDsqEm4;^K4cykB&f8Yf zf6l%r+aHN$D`l3PdS?@E$qDMrC+M2ge@xNc`Vh+b`E!rjySJ>W6fXI~3?sr@r|FbK zxpL0PuP5yF=k&613@4_?H`2KLF32EydPYFNHOwil%m zJE`&xDSZJMYG~*Ut3%|hZ^|Upj$7#GEH9CclG@19t^VeDRx=!Q3dkt<4xJQkv%xwS zf%eMxl&;nMu+9TmQQR&=K0|mzpdWaA=G%ysv0FH@ynjve!d><-KsOK~Zk8&=9Eg32A zt76Qet85U+Z+yb^Vfvy5k1Y$uK%SZ$sq0&*MG;YG7C})SCX&YU8;sfz%!Qb%MA_bm zjl-qQUudbaS=oXd$2M?{(4gN>Q?(xkXQE-E-!M5Y;c_*W_b=4p5gQ^*^r8MiL~Y z#v_#$g8Cp94RoVIA}KF^yVvH*-h6>1V4YHyj)U3!GtGk8H-Ee6)5qcz@qlPwqZ(1Q znL88Yj+4J@I!=JZ9)q)qfNff*8&U1P71mLwLqNgR=O}Qe}SbGbgIJ$=2c7OzT4Q|2R-Q6L$2X}WT zKp?ogL$KiP4#6R~ySuyH&ijAgFX!A_bxzgYRTNVc#n3RjyJxSxo=2RkPfiVg&4OVX z$n5Rn|L8FtJepF`^Q8DmxLT)OhxqZPU}no~UO`lvR>x}oPT8|5V)m&#Ts+i?=z~G| zHZ5>@--fT&+qP`;!#Gw1X+mJYhvS% zW(YoEs*Lv7tUtJgH0X@jq`v;vd|3YdlCr=(LN<-@$n|~3Ozz9D!cbXI?wub-k}hvZ zT|;Ob@>I7n=Vu!1Teza8 z%wrbe0!qsW@s&w!o=j%S2~*k+Wt2rKKg8-N1(Sw$Cpc6&i>2Ym>3+ zSdRS80&E(#TCt)vzlfj%5e&F(Rm|Arbsda>fr3cS;AVS@!T*OFI#t?sDjIvyQj&j-J{!sR zdVyss?tZ7FUFLL%sSjV)(dGYn3;`PK3aW?Hbob|{$NFEW;Pu7Jrc1gX;eWK5t{3<_ zXOFWETDUM;qxyth_p%etkQ}x=wR-Ebybm8EF2O81t|R1cEM~#fmu0&V08&<1775aWlp7{k+=7aGuD|-QIQk_xlpEVIgqUg_Xm7n z@n6i!JyfR36JrTF*1%`JHUg;b1{OeFQ8_sq0Hfsglm^h2F>~xg>puc3C@egD$kn=aJ zHQR6Pt<_xz;(p0n6x1T@>ECbMh>4IT-Ad$nTUZNPmMLC6AGm++|I=;S0dYnUa)b2F zhVb_Hukn@d9-%605x-c-6a^uj(QW8m9+*X!A`lnS>gOWV*M`9Gu*Fyf4S>FsPNz9 z(9qD(Cq{;%bs!hve=taiu1fF!{)MUZE=gPfpQh)CV6IAyNfu?MamI8M@r{QkCB+`? zv&Q{zw9rh_bGi47Uin}qBD6(Y#9JNF?0F&BW@opKOS+X)F381yS(VBYK7a^AVJ@v4 zkrJOXo4*`h%dX|-Ox+(bV}tNt)u~g@@EMZnM8#j(y{?B^^`^=E(8DN=wp~eCiMgog zFCyZyAUy&Z(OYQZD&4nuy&=Rr0p=TPmeV&DNhxWXm3@kU~#!s;7}? zfggnhpZocFJ5(Ai=rUvst}1vo>r}KBT-1R8UhU6Hu}bv&*?klg$|4oE)d`j~eALJ8 zsY#xW-qU;uqM7BJsf%O0xB9adV%7fE9z-`&wAgbl^E&S<;si>*n2u^SM)FvXsaD$ zT6#)bcRkVp=h&JDQh>5zw@5S=lko7gib=Ff$Y5%w9@a^&8_`IFI;5T!n3yz@hBr)T znY9bt*5q5q3l{I})LsfLBMg>D;sfEkJ{NlEk@Dep*ILC?fWZ0>GtBmzVIb#9#B?y5 z7k!Kh3UNjYD%)yWryz-ykM?skXy=kad!gey`QTNL^jbtDtyw1U&Pg9CAKudHSBT2+ zFKC#y<+>(7&Gfb1lVToJ3$nb>2it5Eke9I{BW2m`^7M=_$?iJp&>N!=sn=Bv<`YddI@6LR%LAoD0fwh*78=GvZp z<0!sH&=XG7{Uuf+IRgPwW+cr1F2Ich5xdsDCRlL&nX5+_m)qpO=q$N+}OMN((kh&Ky^EY27gWZD|>*ZCA$S_F7e~Hr2U+ zM9pTg4kIy@ABh%mJBEL6hOq2xooE`eNqz^!KFigcI_qYsyy+F6Ef^|e0zb&L^o*VJ z#0@{QepONz$ZNA&8@oTZ8=Lle>UODkA#WH7X%-Cm1lDKiKJY1xJBzM*sooY|f1ja#s#m_M@%fJ*w2EoVK4PS`8#f67DeoS2 zj(>fp7?d23wi9{+^5#Bon_2rm)b9Z z2cG!7n|}P*FozNsSMCZS)lRy0CTh;8wZv`1wTMsR6h-7>9_LwQT`+0M&ffX)hYoLg z*Rz8kN)C$x?ls{k=~}UyW`E4wnaDrFwEE;Je<^u`q8NzR_>dWtYXx|grUyoR{ z83t}(L~n1ueOuKGHdkNu{?exZQt{Yc)`!zM9f&6L*|<;9KWve%VNjrD*Lx+sCYXFR zZ;Ry9?eDRm%3F=u_W)AIN0XmTFsPo8c|&XWjw4p#2g#KMh_$SkLZPJaF4Xx1pMCB6 zK4JTIXS2eqraK2gr?t}VcphXv_QvK4lL(u9Kkr~lD=s#fMVA!kAw4H_=DTC$XJeal zOKz{&4Ias@o7uTI#+3zvc_mv$qAF{1a*7{DN0cb$TQj_X+y=zXtvrSWt#@+|dDBhC z`dB;=a`2IrDmgi^7B?{e+oY)juUN9@A-c!}x?fE>>Bu&R9#w%z-p<+#fxt)h+k${= zZW`%j)~fs1O*>X`+$-RO><=AXqd1NfNolMEgz)xzEhXAe=%+HIavmh>g<2b%>9}vJ zL^PF;#bguYiJTOcm}V{CZ)iH`8r@Pni7B{cdy@lwr9{%&woT{>F~V@IXO?8Eu`t!< zM4#J%^&oop)hOwfT1iB*Aq|_Bmmuk)0X|M{{eLC$whe#{AvpH>W&-=+fm_?l(a<8T zQndr+8O#c%$Mr=tC9-k?NaER@2Ru%+n+{B?2VuEEGy?9Rh^jj7if+d|52ekIY_j66 z<||OCxi$8yWmrODWIwpvDni;S`;TBrMOc94uRqxlX)S9oZ;|uwCO_i9>xJgqSg>Wu zNhvrB1!cwK2A9?yEhkbbHuhVN4d&+41S-XfyZz8rOkbSIo?ZvpZL!2jP7q6p@dBiW zhZ4voj1ySrFvNrT$c7Lw4vz=j{6&{?qPmMM2nUJU&994q6!Wp7!3mCG zq@B~Q$5%@vJvo$_^l@94B4z1(Y}!7nXar1=DdhAFCslg@H+`Z#aug_Z9rM*y@jS{} zH9e0SXJyH%;aLAVzlQy{!L-tr5aH*jz6E_ryK#Z#RrPg=DhSk-YismCPuqbN{Ztd$ zmUuI$f<3JSIO{7*g<2mc9rT-==kavuZ;T^PDN8Ys+%XAtizcSEnB$J7_~1Oi)ihG& zR+C4F=Ult=u{*|WXWNko&&q3@5U6@JCBZ-&=q2IHoa4ipOKK}?ATv=yJsMgLY~2>B0%bf8MEVnhO7aK{zS1n4`r7a}9z%p|npo3r}I_&6Ak2UD21 znp2hi!2dunCbpubc4gSQW}atTcfd?NJJp8C^Da^=WL7!2!OiOhCu|lAOGAU!caX`c zzlr@Cd*Qjv3_X?EGKYw)dO9I-lp_8c7fliR{){vK*5?QFg2bU6h;wnT zG8VG;amTl&C>nFKr~=C}U7OK$^10uzq`mJ&2Du85{{-+-d*mwN(~bdo0pgy+xE?OM z+2k20U`9N>JqJ}3n^Gizk*>=xh{LVZaItuOMOYsH$p3b>Z4HSCeRYl05+%FDJli~T&Z`=7BCHg^8ovw>GH+=W^DDUGE>bQ}%=6sw;KEr7N)X{K-o98iy$8!vprCNI z61bSz0T;8BkB6F{A#in-Ik>;qnb)de~wXbz}sj}{;`U)y4 zFSMFGUz_=M{eGf`4>4T{Fy(7@RaGoUIr~SC_tE(0XZ?!wThh2kO`Vr9ik3!4^oG*X z8BS|F{N7Zor(=?om(}k53}$N9cqO{YuhhsT3K5vtP=OE$xjMM~ZGJSi0nJ%H?t7xj zw%nuo_r;LJ`~gdc7s&`BJIhgAr`~<+{HWM_%FoA*msbp5K%gU*-mEsm8TL0Zia{>J zj)90@BUr_WAPjW0-B=>5j4Yntj+?X8A*)sq9@=k+7)pbw!aamf12b{{9 z{>qXjze%*IGD5-pQK3O~+!TU>!1iiY%!+y0iIMUFAp86|sQkwQN?SU`ATz8lIW{}XOF7J{Qv%Syfrt_A)4+Yk!?@F$7S4u81|x+_#LDgDi}zPmQvv(8c3P5o7Lu|MJ&Ucx zN}OIDc8ZBEsiR|VTGV^!uu=xTXhKT1tMN7eWzL(Eg_ zWwk%Z$_nYXs`CMNRkXs{1B1qPA{WNE-4ii2b zY>gwZ)J>;p&^w+Q2zAkfHAO+>OEK&J03l^ad!Ge{KFw}Q~d?a>50NfM#VYR-=hP3ZdNHu+P2 zml*43_y$y7ZcWIeQCwfUG}YEW(u3jkGWvti{Q+KC-Y;?Xz;|=a$LIF}tghKa@&}2= z+oP4u^t)xJ4;30FhV3%sh2zreA1x}|qxKjExx$HZv;0RA){^+^jQ8TU#33bY*ur3v zVRFxP!!3S;j2`qeyX^FL)c3btp2i=NG8*4uq2t2&{Z6+yZ?LG|_=}tBpEqGXLah>_ z8-hS2p^5}<*PciBOT+UDwjuMDN7SqX_Po^L(Y8t68=fTfs8If?Za({oj?dG?lWSHc zo$d4tboBjTzl|IIaY=DfW%Js$$-`IBl4~uOO~ZVCa5_9#&eYC7-grS(m?E*v?7hDd z*ueJ+0hzmWhO+QNe&x78R)c-SkhNdLr^?YGZvk03S`r`^<@dD?`1L5cf&1^MUH!#;y*2F7M6o)jhG{awyblyGeuhX}t2@5i>@u zTaSbDiLW`{FUE?xv$jfLDA;`R&mzvU#E_QL`4+|_4o7%41ROInjLjbIAW)>H-ZtAz z9pPjQfoj-BFnFcl5aA4)3l2ruhF zl%MVFCVX#6#nT-SBnGJJWaNDMH$P@bB#nms%kcC( zCjAGNQTMVhk>_H%*r8TF{*;?HZ+ueD#_OSn- ztj=c|td3VeG0Rl8+xcXD=FV@DNdsWY2&K1|lb*+UZ4MSRHg;EI zu2njr%|H>3lM36^R{>P_wHIY2S&cUmI2I@R#EdFFvQC7$|B^aI!;60wWU_0{9ysi^ z?A>%MSr95HSk57s4^H^-4N7X?9Hrld?`mM@-vmZKqgS5HiNVEfPW;nLss(V#FY;>W zB;bFbK|KPZB86vU;R%|G$u9S613cFQ`af5n`&xu@zMyf?6@IYOyU=*=NtK%xNm})o z_dF0(>>vJd!+LY*R`%p`vc2Q7JOhOi=Ud-`8fKIpN;%}u{68=`Cjpc58?2JU5$jM2M0EuHC-4Z-g~;p>`yVk4 zmxIY~@)Od>Ajvxfnmcw?;_v;phb?!YH8JJv%ot)D4qXXkve0w|Y&mI}x8=rt;U}29 zt)BROb`43#uWS)cG3sgKV@3(d){Z)ycmXAZFK2 z$;O3i&;6c^Bh7phAScOb9JhkGKGCY&xyxh}CX)H3u+}m1);bZWLvHRZ1@w)wjOk?L~y!9!UC%aAzvffSj~#mD_-uGCjVK zKUu$%H9Z2w4-@|=%Q%3lqN%X)#KmmeCeT z+`LEWE5SpkQ66K@mSdChV^||nb(H&S z>7=)A(;BltVRNR?@hM1SGhSPz8KV{RQ05X@>LpMj_1D#`in`J0%YlaV39N)FfG&BQ z%fiDW(Zfw=q#(dR&l$+i-CYiaB-pz#jM`4tjlDS)1@NFdrVH1AjBQ%BxdCAmlw%k9 zUUQ1^$#BOCJcMMO76?693EEJIHvWX-h_{Rv+{%0f$N!wikjR(I}-kLrsYTGehd#^iG&)N)m z=E|Gs;35w_*_s#FQTsYi=jJn-LL;=&zIOGH!`iU~x@lYa#~db#8;8W#?>U*|ow}#x z|C=i5c)WjIh19Xk7Z=jZ_oE2%MDeGWb?f|I3km&g0q&^2Zc|Fn;5`>M$BIW}YFTQe zjNxe1_D(dzp7iZaNuUV=A z4xRE;M}88ftEtfz8r-PK<+3{t-p-odkh)PG=9TvacVBUv=Slc$xt;CC;OD@$6Q0#( zWx00mqk_A)n4k?b#Fgyy@l4m+U+F}L40=W+u>n(5O1mh%x)QVsL{>a&$6ProZ68c=6kwjj73*3dTKKpp3h`Nbtdup z|56i+RI$*=vDR|b>ttJp)Ndim@Gj~DZ= z4^ZwVQ_b-u(xp1IT996V^tsVM8Crk8bpV5;W0LQN7ezausfmBr?>CV}tBUmAm%wRf zmiv{X@?kiO4O5cmfli%%K50(#7=L{S)!dh!TE}x)xIG#E^z0xMxGN+E7L)#u^+rAw zuo)uUz_8_UoZH`(V2Sz=-$K)135{K!$ER>++I$zF@WE^*8^a6P;Y=KR}E1eIteNP<_jw9W~It2c-Z&T)=6_l!Sh4)kE{ z4~Y!~LjPwTqt0~j{|jC0wiO|Ymr_lvUqz_%eDH(=Jr zva*w*=4gN?A95)MB18Y2_|)~kd5gyPod2H`^8L(#j;=f+0%$p#)Kq%L5fbVK)1a1| zf2_2)=p?uE13PZE^S>Sa@6O_15D5(ndnZJnc|s6k_DcLOJMizPM^f%kz{><4{ileC z!Z;Ny$o>5@zhp*!C){L+^JfNXX2~?D!|N?)Ea!`te3vJO8SlC-749w?pOqSD9%{N?uMInI^q07|S_vu2~XEC9zP#RdW+pj*>hl z+>A0EjTHEQ#k;Ib*~3>@@&A+>3H%=G=8NGMd4CVx(+ENZ@co;XzF|U8Op!wmrjP5K zyBz;f2~85tZL9H&S3m9abVbp5H&F0Y4J715uF_kID z5OTT*_s6Xsf)fJ%zAF_YcNPws`{J1GV01Hh-cE7Ba|M*Rgx4a+EDLp6}(&%7oxq5&Cx>19Xntj)LW$dE}r;Bdji%tRz zC^5x@iHeSnj_o_<>%-o+-NLD{w9Pv1Pkt^@vf+Ihp4x%M`$e_ZPSBl8WxDDRMnJaE z=DmX5Cz^ppAttFOTLZdZk}@*+uYZ)zC*#B{rN8J@-EW$R8|oBhRhL+tN=9DS&A47h z@(UO#0wlAqFFqEm!n5rkl9}3k?`n?6MnAg$hZo_E{My*S1$dT>K>p3kP>1gMRSNi1 zNq5P{Lj#7n&dT@R|HXAQoVKNz`=73(d(*vv*JlyEf{vV1G^^S)l9Ot-=Q*hW*| zgZ=qy)GzNb#AdTQh(i%YgZ2Wwj?K%Ng^l!9a-@Aokbeh3OD*N>IU$Fx{u)$&j0>;Z zd2xPR#LU-jLe~iBW&8`CR$&D-OXTlp74KEVfNr|AG3kW=#Ci3ZT0UJYv`LI^04(|J z*K5?^>aNik?Y=H;^~xr5+Bw5MA85DS>+HGMbPJDkEr|n~v`ki-#HhnSS33K#$L+p* zz%C{81)Dloz$nJ8*xeQYHY?AT;gANOd{x;1XuAs+4 zJ4)ffJH1ph_shhOo7%L|bbL^)AeIvStdH;RA3ZQ>F1&KrjhqMW*HpzhH~#f^7Yq({ zi_`EEZwC`Hu)kn)W_AZ{YATijSkJcDjqO`GnO_~eM-p^rO|BrE*E7-(G8X-o*4&MR zCLSb_O4AYb!*^XiCb$W|u;r-`QQl0S+K#KXUo_@>x6|=Lnmxy$c697h|7S(1kW~rO zn5-swY#T$qcl*04mk{+|ZcWDln)WtX9r7&V$AL6FvA=L5|L^6=hxe0A7eVw1tgiL5 z)yY*dNe$=ZI{zOFFs?Mzh8%{z{e8AfLMUKLhf(>ls2Ffya37*7T36QziE^RxC<8iQ zUp_Xr*0Nl$wQ<;4Cex`&a=S3qH;k>5^k&WwD5h!3mWDvkcc;!Ch4fwXJ1L_&|+ClO1!+0rP=|0Bcs#m8ws{& ziNFURm>#1J-)PmLp~ZYz8$7~Gx@CT4%+u$J=ldJ9q6i<~HjAmByU*ZyY>ltM>ivW% z?7o|b6J=>xE?Y29*nG)EJ-M7doo14>Z0rZWD`2k5K>qeL%jcc=sJK{rKIRRuSW=4#C{=F^gKqikP!`3_Or*qnbPym6RI>fGW|S{8 z{%jr5@IK&%aW9=q-jMbl)l)%D3Z9r+*5#p!31#pCAp=PcDd%tE1T5<(1Qf*G~13C<^G$7s#_d&b`DLx+3Q?WRYY@m_Sgb;kMz6Z6|zrcnq+4mgL4`4w7^ba zTGp5F2xR{FM5o1;M8P=E4{_I$Lg@Z^?FWOFEJElQslN0NDbvI8zLPTxfYspQv0RE?4lG4i<}hb~ z3&zb(7R$*Bv$BdQfyPm!1NBVz*}$rX7HwLeILY~^&Szlxz;2+@uATnCOF#DTe0svUhEvk^IsnLh7Mvih22<#URo$veThcLeYTu_oyIwdVWa!+}IHdsN(k zC3%1pMDBsL$Bx%tjD-D3y+@!@*iNy_{tLJ>=vCGPe^a=jI!!632ipZDX5myE-qRYE7c%u7?c=Vrb7e(@FiZtVz3 zNIxKk<_kUkWlj>S5yJONsqb+$frFj3$8V)K@shYWG#2YreC=W43x!~E3S>VJ(^?Fe ziq27K;vLe_j5Fq<(a0`8RleIf_9lLuQ-kA6D?HOGyV-qP3Z>snoS=XV>aI38S9TokZLsk{(&m)eq zU)>gy19N(#=9VQWs$`C?zj3*%O3V=|4Vk!bkTGx_-gmy}f_?bJz}tB0bpu#}LCTtP zdoJRx2vIrE0d;4;Y-#DrQIIi36@9@vhQp+R1@2PH{Aw8JDpF8aNxSo7P7d1JY4_(I z2HLKY+9OXv`<#izTeUz+*()fJU{YN1trpju-5_m>ftxV8qz?=T#ZW>U zA9G(DomtD9O8b?-y@=5$Vq^qf*%}f5I^-k{T*Qp3;M2Fu;yt(2e6QUy;XHlCE2?4> zQ5oQp0}@k5M1Xupx-9OC#8@#w9)F-y?PJ^$R;imp;7D|4Bc9 zj(#9gx=TALm7jL~kXU`Voa%kmj{2ILn@c$gtRPGhuBak+xucz6D`_k%#wEOjiKrw% zbpJw)qFePQt%S7l%uZ3Eq$QiXqolUwH+7#IlbKP7l+*uGp(CSQ#eT}UJri^4KqBU| zs)<$FBV%T0!AVU^t)Z$m*?b&kZ$-au8S8hZX3O$I0@2b2WFLOdF_*C({bf*o%T1dL z3so!!v=}ilZ%AmUM6j+)4EU0iE=O5%EeUVMDD2uD&+^p@)!4Yob5Np zUdKyy+bu{N&(r4mxAdM`A7VxsB3=5cVJ2{|1J2D7Ic6{wr^7z>i}6?q8--1rj@ASp zz5EU+2#(Ol=oGfmIW#!D7MevM{eWx`h8-j*R;KNd|Z!o!P!_sAGWr>0) ze<$yB4Bc5B>FVIT64C-RHF~SD_wtMpD=#caBDmaw1l4r6ljHv0pl&saC(6b(640gD zPaj*TqR|a33+S8t>|z4Fu& zv!jIN@wyOK!*2zjPFObg5hG!H*#n=7M3C72)rv>k!~bDbW{ZF};Sf!+>odQ2fth}k zOhi+=G~+{pAJ}i5<0>~_vmUa!)lg~Zc58)B7w)EzXHKAE20=1zyMcc5o{#Y5qnV|h zlrRYicn?DEKa!h%fyqEU=Z0Oap8;@w165r^yff zn8vH={otf`92o)iOE|ckKl~7(>SU`(Hh4Vzw~~Wq@cy#o@tMiXk)g5~e};u3!fJC`HlrFOPSF2u1>ME(DcGtGlSZ_8Ds{+{N8pF%=E4+wc+J zga^Xiqqf9#hl4it!BdM;{ zIK#KZ4+O2ezl~noE?7=G6U9S1v&??=$No0`m4J1(o_wZ-Fx=l8IzQ{GwH5@Ox@3+n z){151&lf7awemIJ^UY%9=>y1r@5z%JmjVj(r&d9Bice^6X{Qk%q)2Q){Ef*rS4~64 zA$IhW{7=|VA2~1=vxlB>=He1k*<%px_R%)V=_}1Al~0sw&M9&}$-nZO6kClr9n(l! zTD9Lf_);Elk3$j-$!8hqsB7fzG8#$t6};H33aGhTRaa9C5Y^U8+*eMxETo6|RJuSn zZfaq@j@rayekuL=+g6~-OldDKeB!x{R7mhbUC`ZS9#79-SI>p76UM)!y9j7gAWun_ne1x;=ol9b%y3eZ(fb_M9_qUsaBBp4u_tb zd9w0Tn@Fm$upcDmO@{I`{DdP-#Io;Br-?_cLZj@?OH{=(g9u)}#Kk@Fb+}V!>we1( z^)O4%TNe!0bDJt*=}|nQoYr$yKHu?X9)8Vm9E6kRiu!0Ioq^n%^s(ID`e zQgHY%)eyiW5xNaHAF51oNgk7LWV&)-M}TT8sfKwk5NYs)z9YW3h~I6h*7-PjR_{Mw zP%ud@htq$_^lXTMxHCNbt1VmxYu#KRI-;zlfoAM+7O(A|$8%(_Cpgt^FlPQ~I{Ba~ z`Fd1Vb|E1v82{wX$*MRfs^g`Vf&mA#gt?e+xW}%gGYr{5C0~A+xL}?!E@_*H@JFBD zi^e2}*!k3Ezged&PMoHr)@*Mm<8Bf!Y^~AmB=6gghIPTx;w_wU3CJ7>B3|ZSaghwE zKSzTK@y@*nE5-=C*bdjA_PC>@BlLOn8Qqt+1Xo!o<`fVdZm6x(7sMaXB@Szizndl* zx`qCP zj6F_1Cjkk!2eL17Ys|wk7T!JJly%lXH)&Zk;uG0{TTYGZ^ex%$En(le`Q=pp`Nt8v znq{mINN_=fKo?d>- z{P|bY_7-a3ZWQTOG-vbLe#W^mX@p@mY}n%G&)NdFUHpb`-7Z@l>Gu+KZbwQ&WN30Q zustLtCV7M~1qm{jX5Na3EtCs!gPiV(>xBU6u#&f9irOu`befig_bbjc|>O z!kGLE%l?>;U&>GnI>jvf9H7M;z{Z~8#~5A-Rxk12aLcxTQLlgU!&8mZb7z$iDl-*8myyn?g)#K5V*!z4#t3Tnk z*3H7h4aX1TPSSr~0yYnk&WJ~I%X08?VWVNEFX$KB}^ zRxMr{s!IOt3r+|h2oVvH-dOf$G^_3V1zDcN>6O09<0gl!22g5WDYH@Ba4Q9upFLy7 z8_Dzcgm125(l`iEUHm`!3?@0gS}jtoB;dyq@mefu0EJ|IrJRcvx%`Bose4r;2RMkk zLdYGP;w@9w&>>}6o5*Kb(TIQ6SGcxBrsk~Qxw-XFu#zfIn`GHIXgfw~NNK$t3rFtD z6BL>V{xHdkl1VJWkMQ%#H{LeclS&8#hep~ybNik6Y@7yl%joVMOWVg#@lHYh9@e>a zarv}u;!JB=YaGl>2*6YP(*tv|aZxkhKZ2kt1(@o7Zk8gXePK<}NOk-*Hu+y>R1jYY z3B0~QO{7$ediTMj)akU#5+4HcXU&QgMNdB_=d$a|7Xo^ddfVfk*gRTjDZy!-@YAso zq)%w&R=)w+6LYTd3mOqA?Up#^k>utwgNEIANx?twLSnQH_*GtR=Cu-Q^WAz~sSDbk zzADc7yIY5(?i!D?ClpkBMaFN5Y&ER>BVubv};!lKx-RZJT>jH*a^&>TfYQD`=GsRqH=;2%8t&%q zi4ccI5S2r}+U6S=p~*v^+;SmJwUk!H61V>Oeut0vWNPQIq~k*dOLw)y-%quT*_=>@S+q>!$()1Vm?1^TV0lr+N7bEr=?(22@4N07eq_m8W31Tcs&codvH3q61P!1Y)*>+NmxqTQ!{ zsuDUVUq-zm{*pX_311;7P;qXHU30h`5B?|I!_z!{pewZ|Hgqp$XTWBGUpX;Rq}O11 zsoIeicALXCz3?T!IzNh)uhn5^trhXQos*9Lg$v6|?Nj36r7QY$dv(dsq%Q>)!nmW; z3gM!I1s38gH7`89M@Bn)Sxwc?WBvRy|{ z4%Plt5=-{z`*nDjuYF%sLi%f}X209AAP59_qdL)@Y~1L`Xs$LetrJl!hmm}MH(Duw z0dHQDx%$X^hlxzki&5&Vr^Ab;8U>*j@4PA7vK5wSYO@~JOVb01e^r+>NNp8b;m6e)H|@6n^7fa)r|U*1**_l_pw?eQvVOHZhz@-B95rihy!IAX zR@YtZ_%Jy+d3dBHCl~qS$KXITAr~iSOUu4p_rQQe{^Xl0J+yBo7>Q8OgylO%_<`86 za4U3y=Wyp2Jk_qr=>9*%dnZIDr4On}3A!GwlEbbfev zkd~HiX^|$01*&=nch2rK{}Ndf6B8XB9o^2GnlB zpqw@2im)--xRKuHfVAH&L>fDnfE zMBk|oA6xL2H;4GhBS%E?BAM91QR&Ds#fPkzfxs>>W`jXQGG?PuP|`kUn!{ z2LTXGq=3VTomn2zVN09K<^J38_UPaG&>Nah^=$2n_md+7FrG-nLO?ew|29lU-T@OU?0vXqdiaG*9{Ql7n=Nh;zXo)JuUmsU;2d$I92=S>{mtcOSf~%6~Fh{ z_@#6z>P<09oOjVJb?!a~LIQ5dsWWTXn5>&EiI_|UNymLtZ`}i*Wq~1g8 zq`YAZYuPi~oUB{itqPIpO+wz=dQjuyjRAA-AKj=% zY}B>p$+)#M%bz&dzRKIW!?LaXw8UZ&A54Gi-6oQ~3k5(8lB%KTWe++^dMYmBho^Ce z8vSV!nNJJ?&ODU#Z#y6OfeDE*qrK-V5SJGB>GZfmw8(@$L(bspY&myE#d5HoZte@6 z5vcf&WYKlBVSLD@>%mu#D8dRUX^rrL~TlQtXn^HTHfOHqN zgR~un8?Jp(VBU^re0`CVV>Uqkso`dY#Kk#lR{-0P8k%98c%2Te_DdhXkCmi>PVSEC zs`FqJo zaDMN@D-eof_D+~vUvSPunh>{Tz%lu{W#6!3dB3Ez;uD7zu>O(ctJ_X@LE;m0+>~(H zs|h5M0WQ1i=7NFsE17^_8b3V)wS~zLCdO6RWPU65$7sZsm0^psw~95aZ>-*%LT1&B z`%J;46lXpa)h5Run9xC+Q@VbxA-|WyX--Gyd2=>F(4Dt)5OZV{Cu28s$q$Nncl6Vy zV9tXpK2dqZ|DVA!`V3pN%r(>zFO1MnC@xYwykMT+vc=w+cu|d+qP}nw!QUx@4b89 zjg5`i-HfPzPE=H#JSX#1R(`(E^L)o|AB-9clHZsjU8;Z4P_9UXJi>26RYm{HscH4M z#yONTrDZWT5$z&Hu(NJjbIVEJZ~ay4akf9kK?c8s-F0(=&9pdw9)Hq_!h1;_YgdcL zpCF8U;wfUzK1x?1OT`V)kb#dAvf!6F$-B&hLBk65K;Qtp>;OUHkRP*L5B z%lr@a_0{gP+~Em&d;v^oQ<8!|H|5gcB4{~y`3h@{b+9W?ofFY=>XBaeG zC_l4O5gmsdBallBOX`+ld6KXAqC((Wf$G|4E>D6VI&K@zb-2NsMqL_I3BP|O)oXg6 z*m^*-?60CyiP??K9za57G`a>57s%sfq!x|C0lp{SuJ3)`l!)!HHAcPiNKk!qaYf(_ z{~(BNJhKIr4XSF!^8Dj==0Qbi$pRe;OOv2=j&W$#4-N2v)7rslpYur+0|80I=<@uc zCSe3_kl2kqT<_1bdq1^|;ut-4QuS8(*htuT-QHZg%K6Vo#DgJ?4&)2T{t~u8Q`^Ve z@Zdd^GPDo>5)uxqZa!n%yl5rET-4`+AEgnd{>IuNv!ku8%GbAFB2Dx zVX`E&E<;|RUk{tr4S)KLGGQ^5!Ct-l@GRwJAtvWt;5~8RO5B7-k&WIQ8o5tv=V`qW z`zaE*ta{w!ksh-9IM6jSlRs{F7!)7}2pgBYslxz?Y=$ex?mlOceROe&SdL2iJuuEb zF0SwM0?A;yG<|%RptMg7@Y~KUl?5s^q_7bky1S&9o1E{?6bX>_?vjj^GoSD*dZf}2 zaTZ5BzL*IgQYfzvI!V1`h`RTPJh(QG22Hi34&@{XMg zwl|~3KvcG6&ZY3Tm_*oFbT|)Y$V!-aQ=2?oKEIq*+Y3249ozovn42XtvoL~fCX?@8 z#w-b^*8>{4N}An%H}tUdz|F4|W)Z!M)15H4y#p;+_SC)|F4I*LIB>ng!o6wn+5Y~+ zVl5bJd#8v5U`{dW&$s^)QgE2+jTVsUR#{UwX6(DL*Eje5RBWn1K|p7jdrq7pGJ0Jj z;oAZK>fZZeG_c(a8p6#?VgBZygoLhyPe>wdM9Z&Sv10G)*?9gq@v8ZC*YNwd4WN5| zfNAb$fM%vg&xPIlkx}`M7IlNHxO8jYhY?~Z&|;s_xWjhwAzNM`1PnrP)T&kQw&U6k z8znih^f&Yvc@Z0Ga4>hy@F>)SBlU`o-jWDrMdzI^g7SU08;U@I?ymU4%W zoJjJ9YV%Cj#F*iRA1%DkcCQcV;cBaXUvg(nlYSlyhk3;u>2(%jEpdgEh-Ep0jH@i^ zjfL_dJf#>30l;CNjUi(o%u8_v=~52aSj>2>f|q2(xo~L`kj?g2JJk%(NlP9ujSW`G zT4lc<{;yg933FgI5Xrm}x!Q^Kad%-UQ*PYAxsa@YyhS(ACLZ)VedgBcKBjC02%xtl zm-ksoU6~~cpZwO*sxGHKU;UTjcm5X$SE?}+b^t(GPPvB^zuOl8NS+K!^vA`!qcIyJ zC&VWlE;%~Kz()Aoo;{qIO}MH98YU7`jI6GWl}nQ0xf|E7c`9$DsG%O#RkBH{@(5Sf z#U9Gx$vNA01`ZJFB&CcEIiY{=fv&SlLs0@ETt(3|o`WRkppYn3Ai^zVyw_KjhL|j^ zxeX!O@<&k~jEC=Ja@I2M4Gfc9-pS2uS(qHQn560VI$PIKTls8=Dl0LLDk|lwV%3-;hiM>&vg_hHpf~$$$mM^0YlM|p)nl7=m^>IX zg~V=9a|cGRx&%d~6$kT7(4OaZKPOM`p@mgAX>M|rtnMBAOF#H6aeI1Dz2bcHv4h95 z&b#JjsDm~xb#!boeYs6yz11v#FU^aDs8hK6y50@+25*lNF@Jv?x>Tp4Wbzu6J?$9I z;=ao%lD?ke{)C@PaI;#7CC7vVg!q)U+DNa~85JT56*SCKG}x7O zzC%5F|7~?K!t_fl_$PX`7cl&8wkHZzL~`h;zIW|O3<`YOr`CE|+^oQqu=FQ9{_+)e zT}8g!cBgm%MEKoPhkcsh{84r?m7q$b!;s;9atuGGi*;hdUstDcR_|c<{16S7g!)FQ z*_v0c?WQB~IlF$(v9s%obYqA-BBw8Epv{H%RZ~;K{4%>RPmni3A_r)v<{z3- zQBO+VvR5LKVg=|Xh`7G;R9O_e#68!mDQd=0TkG{>IBrlSFaCD_#3w&vGuh#j)jn6~ zuECmk9Gav-b9Z=N978o0#^hYB=cL|6`nJ3_d@Sy%K*C7LYL{;#?;=tMeh1pvU1C{! zbGUhM4DaZOhln?b{8^*w>$uFlgwc>p*S`Q5B_kXs7*w)xnh%}Mt`K>X;|<619WYn>bi{xWJP2L1N$XL?4w*2zQ-OTi|nK1+QS+7tlz{L z6}*?&66+gXd7;*#*QOG9Q`y1Prn)}n0>q+ntEwuM6ie1^fFND+ts{|~scjIV!l;R%k>TbI-h;^C7Mv*JX~w%4VwJ=u1O%^-it(+QR>xt@&o5Fn3ADTe!i=Hw@ zyw#bgA!bbu%h9k!RF@R#I%-+pNytOEvK!aK~PYfOd@D6XhuS!FT?Q!4G8X_F5q?|O1wR{JT5%AsO_M-IJ>$!dt-*2 zGErRW`+C0M;r5{rVEb$ZoZY_%0M76F(%k9s{*D#w`Wbo42#x!^|2@p}`f1W?g4H!3 zQcTUn0O}iHIc@k@9V~u_@Kr>91mWbzcW*Wbp_XTtoc+0EJ_`$3IrHScJ(zXPOEi7J zufmv8Ju6KK8p~qWk7H#M|DY%VbJ(Gr_TyvHn>P*V{KaiR#L<$IRtcsvC;9B0TI;@} zlCT;C9J>DuDJVF<&!le50S>GBenxZp!t_mns@!sDp9+zk&Jo!bj|+=?h_7;!0BAQo zHn6)3(G2w9ADbdguiM?4X5Q|?`+T0ZId4H-VTf4@nvQQCd-12M$v35CL~cP zsC|}x>2M!t;^~Ts{ZYWh6lHg3!>?soBnBZg>`|^PXQ0UN?Z-*1BJEg>8IH`fd1aMx zwLMTy$#eEMqqX#quu6vAq2yv&Bf7x_vJD!sFP?Ke6 zxi}4p%?Bt!FxDvXrF_T^IL2Tx`-Ky1749lIOhaDi@%)VC*RR|>G<~*QI;t&}jEA5` z>cKWxmh1Z026Y*}l_L@QiA+y)I5QhISo}>6E9wZm_HS}oo*$tQU>~pnGTt9G1IvLG zDp_nG^ojW5h4@n*bdX(@rsPr`ml;e#Dj$%ne|3FsBf}wmu2{eU$19gej`b0REh%Zrwa zex8;9udybFPxKI4utIqF1Ao&4=2`_ z78#?Bw=H)5O)2l(x%OBj3ctvcKu4^r+*CO8gbTaEYqfHByen5hc3A~7-{P3}K-5Ab zi%|}Ow^kH-%3@=3otT?;-0X@c|1gbJGusk$-f_*f)J)sHDkwtNqi*g{u=d*RfN~ydZ)HkeEHocb(Ls1 zoRX!V9khkTQ6O}n9&x;y9Ot?rjjcN=)F2EY6`(9AQk&nz!f-8U zMMaJwU+gg~CD!x-pxOjcE)hN|`+!VTddsaV zyCTdCQBXd3Qu`0(q=IY@bp?xlrjnKiQ%K8t=7#!)h6YEG3+duW$C|&#nH%fG*;n<3IZ^m2 z{Y9trN)`A#Ww%qVCa>Y;NB$zGNn}OJ+koY_1Xxr-!NwxBK=fYyfl7?q+p@&r`1pW+nrVk zPjsWkS8Pi*-GOLeLK5v(hpddBzvNCVhG}lEmX1m=gIoT^Mi~h}7jG{w{2s?BYFj|9 zUvde1i|dcE2uP001IWXEb1!%iAxgj(%0_5inC&D-R3DpA3KWQ%H~v%gXE$5LF$ znBg^q#2^)Ns}@yvTl$jql^E_{jlcV+)dgcBs>T1RnhNx!{X=x0zo0jT%?;%;S%ulx zxeno3wfwTa0qdG?U!Z*p?gbsK$p$lT*ge>MHpcQvfA8Hn>bKK| zS5ujNAZL+cXz|r`Xn1Ew1l{t35O$sQ^Lzmh+>Yk)$whvcVb+$-?Ot(TG*YqeCDL>` z_2HO*YL@oOiLlx!2JWfzdQ|XY_xxaO@#@t&x9xY~VS|)0aTqAL1~uBWK`MuwDzMJ> z;^pQa;Vxj(&!|blnH}03u5;9`v=uYzF9bI@Mhh5pM0sJQ@qG$sK+1Rt9pVk;!1cGx zo0*>Q_^=gWVoTbi5zY5q;bHxou=Nb4{s_Vyod_`r**`6?Eh2y0um{pfMBK6&4b9!0 zp=@d~KJ=k@zF+CF@0(iltDHuJbZ_FyEIASumA zK*GCPmhQD}qKa0DRutIUA|HHypGOLT%Rz>Y zLuHW=-AM zblgMv6{71qG#bDG-OyH7-0y?;E;f4i7YEpzX)*<@kJN^McFxN}ow??gB+xDAou9aQ zN|SC=Q<)LuLm*XR+|G@a)4mfc9^2bH++lyRZ%2upsiKefLspE^5bdDnkY zB}7d2&kf{kyjx7`%xo71=D)SJDk~@u5fa|s?1?hAY|${HAs|ptP`s?Q;6FQl`hcs6 z;zg#$F?+3aRJrpbblMK#Uea5f{n=&VPaA9=d~{kErXzWaPEPJ~n34();b_x4?ey-H z0kYlV^IQ!TgULvz&@-K2iZ~Y57wf7u_PzT$-?p>(>TWO`0lzNTw_^Y3qI^lMlQ;7= zEw>jJtN?It3;{pzE(ifXu#8KR-~WQTt?<6K*pNQ8wT!ft)?J@-B3&>!P~W|<*e`a} z_kzJlRyOa@#x&6O7)cJBx>}lmGF!_d+`ca7@_27=4z>jI^P1@mqXfUP4`M!WAX|kf zsw@#W_3!I4lXH7v=f5H#{{8FMwDZqcRuGV$f5n9+O)hk0gj*N8wohK)Y)yg#k5|cK zd7F6IUC`mpuKqWB?eq_+hx)|86kNm#dcTmoC-|~HAi>ASe}1DkGjprE%1e$$y2k}} z2prr~r0Xja_znNodVF>v$#NOcVF58QF~@z>2>%m?{)^yJe13+Sq^70=?^-tw@x5=+G@UN}Ww-_Xz!r1eYBsqOB&8|c3`zpP!z ze>(c_tw)9*SojVO{zY0m-fljh-~V@S?#*G z4+&qsuplnk4oH4z9lq%18D)9l#Vm}{yq+8MWWgmPfS16n>$b3F=YDy-){~mdk&9kU z`8;NAXZ;sF-hR4!;=0r!xEW0w*K|HV%qh{UND*jGG`{RY!A3$u+I2LkhKqe4Q&g3%}1Ayso>yyRAL8xLvg#*{D0WRXcZg zacbSSp2SXDU$#sJCDh-`1*NsF!x^7_Wm5Clj{{o*B_xWDG%w*%{N;QT%SFI&Q zhuGZ8Ej?~gy!I(y<7k@R!M(#&jc|s6M z);m2tb#-+!XqMZZNdGZOVhxf$vjbFj$3{hsP&LOU-Y70Utq3&~eD2c6wyJMJa!7lE zR~g!TB1sM57V`Ivf%Tz;G>~N+Mm+jVk$t#Dm~&xY}mqy+*ZUoGcYcB@FJ=4e9Mf zQ&Z^kKx}KVJ7V)i)0%!^8Bp@3U+DNf`qYB0YI-@~Z?`$^-a#uRchA#(`AgnyI8S_M z7%hoG8t%c-CJ@U$Cwm+Ne5ipT==IjB7MhaN6CHD+Hu(R-vhMx)Qbis9d6yg|pYQk# zK=dU=7Cl}c+0<7^|7R((`)|Oq?9>;wVzYNR1pcw0EXW4QP`e=T-ho-4XL2WW^pONe zSdW37ekgg@(#3Gg zze2!>U9<6nTfo1jYgJOwF#L{bkQoviW<0bfeHYrNC}h(Q!kyE8@6Ps4PL^6zC*otS zX*F)lXT4axoRiC{-bw+Rt#*A=4i&LLH#Xw^(Cu6ul-d)L18RqcUiOtA{L0KpqqM)O z5g~s))39CCP;>AMjrI@DBs1DC=}DmiBTrjsk>J+ot>*W6Rw&aNWs-h@u;1e}X?U=+ z_VrrCnO7bf8JDpJM}{p|rtq|DAtUqiKL&X2Q3F-eKn*^c>pqhg@7Km6)LzPyc2lr5 zW~Sj$I3>j`A>bEoPF~MMRSAs@C_qu(l%0d_?SZcSiR>3Ub5c3hnp#n{-*q*p64OUj zSm^f%W~>`d>e^}r{#1RZ)Z^n6NgM7r0n@vbi9rv^`M+9ggMT~JHr*2XoyEh$WR~3x zz8%aYQ5+5~-oY8etGT-bVvaqD)pwI$|1B%p2-|7xPQ^<&MH^QhqjI>HBm-8|H?Gz7 zFuNzGL^U=xe)g^f3TKAj|FiOz)aqs8UMC_(WmkvB_qj%=(I}5`IMp+=)WG_4`uW!s zN6oY|9PDLtc7}AXi5SjllR?XEd$oqQAb0}LhnT!S!+o`9b@rcTwt91anB^*QmA<`o zm-7Kk0|^^rZAj;t8=n}oj*8dp)KB7u9>1WWWO?bHNjcwnJ6xS)Pm%g+dHRWnz3p&2 ztW|&0=w47{sX>C#usPX3$lzoDSqTwla9jW!aS}gq?;hOC#ib{44bAqRL#$UGQ)(~z z9!Bq$RPW}#MqdZLb`xr*qS>TU>pgV3%UDrF$bYnKicUhJvFc7_?UlTM)-q}aWMIl} zE8RdAM(Q;8F_jg*HHYT!J4_9U1;PTl_1d|{^(&c2fm7FhFMI5T>$4{ZBz^-$$9=6O0w!f97-|t@uYeirRp(Y9M1~@oMk&6 zi+qf{uQLUadBmMql{^2d_!R$pQ|-s+8bOWipUq&SG8+OKiR3A3)mY#lZnSKk+!&P~Vnk4zacN&JN zUrrZ~@z2nL=*$g+?~+y#?A|ZH|J0NIlrB&Uz7m$#2dlDTyjrLR{9j06%jqZy0|vHQ zrD#lhPqiUMx%UzibcsUE&Y+6cJD$o*bEpqaADAKTVg<-uLpnvI@^yC6IF(aQ@NW5~ zWJ-iNzh{k%wpe^R*L7ktCg_B8-Fp6^_(utAlDP0=JQY;f+dcc}Gcx!ph>SR3)2U{m z)`l1Fr(mR~$B1h_{;D=v@$kKU&rrH@=6!hKAaCb(@mM$@Mr3@wV@3Mg|DzPvUF$T^ zw*W2wV2sWglVqCVM>#p;dqruA_SiU}*V?4;o{>8DuNcbq{b_E#d8M&71|2%zF_)VC z=^A=_mm{f&Lw&%kR__vcz>g%{JGk)3 z=w@%bg4pj+>P}|QnH>8F*zT5_x(S3FW*Zz`!_CdrTvl~Tu5PJ}n?hso9)AV!%CA3; zfoo9h43|%b!q+eoPEg7_JVJ&tApvLp zG;0JJZj#8}3c~w^YN9$zXmwgutn7+=-e>6$!gUZ;4?zAT%VqxZo_}8YS+^3GO*UO- z(s4su{4=T6{ZUBcjp%B^sgiXU)Zuy@$J_v^Nv1ZnohFrb9H=>>WVAYSYBS5j`Qx;{mCujqSvfJ+)Rpw`D;fdXU0B z$+Y&K+7SN+wg#st{NMoDQBJp=`hEZ4DX%eei>325=&eusp?Cg+VWIW4=!EWoy8NJ7 z^*OOZ1bRQe>$NT%?Gwdq^&JG|CnE8WEax6N^5v_rvHTotKFL<34hO#uk9WL6ijUm8 zo4-JD@=q@mmYwy8qX|jik6uNIcuBmLEf+H@>2>QE`wFc}JAE*)`0Y?5db~ZdiGONp z7=^V~e%EkEzM3)KH6>OBrlSD>?)hCg^Jd#?6sXG^uVP%(WG6b|09u77rtWhWKKB;X zuXZ5`KPLZfgONR1N!~*wvZ_Lavvf^BQKtS5Doj1&c0BGmCTG=9HAG@Nw(-c5Z$Xzb>3N(G~2 za&Z8->A`+~A<6rq9r`(a_llYDU$p?!vdD)xk;ljIq#yvEr=E*`nFmMVkrQD*y^gTg zZw9U$^$`nOn@=GhvMNsuFSeQtcby)BjYFZx501uK<x(>)yse3<$Od^+i+6c>x1#4D@{GyMErk@HqQHcOJ1I8}B4@3o82(sz z5lb-?*v0YZeg!(IFc=E+YD;O=jPIY1Cm7qUmY(m56Op@eK){BjR-k~pgSVHMQ;W4C zDV7?t`^|UIeh|C3&X?HJMK01VZkt3m)X2pCsahC!UNqH+zUXX6WeEDG@g1Pv6xUXQEVt-JmGb!fxEn~H99U<{@pYs||4_PR9xEhkccs0bJ8W!;Xh z*V=%E{6$0>3GhowT=CEii>JrAuR%i3%Clc}0b*IAP8|&FKg6{;jrcGJ2Yau#CHu~m za{|KDO<%?;HU=$Q?yvw7h2IOdOV)+Sto)|gD*f$RtI+!a7MuVZ@=0e6y2BDWNQ5HSAq30%!Qe#_}Nmp~CDZ+npNjmcG~Pd%;{-}<0v>IcmiOOjXRE*k3b zawh^W$mZ#77%EFYFUgx1XxPrYRhsI*scJ7>K;zE$IR+gAQPkm>f`l|jzZ$uoE8Zjq zI+r*D)@&-mKfji_#;S``$pKmJ&dHj(C&*yb`B?cvKl`l)E_Vejga>*rFO?OM`SbHv z0!f6$g(Q(RZ1^!BH?JUn9}|A3%)a})j@{_6KN{z-H<~mEI}5=$7bxI2Z?sNLVZ1Xy zT@_s>o~ZczTJ z-*TY6mxuVcC%5B9G|V8j+|~jrV&2zwHVDDGogLdzc_TT}f;zT%HeW9_Z0|_8mbkXk zTA6adU45d2m0oig3Ppt#gUeSsEUgani8x5_io#uK?!0AmYb_HB9=TZk8RlQH?h3vY zc@T?hGl1=8sF+d8>D5k;Hg_S!;oEi&f~iW1OV>7E!^p*a3CqeQK+Dp7=9-Ofat2y| zy;F?CYX(A*Rhlq+B#>|0*i~6#dee7sFkq?mQ&2~)ap>J-xLwe9UC4;fMLaoByN1%G zgZ?y}W*GkZ^8_A$%r}zhaX;cRTP09j=0_4PO7R=R5!P3;eIwK!MHhX8ZHF9rM7{4ac>X#87VFVy6?{+%Bn z%>fB6re4~KgT)M<2?b7DIIsZr#2_q)4cQdFo7QgKzFV6wW;iGVVONNiT*1avs2kEf z{=kVC{(H38w{Tx-G4b69^ckIF%heG9#9|1Dz!b8h@mAF(;cPS7&sdrUV#unFC(TB_ z8jE)(N`pm@dkV(=8$~J@7++p^fyS@vRu;|KZdH<_&pbw-RpG~GO+I&?*4B{<@T1gw z-sLlF>8PriABsF)&zd7bqNne7LG~@}l%NyGYiw~ZS?q+T@*WFH5jrBu5%B{A*_9lm z99b|Y9q0>=KanoVOH@CbU6slSH|_@0?&7~?M7G@oH9_q`3!=%y!`fw_=(_B8&(1ib zX?m)ywcl8L?she`GPbvs{K2s2plS+$S@CMRno7~nx8bCMbOcMwURl6HTCk}`kh~Rh zkpb9@jg@_N%Tnd0T$e~MpRQ8PFAQ8cIxh4W!UlPtS+~64alo2FVV5@9#t`2G!@UaE zQz%Ml)7%CKp6okIy{p)efDzT?_wmbgDfT80Ym~QKdMcHK(zy}APs?>1I$SJBmK&hOL?$uSa=RrjRl1^H{H8UWmb={@|2H1}Cq;^0t zLC?y~3`_-&k}y>@8Dn`n4TdALKItfaqZAyv&(X&()+---b77S1H`w&soN3EobAghv zZ%iHpNj@;1LGc?3b zGDTn@Yc~=vuT0kmlLs-lrC?CP=n7~>O181x`LP0gN{yGCw(^h97PY*jy^I14&glM@ zr63?~6rZEx<)s~PKLt`Upn04Hhc@eZzh#=X1)(ouqggd}rhl9kl!XDiI84tqoAYWSvmT1J8zu4gxLc7XHo90hv_UE6^j;Qp zoRehcm-Xf!KMM{EV1KpWJn1@PuqkIeHOflP9(J{q+6K#z92%zc{iI~u7|8il3vW;5 z!&O!hYxl~eva~28;U9S3pD1srNuJ_Es8HJN`ek3nr$$iBV7Z`4#lfrj+wd5Du6EvF z9>ElCn{lyH1M=ux)0MVv|IC;e-diIP2SLq>$=t$P&das}ifFzTNj^-r_~S`opicPk z)@=6$`C)vap8=n*(?xiph9P#ZuQ;xiRb;m+vx6iC^7yxG8@yJBM{XlH#PdFN`FwaL z_oTb`*hu**oFvVmw(HC8tmGMqH}iPVBerQgM2R#pB&aoc8wwo1CCfojBZ=a=^5$J(H zaDZZe##(ZD?R)9h15RIw+K9DnD>$4IhBH_gB+jFn{-8>%m)SzB^GBnrsLM% z$4_^%5l`I9uU?b7sY(6_84!)dV6iDKtZ~?uh?oM)5(3wc_pqO)LYP=kdtR>qw4@BdacW9{$ z`)^NZVbB%M0!S|;w7Xs-cf-Y}D}izSf>cwAcJ4ArZi`Si2Q5wBTM^Kdw&YI}2nL*PB=Db2{wmb|;dq#zGmY3f@;|KI= zRL21-3YaN)?fUxpx#6NN$15lAP3v$&$-7~KN6{BK1`Uf$asRxpJOn8!!v??`{=n~h zw6vP!PHW%skP7g*?T^0%iIc|tBiFQ8Msmgeb8zP({kQZn?vrZb0vn-PC)c<=y!EZi zBNx`mY(%~B`g%Zr6Hrz6WV-v&TUuT(3QSZIKL04IL<0IE)A#XgM1;%Zq9oi^lZ1`N z7@7~7-@=@*x6QJSftRjZAD4`X zzUO5J?nFgFX!M>{rFFM0!^ukD{wU$0DW_f`tIn?Ku-!2G#mMJz=KYpsLc@X9A92gW z+N3bs$da7-b2>c?@N3`Aav8g-^;X@a`RG(TG;L|?{qUWU@Mty2b85orRRow3OygpI z5$k?1A%WrZj<~m@u|r?7<9fcJn_GC70gIc#jTkEfs)p>mJIvGY0#jjj6}GqWepAi4 z+i}%+Vd}ZK84?23K`+BSZ0YU5P_G^8%{Ni`X&5TH;Spf00~-m)$y)n<`>J4y!u*lg ziB{-rX?j(X7!9J|t$klaAl%%)G{SvNz;`Dq zXt?l9S~nsLAhOa1o%J_v_MLL5$Gf|;Yp%k=!VC-yHak73i03ci*7WJov8B^4tVHWG z3z>(l+x9L)p5g~xAWZ*WD=Q@=&HrYnFL}VyKI;Iy} zo`Z6`QIdQuGoZjhXXM589>=B0O>eZflzF2}cz*kT2j89YCA1WJ%jY&IQ2MQ5ybFI@ zH4+Z=amGQkvXwJ`^sK+N6*}X`6vF@M9YGr9U~=|WkMM6!%E!9QwuG3;z^yLe(vI&h z$$xt+|HAh|v-BoRt*o@Nv%7`Aolzbh9;?EQ2z6z`mex*S3{zHWKGaMp4c*WO~sf7nO|wA z<*jO)i9;qNZ&|^=?W0p<_Rtm_flm1-y>^{6^27FZKDVfT9^JKkXnLyeclGN%Haz(> zWo_gA=b7A`nbO^W0RnT(IBW-c%-x4Z*~XDY*wnCM_S=5rNP#$V+qBl+91FM6W~6T< zF}b)HpP3RlqIZ$j=iH}X*?OdFzLjun%{kr5|4@E)r9IwAsI*m^gF=!t{L;mME{0We z=8Zu?!$WfTvk_DI&Vr|aGJbT?jyN2~%U<5vY$KM}^48J20#y^+^f<*@oC$*_i_>x= zPR|wV`Se~2BEFCZ+N`K^wC6S6%&xq>lE@1EKr)z!BU6V24_C<>99P=K$xc!c^tLH2 zg$f6M%(~_;EhW3FrOjeUYtl+^e%PC8Myh*)i_tz90}S3~&THt|ob>e4hd{31N<()y z(mSfE%El~*YpoLsmoc1leB9YVYzPV&i{Ke~{JWC4Wn+1LF>s39eL$AkmJIYb1FDp9 z_G{dI6A^VD=w2=!`?XV7OO*Az_8NL_iiXDSC_yD9&3BU|_!&Z<5D7h_s<7uHvxcGp z025NHorZthn9CMss`4IDN+_O|b28r!pAr$O*L$U<%zsh3p8-BdHp}OtMi)SLZi%89 zKbA1(&Q7UGa~W)h9_XF8x!w;qPdN1!4oCV<>E$X)+^t0VNooZvEwC$R|6Ye&nQA`_ zDIK|ydMjy(H>W{e0IBk*NVR8u&ILlL-$UGR*gvIF9F6lRS?O{sm{|}jI^0JyETTAGW-BQ1kfwt z!tA%C2lw1nKO$9TrW=@2iCBM|khGCXfBPmdS8VUZ)TqINjR%Q>+croA+y2p#=H zME9H0M<%jB6WYK1+Z#|%`%x4iMzdz!OO9U6pq=BCR#W_IEtUwjmVGO2n8$#0V!=>e2fN2Fq?aac)Qbb4gPOXuy7-n`#~<4j<}rhkNaQ7cNESG}UBbCSYI zFHr_tYAnz>R+J&2Ou%&&;tvszGj|t zSDm7CtalzqmJd!ANt2?A#s)1jE2IuA9Tgu^(nuAmCXS8ItGNr;W*?9ePMz>x>9X?kAhxN2(EH!g{jBt*e!`L{d_%-TEL z8ME$eTc~IIM-(g3Q=#(5x_itwE^6c-#?fb&L~i7ij);`OEm^)FSQg@5`BBSm4&f0*8fvz{sjv6K)GA9^{bAQSqK44eIhdqQ{vG zbQdI|sxCJjR>%0L8L#j4m`vN<{@hjgUk-792;>P=me*Y=vaynwoZxyDxm80WIW7;i zGGClo#P%)=f@na7luWc7x2-08YtFxE7u+0wZmu@JBvN_-v01q(>Vu=T6)+2;?6nA!wcb_?%`wAK!!!^)fpcmI;nmY5sbwhEXUzLO%g>5=pmJ z?B=f?8vcQHRg1JtfP1O$4y6xB;EmE)ggui4={`PHczh8e8w}Le-00#5h~QO|Y+pA^ zXAI)NE%VC4f@KeDRB=1AvBq9Fo0PWdI>P1gz~MII^sVcuRbf~#{0;HDZc96|;-w@k zth;rYuk#x<^W3eK2S4i_IDSGAx{3FRk(48W;15gBsc}dj zdKmAmmANUJB5`VBi0>`ok-87>ZP(*fMo8aJ1F^1`VlIXGp|8pzgVuZyd%F<3vn#&~ z(ZYepzTiQzr-Wiscm~WKS8cdLWny5Mwcq;*mR3;YzYMgSKT1xO*a0y6bG)HxNbGVu zswf<~{BaYva-Puk03=w6v8I^4c?>P@J>DrSfjboSL zvY=Q$3XN7oqiswq&-!-5$o!n*#D^pY)RD#?n{SYDkrOA2BE%@^XxC1ab38AbEnZ7+ zHHbSXK-R6p1QQxuT(o-Jk!%k9i4YmBsnEu7Jy0UC-j%VkYfGI#=?DHlHL1H#@OA)w zyL?e<42z7^-UsAQKKae6W3Q}*ykSjsF> z&z+D;Dc?5OxCt#gU;TKS0RaGRCB|dHo*Xj{F~rQIbqqfDkA17(Ijj6;#OJE4_;$Z) zBZ@tS!Y1R67Xzbi>N$NUwF>Hb{d5xSI86EFsuEf$@?7f6njCSrllOu3=C3>l*wQCYJusW5^RxUdqB6v&ITROBzm*7z ziaDGj$6z(i3Nvcms#6yB$Cd*Y6iQV*2`sy`BG-ebccwXkbH68MYR?ov>Y8IlnRW+H@sl zf8DUrZp|7pp^2QAS#Bh^3KkB(*f{5nkoz>lrze-ke&7Rj<*6a#h=kdGN{=Xq6TuX- zZleh1WxTEdyMrZOrqJuCi`*sBf9^r0s*;PN9GV*6mvBXc)_rK%3k+mq3tE{t`K}oBpry%*WY70&VF6|6z5$W}=v#JT0N{q>s?H}M;O94h zgZ~Q**UMGMLusSo5U~MqV5BCuU$205mbz)t`tqnkSe)KXVoPHZ(bM5y%XPk*{b8Xq zFC@BzA8*!*O+nJH8XF5<{Oytx#fo#XgSKj5o;?9$nInwf+;Y(yibSHDN=~xGyfOmY z(CDqEnpp5!c#5`y22k%s&Z`X#-vd1jbaETVyp3`n^ehS4n4d!%8xU{+^g$&JZpyN% zL<)C+Uv`%y+zKw+tF+$9HUWhS?#H=#1+jE-{a_2)-jSJ#W(ctCc(*uKoXm zxwj08>}#_<6PLo>Dcr4acP-pq3wJ0q@hY5x!rk57-QC^Y-5qZJ-|oKs_RO7$m~XQ` zp2&!^BjarSt+k#+XP_M?HFnLB|L6V?0XP8rn0rp2osJ{_@1CU#1+MfmEyn}xPaj7) zUgzPuzO&6X_id)^=6PZVI7{ia$B0@VBO@Pdj>r?5xgRr#7ItJ$K%&p^Vky3ZI^F-#l626K^NN0dDWW6(zt{UJ&6v|4$o?pzY z*Z^A{(viOUNPCH@$d`jD-Y$?G1xjfs3DkCG8}U7b%Nk5k{a)RpxI5q8p)KUJ>gVrbOlez zF*c$kaNM?nE=-Gc;bp#;JY{GmCgW?Xg&z^vjPh2bp)=s8(T;XnyksdXJHh6(eB&>) z!_n}1fh>h}2#TotOYq*mJ^XH4kjX&Q;1Qkdpw}&}Rjyfp>v9B2QEY2XOtRnM>WISW zYNyA9@Z$jOk_d2nKWd!IVU zS<{r5+KV9jp{ajE8xyaUu~6e(SrC)zgkBjXk&p4^O1?QEMPu!|v3{yF_hAj$$@kog0QYmZbcPi*+UJ%@ zXQFGC3ch=hH~xgeTGcL}%ELdJgx5O<{wV$X`YdqKhg53%d?;67AZQ^Bx=RJb@teIs zxu3Zvx1Q~x*XOUA7dLiu2#W10#s!t9Q#syBbxDFoRld~dh}2LV6eIDPeUf$B$;|yO zZ{wj4(@xY>RperD$?;BX5{Ab__033hmgfcr`m|2d!~@7bdMsG$4LQv*w`k9M1(2)5 zg`74k=b$wE*rcz@O|sE31-~|@X2P6uS!owHh$eOjLZf(2RV&*&Yy_kS@K@+uxPA~ZDPsr8lM6YN zU|vV5E6U!W8!;4n{_m>Ad4(NggqOm`%hW|uZ+?V;J*Q@d72H)>hh;Fk)}W3hmQgy; zCtb`YV>GvX8%j*KmAkpWg`0|~T3j|q3qVoMY&H*UTO=*)BSizPG(PXii1^4b6EvfD z;U3!5agWK6P>hYPIrzQIMLSN#w2`CbtjMy`3=q>TiD;D~NFXWMW$n3n`J2t17*pr7 z5+hu|?~;*|7_uhFAm2^OxkbJAsPGvTeIDx*oSRZJW|h%!s2u}(=9&38B_E7*Ex0%3 znVt0D46{F#6RhzOX#J^Gy_tdTmyM@=S)+7jki=zjp-(72kHZl@FFkPJ?`%Yn&7TgN zP>iH9!7)3~qp8frKELkt122au{F<;Q3MvWmT*!Cz&y|BmW|S+k{-OX>^DhX?+7qp+ zRwfYv4$_vE!@@w)vmq--5(*I?>pW|~}NfhTPpYP^_;p2o}3l26*#s|q0z?HnI} zlbsl2gPWUDYy@j|vYcHS^^^ws)LNcu}=r)awJXwe-W!Hq0I( zlYe)$Hr26r_5+J05`V{)ClW7JlX{N@x$gi=d&R~Q?7ua4I$iB0^~nEPcePKB<-1=J zox`+=IZ*fgDo~?vX$S%6id`R+1vkf|-ND^-nMAdP7+}_Kal>*n-dHY!2V{IKuto*( zx4G~fS)zi4V)M-oB2QtpC-n;7a(*&z&Rs*R=ARSWIY{NbR^7k4F%03jeJuxlYzwA$ z3*#>=)`!3MNk%On+2)ueY@Ly(>?F-uiowv&;_C_R2AIIRirC;i{PSz3jiQ;H{`wND zmV2}oTwi9eliWaP80{?MYj1i*4G(?C#Khv5;$vi>lEc8lSoD#?>JQx1VKrK@5Dsyi z+-VPpp{L*SLRvemVl1(fLuD-eT%Y864iu3wR`r9{xjCYIII#p<4cGZ2Hy0KZw zLk;nY(HEz`r}ei1z02|FnNwTBMbuHmwDT{jXIvxf@H5( ziEp3il47D^ohvs`KlX9b#wH<$=HLBd4Wup^^!3=8rx{md_M_bM#(v$*SuYwGB=bCx zgS?0v@c<2+4zGeAU164P5%L6D0vrw#@o6n`FbFO>Dl6|^W1X9y zT?92wP$jv;=jY;s3F@&A-t3Uj@ST6*-~Jz(op!45`7a$vYR;#2B29lss&J7=L(9fw z09n+@nvbZtS9F?_c9^pq*Dww}ZLJP$FG|0l z3u{kZGMVxE?g3)Iyps5!GoeCLQquMg2=hwnJq)nlvEu|XR7lowEy-0^o(878y$n915n~A zSDGsWgQP5_;kw6>qR62qoaYP`9k=Zu($mi|+}T|FR)?}|-jAK-@}7%!JZB3}%wd8! zxvKuS;}brQEIE4FgJ7_Ev&H6tQF=<^H^BAEqtj1~LAaDQ&Ojq&2rQ}QC50#f> z{_dOFL_|9WoX(i_Fc;^s{U?^N-}=7Z{)NkxaQ{JvKV$}e`bCms?rKz3%?Dyn3j8-6 zX3S$Z8TYdG5YW|hd2+9!a(wiyn}WR7=&cPs)c96)MivRC2t+by^sI^ehXI$Dv4;_vAEJLP;&6V1 z?@#^C8CZ8pXO04ByC1a$f@XX&+de@tkF#Sv>5@`A=*<<@u$QY1{4~Z;2!N8GWo7$t zSlhQm)VggzDJ;N`nqEW8oh4v@cZJMyHIf%`B9H2eaZl{`TjQLW4sMB3TK*&X>%Zt9 zr{l$wm1P?-E9U9gbR2hgY{im~&K?urQMng62tU})RHce8hb`g7p&$X2W=4zbB-H!X z%%Lo@X3gBIv@<-*Os=VRV}H+hL0MZZ9vTd;Vjf^1Hi++9xo7Dvc0Xh&-Sd%e{#}v~ zITVhcz2j|c#{2r-?3IoGTP%PI=W2X;5S18;s%NJE~%Q2py?LuYhT*Sfg*UgkmTImyc|`n zmA7VtP-oR%9$n`%vzyDBiFt*5yB%Q)MH%Az0-@g-i#(HJ)Sy5br*`vYHRHF?zg!FD za+HCO#8`3HRia@hRq=L_D{noXvFi!QYTKs*Uh^&2c!JL0ULFU{=Zgk&nAEG+nPSPk0S5CaWAnztQ!eu5R<7f%t2nnb<}bBwHQ*v@yh@ z#7c#oTb2vcdn5fd+&GF*R@K?5p`x8Gau4!S6P@zViJw+r#X8>UsYNe4kesQMY{R3q zwb07uDpES+Mw6Oot0hqjvRq?lQ=zuhSL=;LgXx72+01LHm}MVZH>yx|3Skk_W9(#M zWN7A2YfSp7qEzP9xoCWQAU zxve!=oPLJgD#wb}AF{~FVLLvx@iUc>*u#Tz^L}13W<+?->_eAd+#4I&>>m9rb>ig6 zC)gFclu&*EE**xjLF{7DF~{hhcj(Zi~Z9>oHT415|D zm7`8ou8nYhhF3}ZnJecWM3YfM1SMNj%Ot*PgHv+bT-ss}cm0{DrjYw=*}bhCUL_5| zHmAZ%vpv=I5{>>uQ~7L|0KrZRn*M3t!hY<=CyTs)2fgZ1WGg|wa=$)P+H9d{4+SA; z5C1{g{{z6*h0pKWx@UqY@_&<8!f48ax@KKQ05vuBTQ~i`m(sm%eS&aak)!{zI6n_c zJ~(jTk~6(m=rv?sF{(frKQx%#0u}7l@gI4f^c!jZACwnFD|ir~cahujL+5ubV8H+V zg@RBm`(OU810e=*VIb_>w+Xl5apK{sYcto8eSR!(?$4XMq5CgPya@yP zEg^17H*M#4R<#s1ladYiO*-cmx2L6o-l0+ZRV^yQ90 zJ2fKS1y~D^EEnBv(%9$xy*BJ4?x=BA6nXIrlD%WgW4 z*Mh;D)}CUkrKP1q;m)pju^T7r70ad(I?s^~mZfFt(Qff89?S`e=>|(-nU%)j@F!m+ zs!Vc!*%RpVRMGj8E$87jZlf&hWvrQT4EnmmFO~7`C+UKftJg$ ze8de5A|_re*8Zw%_tdSJsTS1Z5>JS(MBKUWq=-RxaxtH-QYT(I9veJwl#>$cs*7kg zAO@A@hQCtIvW;2}k&9p6GXCK7^6WV`B5{@2knM}93RNl|Y#+$1rR1N;5p~Y+8;tG1 zo+WyNOr78XA*^Y;Y`Kr6iw`uhy>^kGH^9_R%9d@G80R3h<}SJPQHWjQQiBVr)#*9D z$aBTw?zk9MWJBVS*-vhv1=T)!LbGatoZFm{XC$zIbD>{ycfisprca1vMOQvo@2K3A z7!h-cbmBZInb02ut^|fA|1@5+&Gr$&7~DQ4@K}c>pW7ooQPV(qP}R_`hT`;BIidxu zXsZ0eat|MizF6+L{>b`i4gOmIRL`zJ-CXeyo*~5q7Fx2 z$>!4evi7J3LUcbEvk^q%uhM#hG(EO8yW@0l5aw;vsvbV*-5zehdiY{DJy4mMKMK=y z(ue4rFPeaOEa;??G7!d2hUowx*@dz$+dLyQw`DgL4Z4uQUR}9AZi?v5k#0Zvkp*HU z15L=)|7y6*x4Nz>Q2oDl@^0~wCEIq(qf%~|fJq4qtCDgE5CAKC2trgxJ#S2Sz*d;I zrxNRIM#EOSAQ)vHD{wd~=!8Q5cTn3@-M0^`o+L4Y(hUXC9s>@Pyl+PBMD%c0D(@@G)*?CBbEN*cW%=}t}1DkyLKHubN=&i+1KRPw;f%L{aZUH0(d zLoki3fgzbc7^M}5TY}YEc5G;W{W(MC1fQGam`P-M{&<|TY3{n2m9mEpVf`cGj%~75S+g| zLlMK7ddSZb=ruY!U7O*NYh<$i@e=(G$!^ma?XGn`{Tu^d$3Zej+z zW_7#HP@`YIYee9%Os+plzZ<4dlF(cUP*G*}ZaZ*61?CGLlc3rnXyLTGj7{NM^DBp^ zge)_Y)K}~9U3^;}>k2;&F1~^QQ0532jmEdis)rm-NaOts>nZXUNVVznoaAwK4s`t0 zkNCC@UD@f*nL>EcY^sTk2ii((^gRCsSvh`$Ec!3MAObI%>W0(WGf2jcm@9vM8UJw- z#iHUlac?*uX-4_GC3#^C?xC99n|NgG)g@P+P*gwL=R&xUN_~=U#oi)l&xK*2$cHg>~nWOi{ z^>eZ6%lt0W0V3qiCb!A)kg7jedQ|C%`t}2$a+>qHd;9f8P_rnAVLD>^PwkI^I?MfH z^Mk*$mL_XtzHx2MKon~6Z*(4D+T3E{U>;LxF{rbwvN{ zxyMKTGJ1nHp)aL;zNi6T=*qc31OFKLt}2d^_q67nYdTlha>g9g-$Mf~u&C0>^G#P* zrfutfG%C%C=^;NtLoExp67FiL#%u9@Dk*7;&JM|QfpO3~cS-hBh0&YB5s>mRw*ILl zz0)+dWFibq$0oXq{K5LJK+g<{9`kKuCcCdMkpR`a$}jboceEiW$fXO1fPK7}J$BSK zgYpw&&}y=M;-^^u+*xw%-j=k<%SWy!w`C_W2r*^=Kz41eWODa! zl`fIEkh-@ArkhW1G}Xauuo&O-W26 zoOy%I%^2lxW&!QF8oj85t% z>__C_r&*Dcv8g|HMZQxL=(kxz1eW65wD!KEg}YqRW59E)%sS+JL4z=({QLOmz8kfx!<& z>A3cYNMP|&Jv!!!KlG|}9^yV!+GPY$;@_COyA;nJUgn|Kh}94x5!)D6vC(4MJ+@Mp zZr(ro9N(W7Y;?j&=uCKcnFvY0qp};c*#1~fK8CO{PbyiUn}O48OQ6>%gxM4&&y|u2N`py>5_I@_X4$TXRl}7~8AjY)qz_ zzh97krHDW{e+TLqXr^#RpkUq-xN&fK1j+#blw}q3&z4TwsG-|9LGx!NjGs@{QX<-D zAb5)dmH9;q0N8ORk^~hdL4g1my>s94KoMOPf1zY?w1hK)DE!hP48|G7SbsjZgoINP zBXT_-_{(OH(93eC0q8Y+ZRy%D%b19)&zxfoLG>rp>VcMUEO+>O zFu%mEF6=<_GHRl7W^OQ|hSzGO74^Hm_jP%KOGChFR#&eBq|3*czCaR12pzlA_ya3@ z5J<%L-tu{}dm8N1-1I5`%ZVhM6@ovB2@ar}H@F2x3#w@pNUl&y=k0Lm$axC3e&n@t z(%?-obIJeQ4zdvoJLID>LO3g=$(gTs`uAV5>M@=8bj~fwmjWj))A$5qo7a z86Q|03-fvLKhu3Fl5jB{x^1NvB{bTupe?VMepMd+!2Z<-kyuNIUpeXJErC=BvnfLl z6~Rc;ZSZ%J1ONz`^nfysiFiQcB0!47zTCh9wR9gy7(YV)mX)DMU<{n4*WMVLp(-sZH{shaY}%$yxJy@EE{gfyP>*hxc;)^Qb?mYq=9#BnwQja06*tn z=$wZFV4{V=5*wP{8oOg>-9U)r5B){?NN7juO>rA2;yqR^19vYz#X$WnAU%$bhG}&Q zZ3Nv;UwWACHHI)ThKj;KaH!NXE`g!2VgO~9J??v`!gp!+B{>5X#SR5~ZL^mzjB=yh zSOF8y0(3N=C~gc_xfz!BGP?=0;6raAqJu=K3>o=ks-O{a*&~`^rG*O%L1|?(Pen(5 zBMC$c*~u}MmC~&0UgM#(v#ZR+ANEvn5+sc@vfeBeW)(t5$%^|In01bJ6Hx}WfJag} zT5(gqbepmKQe=2KuHy`WWjybCD3EsDbR;>&gpsMaCJ29|VhL{HHr^q)q*&v^O&#n07j_yt-pBJt=BDaanw_~E~ zF=MH2vxG@CoFyv21iPUwx#vy{r>5z|KuY;^_q${?_}9lBWnw3x?#hOTSO0#lth~-4 zWH)-)&fodketSaX7qOeWMhqHUT)5+!t$8w-2&eBy4Y0OVPlegkQTjHocT34}H3O|b zlcYAQ8$1LWYfAxs0lPvd?1GfiF(SgHK`Wf9e?IN1mb(iF)b#m@Zu~8f@oY$4DMC1G z>O&Q(%^&+fT7_9}sy}uLtcqQ%203K-ePjILrKG**nve{My6dQs1Kx=*=T1f{V09g zeH8)Cu>>bf=J~GphdlA0FTU6{w4gL9MPENx^`UOc2j`Fc8!QU3Jo?p{>=moSOdTD7)U;p?|;2IwG9 zqQrAgi^K88Ap8@K1G~+Mr?;AwtTrY9wpebpT6F?=yaVhf2Ru&qj~_0SulkysE%$aU z9U4I1a+6a#z#&u)vj-V09Nb+^Cw&AjZexUel}pvU!D z`!6kk=lyt;lQassmYNn}DY|aXwCyQ*;?|Dh=l2?Fvip4z7Y1*zc$l)f8?=wUwB{Ec zyl*ngXTkAZR5TP7d`Ip}nrHw(VT>$lnd7J>YG1Hldm&7Q4iEdC)Vet_nP{eb{T#Xs zTGd%BZq4~@lth_BOA)h{@|vLsI;E*)zOz z1@2c;2SJ8_taz{PUI_-!mC=?l-ikET$PBEy0&wDtxr0aer7x`uW{LV9Fy#oSr!Z(CyZ|g;=7I;;R6aL!Bg(YQzBQ| z*VSs8o&9~S10s?1GOWwx8n#|XXVUBTc_Y$-@bw54UCA*+Y+6L^TJPDhxMZr@sZh#J zc-c9dkDn}+rXQDZvJK^|zn+j}#~h|Bj{7VNzbs<{_}%=PB=q&^nmA-xuG@X4-R0=#i=*Nip10&e<*2U6-+vwU!0RS}VGAdHD+mc0^sHs`Z zE4~)czE9!I=8~3(E}U&}a5~RKFWOdiyYhXH2({u`f~9;WZ=(V+7muJzhy=YVe&mq3e0cOW>*o0N4!1X0zIETV zWD}=MH`?3e1#gv}6B*SA_BlbzM-U_Z=fMB~r};Zn!)9%w*L%a&BBzpfvSN z_0N+3(lv*|A>yw3dAAOwVX)u|63c1(nsm%<#YKq?)0qji-O9*O$hq#md74~XSJ{k@G4L>;751iq90 z2S&#=cLB)^4COMZJLI+v zGPy)Mv$0qa#puOpA47KeBcBgFw(GFX8DMo;l!c@ zahT59M3q1&Ww8`LK;7e%(yQLe?bUXX-XB<@HDku71!Q!`{*MR#Syx1sGH%8-8-LElo{cKIbir$$ zEg9KFZo63o%-=<9N~Y+}E}s8TS#N@2@-y=&6>~+$>dKA|$9u;-fvB;F#IOmS`H^WA^orNZ;U5<5k7v_;kp6= z9R}pLG*^p=_ktsM0daok`pU&%*whGI)#JEoLvNoQl;KZyHtt+Vp(d)44gLEZ>(=OLpc3vf-iLH@J_BSh! zNaRfa?Xf+SK|?O*GvL`#G!*NaX4yZ~Ad)TPTs27cu-kQ68T$hEkpVJYa_u7mS=SD? zKbRsoUe5^T_H2VFSPmo2PHi3#JKKR(U9C}&LgC>pcq!d%RIL5_u~CpLTJ)e!Avmz- zM?Zrws|0O}`|se4^aelLgQEJwlisuC`Y@a;l$Ez_O7O{Al`3!h_~uP7l7<^xnXHT+7L( zX1GxH(Txns4W_aY8K5eN*!FF7p75|7H~?7JX2+YT_mzaXkw-1nxIsZP{|86yWG|9M zLTYm8b@T>|xvTVgwc&U7cJwB&OJiOk?qSZnhq|hoADLYN9S;L8r?WB$Z4y)T6NjHEWcUw z_qGu@}`ZNvbHZy;vcv{&X%U1=|mi(v{Yop@1oV zeA2y-R)UeRrUqFWr{0Y#k(k7J)JyM1TZPlw2MICRqd<=xVY}Dda2*XT6giPugJYnP z=VHP`1_=Lwl#JBvgA4>q`J+>W9rN<(m>sji81B!jC1bVL3SKpvhs4{vc2^cruj9Jl zmMBUv8!>gi-`F&)vi z=6cgZ2)V}8=&~*6*(bX)-VEJoBDN4*92~X>Jtd{+h={S(M*G0uE9=#p2++TJg{k6x z8uIu-vxH^?kzBbs%gvIh%P+MzZCzbn3+Z+nqckrz*N*>0QrQpY`Ec**6{PcKdpTWI z3XZqADE@B;ZCNBQ#|GFL?zQM&Xq^Z{LAOlQP;{*I3H#|a3p(QePbON|Hjs(dLTAWp zlOp!yqb$Oa*kXIbb@}<+i|o<>@b3oNFZPz!n0NlX?~7VfP80R9Z(mVDGDU{EbSaJ} zHWEDJ{vDDY)S=q;{@)#{{ffVj>tb?<*U;{C{x98REBx~xtaLhZ3k$p6Y*X^T4Y`9B zJ5s?M?VyhSH%<+SlteEb{D0!97O(TWEue&x;bFlcNgWb0vRNeOKnsD-6zES7y@pqv z@P$^LASBG|{vYTz8pOx1k$nivIAbY6m=|Bv~5lV1f~U6iGIX%;I2w(<;)pMoqtsRvSP>QMSkqOj1qmN4Rg=f<;1NcOG0cz1^YS*4k@xz* z`|y5DIhV0jKK6n-L{SMMk#$>|P5Y)doR^*gSb!bxcvtu>a@DOZIHu?rD?N6RhWF>M z9uzxBf$x<=w@5Q*->q{NBCo!hL)m#xE^XbXH{vD^`~N=DTLHOL0OaZG)7gwF9ovXQ zw!5QJty|=;$0%Y9jYF?Gs@#+n9pdl62C2m)aGB?G0(OUeclq=6X*W(fQ~>SvPa0Mx zCMM3{Xf#~+zGc-GRxGY^@0kh)!qs?V`StbbpPzwfXN+K^MJGS6w13c&vGS9gjUlzd zL5RakPm)#8(cRqKwA@(i{Ujviq>9Bu|M@c=EO2!1D?4XpqV7qyWr~hS>^q?;587{; zG4A*~!$wAf$EzHOE7Dm94s z!AEhHqwAB9myHJ>cc`tc@zngbgZ)FQ5}q^bu@jzqm3FL%wP*x^E=SbDacrAT2R_Dl z>3G1~$-uBSSO7$PW3RPz55XH)Rq4*MIIPTq7Lbq4U!qU|o{!5QqjfVWD}bl;~Ybho;~+X~CJ0 zJxp|M9Dh9OEL%Fd&))k_D(q?pvH)MbR(#V|%0F(_Jb|@;+>*HfAy;gTqO$@|+({?e zA4IZFYn>9Y5LvMa1P)kHD?d!wX2~S`Z72C?FaSjFh(LP%PxeJc-yop>z(o0i-s!B8 z<152=jMgveXj*Z+%d-PZ|7td&pS#_9Df)Og*nE-wZT(9h(Q59qyM{~Q2RUgwEF)ne zAVphI5o^4Gn5a%9=$0ox9y2ts`PWzs$?m*ntF75(2J^!=o|F8NW`D!Odf_371Cd}oJXg&_Z=Ey@-U@rqYome_N%F8~=Nz!OqWleu0v znU+3V@uI!wP~+7vbY$C_dt?_~bxwW@K5z|1WNhC~M`WF$xu14b#^&b@-nbq*eYRqf ztX?dYD;=?jP7Yn$tdBu3LcgybWU`FOIH)dU=R(lFuL`);^F?U|y)@{c<#V_*CWwN) z-neQqNGJUg1b0>AYlRhtWN09+`EdB@AD!`h0u(uY;F3Gi(|4kN^XUO<_mTC>=#`lH3K z${*RrFB4QE=oZJrj|%0B*y6kb+HRz5(FqwpFYCgqxxfGcIp3w4HY-FqjY0SN%)~H+ zz3fyM`OGuB_MY;Ilz=YFbJ2FkADO;Gzm|Rnf21Pzzld{gC(dhJoK0;{FQejB*};RM z5RP7e&NFp<(Os_^54>DTvC0K18a@FJV6ycYnkR`Hf7h zxlCEYqXGi*@=xP&c*wZVmeBHvSb^^`%ownp~m-@Btus~@87Lgq?XW-u_aFjnIa z6uey1q`j187p8lK`^BsG=rmL|8jTSrNr^Dwq`o_$4_KUrO@rHG&U0u?y)&^;YTy2unixWy?X8Tl6IIxwNaJD<77-&9F9Xn0DWBI&nqW_b4P&)!qv?>N}<;? z^L3J1YUd35EK19H*XUxM1@KQ{GFSLEAFO0DupEL-7)iZ9s*|2kuPHDr;YdM))cs$B z6wvUmK^mfVSG(~+bGce|pdme{d<^%kPydxgOQ7&#?V&fiKb}GFxnG#lxs#5j_#~uw zg_7&^kxVH6+*c)UGxAue99*o=ID5Kw%fZ$5uRZYF=CeD5rO~(47un3*Z+KRDL2#d6 zWTbKjATQF{w^D>G>Z~eF7mz7)P+3~_ULy=VCt1Ix9qL)N*tZ6L6}H;&f2XYU!b-uJ z;C~WxyX=NCQ354UMuScJF5$19k0jOI+hb>E%_j`hddnbkFsf65ju*lhxgPXYwJC>s zcGYgNj+^z_WjoH_58Y?+k2g9QdGuf@8Sh=+DQtt&qtB6Aa!OYbKo#JH{-*F;6hGP* z%yLkDuhh4s^!pp>p>V!ADoSo zNv(b9|2M2VQ4)nU#izZ(o>Bbo>R8$j$l^hx5F7EuY=))3%03vZ9n8Cr>iFc@K z;M=ElbiqTH%dKBI#>a_&n;oas%0K6{c7`a24f7Sr?t4&8&HF5SMQ#S)h~ny;Iz?t{d&L)hgQu@ww=tmsBTRy` z5*1twg;?hJE`OkK6)3e=WcPM4)T=p24_Pw2cKg2w!nH*Y+Z?}=q_)PZ#Uy{9sb4_b z-AkyhQ?3{q@%dCMuj^J1*fhJ%)TG>GdMMa_e=5MnD95BJFF~7OFUaHIg z*|Vu9dr*ZuAVK(1m+{Ps(36!HnRg~O&JxPVX$AnwqOSe3``O3mmn{Qr zchG+9h?$6PJkfE&Cozoev?4o88g5Tb#^{SF`rgXt=8$bD5wn0Gm23k?F%aVw31Vrq z={KHUERERD2!DSZ2Fg;o7;kXiE31^E&ZhIstAgrlxs~_68T$o|tvG{|=B(~d8z+9D zKx$(?lP8%EeH)|ZH{;&2cYH*T){fw?6uZrm7100&xz&76!K{3&NsBz+YWY*ZhQ%@>zeJ5~Jiu<33}7Ik4Y(dB$PB8Cidt3+);zn3aDsv zQ_(~6qvHkGa&zkY(W=VinEM?+qUX-agn#=k-3BEf&vn5Nvm;`{)ei@2*tDx^M%5>% z%*s&Dp!I@E_0pL{y}$RlpK2p!Ty$_eaB!S7w}3{K5aj)M{6}cStF9naX{9xQuWhaU zU;)+0M5nyw>G$Fvt!h=H(_{GdgTKM+9Z|2rKnk>Pr+x{W^dC^6J%C%cM1l@em}vw4`Wd%1D|(;vKZ6d$r+%Xqg7z6Iz86oD0+se3LrrLLg3>y20D*%WR*~nxKuxuH!b$2KwdKc4 zQ?)NKfmWLx_7_hgx4~}gq{gJ}0r7#=E2}&gq!In#Qkhh=*C!)*0{)ZG2r^jcWUH@z zD#fZ+R(-Avq{xCy;D95~+^^yN;IIMk@YiZ>-hu%N2XExZ;=d6%FXtu&-xhNxuw&6N zZ6fG$cLhL){WEc0kzX0tOc;8imYz0dtfX(rkU3r=0b9MTVfMO9Sv#|QNR#x}53n?Y z0Du@;Hplge>A>K@%i%WI^!SN3C?1wNYDOmW9F3&HC&H5epnkp2k8NfC_Rh@c5IYup zSPxoYxVkjUD(Hn!AnR&1`$8jE$HL#ZxJ zg1SZ*U7v_R5{u7oEo2VNSfy?#Tldf0`LU{V^)CDHGfp~qHAfUOJSu8&KZp)7>cQld zSd5m%KZZg7e2UZVuVUzIx@L2 z@lH-gajNCb&1nh^wgjcu1H}V%CxF<(=BU064wb!jmO4C~X;8@7j39n;M!x+SZNj%;tM6D!>HXIRGs+kZOj0Wob9qZ&g!Q?Pf&b%ZC~$d+?Gx)@tLm5wcH)*!M*e*|AIDdaJ%V9Z}Lq|L#jH1W};*BP-I!MiwMl$K)NDvwn9ApkFuuz)Q7%Ox8I#NOw@lQbmW(mQG< zRPgAffhI?e>bB}@m!5f`Dea1i6yR0r{!!re>+5pu8BkK53jWl>cm1_4>k=uxN~b8= z_tj`J&^Y98@~{4bk)P54z!$oIasgg>jYReD=4)%yQ&M=I>^N}dJ8$|+-V_-~B_xCg zcX$a5H;?aarrd(&S*&4uNZVj7`oVh0*u^6iwbi=!G_dv|vj_oQ6ao?F zNwf`gUYkR@t&-+VIr$D=4qT9dsC&y^yc)E%q$Y5%lXIPR?bBnJnu)6D2 z6xB7(&$i;2(vo$zSzG{|NKmRvz+8m4ulE0H>dK>`;QH{$ltGQXiKJv?%g%%>VHjjf zgzUW8im{7f2HCx2mu*mr>}ncWBQmxbA-hJ%5@UI@hB4o~-}m)?-#Nd(?(aVL-gECc z_uTuO-+3M$ie31+{_?z8OUc$DK4hE$0DYN!`B4)s?1gN>Rl%e-1GSXH)(8gtBS;jY_0gv0cc$HUwVy)%)UZu7CfkXE% z7@#Gs!;zH#@YD9S{QyaDyyC#r$GiPG_8ayChUU|uk9n8VEWpf{tf0)XU>?#v-C)X4MOgdT8Z)=7^dxHgPA9%8s%sjE0;4 z@(Fp`lfU2a2)EU+u0OP0f5_(yaoE9LcY~}$TiX!)SFRHRFp<_4{bg4S|2_z~V?ka( zzY7fI`a*brsd{`oXR7E0IkfY{SWDOqX``Pq(MD+_O=e7^QST2AD>vf6(V0~{`jcss zm+QNSqmOXJW^)4P@FBvEOJ(v_nX=PuXKNu=)wW=fkx6TlRf>kNk3CA%G;yy$Jy=4# z%ITng!E`xl^*q2`^G;S|w?Sz+T&Y$~+{f7OXOLw)x$VS$d1UXAAA6)^FD({rp$ec1 z0eQyjEW8a;BrBXo)y_TbnZ@Z@a}$rg2zKU6NaXc!%dvRM0(|%hzGo3Y3rg8vGFtAN zG5{;0_vi?)kZEhz*W$Vwjhk1&3IS{X^N$M$4l!w7jg07PM1>r zN51|XPxX)sH=gBOU|;-mKxC^}mCLQVZqW@2*)7WbG;MP~BK4Ul69a}bC&I{Kt^7J1 z9wK5G>|3AxNAS%$tjH@|uCieU^iPaTh5xL4Vq$X{ZiY}=EBpHugB0_tf1agmX8>-g3z|f<(a=s|69yUOepn0+7kpk^2S{X(xBa-VyAF-t4nk2 zdG2`f8pcly(is}Wb;qOf2TP+p%(JkTb=>~afZV&$b+j2+g6%`LDtevFqmr%)PmidI zte3;wbrI_3Gsj`nuQLE9rZc1E>l2z_l-+*0#1n-; zIgYtZ1$-WhfxWq7C$Wnr-a`A@S0x$JVUDZgCD0vCSjM{F)H-B#K&~dUh<+NT)>_ws zy&6p?4t~hPl&wa<^mh&erQ&fAhgN3M`Uc+0E*hW&H6&)O6{)}e2q?p4_i{S{BgQBz zz&YhL3SrwX@uJe??FVH%_ytMT3W=a<`J8XJLKa=66&{gDVdoHX2zm7}oZO=rM7-1N-+o;9EP>OWo`NtXD6GEt15+W7W8dhKEKRW)k* zBsJE=rTd8}nI}?HO5BQ^TTPg~&yP+uNLGTe5{co2J$A1c$P6;|9#c0wPT(EH5%#wA zuCh7*Oz;2}&Z%g>P*1i^M~wEm92qt;2%xBQBT1lqNnBd76$&xDLf51{zu?qf>s0+0 zRQx5ON@dN%Hq82xTI>)A0GL?jjJJe8M)9m1!n@R4@H!>cn_X_fobe8Uh0Un}_q-L- z&8&i?vUOB>K`WZ9&?P~H9t51L5Xd%983>fJCrx&1sg9Dcj)tw}BHs3>nq zf!4|ESJN>R9Ja(s>kWJIGwn{)Ow}cU6wiVdz9cm~v7krR17CThwJ1%U4>6I{$!YP< z+6gl+%;mQ0B4mt6s5JRLqvmGk8-@FL?j96qSMGP4@A1b8Xb@d(L#_T8R|@d&E{x|B zvI#%gXtnI7*eq|Ghof%sC-a4nt*^PzQu<9~KdY3x(;ivTS(bZ#U#VNH886a2n8(L| z0@sx~+<9lP4DD;_wJ~^-m9yCJho?!P3*Hl@cf(d6KjHyz9pXj^yoncW2b$3mVET4! zCSH2`8MWo9V_9Kt&7SN zu&?JxtNWY-o4_Huzt;+KcXyYYWpRDF0X|=D^M`SX>6vZ+xzFXh=loJfrbN(EUQ2n? zMcC6i(XI#N+3MNRk6mk*y^A(-6=t0M^qQ%?bPD**W?l$oUM(S++CZxbwRa0R?u%f3 zQ&z2(ju_u`;2dp;Tlbv7f7`^MXU5^3oCgRdf1jTBk}U!QMnw%Lq|ewO-lZiwM=+Ro zK+_%G&VM`9zz@CzmI%6ceH%E+ zPu`%3HeYJP$hIaY2>5d$j(a*|7L-0v>)~_MASoy91E)A-)FA0bXsy`mE=*xa?efG(?3I<}=q%fS;idy*kZ(zz(cG?STEi z9zx7}I6+XL;qgQ?`&}z5H K2; + K2 -> K3 [ltail=cluster_0,lhead=cluster_1,minlen=2]; + K2 -> K4 [ltail=cluster_0,lhead=cluster_2,minlen=2]; + K3 -> cb [ltail=cluster_1,lhead=cluster_3,minlen=2]; + K4 -> cb [ltail=cluster_2,lhead=cluster_3,minlen=2]; +} diff --git a/docs/cudax/stf/images/task-sequence-user.png b/docs/cudax/stf/images/task-sequence-user.png new file mode 100644 index 0000000000000000000000000000000000000000..2ae5648042bf64a9ed532382716178df4f7f7f0a GIT binary patch literal 19913 zcmc(Hc{tVkyY_05qzp;O5UDIFL&!X33Jr#&l3A%_p64M`M9Nf|he|3#0}7dCNGKsP zi%{k%gr_?>Xn6ckg{&?Q6BxcYVK~&+~bn`?>G?Ss|xSs%)cTp(2q; z+m5R$YT?fz5@|gRB?bO3EQGlee^8jHt0^bPy_eNWPvYnBkW=F4K_3-@A+iwg^w_G?x{}}QYGKuRNHTs zY22i!bLZKa<>7^(?2=S5+eXKW!?Rv8?8CM_jq|Q-Nv?&BOZshv;lYB{a+Gw`f<5af zIUP00Tj-d0_i?U2mcgxP^f>AUC3W;2THh2cdM*JSHHD3VYR7|1PyfenikxJV(_CC$ zo?%NlZs*`o882?X7+yx^&4`-+*|{12ytMjue}dkjNzi2}gwG3nU4Go5;a&vP>+H?1E)1M#S!Nr^5cQY((qv_Lm zdIpA#q~OrdEc?~RnrT-#?nXwgv$nS8)5{^by1IHO9hi{IuvB0?bpA$(=W>64RcKRd z>yYI3E2Ls?O}SWp!PF=IrWhF4ut?6>i^f1|Fb-sb}=$i zM5^AnwfpFLl9aSGCw<6ief^bcPg4_`;Of9?NE8|dUbGUh?Ae6=3}DtF-J#{n>TNs zK67T%p+korXJsAJ*52{qzKDNjCi(tR>qciYEgIg8^=^Mw#I>}vZ01JKq-SJsN=vgb zGc&s^PI5lU%F6I!-?nWVww5J9!qM;9vqP4amQ86YG!8#nHgQF&hG+W}ZHv}g_xbbZ z-j$`9;9Iw(@CiUic>^acG=4(fs7hL8jfi(_Cl1(}q@I zhXNG2gom_IF-2Pz=6$$g@pyq6YPKfEL*dR+igrqNqPMEv0Q!3p}z5P z>Z3=Gy12Wiop>PXv9?o~tfRJ0T|=XC;@w*K>mHp-sxA-MDro;*V|Kh_$6n>I(i>ZL zG~q7q70a}cl+)My1Ox@ud&Z}xa=o_tCUNQVrAOD0LDai#+U%S%gwva+%#tLf>_d+rSG?>AXlnfZuN*x={y zZ+!jwt;Izb{X)AAyC22=_4f7oT77%NDI|2~ zI%iKi{*d8ul$24Jan;S8Ha0ds4zk8&WHaopPL?pBHnT z=6-G66Yjs6CKZQGRZXp5GNwFD%WnL~r$Fev~;9@1FWZ|#-%2RNKvQM9O1Ox=AYG^1Z`{gNGc7sSoHum=ZzP{@k6C^huIdY_AZ*9CPcR7-kk-2$* zplS6USy>ME`Eg%Gg|&Mk6vduHa#mJ{rKF^YUFYWJ<}v3q92y%7O;d^B5*1~XmzRIp znI8E#%1-zEd8$*VPCd@d?3?VXu=w(nxhGN#5gGRWz3O;Z;STlK!$Xqc3MpE3=WJ{Q z^78U*2H(>$v9L_D1!-eJs_%Ss+9eYDm|US3`S_j$`O<|7mc&f7oi$3{lvxanI9pPqm9WZvo8 zwK0i&adGkDDb|0|=7CUw2cp*%TwTRWmVZeWMr1I!$js{+Cr55h4+;zvI&tDeOmy_V zSgFj+Ols21)RgPM>-c!qFU9W8jI^|+yax}CbvZ51U;jQdRQ1N`XWi`VY#ahmf4cZr ztf7}N3k%CQ)>TV3aJ%XOA#H7K|E8wX9)FgG1&$sab{NOgBwP2CFiAM@yuRK!Umau9 zoT@a^na@uI)5VJy8|^l2+En)bJ>}S!X9JcRN=g|~{sT1;dk-Gms{K%-&!MZbQn8?* zz^Ke`6XljInF}(=)n&*k;yuofA3h8`es{ldO*q@*r%wkRER~Mo!^Q3k<;~4>U%q^a zzg6hH=C!!EXoNfwC*gS2f8o>T&x3rQ#`d4MeS_}l%J^HIdW?*W7Cj}>jfv9r&J6Nt14(A%&1ov{-@o@sOWS*3 zAVJJ_y{4un<@W8>Djc4bwQ(G&cw#ZIw zCO07=!FF-d7|GK3d&51yF3qH*q>mjJ+=)HP-OeTH7zYA zGcyF~;21Z(@zV4V$tNx@?(2ru7cX9D;_;VbrXxeGN&y5=!rCg^s{`&Q+C@h=DLY=(%?*0=B zPEMlz0|QIpTs(*(d87~HYuEnTQms1Q=;)ol7bYSQyd`Or)vL=3xz;^08!0HJ*_0-6 z)blJmgjgk=s1a-fk~jD8oqtJLU0od~X3N#1xiLbGd;p&aijCccmGaNa6Dk>)?RkGa;f9x8n&Y;6ViYsC5ZZrI$9Z6lHef-$nN zP~2;E?(EZxZyEdg`aEj-^iQ1Fw*O>&FlvrvPf3DT-bJ;S>WYoZ5$v~ShCiD2SB3}( z3s?HA+prj3d5`(Xr9IeT+^K(Vu3(QX1I2Z+c#hlL)y1VHGCKZvAuYGx17S1NwJ2qC2 zcOT0b41V&2*K2LnStCh?&9pjU251D-YwCuKQ3sjXjDOVvtkja%L`}U3w zPHygvLqkIzHE#U(?%kW5p7wk9PIhH=HS*4#)FxTKgBwu-9e#B&lgVVgHx6Xor+Vuy zT(~g)_0`ta)>aRdyJ}uuGT%QY(d^#6du4gz-6ASjuI+%b!71ZRK75(hnKM;It}`~P zza3}C+Gr3hez$IID=#nK=+4lDoHE&Z*08Luj;izZ^$O=6&mf?Miq_WX2@5rUH8Sbh zg?wc_J*MY|@3yO`s32#TH8fD8J}SGpN%TyeK6fqz=Yzl~*E?Ug7up|);XCKo(4f(9 zUu5f(Cr><7ba(+;t{}%?@dKYeUhjU<;un1)meUYA&QhVG&S!d_loa+f$|>` z64L(8TXs9En9p112?aw#j*}-(+Su7CM6gR6UAq>l`|P}Dlh~OvX9^t07+YFeZpX)m z9l6}NM?_?&h=_;{@I9)WpvR((-ud%&-vf@<_VjR&XqlO7e-wD6Rg7=c;A5=l=wNPa zY^2<_P3io3mXk>`w{Q}lo_j%oibV23SxYmm2^SC)^hJn=g@q+&XVc(p5P0n3MK(O# z<0nrPG&Sj{wr;hZ=$0l>D+<9;>u#irO+Zl+&cCR4?b=0TM4Y!r6$37E89I9iJo)NX z)$ET?>x#rK4!pY7b~`aKyffc=pv-@Bfya{2^z`&;J-uMSPljE)((>}?k%a(&1D-!W zjA~|d=@P|`9XpVqD!zVY5fv33Y022?`AUuSIEw12QYa&!qvu%NWktn}%C}eoa0Gw( zMbap2mHzKEO8+&m{$FXi{*#Ac*PiVRQDi!D ziCQH@)l`!l-S8;M>*na_D24T|5XF&>7YDw+zFu$4+rp%yw$IecDqP0%w;bPd@2h8Y zbOemcH->9RRwrxmrl4XWx3{`qKpImHXRWp6`s)*)Pyf)-(diu>r7|@&4d3zCdpVCC zJ4VeUKmy9K?h^G{abc(b>-9|Afofkt-C>DbJ!(Tk!$f+zE4ZJi>}-CWGiR=?@kOc< zNlWn%1N~c|t`>M$JKZ%)|$) z`m1GDqIXMq_G}x8Nzi1Yu!xB3NyT*eniAfB-#f|t9@!f&w+U zXxYxcy1EIgN3>NYxlE|)dc|%cB`eeFQL(XUPoGksN|cgEMA+He69;N#b)iShZje&p z*fFFR-9tV%~*Z z!?Okkjs5-o?daa_rKE7)Rpqus5gTvMHH{CO7#^-!Sy>TdR#&x{JG19zXy_F3dfKs| z?LcFJfKVv+wnKG0`3#B{ZynieBaZ;|r`T3reQS4QLc*aU`trd9-Y;Ns#Eu<4a68E8e1DO^-M)kMc+}0J$w-fc>}Q(8ux{_pk@KT(yctd$u8}-hnJUX7r()cm>5P@ zR@TX-;iLi3A|5K*R61(x0@XPtRa9uQ#oBD-UvlUQC3|YIH5Oj0+E~nwW68E&Tf6%zaN#SXj})K^P%2g__5C%l*gu$PW zqgV?)mqoj}x{QpBeC&rCQRc!XRD}-EhX`D5WYs%&&bT2?@cqY+H9!^VUscrp(ELxE zsT1GTt5HaT!@?@jUF16a&?*_=Vc^Y(-?Vu%(Hy9Rv-%<$dThl+Tqcc(3d70CNgNac z?*L~3RTAahvb%^G@Es6J>GWyZKpKv4oY_xa8e~oKPor^-s=ZUA4}C{rb86gS&Tk`0RRXDr(tIa&~qmj@kYD_lah`I-D)^;MwQzg$J@k zuYbLP+TyYB&Dn1EvCK)2@n_|7dudc@wrw-VyZH_tR6p#BkYOf$9~hXazK&Y>6rC*J zo;^NibalTNtaV&Y5)4r+_xk?y4VSyr}qXWp=4{ao@5EeuW zZdA7Z#*G_Aw#(?$1HpBb*m~w$eIrsR@>=1~lGEDS73s(C=AgJsyh}RXt@xieVrlFb zw6y$&KPDlU={|q7?jG0%b{UUg*|imID^s2gycw)$&M7#&R|GE)JOc{;^zg(nOG`dH z*4L4mcPp3Hj`n!$9kX!M)6+}#T%H$mpMR=fqv_zVayJZI1tq8|pp`+hq8$fL5@)Tf ztZZ>4ErMlrv8YI@AxW0Q*Vi{bel-2>HZ8uZM$_yidK!QtKf%D02@s&p0F2i?tO*cq)=jYGYWWYL_u>A=9b3l|0e zxxi!*fO2vmLRM|A;LaU-7Z;ag^hAqD!Q#uJDxr*rCUMo1)ml14(*?SXWaB1k2l!Z7 zRVA;YvYEK)ACHo{-PQoJ!Le2VQsVFr&yJk|eBmLJ`z)oDu+V?EcHxPATJOA8X?O45 zdj0zKhD7PzIJH+MOR&E@{QN55ff82Zes<=o*xBXj+sk;Z==C+`l!~B(lPou>nK^PXC5?e|}&4c^&ix0mB=UWN9v5{B|@v>BIZ?x7|zr$4y~- zzI9TU&~bmkz`;SYMyFk5-mj;~n>SEU%jxOGbv0^g>+AEK32bT6x)-s^Nl#BdIas@+ z;O*NPHl?Qj-`!t*y%iHdO{6)Fec?QCK;^k1JL4PBegwwauz7nWFjbv1Yk*uv)Lwgg z`<35g*?MoC4wY9_EQMDT0tINqi_rlZx0iTIuA04#vWN|izr`-ig5y(D8M4FJ!XkLx z28v0xkRwMfHOS$V0OU~s5+Gkvf7P-xnj38eR;b7^uDD;B|BKl)PSW`>k+G!R=Sw>} z?x8meP^(>s-V6!D`2B6p$=TV;?qYW^Ye#qP+{tv*iWMY*KbxdeX~a?85A*+Bq-7Nq}ts^TCE?>?ISrxL@!NlF1i0p_4+%7eM;$?aCZbZLK4 zQBm*21Pyr1^z?L|*@#}Mb zHO1PWW&S@aGp9hLMt(YSnCBoUPk={3>uxFicV2AZPKg|5WNPZ4lOs^ZzR3ZAiRjaF z^76{jmrZ_i?&+Hzsz*C8BzYfLl1dJo6&)jE;8{E}~gL?Yn(g#1B=^nahs%R~_uld#ySbCz-K}A&p&x%6b zTxfrpPrrb6*J1tM@t@bx5`l2J;p0OBHB#EyxjQ2xqy5EY8mxF|Z?6&hC)+z2nRJ4# zt1dV0{N45q58kNa2PeEoP>^nUVWI{UgAI77k+HFg+S)B>{6?D7HWS~3Vn@UB2#^>V zT>(woox68!zc(P6xBo3Z5bz7Z z#KX(00KE9?+uI;yXft#3J^)=X{65eE&;S##1^Pe=0>{CHl?HuTB{mst9}$2)yB=LD z0=z*dV+)#6%4L!SZAaO_fE`_=CNMK{FWL+_q*OpvLSO*@*3i&EG|<3QdKWGb!Ey18 zLrS*p7E-j9?<qN*LQR@4iS?f;uPUu6$~9bMvOArrVt}ZJ$4* z{d(9)Q#(xt8?=4<_7lI_&wkZXtNr*fqD?`l+NQs9OV=Ambt_Zw^7V5|yX(u!QjY(z zWWP?{S&1F%9UcxMDy-*)y*xaWfRqYqYB%*0lT?G$GWSr>uv3$uu0(e-p-CVZRxr#i zo@bjrH+P=hbQ?7s<=D6`ifqSrxzuM}Q**PtgTqVx8cu7K3?0&gqt?pcMFky3P8Wu! zVMAzWXwYTXI8SZ;J$XU|JuMT{``SVq4-XHU>A~Yt?(-VIOe245k~bX&Qw>pO zas0(cG3JS(FN+AL_%M}j#jVooYpT|N9P9e_4Vc3Z5Yt|C|B~#>3&Qjcskah%}(8+UuY{y*^qixxk>;H98<*6M%_wR*qDP78OI= zR;g^z(`8r}IrZ&bX#woM)3dWTRJr>de232vivuFWcB)@#cBI)qmiolb>*P&5I8{?q zQ(%92t(nJv{xr6-I%#d}Dwn~K9J^KOSRi%ZA%9hFzx@28(RU!C`1aZR4m--o$Q0NO zQ9=K3{m}gRGT-JmzMf$KRwKJXqJ+?2Y0g@DGuBMv5O^X!pwL^T24MZ zO-;>aLPi3Qy%gSJh6^ZC68Daq1(k+gX7TS=6;$ym!qz`4>)^9;yI?gkt-|?U*Mx`& z8c@G}1dW&`yya!nURs31_fdJ$Ylq|TpX*QJ~%`sY}8wI5^ydagKRo(3_x4Qu{ z3Vj@X7j}Tyw8>t1V8lr_r5~DP*ZrS2CAKXsF7Dyyk7(RdzYcv4VDeON*#`N;lSOrn zjoWo~b!Qo4Hh@aI{`KV+^hI`(V}*Rp>qfhhwWd`^{@DODgSQYqU`8`>)pX+7?3|ne z0mqS~DxDcJlrsiHQn3(fAQ+%?Jj~Cr``<4SYjgbg@%Q!hpix^=wWbv#5fwvyed|Cm z$+Nqr6J*@!mssyB*RGi&Y-bsD!@7v^Kqv;Z z;2>Thxl9jkp{AyGIr+F9-}vSEMM;7wB#}ZC*OS0ml_66K*bS;ep^_O8QA57MRuU`B zD&athgnat!SruIFrzfbFB={J*^bT#=ub2V}V-gSYF&u0!kdV{9xfCnu+@b07Vd zN)thXdIhCx8>iuhF5eew8J=SDOOhaxPbp|&agB%Z7AY!?-E-GHk)BYQ7k~e@i+Vf- znKvB3LypgD+50ev2-baCRyu0p$c}Df)>8PR96Z?a{Lie_BC?L>?^$_O)%O=d(Tdg@ za}9y~PrGyH4uv%uaRC7V0&qAuIEZP?{V4(PD<~`!anajzooo(TR0S$9ALB9f+&F>+ z+fkF*odkv%X+8zS-_D)UN3GggTKYMory-GP+S=L{w}mQlu}ZnnLa`kZYYsXhB%}b2 zxOl6Kj#>wr6HU(xYEat`U3lFKT>YIh(z4*q8xv?l)|Q7}y?W)UmadPTn{^{snFtab zYsiiR(^0BU1TugH;LDOnCQs6>fN|#wTyir96H5RKT>Fy2O-c)A^Cew1-;nV@KSh^5p-;S)h4e3lGT#m437y zM`Q}T0oasv&`B)1i^M^EH3Qg(u}LzXeXifPypJ5caF3&!iGIfpLC@a~L>&kDhknry zZL!nOubZlwdid23M?H)pU42(PIsLDO3=(PP`-h5Mh8~*B6(SKFvenQ&Y?r3Z30ZwJ z4g28C5KlsNBQ&^`4F?V!=qhsEftWicFTV~5r1{~A-C-QE?6|_f%+UK=TG7*<8i`WO zhF(kiG&D4lGcq<^zI=JSyLdO){l(F&?8Wf5!Q>W5FFB@A`7Ge62pSq z$1lo@gyw(k+O?;K@7RGt^@>~s!#QLdo%!yiY4ukfzq`7+YJBNZAi6?r&YeCGuSk3L z@5jyOsbp-VWndsU4Xo#9acx&@Ok{j~gQ-xO%(kF_(e2!s z;jsLc;Tt+)L94C;H|x;Q&<&*7U*8_otHz(c@4vZQ1@Z`vAr%P#Y)E`t=0_xfX7tcR z69kC#{=3hv#eCuZV87fLuUU6wggg`@^ta_^q9{q|Fg?%uE5jY?p9z>Ip-9TXPNA5ZYkH!IJw8{5geo{vAT)Gn6tA0*Lr_yV&Z}TNHv&w&5HwZ;jv>0}5XIa{^ z4KIUMAw(UGGiRbt#0mI*D{|F-Zc6nmN?6vB>`1P!ukTn}_Fju%W?>P6_&ndU_9xc- z^5uH-r!;dAB2m9Zzd*>}AF0|7TaccvZj634y&Cz7*XW~{8B$&=Vq>2(cYJDTG3oJ` z+2#&ey`koI5SqQ$i0m&Vp6(ByJ^KJ{vJCo$nM5G1+VPoB0vQL*k00N%m73a&6bI}_ z@Our357f~{Sa#%@YiMiVb^Q4?dKJWxkh_(P`tg|=t2~|zqnHdZH!Pwa z5?BO`*d*InF??O7*aL$01)tNDb(Y`q+neYKJ&qgSgCzZUh9dW>G-VsgzO?M1M_ng6 zyq-w2R8(C+@Y05RM+o~68q2@1V^*_;S3#9K*?GL3u-Bnr^`UGM_72HA8ktX?^aCW- z;5g^P4MmiW=;&yVnma5XXck-!nVXq$6K)G!3pcP6d+AXfnFX7}UMSSCU{qtP%a{fv zTO^xTCjIlZkXq=G^=eQxh$C!}LKAu?=IvWaA_RkjDWQwTPKa46%~bTPOlsA2b?JwJ z2`MQlS%i0EDY*0jdIJ+AD?)#SUe{wginmUz*chl)Ga=)qJbDy_Y;myLHbh9kZPr5L zf#^1L?FIJ3+ws=qr9N*jv9;YjI1}OQ(l5W5wK$|=ydDG2|eCysnL> zj){HKvg*j&1m{yA#HQGa{(mKy<+(DKMYKW`{jfI=gKNU-$Sig1$%-s14# z!+JSJ>yY8P3%&u9BC-HEAKMp$6+?m}ntL=!g+sgHBZCkY09f|^I`tJwM=t{r^Ew8Q zrUQ3R$@KBC$<9=qBcF;FTL*P<_Gf2o7H96iT7%Nqmys9Iif@HytqDYzwt^hevVnm? zk@s_k-z!A$f(mt%39|EzSn7Kkh5a09%jUQ&nF6O;=#@_o2~!Ppr|93d|5EwImUgv( z8v>7WdyH}E;teU;g^8YQ=_kPvJB^YY-i<8g6rUUv~Ctj0}kN=7$I1&a&P%y3rLv zGXWP=4l+%wR+rE)gf^cBIdGjXjsI@`b_ML>~3rgFUY(pBtd}~m| zR1?7wYkL0kVIgVH-#X6+if+*kA3z^Ypg}-D$gCfZzy56_EgK)Lb>RX*W$AC*{O@>g zmQ_})KH&voI#-jGR#3oH8p<~ALD#>csG^dVoJ;~m ztm5U0Ty$^IjRaA5?74pZ=T}oMLj&FC`sapauq~GX1RTQ?_;`4X-mT8>o|&99iiwE< z$hZZ!SPn$HJS5%3=$M$geG^%P(#^=k#MA0IWZ(iR1)EB=Z-4wpPo^=TnI@pJ*GqOVqgOiRy46-)d;J zQc)q#xPk(LVcCZdx7t{RW#4@Mid&x%-NjL9e}+7=H$gp#(V#i`@oLC;e_N^04-mdjFZ> zAi{#+h$K(43Cpg0q~U+WYwzq_gPaf3XxLY_*%L~j87a20+yL=mG*pMWm$~Z$&AXY3 z&ZP~p=s#gvD1%7`J^s)^ibse%KS;M5NZUDNsFCy&cZh_24|=!M_bduxJ7LC3miM2m zJ&@GfTjuYf)TgMc%NU=KFg4P=msQ#=>P7bPV*dLaf4&E`9r=mX#Ix)Lt_6yM=Ko4z zrCQk(O!#&%wzpNe!KnM81L}!PSD2}w=|1KC4BZ8SY$wPYX~GA^&(A+NJUj#U6}0Zj z3L4pDu&c1!rqJ1+3 zT~9n?aj|qyv)Rq>Dt|Vft&4?R7XW>TFzSJvBn&FVD=>!Jx_!OrT<9{z<>%*T1Y8Ux zy+=%piJ-iVjFPq1Om=>VVD)@|R)b~v!cGn@8JWaeOoDy@H+dkUU!cbhW0UcSL;bN8 z`jdq!)quvM;f=$Oa#9YSDQDYm(Htl`g%O87_RufU3d{1p@6Wn0w`So_+QuqIi%P4A zO8e8;2+>c_&!7`%V4xtFbF^^ZYj}ZIdm$W%dHxoL{ulsnSlojLrhv_qTesd_{-MCI z?nA_QFq8Y*K4FL-Bsnv)^oI2!)i9>nj5Kksz(F8p|D75dDkW$%htbbl{;@FufNy}e zDem6A6fADjmoI@RoiAU#BHSN;^Ww=!rRYfuyw_w1lgP~X6NKOY(4lQ;PS8fnCCPZA zBkmH)lYv8w^-+HslmqB#V z!q)j}Z9RGx1#tI>Qq*NaKnS~aE8s0Dx!1xgJxwRWG8pjB$cg&y(dL!U8C`3Gw5dM- z)_wi^7A+2E!FBDAC!XRx*Vf4LLi=GBNALR_Rw7PbqjQ`pFJRAaT|TJl5V;ocU8Oyi zW{R)@W!=6_OF#g)I6PD|#jrGlfxgYq`xdmN88#Kp=F4?%gfAW$8y=ZH$(!X{wrvXl z*9BABQg~|4i6Ff#VTKV1e)nVgkX4*^gq<0-)eJbm`e_RorI zNg7WS*p5xu&&B>w{>AXrrpHm+2#*OdOC!jjjF!P`X=oSW*8j_SZp*+Dowtsl>!5`J zXyX&K;4njRl0NBbQT+2^d-mg4BWHjPi!Rvh>b%Qxx?QGT@e!-M)Ttx zSSJ>py8Mw(k7&RM_<|1yUwj+fMK1jOE5ovDz5tXxwow%LC@ULV8B>}+B}94Lre4ZDd2ic?m&*cw$b7?R$xp2n7lS|cqlr4Ho^fVjXGgw)ikz*joaZ+nQJ|D~Q z2!ao7kGS}`6z@G}zF=U5_)3C3`pXHTDWOy6OxzZcj&UBb83IK9hgY|EEc@Me$(vfK zS|$ESET3Cjue7~idjhirjqHjKj1h!?)1yXH4Oal$8-(|GK3_3JwXFh)a^)adMMsU? zB)^d#eN%4`hgT`AB~2hXAjKZy&0zRF`@r2a$8XEaS4d^419+UK~-r&vmpU`R`_k&ik+Svv2@a! zo}0t)Oc|KY#qjb_Xt)*7!v)?^J!^mjo`E%a|6TDd6)|G~^HI9CvfYi>QCh!YnCkoW zD;%y-zC41xZyx*W3?2Dvb!A_>5KBzT3CX__%TU#nB2OYIbliowJM<`#CJl!3`{J~i<^WK986|fD1?Dr21-AXF0uiuJ7 zrnY7!2ewHIGDrn72)MN{%sBuUrho*Z57`JffLenE2UZW#d&*=jg9DwMlCrLviO7Sv zX@VRjTrnCN)X)JNoGbT-I)xNfEcAFAE@v2^DU?{|FT!F3lpf`Lm%Rz(GH`Ax02L7G^jA;B1!eQHTk3_D2}3|bo{e2%@GIA3if5f zdI*x!;pz$Y)8rm`aA-*QO7MO7FGQ+x6RxPBOc{NMNW~ZCV~19Tle`JNWOlT5D>9~j z&9xRh^5nuo4PXw~uJaQ=!GcgZYxM*6iY~fA$i2f5S`~NX@^)y%;u^|$_tzZbWBy6Z zr~wRlQQ2M(Jra;p(MAe7jq7dKrt?uFivXK@%zQYC4he%Rc&31Z=E};-2?0-HfrKHG zIE}aa0*6iA4r@9bsZ5BOmV+M>uTwLobf@p>h zFr{dTenI&5egCd|`3$5C7%In~B{knr#^eLM=&!sPLlg-{3?zq#r>Coi^2pC?dkZnG z0tM2M$^u3DbU?$$kJvXGU6p|`791499sGo?; z_v$)z3r3cfVQ@0YK|JB6ClAV~^{!Dd!(ju=mZ%*W`#5Ui%IN1z0^Ts0QoSnOu+kF) z|N2gXPdSj}QEtyQ&3o_gEZX0r9Y=z9EL~L_ixiUG^9|<-ez%ypPgGy9gvQp^5g1!T zR=x_WkS|CU9OA|B>5~xLD^AI-RytFktSyQH%UumSllkgPKM`=z;4;fAE0Mpn>|q?w z(tWlV&MCsX?+Syov@{CG6kD1mL+0%hbcbeX|7CpCJRMo`-$ptoW)My9=A2Sj;3b-! zo@M6eM{hwgSc&h-YKveIdD2}o(Vn{juu@-n|9Zs?hG5K05N>&JL&K6(eat`UBO}3_ zt9$kU0+rZ)iy~_i#t=di=ql8XX%6;FIg~;Ar)$@Hb#-TtW-zHK=FVkw-zu2J`m;Dd zGcr1gK`ctrb+RkkoM1ru(r@1qCU_M0Sg6;g0rd``TP1k4#c+j1f@S>+X(>SYU9Ww6*nBqx>d*=MapBJf438r`Ak-j19&f?jnNP#FKjwFu#zw zZhswvOH8L~XxOeYfRc>L&x|^5g55cmF zjH2mY+t5H*Y=L0)mfvBi9NG6WhC(Hb0CyyR_Uta?KZ4!Ac~dQ+(-6L5W@w$Kmxs&C z6?D#?Z6hy@nSyxxYIpzMy|VUprhg{3Fl7=1nF_Dyg?k@SOne;WqIm-;s}FxcDE#PE zZP2KL4qxY@xgLrz8qyTFv1x-t*tz)nis0mdqmfip1jdJF}-`HkbXd&7J4S_jq60ffU4 z#|iNQbdu_c6H2f+k>tSm06Y^D4Z(DKujWf2p8|;Y9(n;2-uc&-A#iP{!r8H{I~#OA z%$!-@@_I@f8F{~VWF#0w5@v16pf!LO@TghcgY<;OgzD@vuy7dV@%*w?FUCuWV=?(N zzej3X=qcdIG!$8>;`Qr!u8^^ybh~y1q1)>R&C7H6urdY^`bI}@;#+WA zC61wKVF-yP32&Il?R#bR_C(^4V;I$;nBdM~@BwKyAp4WBgH= zE}T(>hJ`rAipt7dSitk=&l3YGXwCb6{=5YkKEtMIgC@IgXlN7CCP@y!m(Xc(T8YU_ zP@Dvziv>p7hyYgvk!dRa%@!jez)!+bZnhLmjwlQS68sv6DronLP6r1K1N2tC;>rXhwFr5nj z4PhfAh8Pe|7*MKpR?JAD&CtNoqlBynYaJCG-G{_q;5xJadt-w7!2nh${)AEh*ZUMy zTVk#_PV_npMi_3ONN9Cr5uBx;1d+I`H&c;T+nv59C)$` zmKD?gn)v4#)7v8q|3WVFA&zMF8BgAfUu z5c8|PMusLD0+0mJ-$0gbB~(UmcoCj?2Lz5H2~8s|Y9R?7q(ZW>_qO zGe*Pr-kG7S|5+4#k3=f#8yKjlbs>~}B$6h`lu(@rlPi;$%^pba#NZx@fBShHg&J^f z)PSNG5iLhH5l!zW-*N9}ShPIJ5%kQ{IIWgv3EDm(bRE`(Z&- zg*uS}nMpBGQQY-?yBRP|P0B&=9<_pCaXLW@fV{*u3D{H+m(o3b`tXhEm|+}b!Xr!yTt*I_t!T#UL~c}($3u_8mO3%9*(XpP{^o@c`M)kqTsVE&rz&(; z1@;sb@>NnUdhpF8Xxgj2Qip(poHIHw+D*Dj(;p*Be2+SgPXK1N2oDW@9|6;uWB!I` zO$Dnhs*2YYMYW*Pl3-liw~&k8o2dSfcNli=ES@Vt!T>8@2ApRDiVAz-BN&JcCFL!t zq=Ulh9y5%YG8fPI&=S|Sa0g8=nsvAmHc3iKYTOgr1f`qfcMHqDg9o$b-)+#^#h>vq zCx`g?2;2Gbb0@2bA7>z8W_JEoCp?0Jn63w?*o(9Z*J_-&{eD#4bIbfoox1$Hvmivk z8+3)_CfCbf1Q3<+o^{No~Pbct160R8^aEqNdZM!?r8Fhe>>n`xZe3#vW zAV?N3F!G**7*ls#d#I;IXPS56`qaY0W)kYYn&TP_ULb4Pk&M#pe+tt9YA9o51q&Jo z+*myGhmMYpN#v@|-QP0%vujHJU=U_;?dLZ`)w}LYrBnRRG+(T!Ts^p%QTV~T3%9<@AKFH zp`OtX9&kB19KZeCgruj80Byk3xOq>p)kJqsR@<1k!$_0g*O9HQ2o|D{&dtrqG7KtY zTqX2FVPWB3r?=vm3cMj6@vYcB78VM-l|@^43?9yl-GBJod@FunL=975KpP`-Zh$Wb zYO=pGXSdZO+Y^(#P?E1kq`BVLb4@NPiiZu|9`08_1d*}28ZIaF%Z9$-~1_ z`qL>(`u<`#{G|GacVSqHz=&VID0_NJ;~Za&V7x2LJVc$=tZHhyS0*U}#0fr^)AN`=S0b`(#8~mUPr__NVJj;hQtUKd?~(}od-4Fl8dA;F zuWypCYA&1x@ABcp2Qk*|c`sf-5$mmsJ~+FuVBcH1j*(w?mFGf3Xb8{chRTk%He!^P zVC(na4XLYB>#{m(SmL4SUgJAYmR zk6;%!uq*gSY`%GY41ONr1jZcDPj!`eK0uGr=O7`-M2&V7KSHE|AJOUP=(rk@<=STx z2Z8~v6u-7Ha|;W~yS0_rrp)Z@M>#pCw_jhOc#^npI*UNdQ!bNk#V_;o(|D**Fc@XSGtFPVR3p40Z4;x`PxW%wKll*XdbP?V*Cw=( zpIq|5w&z$hbBS7YslpV91&0W%$5^A=Q4}`)an?;5zA4m(b)S;hog5weW|k(GSA|7I zr>BS1!GJ1zct|1fBH>~g13E0jyQwMs>mmlX3&&brhYZ;rSVygiVB5&S44{(}CR5(g}!9aF$-RJ61kDI>mESfrr5a z^bTg)C+B}&P==PBUsfhhv3YZdB3J3Ggw&-M_rP_)e^LkUa*iHfp57U1JsVB0j}X)_ zHgG^z*89uEf&Kea-Q$e}-uL$I!R|XaIVo2xBJ=kA;wO_sz(V31_2A8;@b?9nQ(jBN zC0Ca~o;07j323j$5CDQ#A%%8eV1VZ){kH3YCm-#FsQTN_Na!9;(u?_V;41sVkVD;VjfX)&lYv1A0YJlZvwU$;42E2suWi@%7ZE?c(6dbxN0wzi zX`vk55wxU~Pd~hx(-^PbV+&$UjLb cSkIxgBls0>x5+j9HXG@<(n-bV3Pyha3v(WuUH||9 literal 0 HcmV?d00001 diff --git a/docs/cudax/stf/images/task-sequence.dot b/docs/cudax/stf/images/task-sequence.dot new file mode 100644 index 00000000000..b86a37eab5f --- /dev/null +++ b/docs/cudax/stf/images/task-sequence.dot @@ -0,0 +1,68 @@ +digraph { + compound=true; + subgraph cluster_00 { + label=""; + AA [label="Allocate A"]; + } + + subgraph cluster_01 { + label=""; + CA [label="Copy A H->D"]; + } + + subgraph cluster_10 { + label=""; + AB [label="Allocate B"]; + } + + subgraph cluster_11 { + label=""; + CB [label="Copy B H->D"]; + } + + subgraph cluster_0 { + label="T1"; + K1 [label="K1"]; + K2 [label="K2"]; + } + + CA -> K1 [ltail=cluster_01,lhead=cluster_0,minlen=2]; + AA -> CA [ltail=cluster_00,lhead=cluster_01,minlen=2]; + + CB -> K1 [ltail=cluster_11,lhead=cluster_0,minlen=2]; + AB -> CB [ltail=cluster_10,lhead=cluster_11,minlen=2]; + + subgraph cluster_1 { + label="T2"; + K3 [label="K3"]; + } + subgraph cluster_2 { + label="T3"; + K4 [label="K4"]; + } + + K1 -> K2; + K2 -> K3 [ltail=cluster_0,lhead=cluster_1,minlen=2]; + K2 -> K4 [ltail=cluster_0,lhead=cluster_2,minlen=2]; + + subgraph cluster_02 { + label=""; + CA2 [label="Copy A D->A"]; + } + + subgraph cluster_12 { + label=""; + CB2 [label="Copy B D->A"]; + } + + subgraph cluster_3 { + label="T4"; + cb [label="callback"]; + } + + K3 -> CA2 [ltail=cluster_1,lhead=cluster_02,minlen=2]; + K4 -> CB2 [ltail=cluster_2,lhead=cluster_12,minlen=2]; + + CA2 -> cb [ltail=cluster_02,lhead=cluster_3,minlen=2] + CB2 -> cb [ltail=cluster_12,lhead=cluster_3,minlen=2] +} diff --git a/docs/cudax/stf/images/task-sequence.png b/docs/cudax/stf/images/task-sequence.png new file mode 100644 index 0000000000000000000000000000000000000000..333eb4c4fe08385631d431042253f0d72b9278d1 GIT binary patch literal 50433 zcmeFZc{G)M7&f|giM(-WmYn0 z%2GmLuEU3UZC$0~5~aMKOw)?`tdp})vx>WYd7&576+0w4R8>`_ z6wbxpS4hl>PmZhdS1SKD7@xnR#w)kzMPz#j*gCng~i#k zJH1+3TG;mnU%Phg&@o#Z8=LFbtHXI1g7Cf(o})*P?%cVP!nfQ!XJKLCU0$9cf6V*$ z@1Hzjjc@$blb;4*)K*3_g$j?&;~7quRDbMo!MxQq1;Dyy9Bfs&qzsUS6Ii6&8kH0TF+`cWA)EsM@QCe!s&2UCY2-h|!75(Yc zr~3N(?(Xh;_b6DymnOgP)zy>BFzw&}I6mG8pEEKt+GxPby}b$SM1^%9Lf3v~W-2PG z;GiH?Rn-FH8xEvHFt*_+AKx__o5)wsGBfYLlm?*2M zsX2eaICE||CMM>`k1Hu!24-f*ti>p)sNx?##&z1ij@@X#m}79|%9Y5^@7}z5^X}cd z+}sLpcFL8Vizg;c12KX(Z{B?S_U+A$=)}av;kntj_wL;jj`Q&FkX)aYqo=3uO&SXK zJRFtzh<)3ZyJ>lYb8e%fqlf+-b#-;#8!K0>tuv+BrHMHP{4@cTgZ2klSQ0l5^||}@ zD|7809UB||T*ZTXZy0r)d;8(Thk|b+A|l(iZHtSKS9`2;@ZiBS@vq+q&&4XJJRM>f zbu3g-QNhWbot@oItEs6aJUu{DQ&)Gta`0PMS9z>>9$%G*q!d;XPJ*|Ih^(yq9((=z zHMxv|p`nBD0aw?hz(C@WBS)B-nfdtm{G+?GTCGmuHt6Z2G?MmE`d_|$+27wkvN=*} zWBK5*W4p?ZiHV72d*5<*fBg6{?-`dC3!J<;Nh&tY3X^4FaZt9ls)}4c>&=_vN=i!G zw{OQ;6=J%xCdNsJu&}Vxs;#G|L~KK2LqjFcw_F@I(T z%&()b5{<4>}wYA;d-5p8C{9t7O77d*&7V*Z$#@)Mj$;ik|O-&*=}utGDpUlP3pga0da?#|IL_!optuI~FQdS68Q}ryqK=NO=|Kge43c!z9oa4`PN_4)bv z+`PO)V$v&5&t=MKZ>*=6o}P}?AY@tlKs!}&C)06h zM`31eMHQ9%TnbqGR;M1v#&*sQR+cRfQ5`#W3?m)AS&@~RdeZ*;$Gcm%53wn5YTvkV z1Gk>46poW(OpnnLZEbBaIsv=bByBpl_XyPU>IGth%?y-z|6N^tsh@r2^5vYa=}VRO zJm>3EPHJ{j8TFSem@jfjnkq{8Tbl)UxOcN6p$)>g8a48z*GwO|Nz5-U()}FKF+u1y3 z$z-1w7OwP{+$NpQZEoo6Ya?tqsrA{)Oq~phZQC-H754JgkJLpS(R{*Rxs%erqM)Fl z@aEX$)RYeQ!;c?7mfT)K+^n-_oP3s&GPpG0y*Zv5j_6`_>HnRGc@ce>#NB3_62;7l-8gzmp2rs#7bkgdbTv1Jvj<9g zuF5MY^cCJz;@Vy3a%{dfKwzO>)>vQPb@saqBO{~V0}RNgP-c$8%P(;o7`&5Ln>yOs z5bm)@nx9BS)x~^Cqa9|b9(!gV^y9~mdYfSapMXGr;mxbk(gfMs)>emxn4@35e97$= zW*Ew{ynIvYCtDoV;9pW}unC*5cIOO6`7fB5j> zbWi>+p=xQbg~dhvY|})%yVK0Obz;F`_|wYj>Zj`J2$t9r^|<+v5GQA64DK%ssoA-6 zx!#*z$Sk?T=RAM?`n7!*tH;XxsOx~I>)M~|jEu3Ek)gTAB?`y5tHfFD=7y>@G&JVs z=4!%N8ft3|-rE{xv+8tBX>mVnO;s{0abLs;mk;yZ+sXWAdCmeS#T4$LVicIh?yd~j z70x0S6Bk$LvEo?pOuHP{q5sMtA|is0g~fAo!}G|IjOt)UwX8@z$9mJi<^+-9p&`f7 z`slxFOV~e6j|AmLx9jtsWMfO>)6e4K;-V!VoS7-J`NBIP#{4`33kkvBw9M--cJsb{ z`yMZYBO?dJ|()E|Ivi2dFjP|kS zq$K>?d9tJH?bYyoCx47K#9D6uvggpzs?5yHt=nln*VY!?9BVpOw*FL{v14lpd!We8 zANaF>?D(S!v10l>2Vqyb9UwY&z^jGav54bfB)+Em<3Zi z_s*HRj*f+)>R_xovky)aNCkefDYp69@4VK2<|EwWdVuO`2L{aU8J$mB?*LRWtu)UO z7Z-P#{>D*hOi4|xuBtjeH;2$vi+jDCXSh%2+j7Van(c*dbDYtt0FhosM)Bem2KhHw zyoM8LJb%y59v2dtaaXbHSS<4LzS{ic@^L0?jN4O>d5?Si?8ww%pd5|Tl%yk)pV%e$ zdj6z{$Ucr!Uh8h_7M`Aeu}U!kbh1fatC_Y-U;6t?E$enu`a4dvrT==B{k5_)GAc@H zbv*U?^XFJx4a4;c5z0O_p&^OyZEh%zym$EPqs?TU5cI*#*Z#Pg17@(OX ziEL|f&nRZcA>OL%*RQjAujeEs4IsXbybJ#J&0v41%XkZiUkm1F?r(?Q6Z8JXs{8ap zSZ20eCoW6wUV5_Zv9U1G4nXU@KBt6Bkof&}E5L*w?UUDvKbQxKT(k4@mzL*-=Z-#! z4%oeC2$7qCk&#Wx%h+qlA_QqQB;>&Hi+75OZYwBMBZ7>+JDC3TjqBf}&GoCZ-^;Nv z9=88c$#fInva(;yQwbM}Qi8^puNI*(|diZDf2&si}+ zLAtu1y1Ub_JA57qJ3LMJboLT3*?{Mg2_xTgYmWW`hg5)x z^*>J@Kd!&>79jbMnt;g%KUzm;=V;TPukVN1v9}EFEkRtCCjt_#pXGD9)bsv2kz?Xh zO1nHRDqV}a&3l)5&bOW(W{#>+$ANcqH%hN{br(2TF!s!p_+zX_vp36XgWP{y4Upv* z^jvkil~091=R5yu+hv2r4`jbBu<4DdNwjaXb(xRI7SYvzu zt?11k>Pwp+W+RFaNR(sVy?@{T!8y3j68l~C{P^+N+J=Uyc1W30(tcrv4S#QP(**z&K z@l;a_{_HA&bVQ20?HRMNr8*fJgKW2*&VN9W?09|^H-~-u`0>VKr`h7-qOpK0Fbg2C z+X)GA`k`>CjoRTk4Ev3N($IjZ>gsA07eU~96O+E7p&@|$Hmi3bA=Ehr#@gDK?Ck8W zUUglZ{DN9~a&i)h{EfjYnyV)LKTR515H8N1JzKP_IbBi{SZzL+rV@obY3X(Ot^1FR zg11r5$n49OiG_u?uo*SIqxv{#iK|O9Qa}k=`a11V&gMCf8b)mZmsdt(PxSJzX7~W6 zsY>Z-X|+_3wPWGTSyN6NTJ}ev#$2G(OK+dRVo6C!$-mJT*`K>WiQS5r6!p2UVt8(^ zfH9!AN=AEVXeiro76`W;d19)zjK1A|uWvPJZzh=HpY7le0jw&h5^0|8eBcDw%^1`#0P70Rc5* zN!|iwbTRQlc3)ds+@_pVCyhvhkRlhr%}|f2NqO|(4mJ29&>Q#s{hcL}Ze$Bondi7+ zqdSR~OmRGywl*nok5)__%yBo-nTp@wH+aL&%WKp9&f4vw$oNr^6w1oP_(p)LhY13k zdb?3*k+L^N`QlKi=@`Nf9Ua}tE1wN=%(r#rIHei;GjY5_d_*$Wcy#>I(yu+&w*0~8b#mH`LN$Ytb}l){lvKR*)W^?oU$=ZNYLNKfO5_zHr)>Nh@a6OL!+ zG6QH>x={j=0)ka5=j9@m-Q(X$nLB{S`R2{w@bH;*SMC`e(L}HLk-EjH&SRj)09EIi zJhoUq&&-?vrZLR51bgv23DqR;nA3Sx{lFXq`zKP~l7|m}>g)T9ipVlxtEl$5bGunt zC9i#b1%$DQWovUf`-}`Nv(Sz88Sl;8)87Pl?b?->n5VhYSL)e=&jBz87EJi5SNhf_4UO@*1vE;)kXgMCRPn{Q_q(#r;i*N`_)$@=K4#wNkFdu z+1dD*h={@IX>s$aK!B&*+!MJ6w``hOTTkGYQH&$l&GJ9BRO;9>dbEnj0cvBwYwajD zynsMcdHJQr$ET4zJ@CAgOjz5&Ex|Pxmubab?`wb5XUedS*9hE z-@ngxT}_e+iVUk6er;OPU0%-dUlGZ8%=FaBlLy6~=W9*MhMavOE`lo!VDl2j#3v^9 zeQ=&iQ!?99ZFS=YppT1#gW)@?*5}%(jyG?XZEoDA@Re~fR=qUYk$LOZEi6XKTk~o7 z6r-7kwz09gi`L}xf`aAXW2JB2zFosu7F}+L!JUD=$@QI#G1UTdNljr<9kl0|Y4P&p z%j_@~u`5@v3R$Xhk@TU8Mm6B2o5Zmhu zc`phIn7ktEyW6=2GSR>pi&jF_crcmu?6f&2Iq5APNg# zu4g!SFcswrDm;{N3=9mf%*y&}YXhgp-M8=D8H!rIzsy_e^y$;w+*V^vi8M4c$g$ly z7D_5AJGxpkw+O_pAz`3yX+cGdx|xH613!yU&OGGN84+A7+CodYHtDzPYl4)w>HBzO zmizbbqnr`H{f9rPd1-#0^53yv(xbrrNxioas5LOP7-%H!>sD4M@$Kd1tCZM0xqc5Z z)6@IT@ijbhEu4QKoQ5yXz15 z>FDY*v<@hUBoRcGmQ^)11K55)8skAPgAhsunS+W`OY4EL)F%exK)sBsmdBZfYe>OM zT$US%rW^4j+YGgwH5k^M90!m!*nV5KY(XTrJrTyIj%gv1(2L(kKli*Vfb`Z5nqS%ne;O0SN z*Ixr=X1coasjb2XNc5;#JGf@Fy$ zpuSKIg2?9K;n_}e;N#l^hK7dVmIp_wb|E`+KkRpXX2P`_G|Aw|=ccAlyJjzwAl!^+{;)Te8Qvgq`_G4UWB_yE z>`o32+GDALCBdlqaE!S3VgT^Ppy5bE^YG!LM@$~0tiI)luu&r!(b3V>We+d?_zHxa z5MwM53+aQAakyq>Wd*CB$bqtVJ!SWS1JhudCMK-S&1GFWEYGFNu8X2hU}lcrCAF@V zoA&hSmi#E}b7KJ^tZ`&+65Fiu0pGd$=jSzB+g^Rjf6|HV86&XGVEqjEAs6}40wGiL zbar$+47tugB#-Q3R^X(|+5jK|P{W=F8eDe3c;F)bla{bB|0zHq>6ZP;Wl-z^uHXQ3 zs}Xmiy0x_xv-ar|#l7I?)Bw;ZazWA8MPi+-r)MN5CnFeZ8bZ|JinGCs^cm?+F+&Vr zIyb@X&*tI=HOqi9Rju?NP<5`uef5Dc{dufJ%Uw zQ%$cQ^d#tg%|M#-Ty=AEb1K(16l;2{)kt7!C>^O`$kvRAJti&V4~)Y4*E?$ z_9CG_(A`ItUk!uTX7-Dqb*A7RuiJBKx0n5{Lwg7WU1|n%=%)Mrw?E@#Yv^JhK7_u> z?HDfI1syb!rGi<;Ys-Bo3JMyZpSQ@akGCGI>r==1A0$b4gR0*1{#bQiI1{@c(MK2@ zAu?Dk4f&m>!4W?@3QCHaKr+G|ORfL^=l{zi;Ag;pSd2pN4{#Zxriz!i_eOD2QWDf3 z%Q|o!jh{c)5_b~(Xh|driThX)1PjD*%5C;yT_o^|#^Y1>{c1aQFqp4gKq(b-)Tn>q z6}FvI9EJ4U6G_j>;o}(tXPW`{WcHjVa97QbBETVyOz8al`4iu$XD?1DzVRt!pOfg{#ZiP=Fl*;M z#BGR_IlaKrFWdQDkuMNyJ;Vu4$1OgEAPjfL=I7;|jX!gCZtgh;ca@_Mc#gZ$EtfbF zH3Cexa_;~SgSxVezB}q&oeJfIWvmH2~o_;P<*9L(Mv7$CiIaHeCJS}8kL}VX!t5e-z zOzUX2-_bx(C&#t>zWh_5<0zJ3_moAepH)@z0YMYl9puW(N3j$Z+@74W-7hbb6n$Jo? zsHE0o;(puGqLnMBrG#NoMVy?T6SVi5#e5Xi~f9L?hX7zWLw3)AhgqUU3U+U0rp*E3IW@6ucJR=^7AR z;1mQ52&zm!GD-2@0ZQz|O5=0FY2GT?6q4%!bu^A;Gh+e9*o!J#8!jFX2?>Grw7M{n znvro8m?Kd=vL;eM+VQ5ENao9z^h9!OdQ4jz8+Bw2QY_#a!Vu6K{v63TIX(^=3wLCI zNx<41en zKtMxPm6w+{^3*ZBmsH?}g#lXPI6#8dR@bgI%w4~;d1&{2TnI3An$E+9(V@z1y1ZXt zhv3eC0lKU}R)uT3Q-tiPx`QQRmBJ zsBn9*(!hfdnU1fEtV_@h;bkyS(RvgW6@{1a0qpE)k1u1I-4C2s` z`Fdksax(j&L!(gCfwzbp@ZFH=!mi57%Jz0pbsDGlkC_?<#>dAa@&)nPpsH`x`I>)qcQcmwr4<1;V_AOUb67)Ldi4Q*yw19Tk~9~l_e!N5a&?=023oRVfElz zmY^cf&E1ttn!!njcO9MZM>Im#g>nlILWV}-1(e$c{AW&|hL%rFB+la9i;H@LF+clY ze%Sgu}JOt1W&k5=6+7{;KFJ8X1 zKJ_>tV7o@^rOJDTMn+LgH^aG|Ies`6f>{6}P*qmu77%#WYK0^YF<32ze{N~X`>zFk zZBQKPFm!Vh-pE1e3k!yZy1KfuGBO{k3SX@7?Ccp#xty$QaY>2ZqE@p}iO0$b0Rfr} zD;XKGe}&=V(o)jtYcT*Upl)nD9L|%nKN)#;3w(e;{!`i16_7weFahoI2a##)ij`Ps+~KlU0;A! z0T%))zzqTr)XJ~fW&@DKQN(=z{{6uNS_G0@-{>O501z`^#le_gu>{!=a_7z+2)(;l z#lJN-@7hJPl~R$`52bu+>cOlA#O!cZ33o`T9vTY?#=RTz06aE zum&3Ns=E4T=p&FUuxlzRE0?Cb=$V*^!gT;`TkYw%p4N$Z2>V zJSPX}=mtkdU^_W+PiFzH4onXGe&Anmy*=P;!96^j-DT7WtCZirgWV4s9ri_9RFjO1 zOIRc@xKV814uwC2StzS=51-}aM1r$KVsPfQY||1^_l5EC*3{FdPJznKf zG$H(B9799FX=dKPzeBdRug?>F8MgL`-fCDPJ~SD`jb$u*>p)pKDK3seAr-4Pw~tCI z1QZDf-+KW8_-5c0a4SEcijpd*z6P-o4r~8q>;)u0{=I5&Il+trJ_p`u(1iEh0rbn&}v~Zyk=)u?!UaqP~G{OY){L#Z*CC4!K@B#4g>}U78D2@USDUK z*J>Rzz*3?)ApCt|LgeDdJ7CeKdkaBS%#*a-D~&$B*IwL}O#!p=t2-~aVHBQ_uI_FD zqxYXN+dsSWz=E=ieaL^AoegtS8Cd>1WLpt5K~f%4(Y{>tZF}*_y}^4}S-pm;?&B`| zid@G=N4IU?e%sxhRoFhc^7`DIt3*<9|H1iXXlUaLy_ZNl&}5#7-#X#F;i0U(=h@4b z%zO4MB6H!!mRF%Ty~TR{0YjBnd+6m6c{=}sygX-+ivW0Lo{J>AB{C}ZC(_s!mz@vq z-oIZ5!VEq*ctD~bKZc|OX8$Rwz1-o|O{_W6LKPD$!mjp$#E6*v;OQg__`9b0_>}eL#$Q!u z{u4q%vFyZ&V4n}kfIG@vrJhA2JlHrB8KtX}d~trAm%@Pq&CJ-sH2@+a>jctTt=@s@ zwY+-Od-03@)YsR{k{-hMHw}R)AOgX1wEy@;X;agFR7@Xk-Aw$^k}cYbbcM5)vI4#jOA{a z%>!HU-!6LtE?B8&rWCe>-5jKtN7DeKuz|s&6P6-}87Sd{C@C(cm4&V|(WF*;*1;Oy z0TEfN-U0{C2<7*;A6i|%ZUXg6h8C)f$5j0Z{wicRz-GC7)I3lbZoHC6Tj%MX*j$6P zg-(`#?@J`mjO=RNV`{J;Ow7!H_Dp*8=v(4qwE1#30<75_%N2<-kobqxZH* zwIxmyZ6uS=?r&2lAza|+xZd+#6DtqF&aqoXzmr*T#g9d-|g>Xi2b8)rjw*SS45AJXb zA5&;^Old-LxSO_YL* z(#4DHvX}*>$cgJCBM2Icb908sRK`m_wB7HoGl$oR-TV<^Y-ikKbWZnw^~rpN&9#5@ z`jUKh!#N6H2Ye-zzlo@*6YZ&6_nEM*(cYH_hz7L#)p}-ttki{saJ;4-BNPHyBiLdi zXT5w0p7DvW1E&lv^zxWj!-In<>FKB4X20W~C^8U5YXAe~QclP#%xW2ek#P>Zl%H+Vk+nW|u*b>+=&~Ut#xQjCM)2uS$evFJPU`R?%+vYX4 z*_Hi0Z)R48vN70MS<-N6VM51G3wb&G{R)kJ4BRKnudksx;q*(>3`UFvxDt$0uY2te zWKSy#_BYabmeCS0d3_iz99r4sD;5SG9>oy#JFlLEo{jK!pZ*A!6eMRdj(2dLW|7{|Q>rPmy^1SBphRI1BAUfuiSQ)}xSTudJAZ)eN`?fQ#VX#LikIMfz`NO2Tw z)03p6GKXPBupt@=&zGZ|OE}dKujyowGk%$~L~4E={qbXn*`mQdKCxoS?Nhsf^H0Do zVv3-}C*>F`8yk{@SXj7kOaK7>ki^{wC=!5Hal3|w`((MfxT+c(e*%ku8F2H*UM3lq zO*FM5Fy-;_V%BY3b-O*BWbb99L82D;o5#S$GsPf7OO0au;)2tKo$JiW?Gy8o=7Dgs z*Dr1v@9i~4p&^ne7xR@58-IW3m2215AYa4SR#UdQv4$Zi+Uyp1ZV-dg0;Fz(xY}jI zz#R%_6_49~JMPzFq&W?L$=5Gm232O#V}}bnp+Ou{GcAjx)O=`UXn5b-xX|%3cy&qU z2T@TsVRAxsowdZu!qV5$l8hTvDTlR{!w=SW=wZE~t}8C^iN`0sDzs13iCcCY zL2KsEJPo^Akz9=bxAJl_%>nkx>n=8f75)gE75-GH4zbN)RDSh}A0<5eBc$RwU+2a1 z=h8%#aHAJlS!n5KoBF;U^VtaGgcDF4z&y6m#zMO$!q6}GrW!;L6hcS1xSW=^U84<)6m(9xG`ezDz&lFV)!BpmqDC^W$C2gd4EdU|wxe2=qR+221# z@YkA}ZA^Ea^m~w%bpxVEyy@D8(tu((>)G^kJA9Z#oyM`z5!t>huA1zMPC9Ei zT4f~xwZg))J(iBS8fIqRK+`jB>PC4u*gCL-p7g>y$ro7jWM1ONrIhpe1jm+hITx&voy z5tG)ZiYoP0u=KNoAEcH2K~_?g@~79WfU$8g)|JV-(@({)>T50@2egOZ@UTBrs->w;yV?gFfHE_?t#rU()Wn%xB8g+o0@*1{DYGMcA-JU;%FPw13Nr8ykVSym#(Pb z{vsA#N_KXc_vQwo1n*er1@}Zm4Iqamus0t+vY$Skx3(nrAO#&7F#R<@60|@Xk2-v| zf^i!+v$FPX&~RrHgauEKj(J=T_^>dssn)w8uEKv1_-4L;|9J5@SnHLqJ^X~|&i;N9 zih^WtIO_gYpYe@rmv39R-3OQf&jY(#v*a4K&*H+U^rG(h^RV(ft~N)}=)&~4jYZWV zqyWu8caW!%O`!BQ;qH*%9vb`a=m?T#CwX{4PoZ}L99v?1{2FleIqsl%rS-FlAxshI z1Ay59K34QyUH|rG5N2k}?b~X#O-xF>{<>qz5gRvWD|dkqTIN)_rBju3_ujovpa&pV z!vN^8Fs^;`=5I_(Z}BZ1vLcH5`A z`!~SHcM-AS5$ArQAfZl z3D6`3IL)u#7nIym5y}i~Y-Q%vXubIEROI<_5OG4q6 zIBQocS=aN|uO%FYg^0P217c)Ou|LVlu2o{B5vmz@aI;N+jVqwIKe0m;u|#_ zTn>k$_MzX4g@uKc75!jxE>4?DhW8Grwy|t}1qFc|^ama$!MM+`!a()%_VzY2Guy^i zK$cBU=?{*9WQGB+hK38M@nGRW?Cm-rOuKux&(&vIDhU6OvanATv0MRBP%W8TSdhpD zbg*Ijz@LeZTkva|kjJN$$&jqoz*?xQd(3_(D=k2@zYZf2DlM(1ehov)9P(h&0>B|h zgu}Ax(e> zu;vrip? z1|PyxTChL>An|j+d$Z(U(;|#f&DGWZm4m_}BIp98 z3HZHAelpn?p&4yufFoe5aKMd&Y7u-QgjSjWltzNrzVd@<2hCosZq#{uA2YMsxp!!x zJAHc3RclcB-~r>c#K`W{H-q$IWtZA=d;g^=c^rc#*M5x}lp!>!^j9% z;mkuVZhn03FN=T>7S*6*AxM292Z|0jL#J~8a|NQ4466}8!Ec~lHFM@i3G8hk0wEaa z!~Y%~P5M1Ldhi|YugZJnKhF7+>3f z%z=1|MOIu24gx_62Sf!f5jf03=Jp(9|DRp}*vEJ7yglQ<5usc)Jjct&hiQVdAHM*q zlhnFVv|ZB&R}(o7N0IDVC=AfEdi#&~k(Z#Po13?xp$iBWTtQvgGMU#aTw!`Tj8VAz(IrKB zPUKV*AW8HZCpD8A2wuOg-c)79@5x| z$N!v8onODc2Q{m#QRjPQY`9&wc6WKo_OTRlN+OD{s3_?LYfZEgmFuVW3Q}JboCrI3e&1O#qQ$ z#1vSnJowWwMYvfk1}u8WEA52?dGFqdt`7phfVmket;3eUMPOgNxNc{OMqgW7ar6Rr zCZT%EdiAQR&Tu6g33YwlV{$>LEe;#0K``rmVgV7j zw3L;D>))dZ(Dc6I;T7P_1;RUE7X`c;2%Zz`(?16!FRO)8IxuoGP;P9szj<@Yy%AD% z!Y?cYJc9rW3SbrYz)t(!AA6L@QMZ+PENj69*&vN79j?04H@l@ihQ5bcD^#%^g*?V2 z$H4Xsb4H#!s{}dPPyisp(QDuN5<%$u(_XajWj+$tM)_br6(rhFhf~~@W?#`ZGAgQk za10A(ogmbW=Ngn>JRYZxi$JlFogKc3TlAgVkl5y5aCKHlCO4JvJrpWieB=9f@3_Qh zTeg-gK9C|1o>OyI;lUIq{ttfUvYNFx3|4H@E199tB7>c3pZXK=LZvDCWmBa-xCm=U z9d9%3USjfxTOC@s4X4rC(#QMz2 zuKF_-mw{$N2tsIoz(Qn~f&CNgI+_FZVVZ`q#nhegCi_kM5QA0}lNmy0c_LXEZ z9Yr5GOU_gHf}9DV$H(c$1NY#4(cAGm59xQx97e?+Vf;+xb z*w$0*za%X%9;iad=Q_`e^R&q*sc#WJmBNM5Q{5tJzWZa!j>{N5E!GygC<3KmA^u)Z zfxo{o;1^a@-)rS}cehX2BXYdD1z-9unHKNL87lYNFC8btahQzSOot2anfJA*;<__< zjdH7!pG-Ev{}K6ei#!Bv1Uqm6053#xr0VGcU6dosC_m2J=pCP)rmo?<6M3-8T(C>< zc_{$NA+;D?eSIo&A5feKe1HD9Ozc4~R|)u{+zK6u7NT$eHcUQ~uswTh!|TkarN@c8koClH5#m9_<>MP9TbA5X_T)&gjgZz-Q@m~dM4+d={Wqg?Cgw{omi$~B$9*E=HTJTq_*;tvD@x%;DElKcSV#JYKsqKObbWV0zdWrQBNv8c|9s8WB?s$;ivzfgQ%Z;fV>A;zqQ;A2HlS1$KaDyV+FAVaz0fiNRb1RY~!(+47Yt2%J&;q(|MD z{zS(*kT=>kOcWHhWzO-^;7rQCvg)Nfna?b*t-;w2LsPDWHaWJ+xO8YKkqe#}#3araGtjBaukuZDM-fBxN3$}3bwHCUqD?W-$ck+G0 zm};qTekXdTDro_1ndG(PgA^wyLXQ(9wy_94IE3$T+m5$)A$Z+!rNE!aNPl8$p=;=a zUJl!;+bp{*H$Q&}I}usGqyirDkVyD*%EH`>}A#V~5yLcc^q$v@XbAsHT$?4GH z!#`rpym)wdSHaC3qDrBokba*`>nntEAYRN>K!%nx!U|0jOEUujicExCceLq~Z^B80 zm1}$L+CPj$>3`ySH#&sgf@H`p+d#HFZF4$~NO0Wdt(JTfB@DV6u*sK|Qnb+6hX)G4 z(vPB|HgqRp&sJQ*5b6);62IPPvC~4>)YsFC*u9&;OC3h%8-^yLxs<`qB-Eo=Av4Em z_7f19X?-_adwazp4ImD9SQTu=BX3MiH@@hb@kkyg=wcbnO2r76Krik=(KGQDNMg&~ zivY*)>`>BB6ZXIdT)r8Lb`S_em zyZ*fctPvf@J1BOLPVn5DC>9#R3Ppy>;8 zKgwDov-Iq2yUUkHQ9grogw#GY1#$n?9V;?|5pq7NFI&DM^Zrkr6=j$J7+5GYiiv;b&*R(|E=8?*-#9 zI9UJ@#l-ahDKT-FOxR#pL>%Sig0BegW z#z5FOxwuH8dj;DI4=Zs%A0Y;&ZbDH6CPVZC6@dtLaF_!>0^0E7Kf}b9dd|%mh z0rW6HB}3r|+l&4B@*GASssTC~c(2T$t_72W=LU)4i3;$FzI=HNt_Wx6Jo7&^ga;zr zRT3~0f~9T4V`tF#104ldFov}x=r3E59if_zXKoxs4+i*B{VZb#^ra(Si6ssY3CSY> z2-13bUrEH?u4z8r% zPyoTzl46okjVyO$SXdLO+Xb{(M~5zGG7yFlEOdnA?Rbh-18HqDFra-Q5&h73xQ&Z& zBA#9Yj_c3tER%@iNV-~VEzNd9i-}g?z%OvBAYSp5gkrb3r=_KmU4!dVe}0Jx1ik#p zy!`o>&qskb3x^f7lq@2lWhZec zi}VnZBq=Fj;exWXHy_dJ4_RbnYz$orr{E**7$p!sY|)`N16IkuSiS{R0GwMLc?;E0 zo54NoY6x;+T1@{m00^PF+ELar4)RmxgmOVD`7?{AH2=Lg_G?bKhb`JI{*O|Qx z3~sk?+jZrpzj~!!JJb^|a`RbgDv~I(iY+}sT7WpDk%lrTEe(&f(PRk4GZyL*Y1`XR z!XfZCVg}UW^v|CV8133(gQok6gz$(i0F#k3R0MV^au})_;|aO+ zNSRHK5n2SSO2c#Vo!bb>&Fq0jL$4mzs*3IVF+6{3WONiQL=g)#gz^344oaXi zphp~8>b?X{a@#h4n8GNtF4CVwwQXss3o0PZl_1@T%fRM8r>)JAe|AU_5ow+zk~odi z7KSc8c=3X__XQK7j$Ec!8-@(-rg-EhNK2vR7tz7Fkw4&du;lLzJRb-Z;oK4xVSF#X z7r+EHFsYmBn)k zdI6R;mxp)JEw*Li*&0N13@c8E7=Qrxbd9gSx_T6EL8ON(Ox$rKFwlJ>8QW7z3SC6T z;(cTUbO96`M4?O*AXo2mk;3Z`dT=0}C}@zAS>dCVl$+tB(!|1zmGc3-0$2 z@H~sV4Wqj^cTrK%mtxsL42A*e7#4Q+vz|bg|5Jg~FNBUF9v10yr{~G)k z-aW_DBA_cjr{Yio`2iLOs<1ZW7Q*-ea)%>myaoyE`m_7Xw{L+T_Fh6Ujt5`u0$xFX zn!E`WA^9e4Rbmew76rz&?Dm!MF)UFq-NlgUI{XQQQX=*hvL+e^m%*EYs}BAUQwsJ- z6g~@}WH^PjJGT&)If+BZC~0V(h}^sao6_3SjCN-mBa{WCMz|?S&-^02ZJd(aLI1yw zK?qz@o~wQzV)`>*y>fDO)z{Ghv&=Mkfk5cH!~Tmhs;(H~i-kohTvOv;C8Dy5i+d|8 zQ_=DT*A0x?q`>C1)u(_NR$kZ=&~N5wZ}0Z|$IeN|4@;|y*KXflL!Ja0E~lg+tZpL? z{o2VUDGm}&M&=S&S3EW1dBX&zQvVZNkFQ=OJbsK7?f2mTK{{NTb}0}75AYByR(RM^ z-i^Kw;M+R(ID;lCcKLY$B`NrVeeDMD@kvoaj(~`V{wvjkKGdUR!^` z8$uOu3{+q*L^oJ}L=(e401zC*cQj$1(93Ci{rdgXVX*MquS)8rVtV_3P{L0Z$JPw2M}meZqbO>x+IK9X&l~#5X)=2OxPamW)8l zO76gR_VK9+gddWL=h!hk+@?AU1ht-%NH+b*d%Rl-1s#;ZHx!5JbJW@9<#xj zZ#xLmL>~igsNx_AZ-dT;8x`HcVDdz6&5xkEglOO>1{#rMSw+;r0w$SZ?d&$e_yCKe z9*pdvBS^2?xAuh?pMZaNb!FxLkRCKc_zSA4tFiT8gD!-Ains)GCU|bhtcV#U*N=*c zts(8hy^c~8OU>dq6Rsm3>jzJML+wAYI4sOcPk#~*ylOL|AUN%O(jv3})~%I)t;#~# zuwo#X!j#CvGXeMjWG5WP%)|smXD5p&>B%E!&w>US5^l0QAxQl)Gjs6w3a%cw5pcr; zPyK>@>e3~Sq~@!~nV<>pq@;9oaM*`u1tO13>hB=1vyls~V28n&b^n%;WYQ)KTcFuN zxdZ7SpCEK%H$Y~ccPj>!MqP_E zAdCE^8G215++kHe8yF43G&*`u*l?0JWzGNK?oGq7Zrir+!x)heQmKScQD_p%P(maP zqCti6Gi!MChMDW*b_7Y2wPdkTJcaHJ-y*r+pxrG(I7W1 ztqCYo6%)mWYmS|p8zCh?n{x-bpA`ifibOl!%%o8^td4KCZnp(8lO1oT5B}tDl#?p8@(hP ziKIFY@84H+D;9|)HDerbpIU{C7|tEA01G{yI>IVyUN$v#QytM|(IPfZ@$K7Hrh@WD zqlgU$zRZb6tcTvD!i5q<>Aj`5Wm|nY&qSaO@N*z9LaA}x&qO5AS<+ZI#wdhBA?1kB zpyPi3c#+BH#}xMDlBsNZt~>@C3BSWF(I+k^w|&>Hl|ey?xHr*J zBGuJHB>HVGSB{QO!tG0~1e-*C2I{IoGO zh|~>Au^v)SaC1zwBtFO~C@g8;(^+KIE^htO91B4AtG^X`62t8k3$D3Ln^yDblVHfEXJllgr&sj)b#DjB%dB5i zAM_%fUsjQ~H2c?$qM{B8o5el^X~wMWKXfQ2L%&HsOb8{CMGc=B2R!kh{$k? zpx>78t%0_puvr|`+o&s0mxLhH1{vn4 zPC-nrpdw}ob?;7k{i=E_XVWXeRnf+8I32!zv@jMJoyyE)C1L^?i$^jEBZ|2%bnpE5 zxHtZc4b2*?(*FHxIh}2WNr@Euv}xi{_i(%0r#EkmQ2SJ5L1SJwx>Hn?+IoX~#SMb) ztYQ1d`=PJxw|Oi+iLYtcq`QQkGO8-jNz1S7>ncb#enXuC5*&SGW%(u%8rexR{Hj9zB{++FcYjQ?Z~YL^Qe5ftQP-0|Npuo>Ay! zp?@Xfv>Y*+#IB33lT4>heNT%1Qxn-wjTDKLS?!$rM^aL3mh3OsOm6TF^Tkz&^9TNb z-6P?}LVM(j_ww-G>C$}D6<1?Xf^5W%D;BFer2bInuJ(jR5=ttDR68rbJ8o@B9Gwx|`REEb9SQOc&% zivuvqA(1?Ay#}xai6qF_7!SZEp;*GllQ8${;vzR_?Aqz;5l~W5S+W*qA`<4{4!g(8 zUjl@L45YPwZ*t;`Yiom6tQb6PcxQ1@jJTwKb=jjw=yJZ$754ht+OC0yAs{OOf9k2u zq(CWYQlc-miUpo3polBp3t}Il@u9RMgE+8&*en6)cpJix-B$&z!B~}SA|op+^mn6d zpy9R2#QpmEv@yR6-~UGoaP%lbHRC^U3vXLhcl&BQqXBbKls`DVR@YWt`UJAw=c z5ef0c-4db?(vtqeiDVemdAPd1!#L~x-5zD{0zUI29I1cv{s>hKQKOt|p?xOWUJ!EH-l%`!?gPwv{^wXa z2-J!I|LODR%63KIT%DhUe*?cHB|(U>K)k0N3}_pi03DSBr>LO{2x20UF#wUsNqSjJ5dR|G39tIWwUrCR0-KsozSrxP`Nd z$YJn@Eiq6M2p1;0HRMnMvNsDJmkTW`SVPgARx^*k<0lHI%8RVL?w5y*Pq{z;0&nnc zqNS%rIY}>BCxZG4mI2Mdl{G4&`=_nVkP0&iVuMz;(gV7+8%n;2to?axf z>Z%#DjoLMMRN}ZJ8_!SHC?b{^(f7sQqb&NwuRm~2LS!aoy-9xe?Ls4`19k=Cz~bH} zd+FVxV5OoT(p_FcSNw~9qOwy zZ=sE7*DJ9wNl9Z9<)K6Sh#WtQ6fL<@oy7}NM!K)*5N4X&S=~h0bm)U#2G=C4{B#$cjpZnfz!qsBUIA8{@qOr_OS>UWlyVu8BwzvNrZrG8&r7xe4- zZsC`#m$csG6nZuIlcR9ydFnF#?oa5n>A0*Iw`rBwDPHBnR_Qx`?<8rd@N}e0|NUQ` zc?I3PCU+6%8p2*hP=US~KmI}op4EpDO$~~P)M51N-Mn>6Iqp8HF*KfPhSSF!nMBh% zpc64Mw7uE0Y)*nSmwbP;X<~_;N4}=d!iD>=eE+o)AN{HZI*8kFDUZqNzYVx(8=5RU z>^Zy0#dw!tI^E}Y*`a*z_3NC6mZh-^Hd08ZyXhL@i3zt(j5QsjOSl$%38A>QNK*cL4YU@LlSkp6efs=)g&L z#oyCy$&w|<28fG{uPhV~XCtHkRers?|IXU_`hVi%!!q?HDGA*TrsCJnWkqV-QPQSU z79YA*l>@t=yaX!=w4D#siyOc|EIXe(xlDHz$El`-^`R z4JIi0MkQ{oYJzx1BkQs|QkgIJS~mU;lm;TpLur+vn`tdT*6HTPV^j+?RFCqebhUjn zw4uyAC@6>ueA2F04mU4P5wEGO#ZxFbO*~-873aq|aSO@Vv6JyGH|_E$ws85wFmO8{ zBJ8y6!!psVL;5Pjw5gWfH|w6L@%%2Tpa7g{8e}PL_mExI(8+IA5j^+K7D6)h(^9LVJH^u&h>v ztm(PbQ)_say zQMC6{cp_w7@2aZ4(bi7hhGrD%k&_#{%xHJ_TE`|qw? zz8u*?7r#OM`t2nJ0T)nu{CM<`?uNsLJw~^Qh>SpCjFo78`V1|p!_3CfltKHWYO#aL}gu9MC5=3UldS#I?T4chwae$yop{^rC z4?Xi++5Ac9$;r$_6G=gbTwKXDBq=dW+o!k`3H!^J;K>U8S~7k(e#VT*ggSs7x*}s{ z|DBn%O0^ArJskolF>xuVKAQVXwXj~2>ArpHC8*jU8|?LcY7XL0vuxQH(yG{er-8fa z8eEN5uk6uxdmttp*|DlSdh|V+5z#|OTrypf2IarSo6RciA;rK}w4%68oQMgPWDkFh z*I0E&TekPT4}w6*p$`39C!;s6-553|OmmmYe;0AK9&uxEbHJlvOthZ&w^di!miRhN zas7TDtQRR4a6JM@$`nYF&6!%7N}4f!ZMd7PtSkZnaNe;g{cD%&xY^s^BsRWwPK{h` z@d}y{X-0vk>lu4T$NhWuXpCG__Tt4_78OXvSPvzu&JpS(6{111gmFU0j_KQ13Kp$q&?4}dp?wlpa@tiEPD^WqPT3*dGdfj*m&1W+aq^2n6zy8Vab z_Tri6=Z;6wz*!4;yQ%8Rg$n`AJ@L4dbiPW@b{|d6r2HE&`QGZoAc9p?#frt{%v?5Z z+SG!S!AO4l?%j4Cj-fWmF66#^I6;{G10jND6AQ*~zx;+Z3r+zZUV5-t9P7Y~=n4Tc zh`npjjIw#oyBm}p5o>%C<9>nC(O*L^MX=R1-E!G-zhvk8T@vUWX>TcMXV11^*$tYh z9z{aS_^bxE((gU45pq~PlfINdNSWFVwF zp}l1-dVOXu(^L2YZxGmGFb|$?M5cbRc$KG5u9C63CT`Ny7|glYwts(|H+^mvwHrkr z`pNhh-QT2cES`UsYG+)Vjk!ohvB1kKfvPbf!Nx;MG*bNkOVoaXzZweTqkr}OJa1SC zo(mr)X5d#tjyZJLuyGC!#_ICjHzVwX{euwXiT^c(f~G=s?2;pXUmJV^)PVwlq_iMw zCKcG^$v9%LaCvnHop)2Okh@Kp!WRpl2M<7(aP}WObm)faXmJ^zgb+Y{n%y97cpKD2 zOjMxn3ow-TfC!(886cM^JVvsM|DB3u^a2Bm37sIMzaSClp}f@@|3xYPfn*wz2|0b0 zX>HdIZCw*fN^afKdi8;N49z@0mXcz!{H;COAeIa6MhrxmczlXhM64-s3k0C;OT}o4 zrMYw8gIYc>EoC_hmIQn$KGR@pl;vfE6f{NFPFsjiz`uo{Y^kY}k56HXqxleE57sQ+ z!sCwoIB}82j=%P>v>1JUnhroS7EPg`ZJc*6VEOVC7cU#xPPhvL?gIMr;o=!J@%HTn ztXKi)XtsGZSD$VJeSJC{ZCSAlc0zBm%7PIERf05ylN$0z=~_q4*rv!>>=e$N85G+I zi8BV_RB^L~JI2C0L11m4JlXzKDms5Pn}wK!#h{V=LmVS$55uh@)`Q+J`Lo-?yxqx% zV>{77^6}L*?x}M=mU-u~#;^$ajk{ICQ(GDxnGQp@*r}A1h^i*!p!fb6iST!vPuIg+ zL#=j+eXn|AsNBJapX0Sc#zVAl%kc0wfNK_H$Z5xIojrEU{-$)F7;{&5cXcE`7)sa^413FUYF8^2B6ss7W@*Bni-c+YXr5oehy_b6wZv?v28R&qSLrdBa%*cVBniNO7IY|>15kzLsd?9# zC~*WCIPSA&;WT+UxY3I~J(~!DNczjK-l}daD8Ph%JqpphHDUS?$u}X{$%d<{ss!(M zmKcR+nQH49`niyWRRQMXY6eY2TM;nJYnC>!_~_J!Tmj=`2DTk~GsxJ;=n9%WKofQe zf4hC>&JCGbR=UvU9y#mE;ma^_)z+R(TO9cQdd3&BhB=F-83+b3^!L?n{N0-ofS}UY z`1m1?BoHL5{{9qD-(>J$0Abp|KsrG8t-8xpE;(Sax|G(0Cr^H|h=~x#ZvWDvdZddG z?76+6s5f?3Msr>c;I;&DU8hcPz^vr83x*V5%bw-tMirVxFWZ;KMgTIQ@Ef5Mblo9> z;BPqV>MBTM$oWrQ=@;zjIx%SOy2Y0FDb(n=0CAzwnY(7~VKmIlnJ>u?*%O>8F_{O}c>%g0 zH)I9Wb?xVL-MH}TvJ_b_wu87qbK+yxigkc*47o&@vV(F6!^v& z*})_H(+BefZs?yFrvzK^sjg0tY9zVngNy-A#HktL$ctrD(5Rf=Wh#zMv?oT}0nibj@ z)h(??VcX{t362a6mu5cdH>$r)o-~OLRByUjd~Yg(g&>_K*2{&4F&Sb7cqdY|+~ud7 z%h|OoCUlb=oO~O}rumm2KW_L3^D9H@r+cxC5v($^VbOvGO4&ohq)Vk9S3cf;@dr@E z(T6o;eKtRKZrc{_y=Uvz3C)}N-)u$x1$63D8-r-@W|q%1{tq*)C|p20uyl);Qsl^l z&Chx@Z`rFwc^rQE`Jhqxdnqh6hoXH*kJu|JHhY{@-hSzP-2YS!2?KDb!U8h{Nu+{E6hnVmXcbxBT}i@ z-`$=Zsh7nn&6L&q2VHxxp6ZY{uo8O*ogc?QegOOtA;&+}Rn(KI3|)=aC&wBlOvptxe8Sh=NB^1*kuh&rx={0vgd=%0`X2kG zzZ{odxQ^+b!V2>EoH4JY7u%dR z07Qtrdt*@QJhc=3%1CZWdwhpJea+Oy%a<-;6SDp3+zGGdtHoVp_zf-g2~U@v@p`!pC-|M1?r6yDywx-0OlZk8Al_6SH9SDpZ7x`(ARl>!+650N zXc>#x#KO|Doxvr#=?@iP0xCS}fwj?UWQC3B~|u;o~&sQ)ta*wV{=jj_H9NZtDE7eWV|l^|+1G#B@tnHMNe< zCPuF?K`*BmT+6E>_D3aKqgVi^YynbdBFbn-=cS*V$Zu_KXKJ`|7 zwaBM9p)biuUQpS$;nUBC;3(B|oixq18_v8X3Y%`Bsy!4Jf;dEZkyUF`;K>)^H|L(G zY8M8%rSH^PSg`L23qK&eL{r|UvUorEgUSqtluIkBogrQb3j=?IYwmJ48rm>uNcX!; z60xudzHSi=AC=W}V8{^Bp&VNz76OCx0OcIv zO%M?C6K}Xb>_dc!p%%6mXqG((dh^E@K&$Lx{2N5ua7wU2P-q{t5pCj5H}&-N@4~ds zutimrS}AAF2=itR4IHB!^6l<$dIZYP&5^6MS^pj0gQNcx91A|dfrd#;%rsBO=9pxm zWWtHYL4rvJo{k)+Z%Q6H)BW2U+ZPxOs_W4naW2`MteEhMed#d=FuI=bT8jDu@%sJH zYDKbu>|gKqb6Pqjj_Mg`m_UPU9RiSPm;9c=2BBcI!yX`myoi5u(<8l8fjnYJ!tqOZ zbRIzLMb%bEv~!qt04$L*hGnWb2xB{`YB8cieJUK_?5-zdL8KW*c{qr5gL=7j2FsP32sxR-QgHU_5ax1W<5! zBEDXV^u%;aZ}XLR^7ChdHj*xm@o?TsOeX#ia;PpMU%s$V@PYt>dv0oKI7A7@yye>E zYu=(ggu5oTl3Q(X^pO_9BSKH$_le8^!9|`_>mkrcCv_B`-YW%bRe3UGx|@yV0-g;#Ma3@$y)fzPR)Wn}6C>;`1aR zCAVUyWXvtn&=&2I0yGxhW2@Zu%&YU@M!p1}82e9x2ram$03+Whmv^v2aprDv?AiUTYjAO{i zk=Nik2{5FuO&kL8Q_xMPlkG#Iyi z-eJg3Ng~fa`$$njuj%#gc@oN!{%5EpMiIP7F?nsA7&JRGqD;M}r2JPx4r7W4CDdkU z&(RcwtXnrKt}w;rO^8mn4K}h03hOw$NUITrz=&cr>1 z^}4yZDB1>vF(R1#0W8?gpH){^qZxdL_aezOTxD0=AkI?+?JO;nl|O#|WTpjr(_44$ z9H2GV$*hwfO2bCMYMc)@U6Akr5T4VVCDyfT*T2)Tz3abG-+e(pPToN-mlM)Fl>xhy z@ZdDQBW+|G6%RUlvg`45LAVX+)}>dkHo_C|2aamFtPEg{J#kXc{C-#x^QiOz==d(= z473|!Bh*(@(+X^}z)ei2+rWukMKRB5I5Cp%Qa*mx#<(~la4_O20l&aL9I%T<7r@v* z9VQbvW9;mri+xc7a?+a}d_|rCm`bY11H_n|>Ko7%!Ucq%5A)y&X#7@8g9uw8_<3lV z8Eo9V`TMtTJ?yo`iW=C%~G5XRl9kPu|a|dF<2|eAsLGS5-Ufe!yqdRxfdC_dK%oQ;w4Rwn zad=e|kO){@w6{BA1j=oyIqRzr1OfAa0aeRPJuYKLHXN>#izew@3^@+OM4U9ZC?Sjd zvA-~?Xz0Qz&4yI=)KR2p07RNww(_#Dr9g`bRzSUpV&%ZfI!^>Gl@#$GKIlJ%tRgeu zgVqa{|1a?6KND37iXS$_o9RpMi%F(8>Erz{;Y}D)OumedOIPvVT}<1=|4+>E|3)bL z-S@C-;e}Ehl({n1@WPO?zVS<~vi1913vHhjP?s8RBwc<@7ir|y%#N}{Jo(WP z>ZF3he*QA4^tbM3y{xHY1pLt^>@c9t3V(mP3S??+MB+wvo1~-jlOITzUse2&m(bS{ zj1~NeUI4`~&9RYp=gO@(>0|UoQc8;Q?fTW+H!z3NlExpba2P8p(laoS59}inlkpj% z8IxNL9fe>F#EE}i&?!Q_lR~tFwY(Wl0a$*S`BzHv`>y~)=gvJ};oI|rsHg!CXTWQv zkr8V427HOvLh*ow1VR!MY;zP9nF{r zVaH|f3l{jHq3wYGy|zYYs%b7q(QBm`xrBzckK|ySD5JR{WWhWu>&tKgsKDRsDEHzU zMTsTV%QKdpQwU&?0~H!hAHrR!Ng9h|kC1P~O_WnTZQr(SM9A+UJjMCeirQAE_C#n( zm!~|ep_&)^AS~VndBL8Gw!~PMq`RMD1F$KR`5r~SN3kHO8juF5oCFVX7~W5_~n zk+u99E_WnXq^&Ix*vEB6#O9EZY9z^99IXN|d3pn`m-tZaRzqSYua z;0&2d z$Y2J%fi==1!3#_fe$uWWX(_NfzltM?{2YbmO;)T3^+w1gk}@_bIi>I&nEi?ISVM*( zScSLbS6#PK)CR!w*GQdooHyOd&{CXxry}oXqbv~at;1t6Z`3^t=FeXr9es!|flCw{ zw!hPEr?63n9?Zunmg15L3W0`U8r;b;Qw@GYo{WM5=yE=M9sB;K7+2#oUeu>s_wJcj zu3YbzCB~{_18G{Q#Rbwk$(rJ3l(`n3?|Q*n1@$pOjrR~Q;wRZsdUav(ppJw=6w~*r zzj2c!cAG>RN)nUU6?9&_%n9wy3vuTntP~|CL1@t5so?yqlj!fCoa4ukry+~wYe_Eu zJWgL>N-5k;K!C8qT!b6e_OVivj;yGz4r*!g=dzOjUY0%Xq0}fGd+f5`3V*KODuo~n z9gy2EYaLskJF-dxKWiaH2gKodhWz^;-U9r-U7i6I@Si{Ax|icm1vZ}daZI~#Cz8gR zJ>v@L)AOG@SGU&2%JbjfJKQ^OJ>A3CXPf_qMv;7T<4ah8>6Ss$&b=i+gZ`w_xF*fN zx9Be$lgA<;l7Vn?k!yrY+OtoTLdjZDQzM)D@i$fh#tW7#QETZPp&la>R|wU>fcYY$ zO}F3CkwiK){8*fLd45G&X{EQ97pjt@oCn8`|Ke|`9M-H_g>?+NQV?rCLPp1ld+fGk zD+ado6h1|QeCqV+ppQ={lCt#fZ3&8978t(dy3>(8m}1(F8ADecA0KtH3O!RlJb^fr z$S;pAl9b5BZwtXFJ>mK)8zE+d!op25v%};rJ|@g}{2@5iQj^hYJaD(;kx8UIj>*^8 ziv8YFQtg?sys)$s)BY{D-+lcRJaBhhm8qnEj>xjb$Aq51L3hRJnqzZs%;;t0rKsdb zD`!;|P|L%0dpP}+LRX2&()yswQAS1H9-PH*2ghUtUX!D^gg|8Dbp-}YKf%_bx-ZYJ zUbWD14WAXRsg_8(xo({qBQk|*%)VAkA|H`MYfDS^kc;1GNjB{>=oiw7d1#ychGn&? zQH&uJk?(1y;_BcfN_sFKNJ&;+{^O812!)t$`Bv=bxDpcZWg1I!Hr$-&a_Nfb2*&XuSsdY;>3lYD)j z0>W>Dsgy7g3w+?mF{{1sJuFGyrdL8=#u&VBScM=N1qYpJiUYv( zAZ6t9KLq@y^TcJhg!Czx6v46JW+k4RW$(h4G&aoYLi+UfvGFR@rQYgvp0E5~>q