diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 8f3bb2f07..871e51a8b 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -35,12 +35,22 @@ steps: julia_args: "--threads=1" - JuliaCI/julia-coverage#v1: codecov: true + - label: Julia 1.9 + timeout_in_minutes: 60 + <<: *test + plugins: + - JuliaCI/julia#v1: + version: "1.9" + - JuliaCI/julia-test#v1: + julia_args: "--threads=1" + - JuliaCI/julia-coverage#v1: + codecov: true - label: Julia nightly timeout_in_minutes: 60 <<: *test plugins: - JuliaCI/julia#v1: - version: "1.9-nightly" + version: "1.10-nightly" - JuliaCI/julia-test#v1: julia_args: "--threads=1" - JuliaCI/julia-coverage#v1: @@ -93,3 +103,16 @@ steps: BENCHMARK_SCALE: "5:5:50" artifacts: - benchmarks/result* + - label: DTables.jl stability test + timeout_in_minutes: 20 + plugins: + - JuliaCI/julia#v1: + version: "1.8" + env: + JULIA_NUM_THREADS: "4" + agents: + queue: "juliaecosystem" + sandbox.jl: "true" + os: linux + arch: x86_64 + command: "git clone https://github.com/JuliaParallel/DTables.jl.git ; julia -t4 -e 'using Pkg; Pkg.activate(\"DTables.jl\"); Pkg.develop(;path=\".\"); Pkg.instantiate(); Pkg.test()'" diff --git a/Project.toml b/Project.toml index f09176147..a890bbc3d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,10 +1,11 @@ name = "Dagger" uuid = "d58978e5-989f-55fb-8d15-ea34adc7bf54" -version = "0.16.1" +version = "0.16.3" [deps] Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" ContextVariablesX = "6add18c4-b38d-439d-96f6-d6bc489c04c5" +DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" diff --git a/README.md b/README.md index 1f3833cca..b472f0e78 100644 --- a/README.md +++ b/README.md @@ -17,10 +17,14 @@ At the core of Dagger.jl is a scheduler heavily inspired by [Dask](https://docs. ## Installation -You can install Dagger by typing +Dagger.jl can be installed using the Julia package manager. Enter the Pkg REPL mode by typing "]" in the Julia REPL and then run: ```julia -julia> ] add Dagger +pkg> add Dagger +``` +Or, equivalently, via the Pkg API: +```julia +julia> import Pkg; Pkg.add("Dagger") ``` ## Usage @@ -37,6 +41,34 @@ b = Dagger.@spawn rand(a, 4) c = Dagger.@spawn sum(b) fetch(c) # some number! ``` +## Contributing Guide +[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square)](http://makeapullrequest.com) +[![GitHub issues](https://img.shields.io/github/issues/JuliaParallel/Dagger.jl)](https://github.com/JuliaParallel/Dagger.jl/issues) +[![GitHub contributors](https://img.shields.io/github/contributors/JuliaParallel/Dagger.jl)](https://github.com/JuliaParallel/Dagger.jl/graphs/contributors) + +Contributions are encouraged. + +There are several ways to contribute to our project: + +**Reporting Bugs**: If you find a bug, please open an issue and describe the problem. Make sure to include steps to reproduce the issue and any error messages you receive regarding that issue. + +**Fixing Bugs**: If you'd like to fix a bug, please create a pull request with your changes. Make sure to include a description of the problem and how your changes will address it. + +Additional examples and documentation improvements are also very welcome. + +## Resources +List of recommended Dagger.jl resources: +- Docs [![][docs-master-img]][docs-master-url] +- Videos + - [Distributed Computing with Dagger.jl](https://youtu.be/capjmjVHfMU) + - [Easy, Featureful Parallelism with Dagger.jl](https://youtu.be/t3S8W6A4Ago) + - [Easier parallel Julia workflow with Dagger.jl](https://youtu.be/VrqzOsav61w) + - [Dagger.jl Development and Roadmap](https://youtu.be/G0Y62ysFbDk) + +## Help and Discussion +For help and discussion, we suggest asking in the following places: + +[Julia Discourse](https://discourse.julialang.org/c/domain/parallel/34) and on the [Julia Slack](https://julialang.org/slack/) in the `#distributed` channel. ## Acknowledgements diff --git a/docs/src/checkpointing.md b/docs/src/checkpointing.md index 9b3f7b618..97dd8e8ed 100644 --- a/docs/src/checkpointing.md +++ b/docs/src/checkpointing.md @@ -54,9 +54,9 @@ Let's see how we'd modify the above example to use checkpointing: ```julia using Serialization + X = compute(randn(Blocks(128,128), 1024, 1024)) -Y = [delayed(sum; options=Dagger.Sch.ThunkOptions(; -checkpoint=(thunk,result)->begin +Y = [delayed(sum; checkpoint=(thunk,result)->begin open("checkpoint-$idx.bin", "w") do io serialize(io, collect(result)) end @@ -64,7 +64,7 @@ end, restore=(thunk)->begin open("checkpoint-$idx.bin", "r") do io Dagger.tochunk(deserialize(io)) end -end))(chunk) for (idx,chunk) in enumerate(X.chunks)] +end)(chunk) for (idx,chunk) in enumerate(X.chunks)] inner(x...) = sqrt(sum(x)) Z = delayed(inner)(Y...) z = collect(Z) diff --git a/docs/src/index.md b/docs/src/index.md index 29bdccfeb..45cc68827 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -2,32 +2,34 @@ ## Usage -The main function for using Dagger is `spawn`: +The main entrypoint to Dagger is `@spawn`: -`Dagger.spawn(f, args...; options...)` +`Dagger.@spawn [option=value]... f(args...; kwargs...)` -or `@spawn` for the more convenient macro form: +or `spawn` if it's more convenient: -`Dagger.@spawn [option=value]... f(args...)` +`Dagger.spawn(f, Dagger.Options(options), args...; kwargs...)` When called, it creates an `EagerThunk` (also known as a "thunk" or "task") -object representing a call to function `f` with the arguments `args`. If it is -called with other thunks as inputs, such as in `Dagger.@spawn f(Dagger.@spawn -g())`, then the function `f` gets passed the results of those input thunks. If -those thunks aren't yet finished executing, then the execution of `f` waits on -all of its input thunks to complete before executing. +object representing a call to function `f` with the arguments `args` and +keyword arguments `kwargs`. If it is called with other thunks as args/kwargs, +such as in `Dagger.@spawn f(Dagger.@spawn g())`, then the function `f` gets +passed the results of those input thunks, once they're available. If those +thunks aren't yet finished executing, then the execution of `f` waits on all of +its input thunks to complete before executing. The key point is that, for each argument to a thunk, if the argument is an `EagerThunk`, it'll be executed before this node and its result will be passed into the function `f`. If the argument is *not* an `EagerThunk` (instead, some other type of Julia object), it'll be passed as-is to the function `f`. -Thunks don't accept regular keyword arguments for the function `f`. Instead, -the `options` kwargs are passed to the scheduler to control its behavior: +The `Options` struct in the second argument position is optional; if provided, +it is passed to the scheduler to control its behavior. `Options` contains a +`NamedTuple` of option key-value pairs, which can be any of: - Any field in `Dagger.Sch.ThunkOptions` (see [Scheduler and Thunk options](@ref)) - `meta::Bool` -- Pass the input `Chunk` objects themselves to `f` and not the value contained in them -There are also some extra kwargs that can be passed, although they're considered advanced options to be used only by developers or library authors: +There are also some extra optionss that can be passed, although they're considered advanced options to be used only by developers or library authors: - `get_result::Bool` -- return the actual result to the scheduler instead of `Chunk` objects. Used when `f` explicitly constructs a Chunk or when return value is small (e.g. in case of reduce) - `persist::Bool` -- the result of this Thunk should not be released after it becomes unused in the DAG - `cache::Bool` -- cache the result of this Thunk such that if the thunk is evaluated again, one can just reuse the cached value. If it’s been removed from cache, recompute the value. @@ -133,10 +135,10 @@ via `@par` or `delayed`. The above computation can be executed with the lazy API by substituting `@spawn` with `@par` and `fetch` with `collect`: ```julia -p = @par add1(4) -q = @par add2(p) -r = @par add1(3) -s = @par combine(p, q, r) +p = Dagger.@par add1(4) +q = Dagger.@par add2(p) +r = Dagger.@par add1(3) +s = Dagger.@par combine(p, q, r) @assert collect(s) == 16 ``` @@ -144,7 +146,7 @@ s = @par combine(p, q, r) or similarly, in block form: ```julia -s = @par begin +s = Dagger.@par begin p = add1(4) q = add2(p) r = add1(3) @@ -159,7 +161,7 @@ operation, you can call `compute` on the thunk. This will return a `Chunk` object which references the result (see [Chunks](@ref) for more details): ```julia -x = @par 1+2 +x = Dagger.@par 1+2 cx = compute(x) cx::Chunk @assert collect(cx) == 3 @@ -198,15 +200,17 @@ While Dagger generally "just works", sometimes one needs to exert some more fine-grained control over how the scheduler allocates work. There are two parallel mechanisms to achieve this: Scheduler options (from `Dagger.Sch.SchedulerOptions`) and Thunk options (from -`Dagger.Sch.ThunkOptions`). These two options structs generally contain the -same options, with the difference being that Scheduler options operate +`Dagger.Sch.ThunkOptions`). These two options structs contain many shared +options, with the difference being that Scheduler options operate globally across an entire DAG, and Thunk options operate on a thunk-by-thunk -basis. Scheduler options can be constructed and passed to `collect()` or -`compute()` as the keyword argument `options` for lazy API usage: +basis. + +Scheduler options can be constructed and passed to `collect()` or `compute()` +as the keyword argument `options` for lazy API usage: ```julia -t = @par 1+2 -opts = Dagger.Sch.ThunkOptions(;single=1) # Execute on worker 1 +t = Dagger.@par 1+2 +opts = Dagger.Sch.SchedulerOptions(;single=1) # Execute on worker 1 compute(t; options=opts) @@ -219,12 +223,46 @@ Thunk options can be passed to `@spawn/spawn`, `@par`, and `delayed` similarly: # Execute on worker 1 Dagger.@spawn single=1 1+2 +Dagger.spawn(+, Dagger.Options(;single=1), 1, 2) -Dagger.spawn(+, 1, 2; single=1) - -opts = Dagger.Sch.ThunkOptions(;single=1) -delayed(+)(1, 2; options=opts) +delayed(+; single=1)(1, 2) ``` +### Core vs. Worker Schedulers + +Dagger's scheduler is really two kinds of entities: the "core" scheduler, and +"worker" schedulers: + +The core scheduler runs on worker 1, thread 1, and is the entrypoint to tasks +which have been submitted. The core scheduler manages all task dependencies, +notifies calls to `wait` and `fetch` of task completion, and generally performs +initial task placement. The core scheduler has cached information about each +worker and their processors, and uses that information (together with metrics +about previous tasks and other aspects of the Dagger runtime) to generate a +near-optimal just-in-time task schedule. + +The worker schedulers each run as a set of tasks across all workers and all +processors, and handles data movement and task execution. Once the core +scheduler has scheduled and launched a task, it arrives at the worker scheduler +for handling. The worker scheduler will pass the task to a queue for the +assigned processor, where it will wait until the processor has a sufficient +amount of "occupancy" for the task. Once the processor is ready for the task, +it will first fetch all arguments to the task from other workers, and then it +will execute the task, package the result into a `Chunk`, and pass that back to +the core scheduler. + +### Workload Balancing + +In general, Dagger's core scheduler tries to balance workloads as much as +possible across all the available processors, but it can fail to do so +effectively when either the cached per-processor information is outdated, or +when the estimates about the task's behavior are inaccurate. To minimize the +impact of this potential workload imbalance, the worker schedulers' processors +will attempt to steal tasks from each other when they are under-occupied. Tasks +will only be stolen if their [scope](`Scopes`) matches the processor attempting +the steal, so tasks with wider scopes have better balancing potential. + +### Scheduler/Thunk Options + [`Dagger.Sch.SchedulerOptions`](@ref) [`Dagger.Sch.ThunkOptions`](@ref) diff --git a/docs/src/logging.md b/docs/src/logging.md index 951af63b1..81f97490b 100644 --- a/docs/src/logging.md +++ b/docs/src/logging.md @@ -32,7 +32,7 @@ called. Let's construct one: ```julia ctx = Context() -ml = TimspanLogging.MultiEventLog() +ml = TimespanLogging.MultiEventLog() # Add the BytesAllocd consumer to the log as `:bytes` ml[:bytes] = Dagger.Events.BytesAllocd() diff --git a/docs/src/processors.md b/docs/src/processors.md index e55fd2119..6e74ecc81 100644 --- a/docs/src/processors.md +++ b/docs/src/processors.md @@ -76,42 +76,13 @@ processor B. This mechanism uses Julia's Serialization library to serialize and deserialize data, so data must be serializable for this mechanism to work properly. -### Future: Hierarchy Generic Path Move - -NOTE: This used to be the default move behavior, but was removed because it -wasn't considered helpful, and there were not any processor implementations -that made use of it. - -Movement of data between any two processors is decomposable into a sequence of -"moves" between a child and its parent, termed a "generic path move". Movement -of data may also take "shortcuts" between nodes in the tree which are not -directly connected if enabled by libraries or the user, which may make use of -IPC mechanisms to transfer data more directly and efficiently (such as -Infiniband, GPU RDMA, NVLINK, etc.). All data is considered local to some -processor, and may only be operated on by another processor by first doing an -explicit move operation to that processor. - ## Processor Selection By default, Dagger uses the CPU to process work, typically single-threaded per cluster node. However, Dagger allows access to a wider range of hardware and software acceleration techniques, such as multithreading and GPUs. These more advanced (but performant) accelerators are disabled by default, but can easily -be enabled by using Scheduler/Thunk options in the `proclist` field. If -`nothing`, all default processors will be used. If a vector of types, only the -processor types contained in `options.proclist` will be used to compute all or -a given thunk. If a function, it will be called for each processor (with the -processor as the argument) until it returns `true`. - -```julia -opts = Dagger.Sch.ThunkOptions(;proclist=nothing) # default behavior -# OR -opts = Dagger.Sch.ThunkOptions(;proclist=[DaggerGPU.CuArrayProc]) # only execute on CuArrayProc -# OR -opts = Dagger.Sch.ThunkOptions(;proclist=(proc)->(proc isa Dagger.ThreadProc && proc.tid == 3)) # only run on ThreadProc with thread ID 3 - -t = Dagger.@par options=opts sum(X) # do sum(X) on the specified processor -``` +be enabled by using scopes (see [Scopes](@ref) for details). ## Resource Control @@ -137,7 +108,7 @@ sufficient resources become available by thunks completing execution. The [DaggerGPU.jl](https://github.com/JuliaGPU/DaggerGPU.jl) package can be imported to enable GPU acceleration for NVIDIA and AMD GPUs, when available. The processors provided by that package are not enabled by default, but may be -enabled via `options.proclist` as usual. +enabled via custom scopes ([Scopes](@ref)). ### Future: Network Devices and Topology diff --git a/docs/src/propagation.md b/docs/src/propagation.md index c47ba1353..ce00af4f2 100644 --- a/docs/src/propagation.md +++ b/docs/src/propagation.md @@ -1,6 +1,6 @@ # Option Propagation -Most options passed to Dagger are passed via `delayed` or `Dagger.@spawn` +Most options passed to Dagger are passed via `@spawn/spawn` or `delayed` directly. This works well when an option only needs to be set for a single thunk, but is cumbersome when the same option needs to be set on multiple thunks, or set recursively on thunks spawned within other thunks. Thankfully, diff --git a/docs/src/scheduler-visualization.md b/docs/src/scheduler-visualization.md index 1696aac5d..234f54c92 100644 --- a/docs/src/scheduler-visualization.md +++ b/docs/src/scheduler-visualization.md @@ -16,14 +16,16 @@ easiest way to get started with the web dashboard for new users. For manual usage, the following snippet of code will suffice: ```julia +using Dagger, DaggerWebDash, TimespanLogging + ctx = Context() # or `ctx = Dagger.Sch.eager_context()` for eager API usage -ml = Dagger.MultiEventLog() +ml = TimespanLogging.MultiEventLog() ## Add some logging events of interest -ml[:core] = Dagger.Events.CoreMetrics() -ml[:id] = Dagger.Events.IDMetrics() -ml[:timeline] = Dagger.Events.TimelineMetrics() +ml[:core] = TimespanLogging.Events.CoreMetrics() +ml[:id] = TimespanLogging.Events.IDMetrics() +ml[:timeline] = TimespanLogging.Events.TimelineMetrics() # ... # (Optional) Enable profile flamegraph generation with ProfileSVG @@ -31,7 +33,7 @@ ml[:profile] = DaggerWebDash.ProfileMetrics() ctx.profile = true # Create a LogWindow; necessary for real-time event updates -lw = Dagger.Events.LogWindow(20*10^9, :core) +lw = TimespanLogging.Events.LogWindow(20*10^9, :core) ml.aggregators[:logwindow] = lw # Create the D3Renderer server on port 8080 @@ -40,20 +42,20 @@ d3r = DaggerWebDash.D3Renderer(8080) ## Add some plots! Rendered top-down in order # Show an overview of all generated events as a Gantt chart -push!(d3r, GanttPlot(:core, :id, :esat, :psat; title="Overview")) +push!(d3r, DaggerWebDash.GanttPlot(:core, :id, :esat, :psat; title="Overview")) # Show various numerical events as line plots over time -push!(d3r, LinePlot(:core, :wsat, "Worker Saturation", "Running Tasks")) -push!(d3r, LinePlot(:core, :loadavg, "CPU Load Average", "Average Running Threads")) -push!(d3r, LinePlot(:core, :bytes, "Allocated Bytes", "Bytes")) -push!(d3r, LinePlot(:core, :mem, "Available Memory", "% Free")) +push!(d3r, DaggerWebDash.LinePlot(:core, :wsat, "Worker Saturation", "Running Tasks")) +push!(d3r, DaggerWebDash.LinePlot(:core, :loadavg, "CPU Load Average", "Average Running Threads")) +push!(d3r, DaggerWebDash.LinePlot(:core, :bytes, "Allocated Bytes", "Bytes")) +push!(d3r, DaggerWebDash.LinePlot(:core, :mem, "Available Memory", "% Free")) # Show a graph rendering of compute tasks and data movement between them # Note: Profile events are ignored if absent from the log -push!(d3r, GraphPlot(:core, :id, :timeline, :profile, "DAG")) +push!(d3r, DaggerWebDash.GraphPlot(:core, :id, :timeline, :profile, "DAG")) # TODO: Not yet functional -#push!(d3r, ProfileViewer(:core, :profile, "Profile Viewer")) +#push!(d3r, DaggerWebDash.ProfileViewer(:core, :profile, "Profile Viewer")) # Add the D3Renderer as a consumer of special events generated by LogWindow push!(lw.creation_handlers, d3r) diff --git a/docs/src/scopes.md b/docs/src/scopes.md index 01c1a3414..bed60b506 100644 --- a/docs/src/scopes.md +++ b/docs/src/scopes.md @@ -15,16 +15,16 @@ considered valid. Let's take the example of a webcam handle generated by VideoIO.jl. This handle is a C pointer, and thus has process scope. We can open the handle on a given -process, and set the scope of the resulting data to a `ProcessScope()`, which -defaults to the current Julia process: +process, and set the scope of the resulting data to be locked to the current +process with `Dagger.scope` to construct a `ProcessScope`: ```julia -using VideoIO +using VideoIO, Distributed function get_handle() handle = VideoIO.opencamera() proc = Dagger.thunk_processor() - scope = ProcessScope() + scope = Dagger.scope(worker=myid()) # constructs a `ProcessScope` return Dagger.tochunk(handle, proc, scope) end @@ -41,7 +41,7 @@ cam_frame = Dagger.@spawn read(cam_handle) The `cam_frame` task is executed within any processor on the same process that the `cam_handle` task was executed on. Of course, the resulting camera frame is -*not* scoped to anywhere specific (denoted as `AnyScope()`), and thus +*not* scoped to anywhere specific (denoted as `AnyScope`), and thus computations on it may execute anywhere. You may also encounter situations where you want to use a callable struct (such @@ -53,13 +53,15 @@ using Flux m = Chain(...) # If `m` is only safe to transfer to and execute on this process, # we can set a `ProcessScope` on it: -result = Dagger.@spawn scope=ProcessScope() m(rand(8,8)) +result = Dagger.@spawn scope=Dagger.scope(worker=myid()) m(rand(8,8)) ``` Setting a scope on the function treats it as a regular piece of data (like the arguments to the function), so it participates in the scoping rules described in the following sections all the same. +[`Dagger.scope`](@ref) + Now, let's try out some other kinds of scopes, starting with `NodeScope`. This scope encompasses the server that one or more Julia processes may be running on. Say we want to use memory mapping (mmap) to more efficiently send arrays @@ -75,6 +77,7 @@ function generate() arr = Mmap.mmap(path, Matrix{Int}, (64,64)) fill!(arr, 1) Mmap.sync!(arr) + # Note: Dagger.scope() does not yet support node scopes Dagger.tochunk(path, Dagger.thunk_processor(), NodeScope()) end @@ -87,14 +90,14 @@ a = Dagger.@spawn generate() @assert fetch(Dagger.@spawn consume(a)) == 64*64 ``` -Whatever server `a` executed on, `b` will also execute on! +Whatever server `a` executed on, `b` will also execute on it! Finally, we come to the "lowest" scope on the scope hierarchy, the `ExactScope`. This scope specifies one exact processor as the bounding scope, -and is typically useful in certain limited cases. We won't provide an example -here, because you don't usually need to ever use this scope, but if you already -understand the `NodeScope` and `ProcessScope`, the `ExactScope` should be easy -to figure out. +and is typically useful in certain limited cases (such as data existing only on +a specific GPU). We won't provide an example here, because you don't usually +need to ever use this scope, but if you already understand the `NodeScope` and +`ProcessScope`, the `ExactScope` should be easy to figure out. ## Union Scopes diff --git a/lib/DaggerMPI/src/DaggerMPI.jl b/lib/DaggerMPI/src/DaggerMPI.jl index 7f185ed2f..adffa583e 100644 --- a/lib/DaggerMPI/src/DaggerMPI.jl +++ b/lib/DaggerMPI/src/DaggerMPI.jl @@ -1,6 +1,6 @@ module DaggerMPI - using Dagger +import Base: reduce, fetch, cat using MPI struct MPIProcessor{P,C} <: Dagger.Processor @@ -16,6 +16,8 @@ end const MPI_PROCESSORS = Ref{Int}(-1) +#const BCAST_VALUES = Dict{Any, Any}() + const PREVIOUS_PROCESSORS = Set() @@ -28,14 +30,14 @@ function initialize(comm::MPI.Comm=MPI.COMM_WORLD; color_algo=SimpleColoring()) MPI.Init(; finalize_atexit=false) procs = Dagger.get_processors(OSProc()) i = 0 - empty!(Dagger.PROCESSOR_CALLBACKS) + #=empty!(Dagger.PROCESSOR_CALLBACKS) empty!(Dagger.OSPROC_PROCESSOR_CACHE) for proc in procs Dagger.add_processor_callback!("mpiprocessor_$i") do return MPIProcessor(proc, comm, color_algo) end i += 1 - end + end=# MPI_PROCESSORS[] = i # FIXME: Hack to populate new processors @@ -52,7 +54,7 @@ function finalize() empty!(Dagger.PROCESSOR_CALLBACKS) empty!(Dagger.OSPROC_PROCESSOR_CACHE) i = 1 - for proc in PREVIOUS_PROCESSORS + for proc in PREVIOUS_PROCESSORS Dagger.add_processor_callback!("old_processor_$i") do return proc end @@ -73,6 +75,113 @@ end Dagger.get_parent(proc::MPIProcessor) = Dagger.OSProc() Dagger.default_enabled(proc::MPIProcessor) = true +export MPIBlocks + +struct MPIBlocks{N} <: Dagger.AbstractSingleBlocks{N} + blocksize::NTuple{N, Int} +end +MPIBlocks(xs::Int...) = MPIBlocks(xs) + +function Dagger.distribute(::Type{A}, + x::Union{AbstractArray, Nothing}, + dist::MPIBlocks, + comm::MPI.Comm=MPI.COMM_WORLD, + root::Integer=0) where {A<:AbstractArray{T, N}} where {T, N} + isroot = MPI.Comm_rank(comm) == root + + #TODO: Make better load balancing + + data = Array{T, N}(undef, dist.blocksize) + if isroot + cs = Array{T, N}(undef, size(x)) + parts = partition(dist, domain(x)) + idx = 1 + for part in parts + cs[idx:(idx - 1 + prod(dist.blocksize))] = x[part] + idx += prod(dist.blocksize) + end + MPI.Scatter!(MPI.UBuffer(cs, div(length(cs), MPI.Comm_size(comm))), data, comm, root=root) + else + MPI.Scatter!(nothing, data, comm, root=root) + end + + data = Dagger.tochunk(data) + + Dagger.DArray( + T, + domain(data), + domain(data), + data, + dist + ) +end + +function Dagger.distribute(::Type{A}, + dist::MPIBlocks, + comm::MPI.Comm=MPI.COMM_WORLD, + root::Integer=0) where {A<:AbstractArray{T, N}} where {T, N} + distribute(A, nothing, dist, comm, root) +end + +function Dagger.distribute(x::AbstractArray, + dist::MPIBlocks, + comm::MPI.Comm=MPI.COMM_WORLD, + root::Integer=0) + distribute(typeof(x), x, dist, comm, root) +end + +export MPIBlocks, distribute + +function Base.reduce(f::Function, x::Dagger.DArray{T,N,MPIBlocks{N},F}; comm=MPI.COMM_WORLD, root=nothing, dims=nothing, acrossranks::Bool=true) where {T,N,F} + if dims === nothing + if !acrossranks + return fetch(reduce_async(f,x)) + elseif root === nothing + return MPI.Allreduce(fetch(reduce_async(f,x)), f, comm) + else + return MPI.Reduce(fetch(reduce_async(f,x)), f, comm; root) + end + else + if dims isa Int + dims = (dims,) + end + d = reduce(x.domain, dims=dims) + ds = reduce(x.subdomains[1], dims=dims) + if !acrossranks + thunks = Dagger.spawn(b->reduce(f, b, dims=dims), x.chunks[1]) + return Dagger.DArray(T, + d, + ds, + thunks, + x.partitioning) + else + tmp = collect(reduce(f, x, comm=comm, root=root, dims=dims, acrossranks=false)) + if root === nothing + h = UInt(0) + for dim in 1:N + if dim in dims + continue + end + h = hash(x.subdomains[1].indexes[dim], h) + end + h = abs(Base.unsafe_trunc(Int32, h)) + newc = MPI.Comm_split(comm, h, MPI.Comm_rank(comm)) + chunks = Dagger.tochunk(reshape(MPI.Allreduce(tmp, f, newc), size(tmp))) + else + rcvbuf = MPI.Reduce(tmp, f, comm; root) + if root === MPI.Comm_rank(comm) + chunks = Dagger.tochunk(reshape(rcvbuf, size(tmp))) + return Dagger.DArray(T, + d, + ds, + chunks, + x.partitioning) + end + return nothing + end + end + end +end "Busy-loop Irecv that yields to other tasks." function recv_yield(src, tag, comm) @@ -142,15 +251,15 @@ function Dagger.move(from_proc::MPIProcessor, to_proc::Dagger.Processor, x::Dagg @assert Dagger.chunktype(x) <: MPIColoredValue x_value = fetch(x) rank = MPI.Comm_rank(from_proc.comm) - tag = abs(Base.unsafe_trunc(Int32, Dagger.get_task_hash(:input) >> 32)) + tag = abs(Base.unsafe_trunc(Int32, Dagger.get_task_hash(:input) >> 32)) if rank == x_value.color - # FIXME: Broadcast send - @sync for other in 0:(MPI.Comm_size(from_proc.comm)-1) + @debug "[$rank] Starting bcast send on $tag" + @sync for other in 0:(MPI_Comm_size(from_proc.comm)-1) other == rank && continue @async begin - @debug "[$rank] Starting bcast send to [$other] on $tag" + @debug "[$rank] Starting bcast send on $tag" MPI.isend(x_value.value, other, tag, from_proc.comm) - @debug "[$rank] Finished bcast send to [$other] on $tag" + @debug "[$rank] Finished bcast send on $tag" end end return Dagger.move(from_proc.proc, to_proc, x_value.value) diff --git a/lib/DaggerWebDash/Project.toml b/lib/DaggerWebDash/Project.toml index fb1d99d1c..1dccdec2f 100644 --- a/lib/DaggerWebDash/Project.toml +++ b/lib/DaggerWebDash/Project.toml @@ -6,6 +6,7 @@ version = "0.1.2" [deps] Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94" Mux = "a975b10e-0019-58db-a62f-e48ff68538c9" @@ -17,9 +18,10 @@ TimespanLogging = "a526e669-04d3-4846-9525-c66122c55f63" [compat] Dagger = "0.16" +HTTP = "1.7" JSON3 = "1" MemPool = "0.3, 0.4" -Mux = "0.7" +Mux = "0.7, 1" ProfileSVG = "0.2" StructTypes = "1" Tables = "1" diff --git a/lib/DaggerWebDash/src/d3.jl b/lib/DaggerWebDash/src/d3.jl index bc77991cd..00d2ffb33 100644 --- a/lib/DaggerWebDash/src/d3.jl +++ b/lib/DaggerWebDash/src/d3.jl @@ -1,4 +1,7 @@ -using Mux, MemPool, Distributed, Sockets +using Mux, Sockets +import HTTP +import HTTP: WebSockets +using MemPool, Distributed struct LinePlot core_key::Symbol @@ -172,22 +175,22 @@ function client_handler(sock, id, port, port_range, config_updated, config, seek if id != myid() fsock_ready = Base.Event() worker_host, worker_port = worker_host_port(id, port, port_range) - Mux.HTTP.WebSockets.open("ws://$worker_host:$worker_port/data_feed") do _fsock + WebSockets.open("ws://$worker_host:$worker_port/data_feed") do _fsock fsock = _fsock @debug "D3R forwarder for $id ready" notify(fsock_ready) - while !eof(fsock) && isopen(fsock) + while true try - bytes = readavailable(fsock) + bytes = WebSockets.receive(fsock) if length(bytes) == 0 sleep(0.1) continue end data = String(bytes) #@info "D3R forwarder for $id received data" - write(sock, data) + WebSockets.send(sock, data) catch err - if err isa Mux.WebSockets.WebSocketClosedError || err isa Base.IOError + if err isa WebSockets.WebSocketError && err.message isa WebSockets.CloseFrameBody # Force-close client and forwarder @async close(sock) @async close(fsock) @@ -203,25 +206,25 @@ function client_handler(sock, id, port, port_range, config_updated, config, seek end if id == myid() @debug "D3R client for $id sending initial config" - write(sock, JSON3.write((;cmd="data", payload=sanitize(D3R_LOGS[port])))) + WebSockets.send(sock, JSON3.write((;cmd="data", payload=sanitize(D3R_LOGS[port])))) _workers = workers() if !(myid() in _workers) # FIXME: Get this from the Context _workers = vcat(myid(), _workers) end - write(sock, JSON3.write((;cmd="config", payload=sanitize((;myid=myid(),workers=_workers,ctxs=config))))) + WebSockets.send(sock, JSON3.write((;cmd="config", payload=sanitize((;myid=myid(),workers=_workers,ctxs=config))))) end push!(get!(()->[], D3R_CLIENT_SOCKETS[port], id), sock) @debug "D3R client for $id ready" - while !eof(sock) && isopen(sock) + while true try - data = String(read(sock)) + data = String(WebSockets.receive(sock)) @debug "D3R client for $id received: $data" if id == myid() #= FIXME if config_updated[] config_updated[] = false - write(sock, JSON3.write((;cmd="config", payload=sanitize(config)))) + WebSockets.send(sock, JSON3.write((;cmd="config", payload=sanitize(config)))) end =# if seek_store !== nothing @@ -231,7 +234,7 @@ function client_handler(sock, id, port, port_range, config_updated, config, seek for (idx,key) in enumerate(Tables.columnnames(raw_logs)) logs[key] = Tables.columns(raw_logs)[idx] end - write(sock, JSON3.write((;cmd="data", payload=sanitize(logs)))) + WebSockets.send(sock, JSON3.write((;cmd="data", payload=sanitize(logs)))) continue end m = match(r"seek\(([0-9]*),([0-9]*)\)", data) @@ -242,19 +245,19 @@ function client_handler(sock, id, port, port_range, config_updated, config, seek for (idx,key) in enumerate(Tables.columnnames(raw_logs)) logs[key] = Tables.columns(raw_logs)[idx] end - write(sock, JSON3.write((;cmd="data", payload=sanitize(logs)))) + WebSockets.send(sock, JSON3.write((;cmd="data", payload=sanitize(logs)))) continue end end if data == "data" - write(sock, JSON3.write((;cmd="data", payload=sanitize(D3R_LOGS[port])))) + WebSockets.send(sock, JSON3.write((;cmd="data", payload=sanitize(D3R_LOGS[port])))) end else @debug "D3R client sending to forwarder: $data" - write(fsock, data) + WebSockets.send(fsock, data) end catch err - if err isa Mux.WebSockets.WebSocketClosedError || err isa Base.IOError + if err isa WebSockets.WebSocketError && err.message isa WebSockets.CloseFrameBody idx = findfirst(x->x==sock, D3R_CLIENT_SOCKETS[port][id]) if idx !== nothing deleteat!(D3R_CLIENT_SOCKETS[port][id], idx) @@ -273,11 +276,9 @@ end function TimespanLogging.Events.creation_hook(d3r::D3Renderer, log) for sock in get!(()->[], get!(()->Dict{Int,Vector{Any}}(), D3R_CLIENT_SOCKETS, d3r.port), myid()) try - if isopen(sock) - write(sock, JSON3.write((;cmd="add", payload=sanitize(log)))) - end + WebSockets.send(sock, JSON3.write((;cmd="add", payload=sanitize(log)))) catch err - if err isa Mux.WebSockets.WebSocketClosedError + if err isa WebSockets.WebSocketError && err.message isa WebSockets.CloseFrameBody continue end rethrow(err) @@ -287,11 +288,9 @@ end function TimespanLogging.Events.deletion_hook(d3r::D3Renderer, idx) for sock in get!(()->[], get!(()->Dict{Int,Vector{Any}}(), D3R_CLIENT_SOCKETS, d3r.port), myid()) try - if isopen(sock) - write(sock, JSON3.write((;cmd="delete", payload=idx))) - end + WebSockets.send(sock, JSON3.write((;cmd="delete", payload=idx))) catch err - if err isa Mux.WebSockets.WebSocketClosedError + if err isa WebSockets.WebSocketError && err.message isa WebSockets.CloseFrameBody continue end rethrow(err) diff --git a/src/Dagger.jl b/src/Dagger.jl index a9b947921..1be27cacf 100644 --- a/src/Dagger.jl +++ b/src/Dagger.jl @@ -37,6 +37,7 @@ include("compute.jl") include("utils/clock.jl") include("utils/system_uuid.jl") include("utils/uhash.jl") +include("utils/locked-object.jl") include("sch/Sch.jl"); using .Sch # Array computations diff --git a/src/array/alloc.jl b/src/array/alloc.jl index b47af6f56..39ca60396 100644 --- a/src/array/alloc.jl +++ b/src/array/alloc.jl @@ -25,7 +25,7 @@ end function stage(ctx, a::AllocateArray) alloc(idx, sz) = a.f(idx, a.eltype, sz) - thunks = [delayed(alloc)(i, size(x)) for (i, x) in enumerate(a.domainchunks)] + thunks = [Dagger.@spawn alloc(i, size(x)) for (i, x) in enumerate(a.domainchunks)] DArray(a.eltype,a.domain, a.domainchunks, thunks) end diff --git a/src/array/darray.jl b/src/array/darray.jl index 7f1b8c6bd..221600922 100644 --- a/src/array/darray.jl +++ b/src/array/darray.jl @@ -1,4 +1,4 @@ -import Base: == +import Base: ==, fetch using Serialization import Serialization: serialize, deserialize @@ -12,7 +12,6 @@ An `N`-dimensional domain over an array. struct ArrayDomain{N} indexes::NTuple{N, Any} end - include("../lib/domain-blocks.jl") @@ -78,13 +77,14 @@ domain(x::AbstractArray) = ArrayDomain([1:l for l in size(x)]) abstract type ArrayOp{T, N} <: AbstractArray{T, N} end Base.IndexStyle(::Type{<:ArrayOp}) = IndexCartesian() -compute(ctx, x::ArrayOp; options=nothing) = - compute(ctx, cached_stage(ctx, x)::DArray; options=options) -collect(ctx::Context, x::ArrayOp; options=nothing) = - collect(ctx, compute(ctx, x; options=options); options=options) +collect(x::ArrayOp) = collect(fetch(x)) + +Base.fetch(x::ArrayOp) = fetch(cached_stage(Context(global_context()), x)::DArray) -collect(x::ArrayOp; options=nothing) = collect(Context(global_context()), x; options=options) +collect(x::Computation) = collect(fetch(x)) + +Base.fetch(x::Computation) = fetch(cached_stage(Context(global_context()), x)) function Base.show(io::IO, ::MIME"text/plain", x::ArrayOp) write(io, string(typeof(x))) @@ -113,7 +113,7 @@ An N-dimensional distributed array of element type T, with a concatenation funct mutable struct DArray{T,N,F} <: ArrayOp{T, N} domain::ArrayDomain{N} subdomains::AbstractArray{ArrayDomain{N}, N} - chunks::AbstractArray{Union{Chunk,Thunk}, N} + chunks::AbstractArray{Any, N} concat::F function DArray{T,N,F}(domain, subdomains, chunks, concat::Function) where {T, N,F} new(domain, subdomains, chunks, concat) @@ -135,8 +135,8 @@ domainchunks(d::DArray) = d.subdomains size(x::DArray) = size(domain(x)) stage(ctx, c::DArray) = c -function collect(ctx::Context, d::DArray; tree=false, options=nothing) - a = compute(ctx, d; options=options) +function collect(d::DArray; tree=false) + a = fetch(d) if isempty(d.chunks) return Array{eltype(d)}(undef, size(d)...) @@ -144,9 +144,9 @@ function collect(ctx::Context, d::DArray; tree=false, options=nothing) dimcatfuncs = [(x...) -> d.concat(x..., dims=i) for i in 1:ndims(d)] if tree - collect(treereduce_nd(delayed.(dimcatfuncs), a.chunks)) + collect(fetch(treereduce_nd(map(x -> ((args...,) -> Dagger.@spawn x(args...)) , dimcatfuncs), a.chunks))) else - treereduce_nd(dimcatfuncs, asyncmap(collect, a.chunks)) + treereduce_nd(dimcatfuncs, asyncmap(fetch, a.chunks)) end end @@ -209,53 +209,33 @@ _cumsum(x::AbstractArray) = length(x) == 0 ? Int[] : cumsum(x) function lookup_parts(ps::AbstractArray, subdmns::DomainBlocks{N}, d::ArrayDomain{N}) where N groups = map(group_indices, subdmns.cumlength, indexes(d)) sz = map(length, groups) - pieces = Array{Union{Chunk,Thunk}}(undef, sz) + pieces = Array{Any}(undef, sz) for i = CartesianIndices(sz) idx_and_dmn = map(getindex, groups, i.I) idx = map(x->x[1], idx_and_dmn) dmn = ArrayDomain(map(x->x[2], idx_and_dmn)) - pieces[i] = delayed(getindex)(ps[idx...], project(subdmns[idx...], dmn)) + pieces[i] = Dagger.@spawn getindex(ps[idx...], project(subdmns[idx...], dmn)) end out_cumlength = map(g->_cumsum(map(x->length(x[2]), g)), groups) out_dmn = DomainBlocks(ntuple(x->1,Val(N)), out_cumlength) pieces, out_dmn end - """ - compute(ctx::Context, x::DArray; persist=true, options=nothing) - -A `DArray` object may contain a thunk in it, in which case -we first turn it into a `Thunk` and then compute it. -""" -function compute(ctx::Context, x::DArray; persist=true, options=nothing) - thunk = thunkize(ctx, x, persist=persist) - if isa(thunk, Thunk) - compute(ctx, thunk; options=options) - else - x - end -end - -""" - thunkize(ctx::Context, c::DArray; persist=true) + Base.fetch(c::DArray) If a `DArray` tree has a `Thunk` in it, make the whole thing a big thunk. """ -function thunkize(ctx::Context, c::DArray; persist=true) +function Base.fetch(c::DArray) if any(istask, chunks(c)) thunks = chunks(c) sz = size(thunks) dmn = domain(c) dmnchunks = domainchunks(c) - if persist - foreach(persist!, thunks) - end - Thunk(thunks...; meta=true) do results... + fetch(Dagger.spawn(Options(meta=true), thunks...) do results... t = eltype(results[1]) - DArray(t, dmn, dmnchunks, - reshape(Union{Chunk,Thunk}[results...], sz)) - end + DArray(t, dmn, dmnchunks, reshape(Any[results...], sz)) + end) else c end @@ -324,7 +304,7 @@ Distribute(p::Blocks, data::AbstractArray) = function stage(ctx::Context, d::Distribute) if isa(d.data, ArrayOp) - # distributing a dsitributed array + # distributing a distributed array x = cached_stage(ctx, d.data) if d.domainchunks == domainchunks(x) return x # already properly distributed @@ -335,19 +315,22 @@ function stage(ctx::Context, d::Distribute) cs = map(d.domainchunks) do idx chunks = cached_stage(ctx, x[idx]).chunks shape = size(chunks) - (delayed() do shape, parts... + hash = Base.hash(idx, Base.hash(stage, Base.hash(d.data))) + Dagger.spawn(shape, chunks...) do shape, parts... if prod(shape) == 0 return Array{T}(undef, shape) end dimcatfuncs = [(x...) -> concat(x..., dims=i) for i in 1:length(shape)] ps = reshape(Any[parts...], shape) collect(treereduce_nd(dimcatfuncs, ps)) - end)(shape, chunks...) + end end else - cs = map(c -> delayed(identity)(d.data[c]), d.domainchunks) + cs = map(d.domainchunks) do c + hash = Base.hash(c, Base.hash(stage, Base.hash(d.data))) + Dagger.@spawn hash=hash identity(d.data[c]) + end end - DArray( eltype(d.data), domain(d.data), @@ -357,7 +340,7 @@ function stage(ctx::Context, d::Distribute) end function distribute(x::AbstractArray, dist) - compute(Distribute(dist, x)) + fetch(Distribute(dist, x)) end function distribute(x::AbstractArray{T,N}, n::NTuple{N}) where {T,N} diff --git a/src/array/getindex.jl b/src/array/getindex.jl index d04ae024f..89d38bb97 100644 --- a/src/array/getindex.jl +++ b/src/array/getindex.jl @@ -34,9 +34,9 @@ end function stage(ctx::Context, gidx::GetIndexScalar) inp = cached_stage(ctx, gidx.input) s = view(inp, ArrayDomain(gidx.idx)) - delayed(identity)(collect(s)[1]) + Dagger.@spawn identity(collect(s)[1]) end Base.getindex(c::ArrayOp, idx::ArrayDomain) = GetIndex(c, indexes(idx)) Base.getindex(c::ArrayOp, idx...) = GetIndex(c, idx) -Base.getindex(c::ArrayOp, idx::Integer...) = compute(GetIndexScalar(c, idx)) +Base.getindex(c::ArrayOp, idx::Integer...) = fetch(GetIndexScalar(c, idx)) diff --git a/src/array/map-reduce.jl b/src/array/map-reduce.jl index e39019ed0..76bdcd727 100644 --- a/src/array/map-reduce.jl +++ b/src/array/map-reduce.jl @@ -19,7 +19,7 @@ function stage(ctx::Context, node::Map) f = node.f for i=eachindex(domains) inps = map(x->chunks(x)[i], inputs) - thunks[i] = Thunk((args...) -> map(f, args...), inps...) + thunks[i] = Dagger.@spawn map(f, inps...) end DArray(Any, domain(primary), domainchunks(primary), thunks) end @@ -40,16 +40,16 @@ end function stage(ctx::Context, r::ReduceBlock) inp = stage(ctx, r.input) - reduced_parts = map(x -> Thunk(r.op, x; get_result=r.get_result), chunks(inp)) - Thunk((xs...) -> r.op_master(xs), reduced_parts...; meta=true) + reduced_parts = map(x -> (Dagger.@spawn get_result=r.get_result r.op(x)), chunks(inp)) + r_op_master(args...,) = r.op_master(args) + Dagger.@spawn meta=true r_op_master(reduced_parts...) end reduceblock_async(f, x::ArrayOp; get_result=true) = ReduceBlock(f, f, x, get_result) reduceblock_async(f, g::Function, x::ArrayOp; get_result=true) = ReduceBlock(f, g, x, get_result) -reduceblock(f, x::ArrayOp) = compute(reduceblock_async(f, x)) -reduceblock(f, g::Function, x::ArrayOp) = - compute(reduceblock_async(f, g, x)) +reduceblock(f, x::ArrayOp) = fetch(reduceblock_async(f, x)) +reduceblock(f, g::Function, x::ArrayOp) = fetch(reduceblock_async(f, g, x)) reduce_async(f::Function, x::ArrayOp) = reduceblock_async(xs->reduce(f,xs), xs->reduce(f,xs), x) @@ -62,7 +62,7 @@ prod(f::Function, x::ArrayOp) = reduceblock(a->prod(f, a), prod, x) mean(x::ArrayOp) = reduceblock(mean, mean, x) -mapreduce(f::Function, g::Function, x::ArrayOp) = reduce(g, map(f, x)) +mapreduce(f::Function, g::Function, x::ArrayOp) = reduce(g, map(f, x)) #think about fetching function mapreducebykey_seq(f, op, itr, dict=Dict()) for x in itr @@ -115,7 +115,7 @@ end function reduce(f::Function, x::ArrayOp; dims = nothing) if dims === nothing - return compute(reduce_async(f,x)) + return fetch(reduce_async(f,x)) elseif dims isa Int dims = (dims,) end @@ -126,10 +126,10 @@ function stage(ctx::Context, r::Reducedim) inp = cached_stage(ctx, r.input) thunks = let op = r.op, dims=r.dims # do reducedim on each block - tmp = map(p->Thunk(b->reduce(op,b,dims=dims), p), chunks(inp)) + tmp = map(p->Dagger.spawn(b->reduce(op,b,dims=dims), p), chunks(inp)) # combine the results in tree fashion treereducedim(tmp, r.dims) do x,y - Thunk(op, x,y) + Dagger.@spawn op(x,y) end end c = domainchunks(inp) diff --git a/src/array/matrix.jl b/src/array/matrix.jl index 686dfb3b8..48a77190e 100644 --- a/src/array/matrix.jl +++ b/src/array/matrix.jl @@ -17,10 +17,12 @@ function size(x::Transpose) end transpose(x::ArrayOp) = Transpose(transpose, x) -transpose(x::Union{Chunk, Thunk}) = Thunk(transpose, x) + +transpose(x::Union{Chunk, EagerThunk}) = @spawn transpose(x) adjoint(x::ArrayOp) = Transpose(adjoint, x) -adjoint(x::Union{Chunk, Thunk}) = Thunk(adjoint, x) +adjoint(x::Union{Chunk, EagerThunk}) = @spawn adjoint(x) + function adjoint(x::ArrayDomain{2}) d = indexes(x) @@ -32,7 +34,7 @@ function adjoint(x::ArrayDomain{1}) end function _ctranspose(x::AbstractArray) - Any[delayed(adjoint)(x[j,i]) for i=1:size(x,2), j=1:size(x,1)] + Any[Dagger.@spawn adjoint(x[j,i]) for i=1:size(x,2), j=1:size(x,1)] end function stage(ctx::Context, node::Transpose) @@ -91,8 +93,12 @@ function (+)(a::ArrayDomain, b::ArrayDomain) a end -(*)(a::Union{Chunk, Thunk}, b::Union{Chunk, Thunk}) = Thunk(*, a,b) -(+)(a::Union{Chunk, Thunk}, b::Union{Chunk, Thunk}) = Thunk(+, a,b) +struct BinaryComputeOp{F} end +BinaryComputeOp{F}(x::Union{Chunk,EagerThunk}, y::Union{Chunk,EagerThunk}) where F = @spawn F(x, y) +BinaryComputeOp{F}(x, y) where F = F(x, y) + +const AddComputeOp = BinaryComputeOp{+} +const MulComputeOp = BinaryComputeOp{*} # we define our own matmat and matvec multiply # for computing the new domains and thunks. @@ -101,7 +107,7 @@ function _mul(a::Matrix, b::Matrix; T=eltype(a)) n = size(a, 2) for i=1:size(a,1) for j=1:size(b, 2) - c[i,j] = treereduce(+, map(*, reshape(a[i,:], (n,)), b[:, j])) + c[i,j] = treereduce(AddComputeOp, map(MulComputeOp, reshape(a[i,:], (n,)), b[:, j])) end end c @@ -111,14 +117,14 @@ function _mul(a::Matrix, b::Vector; T=eltype(b)) c = Array{T}(undef, size(a,1)) n = size(a,2) for i=1:size(a,1) - c[i] = treereduce(+, map(*, reshape(a[i, :], (n,)), b)) + c[i] = treereduce(AddComputeOp, map(MulComputeOp, reshape(a[i, :], (n,)), b)) end c end function _mul(a::Vector, b::Vector; T=eltype(b)) @assert length(b) == 1 - [x * b[1] for x in a] + [MulComputeOp(x, b[1]) for x in a] end function promote_distribution(ctx::Context, m::MatMul, a,b) @@ -176,7 +182,7 @@ function stage(ctx::Context, mul::MatMul) a, b = stage_operands(ctx, mul, mul.a, mul.b) d = domain(a)*domain(b) DArray(Any, d, domainchunks(a)*domainchunks(b), - _mul(chunks(a), chunks(b); T=Thunk)) + _mul(chunks(a), chunks(b); T=Any)) end Base.power_by_squaring(x::DArray, i::Int) = foldl(*, ntuple(idx->x, i)) @@ -211,7 +217,7 @@ end function _scale(l, r) res = similar(r, Any) for i=1:length(l) - res[i,:] = map(x->Thunk((a,b) -> Diagonal(a)*b, l[i], x), r[i,:]) + res[i,:] = map(x->Dagger.spawn((a,b) -> Diagonal(a)*b, l[i], x), r[i,:]) end res end diff --git a/src/array/operators.jl b/src/array/operators.jl index 6bfe95ae6..640db4ea9 100644 --- a/src/array/operators.jl +++ b/src/array/operators.jl @@ -90,8 +90,7 @@ function stage(ctx::Context, node::BCast) end blcks = DomainBlocks(map(_->1, size(node)), cumlengths) - thunks = broadcast(delayed((args...)->broadcast(bc.f, args...); ), - args2...) + thunks = broadcast((args3...)->Dagger.spawn((args...)->broadcast(bc.f, args...), args3...), args2...) DArray(eltype(node), domain(node), blcks, thunks) end @@ -107,7 +106,7 @@ Base.@deprecate mappart(args...) mapchunk(args...) function stage(ctx::Context, node::MapChunk) inputs = map(x->cached_stage(ctx, x), node.input) thunks = map(map(chunks, inputs)...) do ps... - Thunk(node.f, ps...) + Dagger.spawn(node.f, map(p->nothing=>p, ps)...) end DArray(Any, domain(inputs[1]), domainchunks(inputs[1]), thunks) diff --git a/src/array/setindex.jl b/src/array/setindex.jl index e2c842901..45ae35c64 100644 --- a/src/array/setindex.jl +++ b/src/array/setindex.jl @@ -28,14 +28,14 @@ function stage(ctx::Context, sidx::SetIndex) groups = map(group_indices, subdmns.cumlength, indexes(d)) sz = map(length, groups) - pieces = Array{Union{Chunk, Thunk}}(undef, sz) + pieces = Array{Any}(undef, sz) for i = CartesianIndices(sz) idx_and_dmn = map(getindex, groups, i.I) idx = map(x->x[1], idx_and_dmn) local_dmn = ArrayDomain(map(x->x[2], idx_and_dmn)) s = subdmns[idx...] part_to_set = sidx.val - ps[idx...] = Thunk(ps[idx...]) do p + ps[idx...] = Dagger.spawn(ps[idx...]) do p q = copy(p) q[indexes(project(s, local_dmn))...] .= part_to_set q diff --git a/src/array/sort.jl b/src/array/sort.jl index cbc9b21f1..7c989e05c 100644 --- a/src/array/sort.jl +++ b/src/array/sort.jl @@ -3,6 +3,7 @@ using Distributed using StatsBase + function getmedians(x, n) q,r = divrem(length(x), n+1) @@ -28,6 +29,7 @@ function sortandsample_array(ord, xs, nsamples, presorted=false) chunk = nothing # avoid communicating metadata if already sorted samples = chunk[r] end + (chunk, samples) end @@ -89,7 +91,7 @@ end function collect_merge(merge, group) #delayed((xs...) -> treereduce(merge, Any[xs...]))(group...) - t = treereduce(delayed(merge), group) + t = treereduce((args...) -> (Dagger.@spawn merge(args...)), group) end # Given sorted chunks, splits each chunk according to splitters @@ -117,15 +119,15 @@ function splitchunk(c, splitters, by, sub, ord) 1:j end - between = map((hi, lo) -> delayed(x->sub(x, getbetween(by(x), hi, lo, ord)))(c), + between = map((hi, lo) -> Dagger.spawn(x->sub(x, getbetween(by(x), hi, lo, ord)), c), splitters[1:end-1], splitters[2:end]) hi = splitters[1] lo = splitters[end] - a = delayed(x->sub(x, getlt(by(x), hi, ord)))(c) + a = Dagger.spawn(x->sub(x, getlt(by(x), hi, ord)), c) b = between - c = delayed(x->sub(x, getgt(by(x), lo, ord)))(c) - return vcat(a, b, c) + c = Dagger.spawn(x->sub(x, getgt(by(x), lo, ord)), c) + return map(fetch, vcat(a, b, c)) #[delayed(c->sub(c, getlt(by(c), hi, ord)))(c); # between; delayed(c->sub(c, getgt(by(c), lo, ord)))(c)] @@ -263,17 +265,17 @@ function dsort_chunks(cs, nchunks=length(cs), nsamples=2000; sortandsample = (x,ns, presorted)->sortandsample_array(order, x,ns, presorted), affinities=workers(), ) - if splitters !== nothing + if splitters != nothing # this means splitters are given beforehand nsamples = 0 # no samples needed end # first sort each chunk and sample nsamples elements from each chunk - cs1 = map(c->delayed(sortandsample)(c, nsamples, chunks_presorted), cs) - + cs1 = map(c->Dagger.spawn(sortandsample, c, nsamples, chunks_presorted), cs) batchsize = max(2, batchsize) - # collect the samples - xs = collect(treereduce(delayed(vcat), cs1)) + + xs = treereduce((cs...)->Dagger.spawn(vcat, cs...), cs1) + xs = collect(fetch(xs)) if length(cs1) == 1 xs = [xs] end @@ -283,10 +285,9 @@ function dsort_chunks(cs, nchunks=length(cs), nsamples=2000; samples = reduce((a,b)->merge_sorted(order, a, b), map(x->x[2], xs)) splitters = getmedians(samples, nchunks-1) end - - # if first(x) === nothing this means that - # the chunk was already sorted, so just - # use the input chunk as-is + # if first(x) === nothing this means that + # the chunk was already sorted, so just + # use the input chunk as-is cs2 = map((x,c) -> x === nothing ? c : x, map(first, xs), cs) # main sort routine. At this point: @@ -304,7 +305,7 @@ function dsort_chunks(cs, nchunks=length(cs), nsamples=2000; end function propagate_affinity!(c, aff) - if !isa(c, Thunk) + if !isa(c, EagerThunk) return end if c.affinity !== nothing @@ -326,17 +327,13 @@ function Base.sort(v::ArrayOp; batchsize=max(2, nworkers()), nsamples=2000, order::Ordering=default_ord) - v1 = compute(v) + v1 = fetch(v) ord = Base.Sort.ord(lt,by,rev,order) nchunks = nchunks === nothing ? length(v1.chunks) : nchunks cs = dsort_chunks(v1.chunks, nchunks, nsamples, order=ord, merge=(x,y)->merge_sorted(ord, x,y)) - foreach(persist!, cs) - t=delayed((xs...)->[xs...]; meta=true)(cs...) - # `compute(t)` only computes references to materialized version of `cs` - # we don't want the scheduler to think that `cs`s' job is done - # so we call persist! above - chunks = compute(t) + t=Dagger.spawn((xs...)->[xs...], cs...) + chunks = fetch(t) dmn = ArrayDomain((1:sum(length(domain(c)) for c in chunks),)) DArray(eltype(v1), dmn, map(domain, chunks), chunks) end diff --git a/src/chunks.jl b/src/chunks.jl index 5c3d1533f..b8f2a0cd0 100644 --- a/src/chunks.jl +++ b/src/chunks.jl @@ -267,6 +267,16 @@ function savechunk(data, dir, f) Chunk{typeof(data),typeof(fr),typeof(proc),typeof(scope)}(typeof(data), domain(data), fr, proc, scope, true) end +struct WeakChunk + x::WeakRef +end +WeakChunk(c::Chunk) = WeakChunk(WeakRef(c)) +unwrap_weak(c::WeakChunk) = c.x.value +function unwrap_weak_checked(c::WeakChunk) + c = unwrap_weak(c) + @assert c !== nothing + return c +end Base.@deprecate_binding AbstractPart Union{Chunk, Thunk} Base.@deprecate_binding Part Chunk diff --git a/src/compute.jl b/src/compute.jl index d895f79db..af034cb81 100644 --- a/src/compute.jl +++ b/src/compute.jl @@ -12,11 +12,6 @@ collect(d::Union{Chunk,Thunk}; options=nothing) = abstract type Computation end -compute(ctx, c::Computation; options=nothing) = - compute(ctx, stage(ctx, c); options=options) -collect(c::Computation; options=nothing) = - collect(Context(global_context()), c; options=options) - """ compute(ctx::Context, d::Thunk; options=nothing) -> Chunk @@ -97,7 +92,7 @@ function dependents(node::Thunk) if !haskey(deps, next) deps[next] = Set{Thunk}() end - for inp in inputs(next) + for (_, inp) in next.inputs if istask(inp) || (inp isa Chunk) s = get!(()->Set{Thunk}(), deps, inp) push!(s, next) @@ -165,7 +160,7 @@ function order(node::Thunk, ndeps) haskey(output, next) && continue s += 1 output[next] = s - parents = filter(istask, inputs(next)) + parents = filter(istask, map(last, next.inputs)) if !isempty(parents) # If parents is empty, sort! should be a no-op, but raises an ambiguity error # when InlineStrings.jl is loaded (at least, version 1.1.0), because InlineStrings diff --git a/src/lib/util.jl b/src/lib/util.jl index 977007dd4..67cca689b 100644 --- a/src/lib/util.jl +++ b/src/lib/util.jl @@ -152,6 +152,8 @@ function setindex(x::NTuple{N}, idx, v) where N map(ifelse, ntuple(x->idx === x, Val(N)), ntuple(x->v, Val(N)), x) end +export treereducedim + function showloc(f, argcount) args = ntuple(x->Any, argcount) ms = methods(f) diff --git a/src/processor.jl b/src/processor.jl index e99399dc0..24739cd43 100644 --- a/src/processor.jl +++ b/src/processor.jl @@ -1,5 +1,7 @@ export OSProc, Context, addprocs!, rmprocs! +import Base: @invokelatest + """ Processor @@ -29,11 +31,11 @@ function delete_processor_callback!(name::Symbol) end """ - execute!(proc::Processor, f, args...) -> Any + execute!(proc::Processor, f, args...; kwargs...) -> Any -Executes the function `f` with arguments `args` on processor `proc`. This -function can be overloaded by `Processor` subtypes to allow executing function -calls differently than normal Julia. +Executes the function `f` with arguments `args` and keyword arguments `kwargs` +on processor `proc`. This function can be overloaded by `Processor` subtypes to +allow executing function calls differently than normal Julia. """ function execute! end @@ -152,13 +154,14 @@ end iscompatible(proc::ThreadProc, opts, f, args...) = true iscompatible_func(proc::ThreadProc, opts, f) = true iscompatible_arg(proc::ThreadProc, opts, x) = true -function execute!(proc::ThreadProc, @nospecialize(f), @nospecialize(args...)) +function execute!(proc::ThreadProc, @nospecialize(f), @nospecialize(args...); @nospecialize(kwargs...)) tls = get_tls() task = Task() do set_tls!(tls) TimespanLogging.prof_task_put!(tls.sch_handle.thunk_id.id) - f(args...) + @invokelatest f(args...; kwargs...) end + task.sticky = true ret = ccall(:jl_set_task_tid, Cint, (Any, Cint), task, proc.tid-1) if ret == 0 error("jl_set_task_tid == 0") @@ -333,8 +336,7 @@ get_tls() = ( task_hash=task_local_storage(:_dagger_task_hash), input_hash=get(task_local_storage(), :_dagger_input_hash, nothing), processor=thunk_processor(), - time_utilization=task_local_storage(:_dagger_time_utilization), - alloc_utilization=task_local_storage(:_dagger_alloc_utilization), + task_spec=task_local_storage(:_dagger_task_spec), ) """ @@ -350,6 +352,7 @@ function set_tls!(tls) task_local_storage(:_dagger_input_hash, tls.input_hash) end task_local_storage(:_dagger_processor, get(tls, :processor, nothing)) + task_local_storage(:_dagger_task_spec, get(tls, :task_spec, nothing)) task_local_storage(:_dagger_time_utilization, get(tls, :time_utilization, nothing)) task_local_storage(:_dagger_alloc_utilization, get(tls, :alloc_utilization, nothing)) end diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index bee0717ef..587899f3b 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -6,10 +6,14 @@ import MemPool: DRef, StorageResource import MemPool: poolset, storage_available, storage_capacity, storage_utilized, externally_varying import Statistics: mean import Random: randperm +import Base: @invokelatest import ..Dagger -import ..Dagger: Context, Processor, Thunk, WeakThunk, ThunkFuture, ThunkFailedException, Chunk, OSProc, AnyScope +import ..Dagger: Context, Processor, Thunk, WeakThunk, ThunkFuture, ThunkFailedException, Chunk, WeakChunk, OSProc, AnyScope, DefaultScope, LockedObject import ..Dagger: order, dependents, noffspring, istask, inputs, unwrap_weak_checked, affinity, tochunk, timespan_start, timespan_finish, procs, move, chunktype, processor, default_enabled, get_processors, get_parent, execute!, rmprocs!, addprocs!, thunk_processor, constrain, cputhreadtime, uhash +import DataStructures: PriorityQueue, enqueue!, dequeue_pair!, peek + +import ..Dagger const OneToMany = Dict{Thunk, Set{Thunk}} @@ -96,8 +100,10 @@ struct ComputeState chan::RemoteChannel{Channel{Any}} end +const UID_COUNTER = Threads.Atomic{UInt64}(1) + function start_state(deps::Dict, node_order, chan) - state = ComputeState(rand(UInt64), + state = ComputeState(Threads.atomic_add!(UID_COUNTER, UInt64(1)), OneToMany(), deps, Vector{Thunk}(undef, 0), @@ -124,7 +130,7 @@ function start_state(deps::Dict, node_order, chan) for k in sort(collect(keys(deps)), by=node_order) if istask(k) - waiting = Set{Thunk}(Iterators.filter(istask, inputs(k))) + waiting = Set{Thunk}(Iterators.filter(istask, map(last, inputs(k)))) if isempty(waiting) push!(state.ready, k) else @@ -142,13 +148,13 @@ end Stores DAG-global options to be passed to the Dagger.Sch scheduler. # Arguments -- `single::Int=0`: Force all work onto worker with specified id. `0` disables -this option. -- `proclist=nothing`: Force scheduler to use one or more processors that are -instances/subtypes of a contained type. Alternatively, a function can be -supplied, and the function will be called with a processor as the sole -argument and should return a `Bool` result to indicate whether or not to use -the given processor. `nothing` enables all default processors. +- `single::Int=0`: (Deprecated) Force all work onto worker with specified id. +`0` disables this option. +- `proclist=nothing`: (Deprecated) Force scheduler to use one or more +processors that are instances/subtypes of a contained type. Alternatively, a +function can be supplied, and the function will be called with a processor as +the sole argument and should return a `Bool` result to indicate whether or not +to use the given processor. `nothing` enables all default processors. - `allow_errors::Bool=true`: Allow thunks to error without affecting non-dependent thunks. - `checkpoint=nothing`: If not `nothing`, uses the provided function to save @@ -175,23 +181,27 @@ end Stores Thunk-local options to be passed to the Dagger.Sch scheduler. # Arguments -- `single::Int=0`: Force thunk onto worker with specified id. `0` disables this -option. -- `proclist=nothing`: Force thunk to use one or more processors that are -instances/subtypes of a contained type. Alternatively, a function can be -supplied, and the function will be called with a processor as the sole +- `single::Int=0`: (Deprecated) Force thunk onto worker with specified id. `0` +disables this option. +- `proclist=nothing`: (Deprecated) Force thunk to use one or more processors +that are instances/subtypes of a contained type. Alternatively, a function can +be supplied, and the function will be called with a processor as the sole argument and should return a `Bool` result to indicate whether or not to use the given processor. `nothing` enables all default processors. -- `time_util::Dict{Type,Any}=Dict{Type,Any}()`: Indicates the maximum expected -time utilization for this thunk. Each keypair maps a processor type to the -utilization, where the value can be a real (approximately the number of -nanoseconds taken), or `MaxUtilization()` (utilizes all processors of this -type). By default, the scheduler assumes that this thunk only uses one -processor. -- `alloc_util::Dict{Type,UInt64}=Dict{Type,UInt64}()`: Indicates the maximum -expected memory utilization for this thunk. Each keypair maps a processor type -to the utilization, where the value is an integer representing approximately -the maximum number of bytes allocated at any one time. +- `time_util::Dict{Type,Any}`: Indicates the maximum expected time utilization +for this thunk. Each keypair maps a processor type to the utilization, where +the value can be a real (approximately the number of nanoseconds taken), or +`MaxUtilization()` (utilizes all processors of this type). By default, the +scheduler assumes that this thunk only uses one processor. +- `alloc_util::Dict{Type,UInt64}`: Indicates the maximum expected memory +utilization for this thunk. Each keypair maps a processor type to the +utilization, where the value is an integer representing approximately the +maximum number of bytes allocated at any one time. +- `occupancy::Dict{Type,Real}`: Indicates the maximum expected processor +occupancy for this thunk. Each keypair maps a processor type to the +utilization, where the value can be a real between 0 and 1 (the occupancy +ratio, where 1 is full occupancy). By default, the scheduler assumes that this +thunk has full occupancy. - `allow_errors::Bool=true`: Allow this thunk to error without affecting non-dependent thunks. - `checkpoint=nothing`: If not `nothing`, uses the provided function to save @@ -214,15 +224,13 @@ Base.@kwdef struct ThunkOptions proclist = nothing time_util::Union{Dict{Type,Any},Nothing} = nothing alloc_util::Union{Dict{Type,UInt64},Nothing} = nothing + occupancy::Union{Dict{Type,Real},Nothing} = nothing allow_errors::Union{Bool,Nothing} = nothing checkpoint = nothing restore = nothing storage::Union{Chunk,Nothing} = nothing end -# Eager scheduling -include("eager.jl") - """ Base.merge(sopts::SchedulerOptions, topts::ThunkOptions) -> ThunkOptions @@ -236,6 +244,7 @@ function Base.merge(sopts::SchedulerOptions, topts::ThunkOptions) proclist, topts.time_util, topts.alloc_util, + topts.occupancy, allow_errors, topts.checkpoint, topts.restore, @@ -268,6 +277,7 @@ function populate_defaults(opts::ThunkOptions, Tf, Targs) maybe_default(:proclist), maybe_default(:time_util), maybe_default(:alloc_util), + maybe_default(:occupancy), maybe_default(:allow_errors), maybe_default(:checkpoint), maybe_default(:restore), @@ -278,6 +288,9 @@ end function cleanup(ctx) end +# Eager scheduling +include("eager.jl") + const WORKER_MONITOR_LOCK = Threads.ReentrantLock() const WORKER_MONITOR_TASKS = Dict{Int,Task}() const WORKER_MONITOR_CHANS = Dict{Int,Dict{UInt64,RemoteChannel}}() @@ -464,6 +477,8 @@ function scheduler_init(ctx, state::ComputeState, d::Thunk, options, deps) end function scheduler_run(ctx, state::ComputeState, d::Thunk, options) + @dagdebug nothing :global "Initializing scheduler" uid=state.uid + safepoint(state) # Loop while we still have thunks to execute @@ -477,12 +492,14 @@ function scheduler_run(ctx, state::ComputeState, d::Thunk, options) isempty(state.running) && continue timespan_start(ctx, :take, 0, 0) + @dagdebug nothing :take "Waiting for results" chan_value = take!(state.chan) # get result of completed thunk timespan_finish(ctx, :take, 0, 0) if chan_value isa RescheduleSignal continue end pid, proc, thunk_id, (res, metadata) = chan_value + @dagdebug thunk_id :take "Got finished task" gproc = OSProc(pid) safepoint(state) lock(state.lock) do @@ -527,7 +544,7 @@ function scheduler_run(ctx, state::ComputeState, d::Thunk, options) state.errored[node] = thunk_failed if node.options !== nothing && node.options.checkpoint !== nothing try - node.options.checkpoint(node, res) + @invokelatest node.options.checkpoint(node, res) catch err report_catch_error(err, "Thunk checkpoint failed") end @@ -556,6 +573,8 @@ function scheduler_run(ctx, state::ComputeState, d::Thunk, options) return value, errored end function scheduler_exit(ctx, state::ComputeState, options) + @dagdebug nothing :global "Tearing down scheduler" uid=state.uid + close(state.chan) notify(state.halt) @sync for p in procs_to_use(ctx) @@ -596,7 +615,7 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) populate_processor_cache_list!(state, procs) # Schedule tasks - to_fire = Dict{Tuple{OSProc,<:Processor},Vector{Tuple{Thunk,<:Any,<:Any}}}() + to_fire = Dict{Tuple{OSProc,<:Processor},Vector{Tuple{Thunk,<:Any,<:Any,UInt64,UInt32}}}() failed_scheduling = Thunk[] # Select a new task and get its options @@ -610,7 +629,22 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) end task = pop!(state.ready) timespan_start(ctx, :schedule, task.id, (;thunk_id=task.id)) - @assert !haskey(state.cache, task) + if haskey(state.cache, task) + if haskey(state.errored, task) + # An error was eagerly propagated to this task + finish_failed!(state, task) + else + # This shouldn't have happened + iob = IOBuffer() + println(iob, "Scheduling inconsistency: Task being scheduled is already cached!") + println(iob, " Task: $(task.id)") + println(iob, " Cache Entry: $(typeof(state.cache[task]))") + ex = SchedulingException(String(take!(iob))) + state.cache[task] = ex + state.errored[task] = true + end + @goto pop_task + end opts = merge(ctx.options, task.options) sig = signature(task, state) if task.hash == UInt(0) @@ -622,9 +656,14 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) scope = if task.f isa Chunk task.f.scope else - AnyScope() + if task.options.proclist !== nothing + # proclist overrides scope selection + AnyScope() + else + DefaultScope() + end end - for input in task.inputs + for (_,input) in task.inputs input = unwrap_weak_checked(input) chunk = if istask(input) state.cache[input] @@ -653,7 +692,7 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) @goto fallback end - inputs = collect_task_inputs(state, task) + inputs = map(last, collect_task_inputs(state, task)) opts = populate_defaults(opts, chunktype(task.f), map(chunktype, inputs)) local_procs, costs = estimate_task_costs(state, local_procs, task, inputs) scheduled = false @@ -670,18 +709,24 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) for proc in local_procs gproc = get_parent(proc) - if can_use_proc(task, gproc, proc, opts, scope) - has_cap, est_time_util, est_alloc_util = has_capacity(state, proc, gproc.pid, opts.time_util, opts.alloc_util, sig) + can_use, scope = can_use_proc(task, gproc, proc, opts, scope) + if can_use + has_cap, est_time_util, est_alloc_util, est_occupancy = + has_capacity(state, proc, gproc.pid, opts.time_util, opts.alloc_util, opts.occupancy, sig) if has_cap # Schedule task onto proc # FIXME: est_time_util = est_time_util isa MaxUtilization ? cap : est_time_util - push!(get!(()->Vector{Tuple{Thunk,<:Any,<:Any}}(), to_fire, (gproc, proc)), (task, est_time_util, est_alloc_util)) - state.worker_time_pressure[gproc.pid][proc] = get(state.worker_time_pressure[gproc.pid], proc, UInt64(0)) + est_time_util + proc_tasks = get!(to_fire, (gproc, proc)) do + Vector{Tuple{Thunk,<:Any,<:Any,UInt64,UInt32}}() + end + push!(proc_tasks, (task, scope, est_time_util, est_alloc_util, est_occupancy)) + state.worker_time_pressure[gproc.pid][proc] = get(state.worker_time_pressure[gproc.pid], proc, UInt64(0)) + est_time_util + @dagdebug task :schedule "Scheduling to $gproc -> $proc" @goto pop_task end end end - state.cache[task] = SchedulingException("No processors available, try making proclist more liberal") + state.cache[task] = SchedulingException("No processors available, try widening scope") state.errored[task] = true set_failed!(state, task) @goto pop_task @@ -694,8 +739,10 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) cap, extra_util = nothing, nothing procs_found = false # N.B. if we only have one processor, we need to select it now - if can_use_proc(task, entry.gproc, entry.proc, opts, scope) - has_cap, est_time_util, est_alloc_util = has_capacity(state, entry.proc, entry.gproc.pid, opts.time_util, opts.alloc_util, sig) + can_use, scope = can_use_proc(task, entry.gproc, entry.proc, opts, scope) + if can_use + has_cap, est_time_util, est_alloc_util, est_occupancy = + has_capacity(state, entry.proc, entry.gproc.pid, opts.time_util, opts.alloc_util, opts.occupancy, sig) if has_cap selected_entry = entry else @@ -711,15 +758,17 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) if procs_found push!(failed_scheduling, task) else - state.cache[task] = SchedulingException("No processors available, try making proclist more liberal") + state.cache[task] = SchedulingException("No processors available, try widening scope") state.errored[task] = true set_failed!(state, task) end @goto pop_task end - if can_use_proc(task, entry.gproc, entry.proc, opts, scope) - has_cap, est_time_util, est_alloc_util = has_capacity(state, entry.proc, entry.gproc.pid, opts.time_util, opts.alloc_util, sig) + can_use, scope = can_use_proc(task, entry.gproc, entry.proc, opts, scope) + if can_use + has_cap, est_time_util, est_alloc_util, est_occupancy = + has_capacity(state, entry.proc, entry.gproc.pid, opts.time_util, opts.alloc_util, opts.occupancy, sig) if has_cap # Select this processor selected_entry = entry @@ -738,7 +787,10 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) # Schedule task onto proc gproc, proc = entry.gproc, entry.proc est_time_util = est_time_util isa MaxUtilization ? cap : est_time_util - push!(get!(()->Vector{Tuple{Thunk,<:Any,<:Any}}(), to_fire, (gproc, proc)), (task, est_time_util, est_alloc_util)) + proc_tasks = get!(to_fire, (gproc, proc)) do + Vector{Tuple{Thunk,<:Any,<:Any,UInt64,UInt32}}() + end + push!(proc_tasks, (task, scope, est_time_util, est_alloc_util, est_occupancy)) # Proceed to next entry to spread work state.procs_cache_list[] = state.procs_cache_list[].next @@ -856,13 +908,13 @@ function evict_chunks!(log_sink, chunks::Set{Chunk}) nothing end -fire_task!(ctx, thunk::Thunk, p, state; time_util=10^9, alloc_util=10^6) = - fire_task!(ctx, (thunk, time_util, alloc_util), p, state) -fire_task!(ctx, (thunk, time_util, alloc_util)::Tuple{Thunk,<:Any}, p, state) = - fire_tasks!(ctx, [(thunk, time_util, alloc_util)], p, state) +fire_task!(ctx, thunk::Thunk, p, state; scope=AnyScope(), time_util=10^9, alloc_util=10^6, occupancy=typemax(UInt32)) = + fire_task!(ctx, (thunk, scope, time_util, alloc_util, occupancy), p, state) +fire_task!(ctx, (thunk, scope, time_util, alloc_util, occupancy)::Tuple{Thunk,<:Any}, p, state) = + fire_tasks!(ctx, [(thunk, scope, time_util, alloc_util, occupancy)], p, state) function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state) to_send = [] - for (thunk, time_util, alloc_util) in thunks + for (thunk, scope, time_util, alloc_util, occupancy) in thunks push!(state.running, thunk) state.running_on[thunk] = gproc if thunk.cache && thunk.cache_ref !== nothing @@ -881,7 +933,7 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state) end if thunk.options !== nothing && thunk.options.restore !== nothing try - result = thunk.options.restore(thunk) + result = @invokelatest thunk.options.restore(thunk) if result isa Chunk state.cache[thunk] = result state.errored[thunk] = false @@ -898,10 +950,13 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state) ids = Int[0] data = Any[thunk.f] hashes = Union{UInt,Nothing}[uhash(thunk.f, UInt(0))] - for (idx, x) in enumerate(thunk.inputs) + positions = Union{Symbol,Nothing}[] + for (idx, pos_x) in enumerate(thunk.inputs) + pos, x = pos_x x = unwrap_weak_checked(x) push!(ids, istask(x) ? x.id : -idx) push!(data, istask(x) ? state.cache[x] : x) + push!(positions, pos) push!(hashes, uhash(x, UInt(0))) end toptions = thunk.options !== nothing ? thunk.options : ThunkOptions() @@ -912,10 +967,11 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state) sch_handle = SchedulerHandle(ThunkID(thunk.id, nothing), state.worker_chans[gproc.pid]...) # TODO: De-dup common fields (log_sink, uid, etc.) + push!(to_send, Any[thunk.id, thunk.hash, time_util, alloc_util, chunktype(thunk.f), data, thunk.get_result, thunk.persist, thunk.cache, thunk.meta, options, - propagated, ids, hashes, + propagated, ids, positions, hashes, (log_sink=ctx.log_sink, profile=ctx.profile), sch_handle, state.uid]) end @@ -940,38 +996,333 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state) end end +@static if VERSION >= v"1.9" +const Doorbell = Base.Event +else +# We need a sticky, resetable signal +mutable struct Doorbell + waiter::Union{Task,Nothing} + @atomic sleeping::Int + Doorbell() = new(nothing, 0) +end +function Base.wait(db::Doorbell) + db.waiter = current_task() + while true + _, succ = @atomicreplace db.sleeping 0 => 1 + if succ + # No messages, wait for someone to wake us + wait() + end + _, succ = @atomicreplace db.sleeping 2 => 0 + if succ + # We had a notification + return + end + end +end +function Base.notify(db::Doorbell) + while true + if (@atomic db.sleeping) == 2 + # Doorbell already rung + return + end + + _, succ = @atomicreplace db.sleeping 0 => 2 + if succ + # Task was definitely busy, we're done + return + end + + _, succ = @atomicreplace db.sleeping 1 => 2 + if succ + # Task was sleeping, wake it and wait for it to awaken + waiter = db.waiter + @assert waiter !== nothing + waiter::Task + schedule(waiter) + while true + sleep_value = @atomic db.sleeping + if sleep_value == 0 || sleep_value == 2 + return + end + #if waiter._state === Base.task_state_runnable && t.queue === nothing + # schedule(waiter) + #else + yield() + #end + end + end + end +end +end + +struct ProcessorInternalState + ctx::Context + proc::Processor + queue::LockedObject{PriorityQueue{Vector{Any}, UInt32, Base.Order.ForwardOrdering}} + reschedule::Doorbell + tasks::Dict{Int,Task} + proc_occupancy::Base.RefValue{UInt32} + time_pressure::Base.RefValue{UInt64} +end +struct ProcessorState + state::ProcessorInternalState + runner::Task +end + +const PROCESSOR_TASK_STATE = LockedObject(Dict{UInt64,Dict{Processor,ProcessorState}}()) + +function proc_states(f::Base.Callable, uid::UInt64) + lock(PROCESSOR_TASK_STATE) do all_states + if !haskey(all_states, uid) + all_states[uid] = Dict{Processor,ProcessorState}() + end + our_states = all_states[uid] + return f(our_states) + end +end +proc_states(f::Base.Callable) = + proc_states(f, task_local_storage(:_dagger_sch_uid)::UInt64) + +task_tid_for_processor(::Processor) = nothing +task_tid_for_processor(proc::Dagger.ThreadProc) = proc.tid + +stealing_permitted(::Processor) = true +stealing_permitted(proc::Dagger.ThreadProc) = proc.owner != 1 || proc.tid != 1 + +proc_has_occupancy(proc_occupancy, task_occupancy) = + UInt64(task_occupancy) + UInt64(proc_occupancy) <= typemax(UInt32) + +function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, return_queue::RemoteChannel) + to_proc = istate.proc + proc_run_task = @task begin + ctx = istate.ctx + tasks = istate.tasks + proc_occupancy = istate.proc_occupancy + time_pressure = istate.time_pressure + + while isopen(return_queue) + # Wait for new tasks + @dagdebug nothing :processor "Waiting for tasks" + timespan_start(ctx, :proc_run_wait, to_proc, nothing) + wait(istate.reschedule) + @static if VERSION >= v"1.9" + reset(istate.reschedule) + end + timespan_finish(ctx, :proc_run_wait, to_proc, nothing) + + # Fetch a new task to execute + @dagdebug nothing :processor "Trying to dequeue" + timespan_start(ctx, :proc_run_fetch, to_proc, nothing) + task_and_occupancy = lock(istate.queue) do queue + # Only steal if there are multiple queued tasks, to prevent + # ping-pong of tasks between empty queues + if length(queue) == 0 + @dagdebug nothing :processor "Nothing to dequeue" + return nothing + end + _, occupancy = peek(queue) + if !proc_has_occupancy(proc_occupancy[], occupancy) + @dagdebug nothing :processor "Insufficient occupancy" proc_occupancy=proc_occupancy[] task_occupancy=occupancy + return nothing + end + return dequeue_pair!(queue) + end + if task_and_occupancy === nothing + timespan_finish(ctx, :proc_run_fetch, to_proc, nothing) + + @dagdebug nothing :processor "Failed to dequeue" + + if !stealing_permitted(to_proc) + continue + end + + if proc_occupancy[] == typemax(UInt32) + continue + end + + @dagdebug nothing :processor "Trying to steal" + + # Try to steal a task + timespan_start(ctx, :steal_local, to_proc, nothing) + + # Try to steal from local queues randomly + # TODO: Prioritize stealing from busiest processors + states = collect(proc_states(values, uid)) + # TODO: Try to pre-allocate this + P = randperm(length(states)) + for state in getindex.(Ref(states), P) + other_istate = state.state + if other_istate.proc === to_proc + continue + end + # FIXME: We need to lock two queues to compare occupancies + proc_occupancy_cached = lock(istate.queue) do _ + proc_occupancy[] + end + task_and_occupancy = lock(other_istate.queue) do queue + if length(queue) == 0 + return nothing + end + task, occupancy = peek(queue) + scope = task[5] + if !isa(constrain(scope, Dagger.ExactScope(to_proc)), + Dagger.InvalidScope) && + typemax(UInt32) - proc_occupancy_cached >= occupancy + # Compatible, steal this task + return dequeue_pair!(queue) + end + return nothing + end + if task_and_occupancy !== nothing + from_proc = other_istate.proc + thunk_id = task[1] + @dagdebug thunk_id :processor "Stolen from $from_proc by $to_proc" + timespan_finish(ctx, :steal_local, to_proc, (;from_proc, thunk_id)) + # TODO: Keep stealing until we hit full occupancy? + @goto execute + end + end + timespan_finish(ctx, :steal_local, to_proc, nothing) + + # TODO: Try to steal from remote queues + + continue + end + + @label execute + task, task_occupancy = task_and_occupancy + thunk_id = task[1] + time_util = task[2] + timespan_finish(ctx, :proc_run_fetch, to_proc, (;thunk_id, proc_occupancy=proc_occupancy[], task_occupancy)) + @dagdebug thunk_id :processor "Dequeued task" + + # Execute the task and return its result + t = @task begin + result = try + do_task(to_proc, task) + catch err + bt = catch_backtrace() + (CapturedException(err, bt), nothing) + finally + lock(istate.queue) do _ + delete!(tasks, thunk_id) + proc_occupancy[] -= task_occupancy + time_pressure[] -= time_util + end + notify(istate.reschedule) + end + try + put!(return_queue, (myid(), to_proc, thunk_id, result)) + catch err + if unwrap_nested_exception(err) isa InvalidStateException || !isopen(return_queue) + @dagdebug thunk_id :execute "Return queue is closed, failing to put result" chan=return_queue exception=(err, catch_backtrace()) + else + rethrow(err) + end + end + end + lock(istate.queue) do _ + tid = task_tid_for_processor(to_proc) + if tid !== nothing + t.sticky = true + ret = ccall(:jl_set_task_tid, Cint, (Any, Cint), t, tid-1) + else + t.sticky = false + end + tasks[thunk_id] = errormonitor(schedule(t)) + proc_occupancy[] += task_occupancy + time_pressure[] += time_util + end + end + end + tid = task_tid_for_processor(to_proc) + if tid !== nothing + proc_run_task.sticky = true + ret = ccall(:jl_set_task_tid, Cint, (Any, Cint), proc_run_task, tid-1) + else + proc_run_task.sticky = false + end + return errormonitor(schedule(proc_run_task)) +end + """ - do_tasks(to_proc, chan, tasks) + do_tasks(to_proc, return_queue, tasks) -Executes a batch of tasks on `to_proc`. +Executes a batch of tasks on `to_proc`, returning their results through +`return_queue`. """ -function do_tasks(to_proc, chan, tasks) - for task in tasks - thunk_id = task[1] - should_launch = lock(TASK_SYNC) do - # Already running; don't try to re-launch - if !(thunk_id in TASKS_RUNNING) - push!(TASKS_RUNNING, thunk_id) - true - else - false +function do_tasks(to_proc, return_queue, tasks) + @dagdebug nothing :processor "Enqueuing task batch" batch_size=length(tasks) + + # FIXME: This is terrible + ctx_vars = first(tasks)[16] + ctx = Context(Processor[]; log_sink=ctx_vars.log_sink, profile=ctx_vars.profile) + uid = first(tasks)[18] + state = proc_states(uid) do states + get!(states, to_proc) do + queue = PriorityQueue{Vector{Any}, UInt32}() + queue_locked = LockedObject(queue) + reschedule = Doorbell() + istate = ProcessorInternalState(ctx, to_proc, + queue_locked, reschedule, + Dict{Int,Task}(), + Ref(UInt32(0)), Ref(UInt64(0))) + runner = start_processor_runner!(istate, uid, return_queue) + @static if VERSION < v"1.9" + reschedule.waiter = runner end + return ProcessorState(istate, runner) end - should_launch || continue - @async begin - try - result = do_task(to_proc, task) - put!(chan, (myid(), to_proc, thunk_id, result)) - catch ex - bt = catch_backtrace() - put!(chan, (myid(), to_proc, thunk_id, (CapturedException(ex, bt), nothing))) + end + istate = state.state + lock(istate.queue) do queue + for task in tasks + thunk_id = task[1] + occupancy = task[4] + timespan_start(ctx, :enqueue, (;to_proc, thunk_id), nothing) + should_launch = lock(TASK_SYNC) do + # Already running; don't try to re-launch + if !(thunk_id in TASKS_RUNNING) + push!(TASKS_RUNNING, thunk_id) + true + else + false + end end + should_launch || continue + enqueue!(queue, task, occupancy) + timespan_finish(ctx, :enqueue, (;to_proc, thunk_id), nothing) + @dagdebug thunk_id :processor "Enqueued task" end end + notify(istate.reschedule) + + # Kick other processors to make them steal + # TODO: Alternatively, automatically balance work instead of blindly enqueueing + states = collect(proc_states(values, uid)) + P = randperm(length(states)) + for other_state in getindex.(Ref(states), P) + other_istate = other_state.state + if other_istate.proc === to_proc + continue + end + notify(other_istate.reschedule) + end + @dagdebug nothing :processor "Kicked processors" end -"Executes a single task on `to_proc`." -function do_task(to_proc, comm) - thunk_id, task_hash, est_time_util, est_alloc_util, Tf, data, send_result, persist, cache, meta, options, propagated, ids, hashes, ctx_vars, sch_handle, sch_uid = comm + +""" + do_task(to_proc, task_desc) -> Any + +Executes a single task specified by `task_desc` on `to_proc`. +""" +function do_task(to_proc, task_desc) + thunk_id, task_hash, est_time_util, est_alloc_util, + Tf, data, + send_result, persist, cache, meta, + options, propagated, ids, positions, hashes, + ctx_vars, sch_handle, sch_uid = task_desc ctx = Context(Processor[]; log_sink=ctx_vars.log_sink, profile=ctx_vars.profile) from_proc = OSProc() @@ -1005,7 +1356,6 @@ function do_task(to_proc, comm) "[$(myid()), $thunk_id] $f($Tdata) $msg: $est_alloc_util | $real_alloc_util/$storage_cap" end end - lock(TASK_SYNC) do while true # Get current time utilization for the selected processor @@ -1047,6 +1397,8 @@ function do_task(to_proc, comm) end timespan_finish(ctx, :storage_wait, thunk_id, (;f, to_proc, device=typeof(to_storage))) + @dagdebug thunk_id :execute "Moving data" + # Initiate data transfers for function and arguments transfer_time = Threads.Atomic{UInt64}(0) transfer_size = Threads.Atomic{UInt64}(0) @@ -1071,7 +1423,8 @@ function do_task(to_proc, comm) # TODO: Choose "closest" processor of same type first some_proc = first(keys(CHUNK_CACHE[x])) some_x = CHUNK_CACHE[x][some_proc] - move(some_proc, to_proc, some_x) + @dagdebug thunk_id :move "Cache hit for argument $id at $some_proc: $some_x" + @invokelatest move(some_proc, to_proc, some_x) end) else nothing @@ -1083,13 +1436,16 @@ function do_task(to_proc, comm) else # Fetch it time_start = time_ns() - _x = move(to_proc, x) + from_proc = processor(x) + _x = @invokelatest move(from_proc, to_proc, x) time_finish = time_ns() if x.handle.size !== nothing Threads.atomic_add!(transfer_time, time_finish - time_start) Threads.atomic_add!(transfer_size, x.handle.size) end + @dagdebug thunk_id :move "Cache miss for argument $id at $from_proc" + # Update cache lock(TASK_SYNC) do CHUNK_CACHE[x] = Dict{Processor,Any}() @@ -1099,21 +1455,34 @@ function do_task(to_proc, comm) _x end else - move(to_proc, x) + @invokelatest move(to_proc, x) end + @dagdebug thunk_id :move "Moved argument $id to $to_proc: $x" timespan_finish(ctx, :move, (;thunk_id, id), (;f, id, data=x); tasks=[Base.current_task()]) return x end end fetched = Any[] + @debug fetch_tasks for task in fetch_tasks push!(fetched, fetch_report(task)) end if meta append!(fetched, data[2:end]) end + f = popfirst!(fetched) @assert !(f isa Chunk) "Failed to unwrap thunk function" + fetched_args = Any[] + fetched_kwargs = Pair{Symbol,Any}[] + for (idx, x) in enumerate(fetched) + pos = positions[idx] + if pos === nothing + push!(fetched_args, x) + else + push!(fetched_kwargs, pos => x) + end + end #= FIXME: If MaxUtilization, stop processors and wait if (est_time_util isa MaxUtilization) && (real_time_util > 0) @@ -1132,6 +1501,8 @@ function do_task(to_proc, comm) # FIXME #gcnum_start = Base.gc_num() + @dagdebug thunk_id :execute "Executing" + result_meta = try # Set TLS variables Dagger.set_tls!(( @@ -1139,13 +1510,12 @@ function do_task(to_proc, comm) sch_handle=sch_handle, task_hash, processor=to_proc, - time_utilization=est_time_util, - alloc_utilization=est_alloc_util, + task_spec=task_desc, )) res = Dagger.with_options(propagated) do # Execute - execute!(to_proc, f, fetched...) + execute!(to_proc, f, fetched_args...; fetched_kwargs...) end # Check if result is safe to store @@ -1167,6 +1537,7 @@ function do_task(to_proc, comm) bt = catch_backtrace() RemoteException(myid(), CapturedException(ex, bt)) end + threadtime = cputhreadtime() - threadtime_start # FIXME: This is not a realistic measure of max. required memory #gc_allocd = min(max(UInt64(Base.gc_num().allocd) - UInt64(gcnum_start.allocd), UInt64(0)), UInt64(1024^4)) @@ -1176,6 +1547,9 @@ function do_task(to_proc, comm) pop!(TASKS_RUNNING, thunk_id) notify(TASK_SYNC) end + + @dagdebug thunk_id :execute "Returning" + # TODO: debug_storage("Releasing $to_storage_name") metadata = ( time_pressure=real_time_util[], @@ -1187,7 +1561,7 @@ function do_task(to_proc, comm) gc_allocd=(isa(result_meta, Chunk) ? result_meta.handle.size : 0), transfer_rate=(transfer_size[] > 0 && transfer_time[] > 0) ? round(UInt64, transfer_size[] / (transfer_time[] / 10^9)) : nothing, ) - (result_meta, metadata) + return (result_meta, metadata) end end # module Sch diff --git a/src/sch/dynamic.jl b/src/sch/dynamic.jl index 80d88091b..679514b83 100644 --- a/src/sch/dynamic.jl +++ b/src/sch/dynamic.jl @@ -144,7 +144,7 @@ function _register_future!(ctx, state, task, tid, (future, id, check)::Tuple{Thu if t == target return true end - for input in t.inputs + for (_, input) in t.inputs # N.B. Skips expired tasks input = Dagger.unwrap_weak(input) istask(input) || continue @@ -195,21 +195,23 @@ function _get_dag_ids(ctx, state, task, tid, _) end "Adds a new Thunk to the DAG." -add_thunk!(f, h::SchedulerHandle, args...; future=nothing, ref=nothing, kwargs...) = - exec!(_add_thunk!, h, f, args, kwargs, future, ref) -function _add_thunk!(ctx, state, task, tid, (f, args, kwargs, future, ref)) +add_thunk!(f, h::SchedulerHandle, args...; future=nothing, ref=nothing, options...) = + exec!(_add_thunk!, h, f, args, options, future, ref) +function _add_thunk!(ctx, state, task, tid, (f, args, options, future, ref)) timespan_start(ctx, :add_thunk, tid, 0) - _args = map(arg->arg isa ThunkID ? state.thunk_dict[arg.id] : arg, args) + _args = map(pos_arg->pos_arg[1] => (pos_arg[2] isa ThunkID ? state.thunk_dict[pos_arg[2].id] : pos_arg[2]), args) GC.@preserve _args begin - thunk = Thunk(f, _args...; kwargs...) + thunk = Thunk(f, _args...; options...) # Create a `DRef` to `thunk` so that the caller can preserve it thunk_ref = poolset(thunk; size=64, device=MemPool.CPURAMDevice()) thunk_id = ThunkID(thunk.id, thunk_ref) state.thunk_dict[thunk.id] = WeakThunk(thunk) reschedule_inputs!(state, thunk) + @dagdebug thunk :submit "Added to scheduler" if future !== nothing # Ensure we attach a future before the thunk is scheduled _register_future!(ctx, state, task, tid, (future, thunk_id, false)) + @dagdebug thunk :submit "Registered future" end if ref !== nothing # Preserve the `EagerThunkFinalizer` through `thunk` diff --git a/src/sch/eager.jl b/src/sch/eager.jl index 0b15543e2..1bafc0874 100644 --- a/src/sch/eager.jl +++ b/src/sch/eager.jl @@ -18,12 +18,15 @@ function init_eager() ctx = eager_context() @async try sopts = SchedulerOptions(;allow_errors=true) - topts = ThunkOptions(;single=1) + scope = Dagger.ExactScope(Dagger.ThreadProc(1, 1)) atexit() do EAGER_FORCE_KILL[] = true close(EAGER_THUNK_CHAN) end - Dagger.compute(ctx, Dagger.delayed(eager_thunk; options=topts)(); options=sopts) + opts = Dagger.Options((;scope, + occupancy=Dict(Dagger.ThreadProc=>0))) + Dagger.compute(ctx, Dagger.delayed(eager_thunk, opts)(); + options=sopts) catch err iob = IOContext(IOBuffer(), :color=>true) println(iob, "Error in eager scheduler:") @@ -37,41 +40,47 @@ function init_eager() end end -"Adjusts the scheduler's cached pressure indicators for the specified worker by -the specified amount, and signals the scheduler to try scheduling again if -pressure decreased." -function adjust_pressure!(h::SchedulerHandle, proc::Processor, pressure) - uid = Dagger.get_tls().sch_uid - lock(TASK_SYNC) do - PROCESSOR_TIME_UTILIZATION[uid][proc][] += pressure - notify(TASK_SYNC) - end - exec!(_adjust_pressure!, h, myid(), proc, pressure) -end -function _adjust_pressure!(ctx, state, task, tid, (pid, proc, pressure)) - state.worker_time_pressure[pid][proc] += pressure - if pressure < 0 - put!(state.chan, RescheduleSignal()) - end - nothing -end - -"Allows a thunk to safely wait on another thunk, by temporarily reducing its -effective pressure to 0." +""" +Allows a thunk to safely wait on another thunk by temporarily reducing its +effective occupancy to 0, which allows a newly-spawned task to run. +""" function thunk_yield(f) if Dagger.in_thunk() h = sch_handle() tls = Dagger.get_tls() - proc = tls.processor - util = tls.time_utilization - adjust_pressure!(h, proc, -util) + proc = Dagger.thunk_processor() + proc_istate = proc_states(tls.sch_uid) do states + states[proc].state + end + task_occupancy = tls.task_spec[4] + + # Decrease our occupancy and inform the processor to reschedule + lock(proc_istate.queue) do _ + proc_istate.proc_occupancy[] -= task_occupancy + @assert 0 <= proc_istate.proc_occupancy[] <= typemax(UInt32) + end + notify(proc_istate.reschedule) try - f() + # Run the yielding code + return f() finally - adjust_pressure!(h, proc, util) + # Wait for processor to have occupancy to run this task + while true + ready = lock(proc_istate.queue) do _ + @assert 0 <= proc_istate.proc_occupancy[] <= typemax(UInt32) + if proc_has_occupancy(proc_istate.proc_occupancy[], task_occupancy) + proc_istate.proc_occupancy[] += task_occupancy + @assert 0 <= proc_istate.proc_occupancy[] <= typemax(UInt32) + return true + end + return false + end + ready && break + yield() + end end else - f() + return f() end end @@ -83,14 +92,21 @@ function eager_thunk() nothing end tls = Dagger.get_tls() - # Don't apply pressure from this thunk - adjust_pressure!(h, tls.processor, -tls.time_utilization) while isopen(EAGER_THUNK_CHAN) try added_future, future, uid, ref, f, args, opts = take!(EAGER_THUNK_CHAN) # preserve inputs until they enter the scheduler tid = GC.@preserve args begin - _args = map(x->x isa Dagger.EagerThunk ? ThunkID(EAGER_ID_MAP[x.uid], x.thunk_ref) : x, args) + _args = map(args) do pos_x + pos, x = pos_x + if x isa Dagger.EagerThunk + return pos => ThunkID(EAGER_ID_MAP[x.uid], x.thunk_ref) + elseif x isa Dagger.Chunk + return pos => WeakChunk(x) + else + return pos => x + end + end add_thunk!(f, h, _args...; future=future, ref=ref, opts...) end EAGER_ID_MAP[uid] = tid.id diff --git a/src/sch/fault-handler.jl b/src/sch/fault-handler.jl index 401f47096..0782af3a2 100644 --- a/src/sch/fault-handler.jl +++ b/src/sch/fault-handler.jl @@ -44,7 +44,7 @@ function handle_fault(ctx, state, deadproc) # Remove thunks from state.ready that have inputs on the deadlist for idx in length(state.ready):-1:1 rt = state.ready[idx] - if any((input in deadlist) for input in rt.inputs) + if any((input in deadlist) for input in map(last, rt.inputs)) deleteat!(state.ready, idx) end end diff --git a/src/sch/util.jl b/src/sch/util.jl index 5ac2f0eb6..a95d727f7 100644 --- a/src/sch/util.jl +++ b/src/sch/util.jl @@ -1,3 +1,37 @@ +const DAGDEBUG_CATEGORIES = Symbol[:global, :submit, :schedule, :scope, + :take, :execute, :processor] +macro dagdebug(thunk, category, msg, args...) + cat_sym = category.value + @gensym id + debug_ex_id = :(@debug "[$($id)] ($($(repr(cat_sym)))) $($msg)" _module=Dagger _file=$(string(__source__.file)) _line=$(__source__.line)) + append!(debug_ex_id.args, args) + debug_ex_noid = :(@debug "($($(repr(cat_sym)))) $($msg)" _module=Dagger _file=$(string(__source__.file)) _line=$(__source__.line)) + append!(debug_ex_noid.args, args) + esc(quote + let $id = -1 + if $thunk isa Integer + $id = Int($thunk) + elseif $thunk isa Thunk + $id = $thunk.id + elseif $thunk === nothing + $id = 0 + else + @warn "Unsupported thunk argument to @dagdebug: $(typeof($thunk))" + $id = -1 + end + if $id > 0 + if $(QuoteNode(cat_sym)) in $DAGDEBUG_CATEGORIES + $debug_ex_id + end + elseif $id == 0 + if $(QuoteNode(cat_sym)) in $DAGDEBUG_CATEGORIES + $debug_ex_noid + end + end + end + end) +end + """ unwrap_nested_exception(err::Exception) -> Bool @@ -14,7 +48,7 @@ function get_propagated_options(thunk) nt = NamedTuple() for key in thunk.propagates value = if key == :scope - isa(thunk.f, Chunk) ? thunk.f.scope : AnyScope() + isa(thunk.f, Chunk) ? thunk.f.scope : DefaultScope() elseif key == :processor isa(thunk.f, Chunk) ? thunk.f.processor : OSProc() elseif key in fieldnames(Thunk) @@ -44,7 +78,7 @@ end of all chunks that can now be evicted from workers." function cleanup_inputs!(state, node) to_evict = Set{Chunk}() - for inp in node.inputs + for (_, inp) in node.inputs inp = unwrap_weak_checked(inp) if !istask(inp) && !(inp isa Chunk) continue @@ -108,12 +142,12 @@ function reschedule_inputs!(state, thunk, seen=Set{Thunk}()) continue end w = get!(()->Set{Thunk}(), state.waiting, thunk) - for input in thunk.inputs + for (_, input) in thunk.inputs input = unwrap_weak_checked(input) input in seen && continue # Unseen - if istask(input) || (input isa Chunk) + if istask(input) || isa(input, Chunk) push!(get!(()->Set{Thunk}(), state.waiting_data, input), thunk) end istask(input) || continue @@ -145,6 +179,9 @@ function set_failed!(state, origin, thunk=origin) filter!(x->x!==thunk, state.ready) state.cache[thunk] = ThunkFailedException(thunk, origin, state.cache[origin]) state.errored[thunk] = true + finish_failed!(state, thunk, origin) +end +function finish_failed!(state, thunk, origin=nothing) fill_registered_futures!(state, thunk, true) if haskey(state.waiting_data, thunk) for dep in state.waiting_data[thunk] @@ -152,7 +189,7 @@ function set_failed!(state, origin, thunk=origin) delete!(state.waiting, dep) haskey(state.errored, dep) && continue - set_failed!(state, origin, dep) + origin !== nothing && set_failed!(state, origin, dep) end delete!(state.waiting_data, thunk) end @@ -191,7 +228,7 @@ function print_sch_status(io::IO, state, thunk; offset=0, limit=5, max_inputs=3) print(io, "($(status_string(thunk))) ") end println(io, "$(thunk.id): $(thunk.f)") - for (idx,input) in enumerate(thunk.inputs) + for (idx, (_, input)) in enumerate(thunk.inputs) if input isa WeakThunk input = unwrap_weak(input) if input === nothing @@ -252,50 +289,69 @@ end chunktype(x) = typeof(x) function signature(task::Thunk, state) sig = Any[chunktype(task.f)] - append!(sig, collect_task_inputs(state, task)) - sig + for (pos, input) in collect_task_inputs(state, task) + # N.B. Skips kwargs + if pos === nothing + push!(sig, chunktype(input)) + end + end + return sig end function can_use_proc(task, gproc, proc, opts, scope) # Check against proclist - if opts.proclist === nothing - if !default_enabled(proc) - @debug "Rejected $proc: !default_enabled(proc)" - return false - end - elseif opts.proclist isa Function - if !Base.invokelatest(opts.proclist, proc) - @debug "Rejected $proc: proclist(proc) == false" - return false + if opts.proclist !== nothing + @warn "The `proclist` option is deprecated, please use scopes instead\nSee https://juliaparallel.org/Dagger.jl/stable/scopes/ for details" maxlog=1 + if opts.proclist isa Function + if !Base.invokelatest(opts.proclist, proc) + @dagdebug task :scope "Rejected $proc: proclist(proc) == false" + return false, scope + end + scope = constrain(scope, Dagger.ExactScope(proc)) + elseif opts.proclist isa Vector + if !(typeof(proc) in opts.proclist) + @dagdebug task :scope "Rejected $proc: !(typeof(proc) in proclist)" + return false, scope + end + scope = constrain(scope, + Dagger.UnionScope(map(Dagger.ProcessorTypeScope, opts.proclist))) + else + throw(SchedulingException("proclist must be a Function, Vector, or nothing")) end - elseif opts.proclist isa Vector - if !(typeof(proc) in opts.proclist) - @debug "Rejected $proc: !(typeof(proc) in proclist)" - return false + if scope isa Dagger.InvalidScope + @dagdebug task :scope "Rejected $proc: Not contained in task scope ($scope)" + return false, scope end - else - throw(SchedulingException("proclist must be a Function, Vector, or nothing")) end # Check against single if opts.single !== nothing + @warn "The `single` option is deprecated, please use scopes instead\nSee https://juliaparallel.org/Dagger.jl/stable/scopes/ for details" maxlog=1 if gproc.pid != opts.single - @debug "Rejected $proc: gproc.pid != single" - return false + @dagdebug task :scope "Rejected $proc: gproc.pid ($(gproc.pid)) != single ($(opts.single))" + return false, scope + end + scope = constrain(scope, Dagger.ProcessScope(opts.single)) + if scope isa Dagger.InvalidScope + @dagdebug task :scope "Rejected $proc: Not contained in task scope ($scope)" + return false, scope end end - # Check scope + # Check against scope proc_scope = Dagger.ExactScope(proc) if constrain(scope, proc_scope) isa Dagger.InvalidScope - @debug "Rejected $proc: Task scope ($scope) vs. processor scope ($proc_scope)" - return false + @dagdebug task :scope "Rejected $proc: Not contained in task scope ($scope)" + return false, scope end - return true + @label accept + + @dagdebug task :scope "Accepted $proc" + return true, scope end -function has_capacity(state, p, gp, time_util, alloc_util, sig) +function has_capacity(state, p, gp, time_util, alloc_util, occupancy, sig) T = typeof(p) # FIXME: MaxUtilization est_time_util = round(UInt64, if time_util !== nothing && haskey(time_util, T) @@ -306,8 +362,14 @@ function has_capacity(state, p, gp, time_util, alloc_util, sig) est_alloc_util = if alloc_util !== nothing && haskey(alloc_util, T) alloc_util[T] else - get(state.signature_alloc_cost, sig, 0) - end + get(state.signature_alloc_cost, sig, UInt64(0)) + end::UInt64 + est_occupancy = if occupancy !== nothing && haskey(occupancy, T) + # Clamp to 0-1, and scale between 0 and `typemax(UInt32)` + Base.unsafe_trunc(UInt32, clamp(occupancy[T], 0, 1) * typemax(UInt32)) + else + typemax(UInt32) + end::UInt32 #= FIXME: Estimate if cached data can be swapped to storage storage = storage_resource(p) real_alloc_util = state.worker_storage_pressure[gp][storage] @@ -316,7 +378,7 @@ function has_capacity(state, p, gp, time_util, alloc_util, sig) return false, est_time_util, est_alloc_util end =# - return true, est_time_util, est_alloc_util + return true, est_time_util, est_alloc_util, est_occupancy end function populate_processor_cache_list!(state, procs) @@ -361,12 +423,12 @@ end "Collects all arguments for `task`, converting Thunk inputs to Chunks." function collect_task_inputs(state, task) - inputs = Any[] - for input in task.inputs + inputs = Pair{Union{Symbol,Nothing},Any}[] + for (pos, input) in task.inputs input = unwrap_weak_checked(input) - push!(inputs, istask(input) ? state.cache[input] : input) + push!(inputs, pos => (istask(input) ? state.cache[input] : input)) end - inputs + return inputs end """ @@ -413,6 +475,11 @@ Walks the data contained in `x` in DFS fashion, and executes `f` at each object that hasn't yet been seen. """ function walk_data(f, @nospecialize(x)) + action = f(x) + if action !== missing + return action + end + seen = IdDict{Any,Nothing}() to_visit = Any[x] diff --git a/src/scopes.jl b/src/scopes.jl index 2b203f566..6a5af1e5f 100644 --- a/src/scopes.jl +++ b/src/scopes.jl @@ -1,18 +1,62 @@ -export AnyScope, UnionScope, NodeScope, ProcessScope, ExactScope +export AnyScope, DefaultScope, UnionScope, NodeScope, ProcessScope, ExactScope, ProcessorTypeScope abstract type AbstractScope end -"Default scope that is unconstrained." +"Widest scope that contains all processors." struct AnyScope <: AbstractScope end +abstract type AbstractScopeTaint end + +"Taints a scope for later evaluation." +struct TaintScope <: AbstractScope + scope::AbstractScope + taints::Set{AbstractScopeTaint} +end +Base.:(==)(ts1::TaintScope, ts2::TaintScope) = + ts1.scope == ts2.scope && + length(ts1.taints) == length(ts2.taints) && + all(collect(ts1.taints) .== collect(ts2.taints)) + +struct DefaultEnabledTaint <: AbstractScopeTaint end + +"Default scope that contains the set of `default_enabled` processors." +DefaultScope() = TaintScope(AnyScope(), + Set{AbstractScopeTaint}([DefaultEnabledTaint()])) + "Union of two or more scopes." struct UnionScope <: AbstractScope scopes::Tuple + function UnionScope(scopes::Tuple) + scope_set = Set{AbstractScope}() + for scope in scopes + if scope isa UnionScope + for subscope in scope.scopes + push!(scope_set, subscope) + end + else + push!(scope_set, scope) + end + end + return new((collect(scope_set)...,)) + end end UnionScope(scopes...) = UnionScope((scopes...,)) UnionScope(scopes::Vector{<:AbstractScope}) = UnionScope((scopes...,)) UnionScope(s::AbstractScope) = UnionScope((s,)) -UnionScope() = throw(ArgumentError("Cannot construct empty UnionScope")) +UnionScope() = UnionScope(()) + +function Base.:(==)(us1::UnionScope, us2::UnionScope) + if length(us1.scopes) != length(us2.scopes) + return false + end + scopes = Set{AbstractScope}() + for scope in us2.scopes + if !(scope in us2.scopes) + return false + end + end + return true +end "Scoped to the same physical node." struct NodeScope <: AbstractScope @@ -35,6 +79,13 @@ end ProcessScope(p::OSProc) = ProcessScope(p.pid) ProcessScope() = ProcessScope(myid()) +"Scoped to any processor with a given supertype." +struct ProcessorTypeTaint{T} <: AbstractScopeTaint end + +ProcessorTypeScope(T) = + TaintScope(AnyScope(), + Set{AbstractScopeTaint}([ProcessorTypeTaint{T}()])) + "Scoped to a specific processor." struct ExactScope <: AbstractScope parent::ProcessScope @@ -48,6 +99,47 @@ struct InvalidScope <: AbstractScope y::AbstractScope end +# Show methods + +function Base.show(io::IO, scope::UnionScope) + indent = io isa IOContext ? get(io, :indent, 0) : 0 + println(io, "UnionScope:") + indent += 2 + inner_io = IOContext(io, :indent => indent) + if length(scope.scopes) == 0 + print(io, " "^indent, "empty") + return + end + for (idx, inner_scope) in enumerate(scope.scopes) + print(io, " "^indent); print(inner_io, inner_scope) + idx < length(scope.scopes) && println(io) + end +end +function Base.show(io::IO, scope::TaintScope) + indent = io isa IOContext ? get(io, :indent, 0) : 0 + println(io, "TaintScope:") + indent += 2 + inner_io = IOContext(io, :indent => indent) + print(io, " "^indent, "scope: "); println(inner_io, scope.scope) + if length(scope.taints) == 0 + print(io, " "^indent, "no taints") + return + end + println(io, " "^indent, "taints: ") + indent += 4 + inner_io = IOContext(io, :indent => indent) + for (idx, taint) in enumerate(scope.taints) + print(io, " "^indent); print(inner_io, taint) + idx < length(scope.taints) && println(io) + end +end +Base.show(io::IO, scope::NodeScope) = + print(io, "NodeScope: node == $(scope.uuid)") +Base.show(io::IO, scope::ProcessScope) = + print(io, "ProcessScope: worker == $(scope.wid)") +Base.show(io::IO, scope::ExactScope) = + print(io, "ExactScope: processor == $(scope.processor)") + # Comparisons and constraint checking constrain(x, y) = x < y ? constrain(y, x) : throw(MethodError(constrain, x, y)) @@ -58,7 +150,46 @@ Base.isless(x, ::AnyScope) = true constrain(::AnyScope, ::AnyScope) = AnyScope() constrain(::AnyScope, y) = y +# N.B. TaintScope taints constraining (until encountering an `ExactScope`) to +# allow lazy evaluation of the taint matching on the final processor +taint_match(::DefaultEnabledTaint, x::Processor) = default_enabled(x) +taint_match(::ProcessorTypeTaint{T}, x::Processor) where T = x isa T +Base.isless(::TaintScope, ::TaintScope) = false +Base.isless(::TaintScope, ::AnyScope) = true +Base.isless(::TaintScope, x) = false +Base.isless(x, ::TaintScope) = true +function constrain(x::TaintScope, y::TaintScope) + scope = constrain(x.scope, y.scope) + if scope isa InvalidScope + return scope + end + taints = Set{AbstractScopeTaint}() + for tx in x.taints + push!(taints, tx) + end + for ty in y.taints + push!(taints, ty) + end + return TaintScope(scope, taints) +end +function constrain(x::TaintScope, y) + scope = constrain(x.scope, y) + if scope isa InvalidScope + return scope + end + return TaintScope(scope, x.taints) +end +function constrain(x::TaintScope, y::ExactScope) + for taint in x.taints + if !taint_match(taint, y.processor) + return InvalidScope(x, y) + end + end + return constrain(x.scope, y) +end + Base.isless(::UnionScope, ::UnionScope) = false +Base.isless(::UnionScope, ::TaintScope) = true Base.isless(::UnionScope, ::AnyScope) = true function constrain(x::UnionScope, y::UnionScope) zs = Vector{AbstractScope}() @@ -75,12 +206,14 @@ end constrain(x::UnionScope, y) = constrain(x, UnionScope((y,))) Base.isless(::NodeScope, ::NodeScope) = false +Base.isless(::NodeScope, ::TaintScope) = true Base.isless(::NodeScope, ::AnyScope) = true constrain(x::NodeScope, y::NodeScope) = x == y ? y : InvalidScope(x, y) Base.isless(::ProcessScope, ::ProcessScope) = false Base.isless(::ProcessScope, ::NodeScope) = true +Base.isless(::ProcessScope, ::TaintScope) = true Base.isless(::ProcessScope, ::AnyScope) = true constrain(x::ProcessScope, y::ProcessScope) = x == y ? y : InvalidScope(x, y) @@ -90,6 +223,7 @@ constrain(x::NodeScope, y::ProcessScope) = Base.isless(::ExactScope, ::ExactScope) = false Base.isless(::ExactScope, ::ProcessScope) = true Base.isless(::ExactScope, ::NodeScope) = true +Base.isless(::ExactScope, ::TaintScope) = true Base.isless(::ExactScope, ::AnyScope) = true constrain(x::ExactScope, y::ExactScope) = x == y ? y : InvalidScope(x, y) @@ -97,3 +231,137 @@ constrain(x::ProcessScope, y::ExactScope) = x == y.parent ? y : InvalidScope(x, y) constrain(x::NodeScope, y::ExactScope) = x == y.parent.parent ? y : InvalidScope(x, y) + +### Scopes helper + +""" + scope(scs...) -> AbstractScope + scope(;scs...) -> AbstractScope + +Constructs an `AbstractScope` from a set of scope specifiers. Each element in +`scs` is a separate specifier; if `scs` is empty, an empty `UnionScope()` is +produced; if `scs` has one element, then exactly one specifier is constructed; +if `scs` has more than one element, a `UnionScope` of the scopes specified by +`scs` is constructed. A variety of specifiers can be passed to construct a +scope: +- `:any` - Constructs an `AnyScope()` +- `:default` - Constructs a `DefaultScope()` +- `(scs...,)` - Constructs a `UnionScope` of scopes, each specified by `scs` +- `thread=tid` or `threads=[tids...]` - Constructs an `ExactScope` or `UnionScope` containing all `Dagger.ThreadProc`s with thread ID `tid`/`tids` across all workers. +- `worker=wid` or `workers=[wids...]` - Constructs a `ProcessScope` or `UnionScope` containing all `Dagger.ThreadProc`s with worker ID `wid`/`wids` across all threads. +- `thread=tid`/`threads=tids` and `worker=wid`/`workers=wids` - Constructs an `ExactScope`, `ProcessScope`, or `UnionScope` containing all `Dagger.ThreadProc`s with worker ID `wid`/`wids` and threads `tid`/`tids`. + +Aside from the worker and thread specifiers, it's possible to add custom +specifiers for scoping to other kinds of processors (like GPUs) or providing +different ways to specify a scope. Specifier selection is determined by a +precedence ordering: by default, all specifiers have precedence `0`, which can +be changed by defining `scope_key_precedence(::Val{spec}) = precedence` (where +`spec` is the specifier as a `Symbol)`. The specifier with the highest +precedence in a set of specifiers is used to determine the scope by calling +`to_scope(::Val{spec}, sc::NamedTuple)` (where `sc` is the full set of +specifiers), which should be overriden for each custom specifier, and which +returns an `AbstractScope`. For example: + +```julia +# Setup a GPU specifier +Dagger.scope_key_precedence(::Val{:gpu}) = 1 +Dagger.to_scope(::Val{:gpu}, sc::NamedTuple) = ExactScope(MyGPUDevice(sc.worker, sc.gpu)) + +# Generate an `ExactScope` for `MyGPUDevice` on worker 2, device 3 +Dagger.scope(gpu=3, worker=2) +``` +""" +scope(scs...) = simplified_union_scope(map(to_scope, scs)) +scope(; kwargs...) = to_scope((;kwargs...)) + +function simplified_union_scope(scopes) + if length(scopes) == 1 + return only(scopes) + else + return UnionScope(scopes) + end +end +to_scope(scope::AbstractScope) = scope +function to_scope(sc::Symbol) + if sc == :any + return AnyScope() + elseif sc == :default + return DefaultScope() + else + throw(ArgumentError("Cannot construct scope from: $(repr(sc))")) + end +end +function to_scope(sc::NamedTuple) + if isempty(sc) + return UnionScope() + end + + # FIXME: node and nodes + known_keys = (:worker, :workers, :thread, :threads) + unknown_keys = filter(key->!in(key, known_keys), keys(sc)) + if length(unknown_keys) > 0 + # Hand off construction if unknown members encountered + precs = map(scope_key_precedence, map(Val, unknown_keys)) + max_prec = maximum(precs) + if length(findall(prec->prec==max_prec, precs)) > 1 + throw(ArgumentError("Incompatible scope specifiers detected: $unknown_keys")) + end + max_prec_key = unknown_keys[argmax(precs)] + return to_scope(Val(max_prec_key), sc) + end + + workers = if haskey(sc, :worker) + Int[sc.worker] + elseif haskey(sc, :workers) + Int[sc.workers...] + else + nothing + end + threads = if haskey(sc, :thread) + Int[sc.thread] + elseif haskey(sc, :threads) + Int[sc.threads...] + else + nothing + end + + # Simple cases + if workers !== nothing && threads !== nothing + subscopes = AbstractScope[] + for w in workers, t in threads + push!(subscopes, ExactScope(ThreadProc(w, t))) + end + return simplified_union_scope(subscopes) + elseif workers !== nothing && threads === nothing + subscopes = AbstractScope[ProcessScope(w) for w in workers] + return simplified_union_scope(subscopes) + end + + # More complex cases that require querying the cluster + # FIXME: Use per-field scope taint + if workers === nothing + workers = procs() + end + subscopes = AbstractScope[] + for w in workers + if threads === nothing + threads = map(c->c.tid, + filter(c->c isa ThreadProc, + collect(children(OSProc(w))))) + end + for t in threads + push!(subscopes, ExactScope(ThreadProc(w, t))) + end + end + return simplified_union_scope(subscopes) +end +to_scope(scs::Tuple) = + simplified_union_scope(map(to_scope, scs)) +to_scope(sc) = + throw(ArgumentError("Cannot construct scope from: $sc")) + +to_scope(::Val{key}, sc::NamedTuple) where key = + throw(ArgumentError("Scope construction not implemented for key: $key")) + +# Base case for all Dagger-owned keys +scope_key_precedence(::Val) = 0 diff --git a/src/thunk.jl b/src/thunk.jl index 603f53ac3..9e71ae274 100644 --- a/src/thunk.jl +++ b/src/thunk.jl @@ -1,4 +1,4 @@ -export Thunk, delayed, delayedmap +export Thunk, delayed const ID_COUNTER = Threads.Atomic{Int}(1) next_id() = Threads.atomic_add!(ID_COUNTER, 1) @@ -39,7 +39,7 @@ arguments are still passed as-is. - `processor::Processor=OSProc()` - The processor associated with `f`. Useful if `f` is a callable struct that exists on a given processor and should be transferred appropriately. -- `scope::Dagger.AbstractScope=AnyScope()` - The scope associated with `f`. +- `scope::Dagger.AbstractScope=DefaultScope()` - The scope associated with `f`. Useful if `f` is a function or callable struct that may only be transferred to, and executed within, the specified scope. @@ -50,7 +50,7 @@ If omitted, options can also be specified by passing key-value pairs as """ mutable struct Thunk f::Any # usually a Function, but could be any callable - inputs::Vector{Any} # TODO: Use `ImmutableArray` in 1.8 + inputs::Vector{Pair{Union{Symbol,Nothing},Any}} # TODO: Use `ImmutableArray` in 1.8 id::Int hash::UInt get_result::Bool # whether the worker should send the result or only the metadata @@ -82,9 +82,10 @@ mutable struct Thunk if !isa(f, Chunk) && (!isnothing(processor) || !isnothing(scope)) f = tochunk(f, something(processor, OSProc()), - something(scope, AnyScope())) + something(scope, DefaultScope())) end xs = Any[xs...] + @assert all(x->x isa Pair, xs) if options !== nothing @assert isempty(kwargs) new(f, xs, id, hash, get_result, meta, persist, cache, cache_ref, @@ -109,7 +110,7 @@ function affinity(t::Thunk) aff_vec = affinity(t.cache_ref) else aff = Dict{OSProc,Int}() - for inp in inputs(t) + for (_, inp) in t.inputs #if haskey(state.cache, inp) # as = affinity(state.cache[inp]) # for a in as @@ -134,17 +135,35 @@ function affinity(t::Thunk) #end end +struct Options + options::NamedTuple +end +Options(;options...) = Options((;options...)) +Options(options...) = Options((;options...)) + +function args_kwargs_to_pairs(args, kwargs) + args_kwargs = Pair{Union{Symbol,Nothing},Any}[] + for arg in args + push!(args_kwargs, nothing => arg) + end + for kwarg in kwargs + push!(args_kwargs, kwarg[1] => kwarg[2]) + end + return args_kwargs +end + """ - delayed(f; kwargs...)(args...) + delayed(f, options=Options())(args...; kwargs...) -> Thunk + delayed(f; options...)(args...; kwargs...) -> Thunk Creates a [`Thunk`](@ref) object which can be executed later, which will call -`f` with `args`. `kwargs` controls various properties of the resulting `Thunk`. +`f` with `args` and `kwargs`. `options` controls various properties of the +resulting `Thunk`. """ -function delayed(f; kwargs...) - (args...) -> Thunk(f, args...; kwargs...) +function delayed(f, options::Options) + (args...; kwargs...) -> Thunk(f, args_kwargs_to_pairs(args, kwargs)...; options.options...) end - -delayedmap(f, xs...) = map(delayed(f), xs...) +delayed(f; kwargs...) = delayed(f, Options(;kwargs...)) "A future holding the result of a `Thunk`." struct ThunkFuture @@ -158,16 +177,16 @@ Base.wait(t::ThunkFuture) = Dagger.Sch.thunk_yield() do end function Base.fetch(t::ThunkFuture; proc=OSProc(), raw=false) error, value = Dagger.Sch.thunk_yield() do - if raw - fetch(t.future) - else - move(proc, fetch(t.future)) - end + fetch(t.future) end if error throw(value) end - value + if raw + return value + else + return move(proc, value) + end end Base.put!(t::ThunkFuture, x; error=false) = put!(t.future, (error, x)) @@ -198,15 +217,59 @@ ThunkFailedException(thunk, origin, ex::E) where E = ThunkFailedException{E}(convert(WeakThunk, thunk), convert(WeakThunk, origin), ex) function Base.showerror(io::IO, ex::ThunkFailedException) t = unwrap_weak(ex.thunk) - o = unwrap_weak(ex.origin) - t_str = t !== nothing ? "$t" : "?" - o_str = o !== nothing ? "$o" : "?" + + # Find root-cause thunk + last_tfex = ex + failed_tasks = Union{Thunk,Nothing}[] + while last_tfex.ex isa ThunkFailedException && unwrap_weak(last_tfex.ex.origin) !== nothing + push!(failed_tasks, unwrap_weak(last_tfex.thunk)) + last_tfex = last_tfex.ex + end + o = unwrap_weak(last_tfex.origin) + root_ex = last_tfex.ex + + function thunk_string(t) + if t === nothing + return "Thunk(?)" + end + Tinputs = Any[] + for (_, input) in t.inputs + input = unwrap_weak(input) + if istask(input) + push!(Tinputs, "Thunk(id=$(input.id))") + else + push!(Tinputs, input) + end + end + t_sig = if length(Tinputs) <= 4 + "$(t.f)($(join(Tinputs, ", ")))" + else + "$(t.f)($(length(Tinputs)) inputs...)" + end + return "Thunk(id=$(t.id), $t_sig)" + end + t_str = thunk_string(t) + o_str = thunk_string(o) t_id = t !== nothing ? t.id : '?' o_id = o !== nothing ? o.id : '?' - println(io, "ThunkFailedException ($t failure", - (o !== nothing && t != o) ? " due to a failure in $o)" : ")", - ":") - Base.showerror(io, ex.ex) + println(io, "ThunkFailedException:") + println(io, " Root Exception Type: $(typeof(root_ex))") + println(io, " Root Exception:") + Base.showerror(io, root_ex); println(io) + if t !== o + println(io, " Root Thunk: $o_str") + if length(failed_tasks) <= 4 + for i in failed_tasks + i_str = thunk_string(i) + println(io, " Inner Thunk: $i_str") + end + else + println(io, " ...") + println(io, " $(length(failed_tasks)) Inner Thunks...") + println(io, " ...") + end + end + print(io, " This Thunk: $t_str") end """ @@ -224,6 +287,7 @@ mutable struct EagerThunk end Base.isready(t::EagerThunk) = isready(t.future) Base.wait(t::EagerThunk) = wait(t.future) + function Base.fetch(t::EagerThunk; raw=false) if raw fetch(t.future; raw=true) @@ -240,6 +304,9 @@ function Base.fetch(t::EagerThunk; raw=false) end end end + +istask(t::EagerThunk) = true + function Base.show(io::IO, t::EagerThunk) print(io, "EagerThunk ($(isready(t) ? "finished" : "running"))") end @@ -259,20 +326,21 @@ end const EAGER_ID_COUNTER = Threads.Atomic{UInt64}(1) eager_next_id() = Threads.atomic_add!(EAGER_ID_COUNTER, one(UInt64)) -function _spawn(f, args...; options, kwargs...) +function _spawn(f, options::Options, args...) Dagger.Sch.init_eager() for arg in args - @assert !isa(arg, Thunk) "Cannot use Thunks in spawn" + @assert !isa(last(arg), Thunk) "Cannot use Thunks in spawn" end uid = eager_next_id() future = ThunkFuture() finalizer_ref = poolset(EagerThunkFinalizer(uid); device=MemPool.CPURAMDevice()) added_future = Future() - propagates = keys(options) - put!(Dagger.Sch.EAGER_THUNK_CHAN, (added_future, future, uid, finalizer_ref, f, (args...,), (;propagates, options..., kwargs...,))) + propagates = keys(options.options) + put!(Dagger.Sch.EAGER_THUNK_CHAN, (added_future, future, uid, finalizer_ref, f, (args...,), (;propagates, options.options...,))) thunk_ref = fetch(added_future) return (uid, future, finalizer_ref, thunk_ref) end + """ spawn(f, args...; kwargs...) -> EagerThunk @@ -282,26 +350,34 @@ an `EagerThunk`. Uses a scheduler running in the background to execute code. Note that `kwargs` are passed to the `Thunk` constructor, and are documented in its docstring. """ -function spawn(f, args...; processor=nothing, scope=nothing, kwargs...) +function spawn(f, args...; kwargs...) options = get_options() + if length(args) >= 1 && first(args) isa Options + options = merge(options, first(args).options) + args = args[2:end] + end + processor = haskey(options, :processor) ? options.processor : nothing + scope = haskey(options, :scope) ? options.scope : nothing if !isnothing(processor) || !isnothing(scope) f = tochunk(f, something(processor, get_options(:processor, OSProc())), - something(scope, get_options(:scope, AnyScope()))) + something(scope, get_options(:scope, DefaultScope()))) end + args_kwargs = args_kwargs_to_pairs(args, kwargs) uid, future, finalizer_ref, thunk_ref = if myid() == 1 - _spawn(f, args...; options, kwargs...) + _spawn(f, Options(options), args_kwargs...) else - remotecall_fetch(_spawn, 1, f, args...; options, kwargs...) + remotecall_fetch(_spawn, 1, f, Options(options), args_kwargs...) end return EagerThunk(uid, future, finalizer_ref, thunk_ref) end """ - @par [opts] f(args...) -> Thunk + @par [opts] f(args...; kwargs...) -> Thunk -Convenience macro to call `Dagger.delayed` on `f` with arguments `args`. -May also be called with a series of assignments like so: +Convenience macro to call `Dagger.delayed` on `f` with arguments `args` and +keyword arguments `kwargs`. May also be called with a series of assignments +like so: ```julia x = @par begin @@ -339,19 +415,38 @@ macro spawn(exs...) _par(ex; lazy=false, opts=opts) end +struct ExpandedBroadcast{F} end +(eb::ExpandedBroadcast{F})(args...) where F = + Base.materialize(Base.broadcasted(F, args...)) +replace_broadcast(ex) = ex +function replace_broadcast(fn::Symbol) + if startswith(string(fn), '.') + return :($ExpandedBroadcast{$(Symbol(string(fn)[2:end]))}()) + end + return fn +end + function _par(ex::Expr; lazy=true, recur=true, opts=()) if ex.head == :call && recur - f = ex.args[1] - args = ex.args[2:end] + f = replace_broadcast(ex.args[1]) + if length(ex.args) >= 2 && Meta.isexpr(ex.args[2], :parameters) + args = ex.args[3:end] + kwargs = ex.args[2] + else + args = ex.args[2:end] + kwargs = Expr(:parameters) + end opts = esc.(opts) + args_ex = _par.(args; lazy=lazy, recur=false) + kwargs_ex = _par.(kwargs.args; lazy=lazy, recur=false) if lazy - return :(Dagger.delayed($(esc(f)); $(opts...))($(_par.(args; lazy=lazy, recur=false)...))) + return :(Dagger.delayed($(esc(f)), $Options(;$(opts...)))($(args_ex...); $(kwargs_ex...))) else sync_var = esc(Base.sync_varname) @gensym result return quote - let args = ($(_par.(args; lazy=lazy, recur=false)...),) - $result = $spawn($(esc(f)), args...; $(opts...)) + let args = ($(args_ex...),) + $result = $spawn($(esc(f)), $Options(;$(opts...)), args...; $(kwargs_ex...)) if $(Expr(:islocal, sync_var)) put!($sync_var, schedule(Task(()->wait($result)))) end @@ -404,7 +499,15 @@ function Base.show(io::IO, z::Thunk) end print(io, "Thunk[$(z.id)]($f, ") if lvl > 0 - show(IOContext(io, :lazy_level => lvl-1), z.inputs) + inputs = Any[] + for (pos, input) in z.inputs + if pos === nothing + push!(inputs, input) + else + push!(inputs, pos => input) + end + end + show(IOContext(io, :lazy_level => lvl-1), inputs) else print(io, "...") end diff --git a/src/ui/graph.jl b/src/ui/graph.jl index d4cf0acee..033c4baff 100644 --- a/src/ui/graph.jl +++ b/src/ui/graph.jl @@ -142,8 +142,9 @@ write_edge(ctx, io, from::String, to::String) = println(io, "n_$from -> n_$to;") getargs!(d, t) = nothing function getargs!(d, t::Thunk) - d[t.id] = [filter(x->!istask(x[2]), collect(enumerate(t.inputs)))...,] - foreach(i->getargs!(d, i), t.inputs) + raw_inputs = map(last, t.inputs) + d[t.id] = [filter(x->!istask(x[2]), collect(enumerate(raw_inputs)))...,] + foreach(i->getargs!(d, i), raw_inputs) end function write_dag(io, logs::Vector, t=nothing) ctx = (proc_to_color = Dict{Processor,String}(), diff --git a/src/utils/locked-object.jl b/src/utils/locked-object.jl new file mode 100644 index 000000000..e00eaac2d --- /dev/null +++ b/src/utils/locked-object.jl @@ -0,0 +1,16 @@ +# Copied from CUDA.jl + +struct LockedObject{T} + lock::ReentrantLock + payload::T +end +LockedObject(payload) = LockedObject(Threads.ReentrantLock(), payload) + +function Base.lock(f, x::LockedObject) + lock(x.lock) + try + return f(x.payload) + finally + unlock(x.lock) + end +end diff --git a/test/array.jl b/test/array.jl index d44dd9af8..452d92f2c 100644 --- a/test/array.jl +++ b/test/array.jl @@ -1,5 +1,7 @@ using LinearAlgebra, SparseArrays, Random, SharedArrays import Dagger: chunks, DArray, domainchunks, treereduce_nd +import Distributed: myid, procs +import Statistics: mean @testset "treereduce_nd" begin xs = rand(1:10, 8,8,8) @@ -9,13 +11,13 @@ import Dagger: chunks, DArray, domainchunks, treereduce_nd end @testset "DArray constructor" begin - x = compute(rand(Blocks(2,2), 3,3)) + x = fetch(rand(Blocks(2,2), 3,3)) @test collect(x) == DArray{Float64, 2}(x.domain, x.subdomains, x.chunks) |> collect end @testset "rand" begin function test_rand(X) - X1 = compute(X) + X1 = fetch(X) X2 = collect(X1) @test isa(X, Dagger.ArrayOp) @@ -36,6 +38,7 @@ end r = collect(R) @test r[1:10] != r[11:20] end + @testset "sum" begin X = ones(Blocks(10, 10), 100, 100) @test sum(X) == 10000 @@ -43,19 +46,35 @@ end @test sum(Y) == 0 end +@testset "prod" begin + x = randn(100, 100) + X = distribute(x, Blocks(10, 10)) + @test prod(X) == prod(x) + Y = zeros(Blocks(10, 10), 100, 100) + @test prod(Y) == 0 +end + +@testset "mean" begin + x = randn(100, 100) + X = distribute(x, Blocks(10, 10)) + @test mean(X) ≈ mean(x) + Y = zeros(Blocks(10, 10), 100, 100) + @test mean(Y) == 0 +end + @testset "distributing an array" begin function test_dist(X) X1 = Distribute(Blocks(10, 20), X) @test X1 == X - Xc = compute(X1) + Xc = fetch(X1) @test chunks(Xc) |> size == (10, 5) @test domainchunks(Xc) |> size == (10, 5) @test map(x->size(x) == (10, 20), domainchunks(Xc)) |> all end x = [1 2; 3 4] @test Distribute(Blocks(1,1), x) == x - #test_dist(rand(100, 100)) - #test_dist(sprand(100, 100, 0.1)) + test_dist(rand(100, 100)) + test_dist(sprand(100, 100, 0.1)) x = distribute(rand(10), 2) @test collect(distribute(x, 3)) == collect(x) @@ -66,7 +85,7 @@ end x, y = size(X) X1 = Distribute(Blocks(10, 20), X) @test X1' == X' - Xc = compute(X1') + Xc = fetch(X1') @test chunks(Xc) |> size == (div(y, 20), div(x,10)) @test domainchunks(Xc) |> size == (div(y, 20), div(x, 10)) @test map(x->size(x) == (20, 10), domainchunks(Xc)) |> all @@ -81,9 +100,9 @@ end function test_mul(X) tol = 1e-12 X1 = Distribute(Blocks(10, 20), X) - @test_throws DimensionMismatch compute(X1*X1) - X2 = compute(X1'*X1) - X3 = compute(X1*X1') + @test_throws DimensionMismatch fetch(X1*X1) + X2 = fetch(X1'*X1) + X3 = fetch(X1*X1') @test norm(collect(X2) - X'X) < tol @test norm(collect(X3) - X*X') < tol @test chunks(X2) |> size == (2, 2) @@ -100,7 +119,7 @@ end end @testset "matrix powers" begin - x = compute(rand(Blocks(4,4), 16, 16)) + x = fetch(rand(Blocks(4,4), 16, 16)) @test collect(x^1) == collect(x) @test collect(x^2) == collect(x*x) @test collect(x^3) == collect(x*x*x) @@ -113,7 +132,7 @@ end @test hcat(m,m) == collect(hcat(x,x)) @test vcat(m,m) == collect(vcat(x,x)) @test hcat(m,m) == collect(hcat(x,y)) - @test_throws DimensionMismatch compute(vcat(x,y)) + @test_throws DimensionMismatch fetch(vcat(x,y)) end @testset "scale" begin @@ -137,10 +156,9 @@ end @test collect(X[[], []]) == x[[], []] @testset "dimensionality reduction" begin - # THESE NEED FIXING!! @test vec(collect(X[ragged_idx, 5])) == vec(x[ragged_idx, 5]) @test vec(collect(X[5, ragged_idx])) == vec(x[5, ragged_idx]) - @test collect(X[5, 5]) == x[5,5] + @test X[5, 5] == x[5,5] end end @@ -150,7 +168,7 @@ end y = rand(10, 10) xs = distribute(y, Blocks(2,2)) for i=1:10, j=1:10 - @test compute(xs[i:j, j:i]) == y[i:j, j:i] + @test fetch(xs[i:j, j:i]) == y[i:j, j:i] end end @@ -200,8 +218,8 @@ end @test collect(sort(y)) == x x = ones(10) - y = compute(Distribute(Blocks(3), x)) - #@test map(x->length(collect(x)), compute(sort(y)).chunks) == [3,3,3,1] + y = fetch(Distribute(Blocks(3), x)) + @test_broken map(x->length(collect(x)), fetch(sort(y)).chunks) == [3,3,3,1] end using MemPool @@ -219,20 +237,9 @@ using MemPool @test aff[2] == sizeof(Int)*10 end -@testset "show_plan" begin - @test !isempty(Dagger.show_plan(Dagger.Thunk(()->10))) -end - -#= -@testset "darray distributed refcount" begin - D2 = remotecall_fetch(2, compute(Distribute(Blocks(10, 20), rand(40,40)))) do D - D2 = D - end - @test size(collect(D2)) == (40,40) - GC.gc() - @test size(collect(D2)) == (40,40) -end -=# +#=@testset "show_plan" begin + @test !isempty(Dagger.show_plan(Dagger.spawn(()->10))) +end=# @testset "sharedarray" begin A = SharedArray{Int}((1024,)) diff --git a/test/fakeproc.jl b/test/fakeproc.jl index 54328697b..3f7c0b0b5 100644 --- a/test/fakeproc.jl +++ b/test/fakeproc.jl @@ -1,7 +1,7 @@ @everywhere begin using Dagger -import Dagger: ThreadProc +import Dagger: OSProc, ThreadProc struct FakeProc <: Dagger.Processor owner::Int @@ -18,6 +18,7 @@ fakesum(xs...) = FakeVal(sum(map(y->y.x, xs))) Dagger.iscompatible_func(proc::FakeProc, opts, f) = true Dagger.iscompatible_arg(proc::FakeProc, opts, ::Type{<:Integer}) = true Dagger.iscompatible_arg(proc::FakeProc, opts, ::Type{<:FakeVal}) = true +Dagger.move(from_proc::OSProc, to_proc::FakeProc, x::Integer) = FakeVal(x) Dagger.move(from_proc::ThreadProc, to_proc::FakeProc, x::Integer) = FakeVal(x) Dagger.execute!(proc::FakeProc, func, args...) = FakeVal(42+func(args...).x) diff --git a/test/logging.jl b/test/logging.jl index c4431bba5..bb97811d0 100644 --- a/test/logging.jl +++ b/test/logging.jl @@ -3,18 +3,17 @@ import TimespanLogging: Timespan, Event, Events, LocalEventLog, MultiEventLog @testset "Logging" begin @testset "LocalEventLog" begin - @testset "ArrayOp" begin + @testset "Basics" begin ctx = Context() log = LocalEventLog() ctx.log_sink = log - sz = 2^4 - bsz = 2^2 - X = rand(Float32, sz, sz) - XD = Distribute(Blocks(bsz, bsz), X) + X = rand(Float32, 4, 3) + a = delayed(sum)(X) + b = delayed(sum)(X) + c = delayed(+)(a,b) - dag = XD*XD - collect(ctx, dag) + compute(ctx, c) logs = TimespanLogging.get_logs!(log) @test logs isa Vector{Timespan} @test length(logs) > 0 @@ -48,10 +47,13 @@ import TimespanLogging: Timespan, Event, Events, LocalEventLog, MultiEventLog end @testset "Automatic Plan Rendering" begin - x = compute(rand(Blocks(2,2),4,4)) + X = rand(Float32, 4, 3) + a = delayed(sum)(X) + b = delayed(sum)(X) + c = delayed(+)(a,b) mktemp() do path, io ctx = Context(;log_sink=LocalEventLog(),log_file=path) - compute(ctx, x * x) + compute(ctx, c) plan = String(read(io)) @test occursin("digraph {", plan) @test occursin("Move:", plan) @@ -73,8 +75,11 @@ import TimespanLogging: Timespan, Event, Events, LocalEventLog, MultiEventLog ml[:psat] = Dagger.Events.ProcessorSaturation() ctx.log_sink = ml - A = rand(Blocks(4, 4), 16, 16) - collect(ctx, A*A) + X = rand(Float32, 4, 3) + a = delayed(sum)(X) + b = delayed(sum)(X) + c = delayed(+)(a,b) + compute(ctx, c) logs = TimespanLogging.get_logs!(ml) for w in keys(logs) diff --git a/test/mutation.jl b/test/mutation.jl index 48b192476..f076719d4 100644 --- a/test/mutation.jl +++ b/test/mutation.jl @@ -41,11 +41,12 @@ end @testset "@mutable" begin w = first(workers()) + @assert w != 1 "Not enough workers to test mutability" x = remotecall_fetch(w) do Dagger.@mutable Ref{Int}() end @test fetch(Dagger.@spawn (x->x[] = myid())(x)) == w - @test_throws_unwrap Dagger.ThunkFailedException fetch(Dagger.@spawn single=last(workers()) (x->x[] = myid())(x)) + @test_throws_unwrap Dagger.ThunkFailedException fetch(Dagger.@spawn single=1 (x->x[] = myid())(x)) end # @testset "@mutable" @testset "Shard" begin diff --git a/test/options.jl b/test/options.jl index 79c3449b8..93cf58680 100644 --- a/test/options.jl +++ b/test/options.jl @@ -70,7 +70,7 @@ end @test fetch(Dagger.@spawn sf(obj)) == 0 @test fetch(Dagger.@spawn sf(obj)) == 0 end - Dagger.with_options(;scope=Dagger.ExactScope(Dagger.ThreadProc(1,1)), processor=Dagger.ThreadProc(1,1), meta=true) do + Dagger.with_options(;scope=Dagger.ExactScope(Dagger.ThreadProc(1,1)), processor=OSProc(1), meta=true) do @test fetch(Dagger.@spawn sf(obj)) == 43 @test fetch(Dagger.@spawn sf(obj)) == 43 end diff --git a/test/processors.jl b/test/processors.jl index 75870cb4e..9efcc1a4b 100644 --- a/test/processors.jl +++ b/test/processors.jl @@ -37,9 +37,9 @@ end end @testset "Processor exhaustion" begin opts = ThunkOptions(proclist=[OptOutProc]) - @test_throws_unwrap Dagger.ThunkFailedException ex isa Dagger.Sch.SchedulingException ex.reason="No processors available, try making proclist more liberal" collect(delayed(sum; options=opts)([1,2,3])) + @test_throws_unwrap Dagger.ThunkFailedException ex isa Dagger.Sch.SchedulingException ex.reason="No processors available, try widening scope" collect(delayed(sum; options=opts)([1,2,3])) opts = ThunkOptions(proclist=(proc)->false) - @test_throws_unwrap Dagger.ThunkFailedException ex isa Dagger.Sch.SchedulingException ex.reason="No processors available, try making proclist more liberal" collect(delayed(sum; options=opts)([1,2,3])) + @test_throws_unwrap Dagger.ThunkFailedException ex isa Dagger.Sch.SchedulingException ex.reason="No processors available, try widening scope" collect(delayed(sum; options=opts)([1,2,3])) opts = ThunkOptions(proclist=nothing) @test collect(delayed(sum; options=opts)([1,2,3])) == 6 end diff --git a/test/scheduler.jl b/test/scheduler.jl index 96611b82d..28ddcb5c6 100644 --- a/test/scheduler.jl +++ b/test/scheduler.jl @@ -50,7 +50,7 @@ function dynamic_get_dag(x...) end function dynamic_add_thunk(x) h = sch_handle() - id = Dagger.Sch.add_thunk!(h, x) do y + id = Dagger.Sch.add_thunk!(h, nothing=>x) do y y+1 end wait(h, id) @@ -58,7 +58,7 @@ function dynamic_add_thunk(x) end function dynamic_add_thunk_self_dominated(x) h = sch_handle() - id = Dagger.Sch.add_thunk!(h, h.thunk_id, x) do y + id = Dagger.Sch.add_thunk!(h, nothing=>h.thunk_id, nothing=>x) do y y+1 end return fetch(h, id) @@ -205,7 +205,7 @@ end # until we end up on a non-blocked worker h = Dagger.Sch.sch_handle() wkrs = Dagger.Sch.exec!(_list_workers, h) - id = Dagger.Sch.add_thunk!(testfun, h, i) + id = Dagger.Sch.add_thunk!(testfun, h, nothing=>i) return fetch(h, id) end return myid() diff --git a/test/scopes.jl b/test/scopes.jl index 6b5b6252f..d5d7e0a6b 100644 --- a/test/scopes.jl +++ b/test/scopes.jl @@ -31,6 +31,17 @@ es1_ch = Dagger.tochunk(nothing, OSProc(), es1) es2_ch = Dagger.tochunk(nothing, OSProc(), es2) + os1 = ExactScope(OSProc(1)) + + @testset "Default Scope" begin + ds = DefaultScope() + for (s1, s2) in ((ds, es1), (es1, ds)) + @test Dagger.constrain(s1, s2) == es1 + end + for (s1, s2) in ((ds, os1), (os1, ds)) + @test Dagger.constrain(s1, s2) isa Dagger.InvalidScope + end + end @testset "Node Scope" begin @everywhere node_scope_test(ch...) = Dagger.system_uuid() @@ -113,7 +124,7 @@ @test fetch(Dagger.@spawn exact_scope_test(us_es1_multi_ch)) == es1.processor # No inner scopes - @test_throws ArgumentError UnionScope() + @test UnionScope() isa UnionScope # Same inner scope @test fetch(Dagger.@spawn exact_scope_test(us_es1_ch, us_es1_ch)) == es1.processor @@ -128,7 +139,104 @@ @test es1 in us_res.scopes @test !(es2 in us_res.scopes) end + @testset "Processor Type Scope" begin + pts_th = ProcessorTypeScope(Dagger.ThreadProc) + pts_os = ProcessorTypeScope(Dagger.OSProc) + + @test Dagger.constrain(pts_th, es1) == es1 + @test Dagger.constrain(pts_th, os1) isa Dagger.InvalidScope + + @test Dagger.constrain(pts_os, es1) isa Dagger.InvalidScope + @test Dagger.constrain(pts_os, os1) == os1 + + # Duplicate + pts_th_dup = Dagger.constrain(pts_th, pts_th) + @test Dagger.constrain(pts_th_dup, es1) == es1 + @test Dagger.constrain(pts_th_dup, os1) isa Dagger.InvalidScope + + # Empty intersection + pts_all = Dagger.constrain(pts_th, pts_os) + @test Dagger.constrain(pts_all, es1) isa Dagger.InvalidScope + @test Dagger.constrain(pts_all, os1) isa Dagger.InvalidScope + end # TODO: Test scope propagation + @testset "scope helper" begin + @test Dagger.scope(:any) isa AnyScope + @test Dagger.scope(:default) == DefaultScope() + @test_throws ArgumentError Dagger.scope(:blah) + @test Dagger.scope(()) == UnionScope() + + @test Dagger.scope(worker=wid1) == + Dagger.scope(workers=[wid1]) == + ProcessScope(wid1) + @test Dagger.scope(workers=[wid1,wid2]) == UnionScope([ProcessScope(wid1), + ProcessScope(wid2)]) + @test Dagger.scope(workers=[]) == UnionScope() + + @test Dagger.scope(thread=1) == + Dagger.scope(threads=[1]) == + UnionScope([ExactScope(Dagger.ThreadProc(w,1)) for w in procs()]) + @test Dagger.scope(threads=[1,2]) == UnionScope([ExactScope(Dagger.ThreadProc(w,t)) for t in [1,2] for w in procs()]) + @test Dagger.scope(threads=[]) == UnionScope() + + @test Dagger.scope(worker=wid1,thread=1) == + Dagger.scope(thread=1,worker=wid1) == + Dagger.scope(workers=[wid1],thread=1) == + Dagger.scope(worker=wid1,threads=[1]) == + Dagger.scope(workers=[wid1],threads=[1]) == + ExactScope(Dagger.ThreadProc(wid1,1)) + + @test_throws ArgumentError Dagger.scope(blah=1) + @test_throws ArgumentError Dagger.scope(thread=1, blah=1) + + @test Dagger.scope(worker=1,thread=1) == + Dagger.scope((worker=1,thread=1)) == + Dagger.scope(((worker=1,thread=1),)) + @test Dagger.scope((worker=1,thread=1),(worker=wid1,thread=2)) == + Dagger.scope(((worker=1,thread=1),(worker=wid1,thread=2),)) == + Dagger.scope(((worker=1,thread=1),), ((worker=wid1,thread=2),)) == + UnionScope([ExactScope(Dagger.ThreadProc(1, 1)), + ExactScope(Dagger.ThreadProc(wid1, 2))]) + @test_throws ArgumentError Dagger.scope((;blah=1)) + @test_throws ArgumentError Dagger.scope((thread=1, blah=1)) + + @testset "custom handler" begin + @eval begin + Dagger.scope_key_precedence(::Val{:gpu}) = 1 + Dagger.scope_key_precedence(::Val{:rocm}) = 2 + Dagger.scope_key_precedence(::Val{:cuda}) = 2 + + # Some fake scopes to use as sentinels + Dagger.to_scope(::Val{:gpu}, sc::NamedTuple) = ExactScope(Dagger.ThreadProc(1, sc.device)) + Dagger.to_scope(::Val{:rocm}, sc::NamedTuple) = ExactScope(Dagger.ThreadProc($wid1, sc.gpu)) + Dagger.to_scope(::Val{:cuda}, sc::NamedTuple) = ExactScope(Dagger.ThreadProc($wid2, sc.gpu)) + end + + @test Dagger.scope(gpu=1,device=2) == + Dagger.scope(device=2,gpu=1) == + Dagger.scope(gpu=1,device=2,blah=3) == + Dagger.scope((gpu=1,device=2,blah=3)) == + ExactScope(Dagger.ThreadProc(1, 2)) + @test Dagger.scope((gpu=1,device=2),(worker=1,thread=1)) == + Dagger.scope((worker=1,thread=1),(device=2,gpu=1)) == + UnionScope([ExactScope(Dagger.ThreadProc(1, 2)), + ExactScope(Dagger.ThreadProc(1, 1))]) + @test Dagger.scope((gpu=1,device=2),(device=3,gpu=1)) == + UnionScope([ExactScope(Dagger.ThreadProc(1, 2)), + ExactScope(Dagger.ThreadProc(1, 3))]) + + @test Dagger.scope(rocm=1,gpu=2) == + Dagger.scope(gpu=2,rocm=1) == + ExactScope(Dagger.ThreadProc(wid1, 2)) + @test Dagger.scope(cuda=1,gpu=2) == + Dagger.scope(gpu=2,cuda=1) == + ExactScope(Dagger.ThreadProc(wid2, 2)) + @test_throws ArgumentError Dagger.scope(rocm=1,cuda=1,gpu=2) + @test_throws ArgumentError Dagger.scope(gpu=2,rocm=1,cuda=1) + @test_throws ArgumentError Dagger.scope((rocm=1,cuda=1,gpu=2)) + end + end + rmprocs([wid1, wid2]) end diff --git a/test/thunk.jl b/test/thunk.jl index 1055819a2..c99a2bbfb 100644 --- a/test/thunk.jl +++ b/test/thunk.jl @@ -6,7 +6,7 @@ import Dagger: Chunk function dynamic_fib(n) n <= 1 && return n - t = Dagger.spawn(dynamic_fib, n-1) + t = Dagger.@spawn dynamic_fib(n-1) y = dynamic_fib(n-2) return (fetch(t)::Int) + y end @@ -21,6 +21,7 @@ import Dagger: Chunk end MulProc() = MulProc(myid()) Dagger.get_parent(mp::MulProc) = OSProc(mp.owner) + Dagger.move(src::MulProc, dest::Dagger.OSProc, x::Function) = Base.:* Dagger.move(src::MulProc, dest::Dagger.ThreadProc, x::Function) = Base.:* end @@ -54,13 +55,27 @@ end a = @spawn x + x @test a isa Dagger.EagerThunk b = @spawn sum([x,1,2]) - c = spawn(*, a, b) + c = @spawn a * b @test c isa Dagger.EagerThunk @test fetch(a) == 4 @test fetch(b) == 5 @test fetch(c) == 20 end @test Dagger.Sch.EAGER_CONTEXT[] isa Context + @testset "keyword arguments" begin + A = rand(4, 4) + @test fetch(@spawn sum(A; dims=1)) ≈ sum(A; dims=1) + + @test_throws_unwrap Dagger.ThunkFailedException fetch(@spawn sum(A; fakearg=2)) + + @test fetch(@spawn reduce(+, A; dims=1, init=2.0)) ≈ + reduce(+, A; dims=1, init=2.0) + end + @testset "broadcast" begin + A, B = rand(4), rand(4) + @test fetch(@spawn A .+ B) ≈ A .+ B + @test fetch(@spawn A .* B) ≈ A .* B + end @testset "waiting" begin a = @spawn sleep(1) @test !isready(a) @@ -95,9 +110,9 @@ end end ex = Dagger.Sch.unwrap_nested_exception(ex) ex_str = sprint(io->Base.showerror(io,ex)) - @test occursin(r"^ThunkFailedException \(Thunk.*failure\):", ex_str) + @test occursin(r"^ThunkFailedException:", ex_str) @test occursin("Test", ex_str) - @test !occursin("due to a failure in", ex_str) + @test !occursin("Root Thunk", ex_str) ex = try fetch(b) @@ -106,8 +121,8 @@ end end ex = Dagger.Sch.unwrap_nested_exception(ex) ex_str = sprint(io->Base.showerror(io,ex)) - @test occursin(r"Thunk.*failure due to a failure in", ex_str) @test occursin("Test", ex_str) + @test occursin("Root Thunk", ex_str) end @testset "single dependent" begin a = @spawn error("Test") @@ -155,7 +170,7 @@ end end end @testset "remote spawn" begin - a = fetch(Distributed.@spawnat 2 Dagger.spawn(+, 1, 2)) + a = fetch(Distributed.@spawnat 2 Dagger.@spawn 1+2) @test Dagger.Sch.EAGER_INIT[] @test fetch(Distributed.@spawnat 2 !(Dagger.Sch.EAGER_INIT[])) @test a isa Dagger.EagerThunk @@ -197,7 +212,7 @@ end a = delayed(+; processor=Dagger.ThreadProc(1,1))(1,2) @test a.f isa Chunk @test a.f.processor isa Dagger.ThreadProc - @test a.f.scope isa AnyScope + @test a.f.scope == DefaultScope() a = delayed(+; processor=Dagger.ThreadProc(1,1), scope=NodeScope())(1,2) @test a.f isa Chunk @@ -221,19 +236,19 @@ end end end @testset "eager API" begin - _a = Dagger.spawn(+, 1, 2; scope=NodeScope()) + _a = Dagger.@spawn scope=NodeScope() 1+2 a = Dagger.Sch._find_thunk(_a) @test a.f isa Chunk @test a.f.processor isa OSProc @test a.f.scope isa NodeScope - _a = Dagger.spawn(+, 1, 2; processor=Dagger.ThreadProc(1,1)) + _a = Dagger.@spawn processor=Dagger.ThreadProc(1,1) 1+2 a = Dagger.Sch._find_thunk(_a) @test a.f isa Chunk @test a.f.processor isa Dagger.ThreadProc - @test a.f.scope isa AnyScope + @test a.f.scope == DefaultScope() - _a = Dagger.spawn(+, 1, 2; processor=Dagger.ThreadProc(1,1), scope=NodeScope()) + _a = Dagger.@spawn processor=Dagger.ThreadProc(1,1) scope=NodeScope() 1+2 a = Dagger.Sch._find_thunk(_a) @test a.f isa Chunk @test a.f.processor isa Dagger.ThreadProc @@ -245,12 +260,12 @@ end s = p -> p == Dagger.ThreadProc(1, 1) f = (x) -> 10 + x - g = (x) -> fetch(Dagger.spawn(f, x; proclist=s)) - fetch(Dagger.spawn(g, 10; proclist=s)) + g = (x) -> fetch(Dagger.@spawn proclist=s f(x)) + fetch(Dagger.@spawn proclist=s g(10)) end @testset "no cross-scheduler Thunk usage" begin a = delayed(+)(1,2) - @test_throws Exception Dagger.spawn(identity, a) + @test_throws Exception Dagger.@spawn identity(a) end @testset "@sync support" begin result = Dagger.@spawn sleep(1)