From ac95f8aa8973c0648ae4d1028dac1517d357cbc3 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Mon, 26 Sep 2022 06:58:51 -0500 Subject: [PATCH 01/44] Add missing check to walk_data Co-authored-by: krynju --- src/sch/util.jl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/sch/util.jl b/src/sch/util.jl index f6fffa360..9b675a654 100644 --- a/src/sch/util.jl +++ b/src/sch/util.jl @@ -413,6 +413,11 @@ Walks the data contained in `x` in DFS fashion, and executes `f` at each object that hasn't yet been seen. """ function walk_data(f, @nospecialize(x)) + action = f(x) + if action !== missing + return action + end + seen = IdDict{Any,Nothing}() to_visit = Any[x] From 142eae364df670832c01e31b7aad1de856ea16c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krystian=20Guli=C5=84ski?= Date: Sat, 5 Nov 2022 13:15:56 +0100 Subject: [PATCH 02/44] Update Project.toml --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index f09176147..7c8c02e14 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "Dagger" uuid = "d58978e5-989f-55fb-8d15-ea34adc7bf54" -version = "0.16.1" +version = "0.16.2" [deps] Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" From 39200fab2a65a076aa66077c50ae29a7ba5623c5 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Thu, 10 Nov 2022 10:15:00 -0600 Subject: [PATCH 03/44] signature: Don't capture input arguments --- src/sch/util.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/sch/util.jl b/src/sch/util.jl index 9b675a654..85d2a8978 100644 --- a/src/sch/util.jl +++ b/src/sch/util.jl @@ -252,7 +252,9 @@ end chunktype(x) = typeof(x) function signature(task::Thunk, state) sig = Any[chunktype(task.f)] - append!(sig, collect_task_inputs(state, task)) + for input in collect_task_inputs(state, task) + push!(sig, chunktype(input)) + end sig end From 66bf970a2272cd95cff03ddd07eaa1ce1ae1c66c Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Thu, 10 Nov 2022 12:23:02 -0600 Subject: [PATCH 04/44] chunks: Allow weak Chunk references in Thunk args Implement WeakChunk like WeakThunk Swap Chunk for WeakChunk in eager thunk submission --- src/chunks.jl | 10 ++++++++++ src/sch/Sch.jl | 2 +- src/sch/eager.jl | 5 ++++- src/sch/util.jl | 2 +- 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/chunks.jl b/src/chunks.jl index 9745cb598..31098f3b8 100644 --- a/src/chunks.jl +++ b/src/chunks.jl @@ -266,6 +266,16 @@ function savechunk(data, dir, f) Chunk{typeof(data),typeof(fr),typeof(proc),typeof(scope)}(typeof(data), domain(data), fr, proc, scope, true) end +struct WeakChunk + x::WeakRef +end +WeakChunk(c::Chunk) = WeakChunk(WeakRef(c)) +unwrap_weak(c::WeakChunk) = c.x.value +function unwrap_weak_checked(c::WeakChunk) + c = unwrap_weak(c) + @assert c !== nothing + return c +end Base.@deprecate_binding AbstractPart Union{Chunk, Thunk} Base.@deprecate_binding Part Chunk diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index b68a519c8..d9e7e242a 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -8,7 +8,7 @@ import Statistics: mean import Random: randperm import ..Dagger -import ..Dagger: Context, Processor, Thunk, WeakThunk, ThunkFuture, ThunkFailedException, Chunk, OSProc, AnyScope +import ..Dagger: Context, Processor, Thunk, WeakThunk, ThunkFuture, ThunkFailedException, Chunk, WeakChunk, OSProc, AnyScope import ..Dagger: order, dependents, noffspring, istask, inputs, unwrap_weak_checked, affinity, tochunk, timespan_start, timespan_finish, procs, move, chunktype, processor, default_enabled, get_processors, get_parent, execute!, rmprocs!, addprocs!, thunk_processor, constrain, cputhreadtime const OneToMany = Dict{Thunk, Set{Thunk}} diff --git a/src/sch/eager.jl b/src/sch/eager.jl index 0b15543e2..e1126cbc8 100644 --- a/src/sch/eager.jl +++ b/src/sch/eager.jl @@ -90,7 +90,10 @@ function eager_thunk() added_future, future, uid, ref, f, args, opts = take!(EAGER_THUNK_CHAN) # preserve inputs until they enter the scheduler tid = GC.@preserve args begin - _args = map(x->x isa Dagger.EagerThunk ? ThunkID(EAGER_ID_MAP[x.uid], x.thunk_ref) : x, args) + _args = map(x->x isa Dagger.EagerThunk ? ThunkID(EAGER_ID_MAP[x.uid], x.thunk_ref) : + x isa Dagger.Chunk ? WeakChunk(x) : + x, + args) add_thunk!(f, h, _args...; future=future, ref=ref, opts...) end EAGER_ID_MAP[uid] = tid.id diff --git a/src/sch/util.jl b/src/sch/util.jl index 85d2a8978..e1704b11b 100644 --- a/src/sch/util.jl +++ b/src/sch/util.jl @@ -113,7 +113,7 @@ function reschedule_inputs!(state, thunk, seen=Set{Thunk}()) input in seen && continue # Unseen - if istask(input) || (input isa Chunk) + if istask(input) || isa(input, Chunk) push!(get!(()->Set{Thunk}(), state.waiting_data, input), thunk) end istask(input) || continue From 6f606ce6ef82ff4473c78c5f51cebb73156c1ac2 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Sun, 13 Nov 2022 17:37:11 -0600 Subject: [PATCH 05/44] DaggerWebDash: Add Mux 1.x to compat --- lib/DaggerWebDash/Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/DaggerWebDash/Project.toml b/lib/DaggerWebDash/Project.toml index fb1d99d1c..18d196a85 100644 --- a/lib/DaggerWebDash/Project.toml +++ b/lib/DaggerWebDash/Project.toml @@ -19,7 +19,7 @@ TimespanLogging = "a526e669-04d3-4846-9525-c66122c55f63" Dagger = "0.16" JSON3 = "1" MemPool = "0.3, 0.4" -Mux = "0.7" +Mux = "0.7, 1" ProfileSVG = "0.2" StructTypes = "1" Tables = "1" From 188de76821513ec946cc5790ea7ad7b7187e3171 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krystian=20Guli=C5=84ski?= Date: Mon, 14 Nov 2022 16:19:26 +0100 Subject: [PATCH 06/44] Add DTables.jl CI run (#366) --- .buildkite/pipeline.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 8f3bb2f07..f46473b40 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -93,3 +93,16 @@ steps: BENCHMARK_SCALE: "5:5:50" artifacts: - benchmarks/result* + - label: DTables.jl stability test + timeout_in_minutes: 20 + plugins: + - JuliaCI/julia#v1: + version: "1.8" + env: + JULIA_NUM_THREADS: "4" + agents: + queue: "juliaecosystem" + sandbox.jl: "true" + os: linux + arch: x86_64 + command: "git clone https://github.com/JuliaParallel/DTables.jl.git ; julia -t4 -e 'using Pkg; Pkg.activate(\"DTables.jl\"); Pkg.develop(;path=\".\"); Pkg.instantiate(); Pkg.test()'" From 66d51472cdb60cf18dcddbc6536a0a4abd97402d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krystian=20Guli=C5=84ski?= Date: Mon, 5 Dec 2022 19:21:10 +0100 Subject: [PATCH 07/44] Update Project.toml --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 7c8c02e14..4052255f3 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "Dagger" uuid = "d58978e5-989f-55fb-8d15-ea34adc7bf54" -version = "0.16.2" +version = "0.16.3" [deps] Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" From 4e8b20990583b8b09eb760e6be5ab2cac62f34f8 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Fri, 3 Feb 2023 09:00:10 -0600 Subject: [PATCH 08/44] thunk: Improve ThunkFailedException printing --- src/thunk.jl | 58 ++++++++++++++++++++++++++++++++++++++++++++------- test/thunk.jl | 6 +++--- 2 files changed, 54 insertions(+), 10 deletions(-) diff --git a/src/thunk.jl b/src/thunk.jl index 72fb2576f..6168ff330 100644 --- a/src/thunk.jl +++ b/src/thunk.jl @@ -193,15 +193,59 @@ ThunkFailedException(thunk, origin, ex::E) where E = ThunkFailedException{E}(convert(WeakThunk, thunk), convert(WeakThunk, origin), ex) function Base.showerror(io::IO, ex::ThunkFailedException) t = unwrap_weak(ex.thunk) - o = unwrap_weak(ex.origin) - t_str = t !== nothing ? "$t" : "?" - o_str = o !== nothing ? "$o" : "?" + + # Find root-cause thunk + last_tfex = ex + failed_tasks = Union{Thunk,Nothing}[] + while last_tfex.ex isa ThunkFailedException && unwrap_weak(last_tfex.ex.origin) !== nothing + push!(failed_tasks, unwrap_weak(last_tfex.thunk)) + last_tfex = last_tfex.ex + end + o = unwrap_weak(last_tfex.origin) + root_ex = last_tfex.ex + + function thunk_string(t) + if t === nothing + return "Thunk(?)" + end + Tinputs = Any[] + for input in t.inputs + input = unwrap_weak(input) + if istask(input) + push!(Tinputs, "Thunk(id=$(input.id))") + else + push!(Tinputs, input) + end + end + t_sig = if length(Tinputs) <= 4 + "$(t.f)($(join(Tinputs, ", "))))" + else + "$(t.f)($(length(Tinputs)) inputs...)" + end + return "Thunk(id=$(t.id), $t_sig" + end + t_str = thunk_string(t) + o_str = thunk_string(o) t_id = t !== nothing ? t.id : '?' o_id = o !== nothing ? o.id : '?' - println(io, "ThunkFailedException ($t failure", - (o !== nothing && t != o) ? " due to a failure in $o)" : ")", - ":") - Base.showerror(io, ex.ex) + println(io, "ThunkFailedException:") + println(io, " Root Exception Type: $(typeof(root_ex))") + println(io, " Root Exception:") + Base.showerror(io, root_ex); println(io) + if t !== o + println(io, " Root Thunk: $o_str") + if length(failed_tasks) <= 4 + for i in failed_tasks + i_str = thunk_string(i) + println(io, " Inner Thunk: $i_str") + end + else + println(io, " ...") + println(io, " $(length(failed_tasks)) Inner Thunks...") + println(io, " ...") + end + end + print(io, " This Thunk: $t_str") end """ diff --git a/test/thunk.jl b/test/thunk.jl index 1055819a2..3df8d36ef 100644 --- a/test/thunk.jl +++ b/test/thunk.jl @@ -95,9 +95,9 @@ end end ex = Dagger.Sch.unwrap_nested_exception(ex) ex_str = sprint(io->Base.showerror(io,ex)) - @test occursin(r"^ThunkFailedException \(Thunk.*failure\):", ex_str) + @test occursin(r"^ThunkFailedException:", ex_str) @test occursin("Test", ex_str) - @test !occursin("due to a failure in", ex_str) + @test !occursin("Root Thunk", ex_str) ex = try fetch(b) @@ -106,8 +106,8 @@ end end ex = Dagger.Sch.unwrap_nested_exception(ex) ex_str = sprint(io->Base.showerror(io,ex)) - @test occursin(r"Thunk.*failure due to a failure in", ex_str) @test occursin("Test", ex_str) + @test occursin("Root Thunk", ex_str) end @testset "single dependent" begin a = @spawn error("Test") From 2ebb51b54168beaa6841b47c474d5962ec32b3f8 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Fri, 3 Feb 2023 09:00:32 -0600 Subject: [PATCH 09/44] Sch: Fix task error-related assertion --- src/sch/Sch.jl | 17 ++++++++++++++++- src/sch/util.jl | 5 ++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index d9e7e242a..1fb0652d7 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -610,7 +610,22 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) end task = pop!(state.ready) timespan_start(ctx, :schedule, task.id, (;thunk_id=task.id)) - @assert !haskey(state.cache, task) + if haskey(state.cache, task) + if haskey(state.errored, task) + # An error was eagerly propagated to this task + finish_failed!(state, task) + else + # This shouldn't have happened + iob = IOBuffer() + println(iob, "Scheduling inconsistency: Task being scheduled is already cached!") + println(iob, " Task: $(task.id)") + println(iob, " Cache Entry: $(typeof(state.cache[task]))") + ex = SchedulingException(String(take!(iob))) + state.cache[task] = ex + state.errored[task] = true + end + @goto pop_task + end opts = merge(ctx.options, task.options) sig = signature(task, state) diff --git a/src/sch/util.jl b/src/sch/util.jl index e1704b11b..864ac65e2 100644 --- a/src/sch/util.jl +++ b/src/sch/util.jl @@ -145,6 +145,9 @@ function set_failed!(state, origin, thunk=origin) filter!(x->x!==thunk, state.ready) state.cache[thunk] = ThunkFailedException(thunk, origin, state.cache[origin]) state.errored[thunk] = true + finish_failed!(state, thunk, origin) +end +function finish_failed!(state, thunk, origin=nothing) fill_registered_futures!(state, thunk, true) if haskey(state.waiting_data, thunk) for dep in state.waiting_data[thunk] @@ -152,7 +155,7 @@ function set_failed!(state, origin, thunk=origin) delete!(state.waiting, dep) haskey(state.errored, dep) && continue - set_failed!(state, origin, dep) + origin !== nothing && set_failed!(state, origin, dep) end delete!(state.waiting_data, thunk) end From 42bd36fe57d68647d273777a282b0a75a173d339 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Fri, 24 Feb 2023 08:50:58 -0600 Subject: [PATCH 10/44] checkpoint: Use at-invokelatest --- src/sch/Sch.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index 1fb0652d7..9386e9d34 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -6,6 +6,7 @@ import MemPool: DRef, StorageResource import MemPool: poolset, storage_available, storage_capacity, storage_utilized, externally_varying import Statistics: mean import Random: randperm +import Base: @invokelatest import ..Dagger import ..Dagger: Context, Processor, Thunk, WeakThunk, ThunkFuture, ThunkFailedException, Chunk, WeakChunk, OSProc, AnyScope @@ -527,7 +528,7 @@ function scheduler_run(ctx, state::ComputeState, d::Thunk, options) state.errored[node] = thunk_failed if node.options !== nothing && node.options.checkpoint !== nothing try - node.options.checkpoint(node, res) + @invokelatest node.options.checkpoint(node, res) catch err report_catch_error(err, "Thunk checkpoint failed") end @@ -892,7 +893,7 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state) end if thunk.options !== nothing && thunk.options.restore !== nothing try - result = thunk.options.restore(thunk) + result = @invokelatest thunk.options.restore(thunk) if result isa Chunk state.cache[thunk] = result state.errored[thunk] = false From f3986fc3d0fe3ebdb98824e33c54e2b1d896812d Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Fri, 24 Feb 2023 10:15:47 -0600 Subject: [PATCH 11/44] at-spawn: Support broadcasting --- src/thunk.jl | 13 ++++++++++++- test/thunk.jl | 5 +++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/thunk.jl b/src/thunk.jl index 6168ff330..f35ca5d3c 100644 --- a/src/thunk.jl +++ b/src/thunk.jl @@ -367,9 +367,20 @@ macro spawn(exs...) _par(ex; lazy=false, opts=opts) end +struct ExpandedBroadcast{F} end +(eb::ExpandedBroadcast{F})(args...) where F = + Base.materialize(Base.broadcasted(F, args...)) +replace_broadcast(ex) = ex +function replace_broadcast(fn::Symbol) + if startswith(string(fn), '.') + return :($ExpandedBroadcast{$(Symbol(string(fn)[2:end]))}()) + end + return fn +end + function _par(ex::Expr; lazy=true, recur=true, opts=()) if ex.head == :call && recur - f = ex.args[1] + f = replace_broadcast(ex.args[1]) args = ex.args[2:end] opts = esc.(opts) if lazy diff --git a/test/thunk.jl b/test/thunk.jl index 3df8d36ef..339f63fd3 100644 --- a/test/thunk.jl +++ b/test/thunk.jl @@ -61,6 +61,11 @@ end @test fetch(c) == 20 end @test Dagger.Sch.EAGER_CONTEXT[] isa Context + @testset "broadcast" begin + A, B = rand(4), rand(4) + @test fetch(@spawn A .+ B) ≈ A .+ B + @test fetch(@spawn A .* B) ≈ A .* B + end @testset "waiting" begin a = @spawn sleep(1) @test !isready(a) From 22378a19bba2955cc190715e65759d7fb99e3227 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Thu, 2 Mar 2023 13:23:57 -0600 Subject: [PATCH 12/44] DaggerWebDash: Updates for recent HTTP.jl --- lib/DaggerWebDash/Project.toml | 2 ++ lib/DaggerWebDash/src/d3.jl | 47 +++++++++++++++++----------------- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/lib/DaggerWebDash/Project.toml b/lib/DaggerWebDash/Project.toml index 18d196a85..1dccdec2f 100644 --- a/lib/DaggerWebDash/Project.toml +++ b/lib/DaggerWebDash/Project.toml @@ -6,6 +6,7 @@ version = "0.1.2" [deps] Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94" Mux = "a975b10e-0019-58db-a62f-e48ff68538c9" @@ -17,6 +18,7 @@ TimespanLogging = "a526e669-04d3-4846-9525-c66122c55f63" [compat] Dagger = "0.16" +HTTP = "1.7" JSON3 = "1" MemPool = "0.3, 0.4" Mux = "0.7, 1" diff --git a/lib/DaggerWebDash/src/d3.jl b/lib/DaggerWebDash/src/d3.jl index bc77991cd..00d2ffb33 100644 --- a/lib/DaggerWebDash/src/d3.jl +++ b/lib/DaggerWebDash/src/d3.jl @@ -1,4 +1,7 @@ -using Mux, MemPool, Distributed, Sockets +using Mux, Sockets +import HTTP +import HTTP: WebSockets +using MemPool, Distributed struct LinePlot core_key::Symbol @@ -172,22 +175,22 @@ function client_handler(sock, id, port, port_range, config_updated, config, seek if id != myid() fsock_ready = Base.Event() worker_host, worker_port = worker_host_port(id, port, port_range) - Mux.HTTP.WebSockets.open("ws://$worker_host:$worker_port/data_feed") do _fsock + WebSockets.open("ws://$worker_host:$worker_port/data_feed") do _fsock fsock = _fsock @debug "D3R forwarder for $id ready" notify(fsock_ready) - while !eof(fsock) && isopen(fsock) + while true try - bytes = readavailable(fsock) + bytes = WebSockets.receive(fsock) if length(bytes) == 0 sleep(0.1) continue end data = String(bytes) #@info "D3R forwarder for $id received data" - write(sock, data) + WebSockets.send(sock, data) catch err - if err isa Mux.WebSockets.WebSocketClosedError || err isa Base.IOError + if err isa WebSockets.WebSocketError && err.message isa WebSockets.CloseFrameBody # Force-close client and forwarder @async close(sock) @async close(fsock) @@ -203,25 +206,25 @@ function client_handler(sock, id, port, port_range, config_updated, config, seek end if id == myid() @debug "D3R client for $id sending initial config" - write(sock, JSON3.write((;cmd="data", payload=sanitize(D3R_LOGS[port])))) + WebSockets.send(sock, JSON3.write((;cmd="data", payload=sanitize(D3R_LOGS[port])))) _workers = workers() if !(myid() in _workers) # FIXME: Get this from the Context _workers = vcat(myid(), _workers) end - write(sock, JSON3.write((;cmd="config", payload=sanitize((;myid=myid(),workers=_workers,ctxs=config))))) + WebSockets.send(sock, JSON3.write((;cmd="config", payload=sanitize((;myid=myid(),workers=_workers,ctxs=config))))) end push!(get!(()->[], D3R_CLIENT_SOCKETS[port], id), sock) @debug "D3R client for $id ready" - while !eof(sock) && isopen(sock) + while true try - data = String(read(sock)) + data = String(WebSockets.receive(sock)) @debug "D3R client for $id received: $data" if id == myid() #= FIXME if config_updated[] config_updated[] = false - write(sock, JSON3.write((;cmd="config", payload=sanitize(config)))) + WebSockets.send(sock, JSON3.write((;cmd="config", payload=sanitize(config)))) end =# if seek_store !== nothing @@ -231,7 +234,7 @@ function client_handler(sock, id, port, port_range, config_updated, config, seek for (idx,key) in enumerate(Tables.columnnames(raw_logs)) logs[key] = Tables.columns(raw_logs)[idx] end - write(sock, JSON3.write((;cmd="data", payload=sanitize(logs)))) + WebSockets.send(sock, JSON3.write((;cmd="data", payload=sanitize(logs)))) continue end m = match(r"seek\(([0-9]*),([0-9]*)\)", data) @@ -242,19 +245,19 @@ function client_handler(sock, id, port, port_range, config_updated, config, seek for (idx,key) in enumerate(Tables.columnnames(raw_logs)) logs[key] = Tables.columns(raw_logs)[idx] end - write(sock, JSON3.write((;cmd="data", payload=sanitize(logs)))) + WebSockets.send(sock, JSON3.write((;cmd="data", payload=sanitize(logs)))) continue end end if data == "data" - write(sock, JSON3.write((;cmd="data", payload=sanitize(D3R_LOGS[port])))) + WebSockets.send(sock, JSON3.write((;cmd="data", payload=sanitize(D3R_LOGS[port])))) end else @debug "D3R client sending to forwarder: $data" - write(fsock, data) + WebSockets.send(fsock, data) end catch err - if err isa Mux.WebSockets.WebSocketClosedError || err isa Base.IOError + if err isa WebSockets.WebSocketError && err.message isa WebSockets.CloseFrameBody idx = findfirst(x->x==sock, D3R_CLIENT_SOCKETS[port][id]) if idx !== nothing deleteat!(D3R_CLIENT_SOCKETS[port][id], idx) @@ -273,11 +276,9 @@ end function TimespanLogging.Events.creation_hook(d3r::D3Renderer, log) for sock in get!(()->[], get!(()->Dict{Int,Vector{Any}}(), D3R_CLIENT_SOCKETS, d3r.port), myid()) try - if isopen(sock) - write(sock, JSON3.write((;cmd="add", payload=sanitize(log)))) - end + WebSockets.send(sock, JSON3.write((;cmd="add", payload=sanitize(log)))) catch err - if err isa Mux.WebSockets.WebSocketClosedError + if err isa WebSockets.WebSocketError && err.message isa WebSockets.CloseFrameBody continue end rethrow(err) @@ -287,11 +288,9 @@ end function TimespanLogging.Events.deletion_hook(d3r::D3Renderer, idx) for sock in get!(()->[], get!(()->Dict{Int,Vector{Any}}(), D3R_CLIENT_SOCKETS, d3r.port), myid()) try - if isopen(sock) - write(sock, JSON3.write((;cmd="delete", payload=idx))) - end + WebSockets.send(sock, JSON3.write((;cmd="delete", payload=idx))) catch err - if err isa Mux.WebSockets.WebSocketClosedError + if err isa WebSockets.WebSocketError && err.message isa WebSockets.CloseFrameBody continue end rethrow(err) From 345b70bf428d8328e18da23880ead8a8eb701e51 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Thu, 2 Mar 2023 13:24:25 -0600 Subject: [PATCH 13/44] docs: Update scheduler viz docs --- docs/src/logging.md | 2 +- docs/src/scheduler-visualization.md | 26 ++++++++++++++------------ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/docs/src/logging.md b/docs/src/logging.md index 951af63b1..81f97490b 100644 --- a/docs/src/logging.md +++ b/docs/src/logging.md @@ -32,7 +32,7 @@ called. Let's construct one: ```julia ctx = Context() -ml = TimspanLogging.MultiEventLog() +ml = TimespanLogging.MultiEventLog() # Add the BytesAllocd consumer to the log as `:bytes` ml[:bytes] = Dagger.Events.BytesAllocd() diff --git a/docs/src/scheduler-visualization.md b/docs/src/scheduler-visualization.md index 1696aac5d..234f54c92 100644 --- a/docs/src/scheduler-visualization.md +++ b/docs/src/scheduler-visualization.md @@ -16,14 +16,16 @@ easiest way to get started with the web dashboard for new users. For manual usage, the following snippet of code will suffice: ```julia +using Dagger, DaggerWebDash, TimespanLogging + ctx = Context() # or `ctx = Dagger.Sch.eager_context()` for eager API usage -ml = Dagger.MultiEventLog() +ml = TimespanLogging.MultiEventLog() ## Add some logging events of interest -ml[:core] = Dagger.Events.CoreMetrics() -ml[:id] = Dagger.Events.IDMetrics() -ml[:timeline] = Dagger.Events.TimelineMetrics() +ml[:core] = TimespanLogging.Events.CoreMetrics() +ml[:id] = TimespanLogging.Events.IDMetrics() +ml[:timeline] = TimespanLogging.Events.TimelineMetrics() # ... # (Optional) Enable profile flamegraph generation with ProfileSVG @@ -31,7 +33,7 @@ ml[:profile] = DaggerWebDash.ProfileMetrics() ctx.profile = true # Create a LogWindow; necessary for real-time event updates -lw = Dagger.Events.LogWindow(20*10^9, :core) +lw = TimespanLogging.Events.LogWindow(20*10^9, :core) ml.aggregators[:logwindow] = lw # Create the D3Renderer server on port 8080 @@ -40,20 +42,20 @@ d3r = DaggerWebDash.D3Renderer(8080) ## Add some plots! Rendered top-down in order # Show an overview of all generated events as a Gantt chart -push!(d3r, GanttPlot(:core, :id, :esat, :psat; title="Overview")) +push!(d3r, DaggerWebDash.GanttPlot(:core, :id, :esat, :psat; title="Overview")) # Show various numerical events as line plots over time -push!(d3r, LinePlot(:core, :wsat, "Worker Saturation", "Running Tasks")) -push!(d3r, LinePlot(:core, :loadavg, "CPU Load Average", "Average Running Threads")) -push!(d3r, LinePlot(:core, :bytes, "Allocated Bytes", "Bytes")) -push!(d3r, LinePlot(:core, :mem, "Available Memory", "% Free")) +push!(d3r, DaggerWebDash.LinePlot(:core, :wsat, "Worker Saturation", "Running Tasks")) +push!(d3r, DaggerWebDash.LinePlot(:core, :loadavg, "CPU Load Average", "Average Running Threads")) +push!(d3r, DaggerWebDash.LinePlot(:core, :bytes, "Allocated Bytes", "Bytes")) +push!(d3r, DaggerWebDash.LinePlot(:core, :mem, "Available Memory", "% Free")) # Show a graph rendering of compute tasks and data movement between them # Note: Profile events are ignored if absent from the log -push!(d3r, GraphPlot(:core, :id, :timeline, :profile, "DAG")) +push!(d3r, DaggerWebDash.GraphPlot(:core, :id, :timeline, :profile, "DAG")) # TODO: Not yet functional -#push!(d3r, ProfileViewer(:core, :profile, "Profile Viewer")) +#push!(d3r, DaggerWebDash.ProfileViewer(:core, :profile, "Profile Viewer")) # Add the D3Renderer as a consumer of special events generated by LogWindow push!(lw.creation_handlers, d3r) From 8d6b234c6f30802a38e720dcd00b79ebb8092126 Mon Sep 17 00:00:00 2001 From: Madhu Patel <54492585+Madhupatel08@users.noreply.github.com> Date: Fri, 17 Mar 2023 02:24:48 +0530 Subject: [PATCH 14/44] Updating README.md (#383) --- README.md | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1f3833cca..b472f0e78 100644 --- a/README.md +++ b/README.md @@ -17,10 +17,14 @@ At the core of Dagger.jl is a scheduler heavily inspired by [Dask](https://docs. ## Installation -You can install Dagger by typing +Dagger.jl can be installed using the Julia package manager. Enter the Pkg REPL mode by typing "]" in the Julia REPL and then run: ```julia -julia> ] add Dagger +pkg> add Dagger +``` +Or, equivalently, via the Pkg API: +```julia +julia> import Pkg; Pkg.add("Dagger") ``` ## Usage @@ -37,6 +41,34 @@ b = Dagger.@spawn rand(a, 4) c = Dagger.@spawn sum(b) fetch(c) # some number! ``` +## Contributing Guide +[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square)](http://makeapullrequest.com) +[![GitHub issues](https://img.shields.io/github/issues/JuliaParallel/Dagger.jl)](https://github.com/JuliaParallel/Dagger.jl/issues) +[![GitHub contributors](https://img.shields.io/github/contributors/JuliaParallel/Dagger.jl)](https://github.com/JuliaParallel/Dagger.jl/graphs/contributors) + +Contributions are encouraged. + +There are several ways to contribute to our project: + +**Reporting Bugs**: If you find a bug, please open an issue and describe the problem. Make sure to include steps to reproduce the issue and any error messages you receive regarding that issue. + +**Fixing Bugs**: If you'd like to fix a bug, please create a pull request with your changes. Make sure to include a description of the problem and how your changes will address it. + +Additional examples and documentation improvements are also very welcome. + +## Resources +List of recommended Dagger.jl resources: +- Docs [![][docs-master-img]][docs-master-url] +- Videos + - [Distributed Computing with Dagger.jl](https://youtu.be/capjmjVHfMU) + - [Easy, Featureful Parallelism with Dagger.jl](https://youtu.be/t3S8W6A4Ago) + - [Easier parallel Julia workflow with Dagger.jl](https://youtu.be/VrqzOsav61w) + - [Dagger.jl Development and Roadmap](https://youtu.be/G0Y62ysFbDk) + +## Help and Discussion +For help and discussion, we suggest asking in the following places: + +[Julia Discourse](https://discourse.julialang.org/c/domain/parallel/34) and on the [Julia Slack](https://julialang.org/slack/) in the `#distributed` channel. ## Acknowledgements From 96aa4dc0c225729760cc16d417f8c1c30c4290c7 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Sat, 25 Feb 2023 13:09:46 -0600 Subject: [PATCH 15/44] Add scope for union of processor type Adds ProcessorTypeScope(T), which matches processors that are a subtype of T. In the process, also expands the scope system to support lazily-evaluated scoping behavior, such as doing a subtype check or checking for `default_enabled`, via "taints". --- src/sch/Sch.jl | 8 +++--- src/sch/util.jl | 8 +++--- src/scopes.jl | 71 +++++++++++++++++++++++++++++++++++++++++++++++-- src/thunk.jl | 6 ++--- test/scopes.jl | 31 +++++++++++++++++++++ test/thunk.jl | 5 ++-- 6 files changed, 115 insertions(+), 14 deletions(-) diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index 9386e9d34..3f3dc03de 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -9,7 +9,7 @@ import Random: randperm import Base: @invokelatest import ..Dagger -import ..Dagger: Context, Processor, Thunk, WeakThunk, ThunkFuture, ThunkFailedException, Chunk, WeakChunk, OSProc, AnyScope +import ..Dagger: Context, Processor, Thunk, WeakThunk, ThunkFuture, ThunkFailedException, Chunk, WeakChunk, OSProc, DefaultScope import ..Dagger: order, dependents, noffspring, istask, inputs, unwrap_weak_checked, affinity, tochunk, timespan_start, timespan_finish, procs, move, chunktype, processor, default_enabled, get_processors, get_parent, execute!, rmprocs!, addprocs!, thunk_processor, constrain, cputhreadtime const OneToMany = Dict{Thunk, Set{Thunk}} @@ -634,7 +634,7 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) scope = if task.f isa Chunk task.f.scope else - AnyScope() + DefaultScope() end for input in task.inputs input = unwrap_weak_checked(input) @@ -693,7 +693,7 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) end end end - state.cache[task] = SchedulingException("No processors available, try making proclist more liberal") + state.cache[task] = SchedulingException("No processors available, try widening scope") state.errored[task] = true set_failed!(state, task) @goto pop_task @@ -723,7 +723,7 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) if procs_found push!(failed_scheduling, task) else - state.cache[task] = SchedulingException("No processors available, try making proclist more liberal") + state.cache[task] = SchedulingException("No processors available, try widening scope") state.errored[task] = true set_failed!(state, task) end diff --git a/src/sch/util.jl b/src/sch/util.jl index 864ac65e2..e688ed410 100644 --- a/src/sch/util.jl +++ b/src/sch/util.jl @@ -14,7 +14,7 @@ function get_propagated_options(thunk) nt = NamedTuple() for key in thunk.propagates value = if key == :scope - isa(thunk.f, Chunk) ? thunk.f.scope : AnyScope() + isa(thunk.f, Chunk) ? thunk.f.scope : DefaultScope() elseif key == :processor isa(thunk.f, Chunk) ? thunk.f.processor : OSProc() elseif key in fieldnames(Thunk) @@ -285,7 +285,7 @@ function can_use_proc(task, gproc, proc, opts, scope) # Check against single if opts.single !== nothing if gproc.pid != opts.single - @debug "Rejected $proc: gproc.pid != single" + @debug "[$(task.id)] Rejected $proc: gproc.pid ($(gproc.pid)) != single ($(opts.single))" return false end end @@ -293,10 +293,12 @@ function can_use_proc(task, gproc, proc, opts, scope) # Check scope proc_scope = Dagger.ExactScope(proc) if constrain(scope, proc_scope) isa Dagger.InvalidScope - @debug "Rejected $proc: Task scope ($scope) vs. processor scope ($proc_scope)" + @debug "[$(task.id)] Rejected $proc: Not contained in task scope ($scope)" return false end + @debug "[$(task.id)] Accepted $proc" + return true end diff --git a/src/scopes.jl b/src/scopes.jl index 2b203f566..386806a88 100644 --- a/src/scopes.jl +++ b/src/scopes.jl @@ -1,10 +1,28 @@ -export AnyScope, UnionScope, NodeScope, ProcessScope, ExactScope +export AnyScope, DefaultScope, UnionScope, NodeScope, ProcessScope, ExactScope, ProcessorTypeScope abstract type AbstractScope end -"Default scope that is unconstrained." +"Widest scope that contains all processors." struct AnyScope <: AbstractScope end +abstract type AbstractScopeTaint end + +"Taints a scope for later evaluation." +struct TaintScope <: AbstractScope + scope::AbstractScope + taints::Set{AbstractScopeTaint} +end +Base.:(==)(ts1::TaintScope, ts2::TaintScope) = + ts1.scope == ts2.scope && + length(ts1.taints) == length(ts2.taints) && + all(collect(ts1.taints) .== collect(ts2.taints)) + +struct DefaultEnabledTaint <: AbstractScopeTaint end + +"Default scope that contains the set of `default_enabled` processors." +DefaultScope() = TaintScope(AnyScope(), + Set{AbstractScopeTaint}([DefaultEnabledTaint()])) + "Union of two or more scopes." struct UnionScope <: AbstractScope scopes::Tuple @@ -35,6 +53,13 @@ end ProcessScope(p::OSProc) = ProcessScope(p.pid) ProcessScope() = ProcessScope(myid()) +"Scoped to any processor with a given supertype." +struct ProcessorTypeTaint{T} <: AbstractScopeTaint end + +ProcessorTypeScope(T) = + TaintScope(AnyScope(), + Set{AbstractScopeTaint}([ProcessorTypeTaint{T}()])) + "Scoped to a specific processor." struct ExactScope <: AbstractScope parent::ProcessScope @@ -58,7 +83,46 @@ Base.isless(x, ::AnyScope) = true constrain(::AnyScope, ::AnyScope) = AnyScope() constrain(::AnyScope, y) = y +# N.B. TaintScope taints constraining (until encountering an `ExactScope`) to +# allow lazy evaluation of the taint matching on the final processor +taint_match(::DefaultEnabledTaint, x::Processor) = default_enabled(x) +taint_match(::ProcessorTypeTaint{T}, x::Processor) where T = x isa T +Base.isless(::TaintScope, ::TaintScope) = false +Base.isless(::TaintScope, ::AnyScope) = true +Base.isless(::TaintScope, x) = false +Base.isless(x, ::TaintScope) = true +function constrain(x::TaintScope, y::TaintScope) + scope = constrain(x.scope, y.scope) + if scope isa InvalidScope + return scope + end + taints = Set{AbstractScopeTaint}() + for tx in x.taints + push!(taints, tx) + end + for ty in y.taints + push!(taints, ty) + end + return TaintScope(scope, taints) +end +function constrain(x::TaintScope, y) + scope = constrain(x.scope, y) + if scope isa InvalidScope + return scope + end + return TaintScope(scope, x.taints) +end +function constrain(x::TaintScope, y::ExactScope) + for taint in x.taints + if !taint_match(taint, y.processor) + return InvalidScope(x, y) + end + end + return constrain(x.scope, y) +end + Base.isless(::UnionScope, ::UnionScope) = false +Base.isless(::UnionScope, ::TaintScope) = true Base.isless(::UnionScope, ::AnyScope) = true function constrain(x::UnionScope, y::UnionScope) zs = Vector{AbstractScope}() @@ -75,12 +139,14 @@ end constrain(x::UnionScope, y) = constrain(x, UnionScope((y,))) Base.isless(::NodeScope, ::NodeScope) = false +Base.isless(::NodeScope, ::TaintScope) = true Base.isless(::NodeScope, ::AnyScope) = true constrain(x::NodeScope, y::NodeScope) = x == y ? y : InvalidScope(x, y) Base.isless(::ProcessScope, ::ProcessScope) = false Base.isless(::ProcessScope, ::NodeScope) = true +Base.isless(::ProcessScope, ::TaintScope) = true Base.isless(::ProcessScope, ::AnyScope) = true constrain(x::ProcessScope, y::ProcessScope) = x == y ? y : InvalidScope(x, y) @@ -90,6 +156,7 @@ constrain(x::NodeScope, y::ProcessScope) = Base.isless(::ExactScope, ::ExactScope) = false Base.isless(::ExactScope, ::ProcessScope) = true Base.isless(::ExactScope, ::NodeScope) = true +Base.isless(::ExactScope, ::TaintScope) = true Base.isless(::ExactScope, ::AnyScope) = true constrain(x::ExactScope, y::ExactScope) = x == y ? y : InvalidScope(x, y) diff --git a/src/thunk.jl b/src/thunk.jl index f35ca5d3c..66b8f88d3 100644 --- a/src/thunk.jl +++ b/src/thunk.jl @@ -39,7 +39,7 @@ arguments are still passed as-is. - `processor::Processor=OSProc()` - The processor associated with `f`. Useful if `f` is a callable struct that exists on a given processor and should be transferred appropriately. -- `scope::Dagger.AbstractScope=AnyScope()` - The scope associated with `f`. +- `scope::Dagger.AbstractScope=DefaultScope()` - The scope associated with `f`. Useful if `f` is a function or callable struct that may only be transferred to, and executed within, the specified scope. @@ -80,7 +80,7 @@ mutable struct Thunk if !isa(f, Chunk) && (!isnothing(processor) || !isnothing(scope)) f = tochunk(f, something(processor, OSProc()), - something(scope, AnyScope())) + something(scope, DefaultScope())) end xs = Any[xs...] if options !== nothing @@ -315,7 +315,7 @@ function spawn(f, args...; processor=nothing, scope=nothing, kwargs...) if !isnothing(processor) || !isnothing(scope) f = tochunk(f, something(processor, get_options(:processor, OSProc())), - something(scope, get_options(:scope, AnyScope()))) + something(scope, get_options(:scope, DefaultScope()))) end uid, future, finalizer_ref, thunk_ref = if myid() == 1 _spawn(f, args...; options, kwargs...) diff --git a/test/scopes.jl b/test/scopes.jl index 6b5b6252f..3a9303019 100644 --- a/test/scopes.jl +++ b/test/scopes.jl @@ -31,6 +31,17 @@ es1_ch = Dagger.tochunk(nothing, OSProc(), es1) es2_ch = Dagger.tochunk(nothing, OSProc(), es2) + os1 = ExactScope(OSProc(1)) + + @testset "Default Scope" begin + ds = DefaultScope() + for (s1, s2) in ((ds, es1), (es1, ds)) + @test Dagger.constrain(s1, s2) == es1 + end + for (s1, s2) in ((ds, os1), (os1, ds)) + @test Dagger.constrain(s1, s2) isa Dagger.InvalidScope + end + end @testset "Node Scope" begin @everywhere node_scope_test(ch...) = Dagger.system_uuid() @@ -128,6 +139,26 @@ @test es1 in us_res.scopes @test !(es2 in us_res.scopes) end + @testset "Processor Type Scope" begin + pts_th = ProcessorTypeScope(Dagger.ThreadProc) + pts_os = ProcessorTypeScope(Dagger.OSProc) + + @test Dagger.constrain(pts_th, es1) == es1 + @test Dagger.constrain(pts_th, os1) isa Dagger.InvalidScope + + @test Dagger.constrain(pts_os, es1) isa Dagger.InvalidScope + @test Dagger.constrain(pts_os, os1) == os1 + + # Duplicate + pts_th_dup = Dagger.constrain(pts_th, pts_th) + @test Dagger.constrain(pts_th_dup, es1) == es1 + @test Dagger.constrain(pts_th_dup, os1) isa Dagger.InvalidScope + + # Empty intersection + pts_all = Dagger.constrain(pts_th, pts_os) + @test Dagger.constrain(pts_all, es1) isa Dagger.InvalidScope + @test Dagger.constrain(pts_all, os1) isa Dagger.InvalidScope + end # TODO: Test scope propagation rmprocs([wid1, wid2]) diff --git a/test/thunk.jl b/test/thunk.jl index 339f63fd3..28933b3b8 100644 --- a/test/thunk.jl +++ b/test/thunk.jl @@ -21,6 +21,7 @@ import Dagger: Chunk end MulProc() = MulProc(myid()) Dagger.get_parent(mp::MulProc) = OSProc(mp.owner) + Dagger.move(src::MulProc, dest::Dagger.OSProc, x::Function) = Base.:* Dagger.move(src::MulProc, dest::Dagger.ThreadProc, x::Function) = Base.:* end @@ -202,7 +203,7 @@ end a = delayed(+; processor=Dagger.ThreadProc(1,1))(1,2) @test a.f isa Chunk @test a.f.processor isa Dagger.ThreadProc - @test a.f.scope isa AnyScope + @test a.f.scope == DefaultScope() a = delayed(+; processor=Dagger.ThreadProc(1,1), scope=NodeScope())(1,2) @test a.f isa Chunk @@ -236,7 +237,7 @@ end a = Dagger.Sch._find_thunk(_a) @test a.f isa Chunk @test a.f.processor isa Dagger.ThreadProc - @test a.f.scope isa AnyScope + @test a.f.scope == DefaultScope() _a = Dagger.spawn(+, 1, 2; processor=Dagger.ThreadProc(1,1), scope=NodeScope()) a = Dagger.Sch._find_thunk(_a) From 0b108ee762a9f34141f1d0370670ff5b817c0cd3 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Sun, 26 Feb 2023 08:50:11 -0600 Subject: [PATCH 16/44] Deprecate proclist and single in favor of scope Also changes behavior such that proclist and single override scope when set, to prevent issues with mixing proclist/single with scope. --- src/sch/Sch.jl | 42 ++++++++++++++++++++++--------------- src/sch/eager.jl | 4 ++-- src/sch/util.jl | 52 ++++++++++++++++++++++++++++------------------ test/fakeproc.jl | 3 ++- test/processors.jl | 4 ++-- 5 files changed, 63 insertions(+), 42 deletions(-) diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index 3f3dc03de..e5c7f8546 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -9,7 +9,7 @@ import Random: randperm import Base: @invokelatest import ..Dagger -import ..Dagger: Context, Processor, Thunk, WeakThunk, ThunkFuture, ThunkFailedException, Chunk, WeakChunk, OSProc, DefaultScope +import ..Dagger: Context, Processor, Thunk, WeakThunk, ThunkFuture, ThunkFailedException, Chunk, WeakChunk, OSProc, AnyScope, DefaultScope import ..Dagger: order, dependents, noffspring, istask, inputs, unwrap_weak_checked, affinity, tochunk, timespan_start, timespan_finish, procs, move, chunktype, processor, default_enabled, get_processors, get_parent, execute!, rmprocs!, addprocs!, thunk_processor, constrain, cputhreadtime const OneToMany = Dict{Thunk, Set{Thunk}} @@ -143,13 +143,13 @@ end Stores DAG-global options to be passed to the Dagger.Sch scheduler. # Arguments -- `single::Int=0`: Force all work onto worker with specified id. `0` disables -this option. -- `proclist=nothing`: Force scheduler to use one or more processors that are -instances/subtypes of a contained type. Alternatively, a function can be -supplied, and the function will be called with a processor as the sole -argument and should return a `Bool` result to indicate whether or not to use -the given processor. `nothing` enables all default processors. +- `single::Int=0`: (Deprecated) Force all work onto worker with specified id. +`0` disables this option. +- `proclist=nothing`: (Deprecated) Force scheduler to use one or more +processors that are instances/subtypes of a contained type. Alternatively, a +function can be supplied, and the function will be called with a processor as +the sole argument and should return a `Bool` result to indicate whether or not +to use the given processor. `nothing` enables all default processors. - `allow_errors::Bool=true`: Allow thunks to error without affecting non-dependent thunks. - `checkpoint=nothing`: If not `nothing`, uses the provided function to save @@ -176,11 +176,11 @@ end Stores Thunk-local options to be passed to the Dagger.Sch scheduler. # Arguments -- `single::Int=0`: Force thunk onto worker with specified id. `0` disables this -option. -- `proclist=nothing`: Force thunk to use one or more processors that are -instances/subtypes of a contained type. Alternatively, a function can be -supplied, and the function will be called with a processor as the sole +- `single::Int=0`: (Deprecated) Force thunk onto worker with specified id. `0` +disables this option. +- `proclist=nothing`: (Deprecated) Force thunk to use one or more processors +that are instances/subtypes of a contained type. Alternatively, a function can +be supplied, and the function will be called with a processor as the sole argument and should return a `Bool` result to indicate whether or not to use the given processor. `nothing` enables all default processors. - `time_util::Dict{Type,Any}=Dict{Type,Any}()`: Indicates the maximum expected @@ -634,7 +634,12 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) scope = if task.f isa Chunk task.f.scope else - DefaultScope() + if task.options.proclist !== nothing + # proclist overrides scope selection + AnyScope() + else + DefaultScope() + end end for input in task.inputs input = unwrap_weak_checked(input) @@ -682,7 +687,8 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) for proc in local_procs gproc = get_parent(proc) - if can_use_proc(task, gproc, proc, opts, scope) + can_use, scope = can_use_proc(task, gproc, proc, opts, scope) + if can_use has_cap, est_time_util, est_alloc_util = has_capacity(state, proc, gproc.pid, opts.time_util, opts.alloc_util, sig) if has_cap # Schedule task onto proc @@ -706,7 +712,8 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) cap, extra_util = nothing, nothing procs_found = false # N.B. if we only have one processor, we need to select it now - if can_use_proc(task, entry.gproc, entry.proc, opts, scope) + can_use, scope = can_use_proc(task, entry.gproc, entry.proc, opts, scope) + if can_use has_cap, est_time_util, est_alloc_util = has_capacity(state, entry.proc, entry.gproc.pid, opts.time_util, opts.alloc_util, sig) if has_cap selected_entry = entry @@ -730,7 +737,8 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) @goto pop_task end - if can_use_proc(task, entry.gproc, entry.proc, opts, scope) + can_use, scope = can_use_proc(task, entry.gproc, entry.proc, opts, scope) + if can_use has_cap, est_time_util, est_alloc_util = has_capacity(state, entry.proc, entry.gproc.pid, opts.time_util, opts.alloc_util, sig) if has_cap # Select this processor diff --git a/src/sch/eager.jl b/src/sch/eager.jl index e1126cbc8..7e15f1e74 100644 --- a/src/sch/eager.jl +++ b/src/sch/eager.jl @@ -18,12 +18,12 @@ function init_eager() ctx = eager_context() @async try sopts = SchedulerOptions(;allow_errors=true) - topts = ThunkOptions(;single=1) + scope = Dagger.ExactScope(Dagger.ThreadProc(1, 1)) atexit() do EAGER_FORCE_KILL[] = true close(EAGER_THUNK_CHAN) end - Dagger.compute(ctx, Dagger.delayed(eager_thunk; options=topts)(); options=sopts) + Dagger.compute(ctx, Dagger.delayed(eager_thunk; scope)(); options=sopts) catch err iob = IOContext(IOBuffer(), :color=>true) println(iob, "Error in eager scheduler:") diff --git a/src/sch/util.jl b/src/sch/util.jl index e688ed410..855b4d90c 100644 --- a/src/sch/util.jl +++ b/src/sch/util.jl @@ -263,43 +263,55 @@ end function can_use_proc(task, gproc, proc, opts, scope) # Check against proclist - if opts.proclist === nothing - if !default_enabled(proc) - @debug "Rejected $proc: !default_enabled(proc)" - return false - end - elseif opts.proclist isa Function - if !Base.invokelatest(opts.proclist, proc) - @debug "Rejected $proc: proclist(proc) == false" - return false + if opts.proclist !== nothing + @warn "The `proclist` option is deprecated, please use scopes instead\nSee https://juliaparallel.org/Dagger.jl/stable/scopes/ for details" maxlog=1 + if opts.proclist isa Function + if !Base.invokelatest(opts.proclist, proc) + @debug "[$(task.id)] Rejected $proc: proclist(proc) == false" + return false, scope + end + scope = constrain(scope, Dagger.ExactScope(proc)) + elseif opts.proclist isa Vector + if !(typeof(proc) in opts.proclist) + @debug "[$(task.id)] Rejected $proc: !(typeof(proc) in proclist)" + return false, scope + end + scope = constrain(scope, + Dagger.UnionScope(map(Dagger.ProcessorTypeScope, opts.proclist))) + else + throw(SchedulingException("proclist must be a Function, Vector, or nothing")) end - elseif opts.proclist isa Vector - if !(typeof(proc) in opts.proclist) - @debug "Rejected $proc: !(typeof(proc) in proclist)" - return false + if scope isa Dagger.InvalidScope + @debug "[$(task.id)] Rejected $proc: Not contained in task scope ($scope)" + return false, scope end - else - throw(SchedulingException("proclist must be a Function, Vector, or nothing")) end # Check against single if opts.single !== nothing + @warn "The `single` option is deprecated, please use scopes instead\nSee https://juliaparallel.org/Dagger.jl/stable/scopes/ for details" maxlog=1 if gproc.pid != opts.single @debug "[$(task.id)] Rejected $proc: gproc.pid ($(gproc.pid)) != single ($(opts.single))" - return false + return false, scope + end + scope = constrain(scope, Dagger.ProcessScope(opts.single)) + if scope isa Dagger.InvalidScope + @debug "[$(task.id)] Rejected $proc: Not contained in task scope ($scope)" + return false, scope end end - # Check scope + # Check against scope proc_scope = Dagger.ExactScope(proc) if constrain(scope, proc_scope) isa Dagger.InvalidScope @debug "[$(task.id)] Rejected $proc: Not contained in task scope ($scope)" - return false + return false, scope end - @debug "[$(task.id)] Accepted $proc" + @label accept - return true + @debug "[$(task.id)] Accepted $proc" + return true, scope end function has_capacity(state, p, gp, time_util, alloc_util, sig) diff --git a/test/fakeproc.jl b/test/fakeproc.jl index 54328697b..3f7c0b0b5 100644 --- a/test/fakeproc.jl +++ b/test/fakeproc.jl @@ -1,7 +1,7 @@ @everywhere begin using Dagger -import Dagger: ThreadProc +import Dagger: OSProc, ThreadProc struct FakeProc <: Dagger.Processor owner::Int @@ -18,6 +18,7 @@ fakesum(xs...) = FakeVal(sum(map(y->y.x, xs))) Dagger.iscompatible_func(proc::FakeProc, opts, f) = true Dagger.iscompatible_arg(proc::FakeProc, opts, ::Type{<:Integer}) = true Dagger.iscompatible_arg(proc::FakeProc, opts, ::Type{<:FakeVal}) = true +Dagger.move(from_proc::OSProc, to_proc::FakeProc, x::Integer) = FakeVal(x) Dagger.move(from_proc::ThreadProc, to_proc::FakeProc, x::Integer) = FakeVal(x) Dagger.execute!(proc::FakeProc, func, args...) = FakeVal(42+func(args...).x) diff --git a/test/processors.jl b/test/processors.jl index 75870cb4e..9efcc1a4b 100644 --- a/test/processors.jl +++ b/test/processors.jl @@ -37,9 +37,9 @@ end end @testset "Processor exhaustion" begin opts = ThunkOptions(proclist=[OptOutProc]) - @test_throws_unwrap Dagger.ThunkFailedException ex isa Dagger.Sch.SchedulingException ex.reason="No processors available, try making proclist more liberal" collect(delayed(sum; options=opts)([1,2,3])) + @test_throws_unwrap Dagger.ThunkFailedException ex isa Dagger.Sch.SchedulingException ex.reason="No processors available, try widening scope" collect(delayed(sum; options=opts)([1,2,3])) opts = ThunkOptions(proclist=(proc)->false) - @test_throws_unwrap Dagger.ThunkFailedException ex isa Dagger.Sch.SchedulingException ex.reason="No processors available, try making proclist more liberal" collect(delayed(sum; options=opts)([1,2,3])) + @test_throws_unwrap Dagger.ThunkFailedException ex isa Dagger.Sch.SchedulingException ex.reason="No processors available, try widening scope" collect(delayed(sum; options=opts)([1,2,3])) opts = ThunkOptions(proclist=nothing) @test collect(delayed(sum; options=opts)([1,2,3])) == 6 end From 43178ee7a8a61100dd2764aa6ca26f3b5c40f7f0 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Sun, 26 Feb 2023 11:23:18 -0600 Subject: [PATCH 17/44] tests: Robustify mutation test --- test/mutation.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/mutation.jl b/test/mutation.jl index 48b192476..f076719d4 100644 --- a/test/mutation.jl +++ b/test/mutation.jl @@ -41,11 +41,12 @@ end @testset "@mutable" begin w = first(workers()) + @assert w != 1 "Not enough workers to test mutability" x = remotecall_fetch(w) do Dagger.@mutable Ref{Int}() end @test fetch(Dagger.@spawn (x->x[] = myid())(x)) == w - @test_throws_unwrap Dagger.ThunkFailedException fetch(Dagger.@spawn single=last(workers()) (x->x[] = myid())(x)) + @test_throws_unwrap Dagger.ThunkFailedException fetch(Dagger.@spawn single=1 (x->x[] = myid())(x)) end # @testset "@mutable" @testset "Shard" begin From 5cd574cbe84897f4843ea75a1ad65006fdff555a Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Fri, 3 Mar 2023 16:59:29 -0600 Subject: [PATCH 18/44] scopes: Add show methods --- src/scopes.jl | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/scopes.jl b/src/scopes.jl index 386806a88..62850c628 100644 --- a/src/scopes.jl +++ b/src/scopes.jl @@ -73,6 +73,47 @@ struct InvalidScope <: AbstractScope y::AbstractScope end +# Show methods + +function Base.show(io::IO, scope::UnionScope) + indent = io isa IOContext ? get(io, :indent, 0) : 0 + println(io, "UnionScope:") + indent += 2 + inner_io = IOContext(io, :indent => indent) + if length(scope.scopes) == 0 + print(io, " "^indent, "empty") + return + end + for (idx, inner_scope) in enumerate(scope.scopes) + print(io, " "^indent); print(inner_io, inner_scope) + idx < length(scope.scopes) && println(io) + end +end +function Base.show(io::IO, scope::TaintScope) + indent = io isa IOContext ? get(io, :indent, 0) : 0 + println(io, "TaintScope:") + indent += 2 + inner_io = IOContext(io, :indent => indent) + print(io, " "^indent, "scope: "); println(inner_io, scope.scope) + if length(scope.taints) == 0 + print(io, " "^indent, "no taints") + return + end + println(io, " "^indent, "taints: ") + indent += 4 + inner_io = IOContext(io, :indent => indent) + for (idx, taint) in enumerate(scope.taints) + print(io, " "^indent); print(inner_io, taint) + idx < length(scope.taints) && println(io) + end +end +Base.show(io::IO, scope::NodeScope) = + print(io, "NodeScope: node == $(scope.uuid)") +Base.show(io::IO, scope::ProcessScope) = + print(io, "ProcessScope: worker == $(scope.wid)") +Base.show(io::IO, scope::ExactScope) = + print(io, "ExactScope: processor == $(scope.processor)") + # Comparisons and constraint checking constrain(x, y) = x < y ? constrain(y, x) : throw(MethodError(constrain, x, y)) From 49eea9899972fa652ff3774240407e835f717320 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Fri, 3 Mar 2023 16:59:53 -0600 Subject: [PATCH 19/44] scopes: Add Dagger.scope helper --- src/scopes.jl | 134 +++++++++++++++++++++++++++++++++++++++++++++++++ test/scopes.jl | 79 ++++++++++++++++++++++++++++- 2 files changed, 212 insertions(+), 1 deletion(-) diff --git a/src/scopes.jl b/src/scopes.jl index 62850c628..8212bafda 100644 --- a/src/scopes.jl +++ b/src/scopes.jl @@ -205,3 +205,137 @@ constrain(x::ProcessScope, y::ExactScope) = x == y.parent ? y : InvalidScope(x, y) constrain(x::NodeScope, y::ExactScope) = x == y.parent.parent ? y : InvalidScope(x, y) + +### Scopes helper + +""" + scope(scs...) -> AbstractScope + scope(;scs...) -> AbstractScope + +Constructs an `AbstractScope` from a set of scope specifiers. Each element in +`scs` is a separate specifier; if `scs` is empty, an empty `UnionScope()` is +produced; if `scs` has one element, then exactly one specifier is constructed; +if `scs` has more than one element, a `UnionScope` of the scopes specified by +`scs` is constructed. A variety of specifiers can be passed to construct a +scope: +- `:any` - Constructs an `AnyScope()` +- `:default` - Constructs a `DefaultScope()` +- `(scs...,)` - Constructs a `UnionScope` of scopes, each specified by `scs` +- `thread=tid` or `threads=[tids...]` - Constructs an `ExactScope` or `UnionScope` containing all `Dagger.ThreadProc`s with thread ID `tid`/`tids` across all workers. +- `worker=wid` or `workers=[wids...]` - Constructs a `ProcessScope` or `UnionScope` containing all `Dagger.ThreadProc`s with worker ID `wid`/`wids` across all threads. +- `thread=tid`/`threads=tids` and `worker=wid`/`workers=wids` - Constructs an `ExactScope`, `ProcessScope`, or `UnionScope` containing all `Dagger.ThreadProc`s with worker ID `wid`/`wids` and threads `tid`/`tids`. + +Aside from the worker and thread specifiers, it's possible to add custom +specifiers for scoping to other kinds of processors (like GPUs) or providing +different ways to specify a scope. Specifier selection is determined by a +precedence ordering: by default, all specifiers have precedence `0`, which can +be changed by defining `scope_key_precedence(::Val{spec}) = precedence` (where +`spec` is the specifier as a `Symbol)`. The specifier with the highest +precedence in a set of specifiers is used to determine the scope by calling +`to_scope(::Val{spec}, sc::NamedTuple)` (where `sc` is the full set of +specifiers), which should be overriden for each custom specifier, and which +returns an `AbstractScope`. For example: + +```julia +# Setup a GPU specifier +Dagger.scope_key_precedence(::Val{:gpu}) = 1 +Dagger.to_scope(::Val{:gpu}, sc::NamedTuple) = ExactScope(MyGPUDevice(sc.worker, sc.gpu)) + +# Generate an `ExactScope` for `MyGPUDevice` on worker 2, device 3 +Dagger.scope(gpu=3, worker=2) +``` +""" +scope(scs...) = simplified_union_scope(map(to_scope, scs)) +scope(; kwargs...) = to_scope((;kwargs...)) + +function simplified_union_scope(scopes) + if length(scopes) == 1 + return only(scopes) + else + return UnionScope(scopes) + end +end +to_scope(scope::AbstractScope) = scope +function to_scope(sc::Symbol) + if sc == :any + return AnyScope() + elseif sc == :default + return DefaultScope() + else + throw(ArgumentError("Cannot construct scope from: $(repr(sc))")) + end +end +function to_scope(sc::NamedTuple) + if isempty(sc) + return UnionScope() + end + + # FIXME: node and nodes + known_keys = (:worker, :workers, :thread, :threads) + unknown_keys = filter(key->!in(key, known_keys), keys(sc)) + if length(unknown_keys) > 0 + # Hand off construction if unknown members encountered + precs = map(scope_key_precedence, map(Val, unknown_keys)) + max_prec = maximum(precs) + if length(findall(prec->prec==max_prec, precs)) > 1 + throw(ArgumentError("Incompatible scope specifiers detected: $unknown_keys")) + end + max_prec_key = unknown_keys[argmax(precs)] + return to_scope(Val(max_prec_key), sc) + end + + workers = if haskey(sc, :worker) + Int[sc.worker] + elseif haskey(sc, :workers) + Int[sc.workers...] + else + nothing + end + threads = if haskey(sc, :thread) + Int[sc.thread] + elseif haskey(sc, :threads) + Int[sc.threads...] + else + nothing + end + + # Simple cases + if workers !== nothing && threads !== nothing + subscopes = AbstractScope[] + for w in workers, t in threads + push!(subscopes, ExactScope(ThreadProc(w, t))) + end + return simplified_union_scope(subscopes) + elseif workers !== nothing && threads === nothing + subscopes = AbstractScope[ProcessScope(w) for w in workers] + return simplified_union_scope(subscopes) + end + + # More complex cases that require querying the cluster + # FIXME: Use per-field scope taint + if workers === nothing + workers = procs() + end + subscopes = AbstractScope[] + for w in workers + if threads === nothing + threads = map(c->c.tid, + filter(c->c isa ThreadProc, + collect(children(OSProc(w))))) + end + for t in threads + push!(subscopes, ExactScope(ThreadProc(w, t))) + end + end + return simplified_union_scope(subscopes) +end +to_scope(scs::Tuple) = + simplified_union_scope(map(to_scope, scs)) +to_scope(sc) = + throw(ArgumentError("Cannot construct scope from: $sc")) + +to_scope(::Val{key}, sc::NamedTuple) where key = + throw(ArgumentError("Scope construction not implemented for key: $key")) + +# Base case for all Dagger-owned keys +scope_key_precedence(::Val) = 0 diff --git a/test/scopes.jl b/test/scopes.jl index 3a9303019..d5d7e0a6b 100644 --- a/test/scopes.jl +++ b/test/scopes.jl @@ -124,7 +124,7 @@ @test fetch(Dagger.@spawn exact_scope_test(us_es1_multi_ch)) == es1.processor # No inner scopes - @test_throws ArgumentError UnionScope() + @test UnionScope() isa UnionScope # Same inner scope @test fetch(Dagger.@spawn exact_scope_test(us_es1_ch, us_es1_ch)) == es1.processor @@ -161,5 +161,82 @@ end # TODO: Test scope propagation + @testset "scope helper" begin + @test Dagger.scope(:any) isa AnyScope + @test Dagger.scope(:default) == DefaultScope() + @test_throws ArgumentError Dagger.scope(:blah) + @test Dagger.scope(()) == UnionScope() + + @test Dagger.scope(worker=wid1) == + Dagger.scope(workers=[wid1]) == + ProcessScope(wid1) + @test Dagger.scope(workers=[wid1,wid2]) == UnionScope([ProcessScope(wid1), + ProcessScope(wid2)]) + @test Dagger.scope(workers=[]) == UnionScope() + + @test Dagger.scope(thread=1) == + Dagger.scope(threads=[1]) == + UnionScope([ExactScope(Dagger.ThreadProc(w,1)) for w in procs()]) + @test Dagger.scope(threads=[1,2]) == UnionScope([ExactScope(Dagger.ThreadProc(w,t)) for t in [1,2] for w in procs()]) + @test Dagger.scope(threads=[]) == UnionScope() + + @test Dagger.scope(worker=wid1,thread=1) == + Dagger.scope(thread=1,worker=wid1) == + Dagger.scope(workers=[wid1],thread=1) == + Dagger.scope(worker=wid1,threads=[1]) == + Dagger.scope(workers=[wid1],threads=[1]) == + ExactScope(Dagger.ThreadProc(wid1,1)) + + @test_throws ArgumentError Dagger.scope(blah=1) + @test_throws ArgumentError Dagger.scope(thread=1, blah=1) + + @test Dagger.scope(worker=1,thread=1) == + Dagger.scope((worker=1,thread=1)) == + Dagger.scope(((worker=1,thread=1),)) + @test Dagger.scope((worker=1,thread=1),(worker=wid1,thread=2)) == + Dagger.scope(((worker=1,thread=1),(worker=wid1,thread=2),)) == + Dagger.scope(((worker=1,thread=1),), ((worker=wid1,thread=2),)) == + UnionScope([ExactScope(Dagger.ThreadProc(1, 1)), + ExactScope(Dagger.ThreadProc(wid1, 2))]) + @test_throws ArgumentError Dagger.scope((;blah=1)) + @test_throws ArgumentError Dagger.scope((thread=1, blah=1)) + + @testset "custom handler" begin + @eval begin + Dagger.scope_key_precedence(::Val{:gpu}) = 1 + Dagger.scope_key_precedence(::Val{:rocm}) = 2 + Dagger.scope_key_precedence(::Val{:cuda}) = 2 + + # Some fake scopes to use as sentinels + Dagger.to_scope(::Val{:gpu}, sc::NamedTuple) = ExactScope(Dagger.ThreadProc(1, sc.device)) + Dagger.to_scope(::Val{:rocm}, sc::NamedTuple) = ExactScope(Dagger.ThreadProc($wid1, sc.gpu)) + Dagger.to_scope(::Val{:cuda}, sc::NamedTuple) = ExactScope(Dagger.ThreadProc($wid2, sc.gpu)) + end + + @test Dagger.scope(gpu=1,device=2) == + Dagger.scope(device=2,gpu=1) == + Dagger.scope(gpu=1,device=2,blah=3) == + Dagger.scope((gpu=1,device=2,blah=3)) == + ExactScope(Dagger.ThreadProc(1, 2)) + @test Dagger.scope((gpu=1,device=2),(worker=1,thread=1)) == + Dagger.scope((worker=1,thread=1),(device=2,gpu=1)) == + UnionScope([ExactScope(Dagger.ThreadProc(1, 2)), + ExactScope(Dagger.ThreadProc(1, 1))]) + @test Dagger.scope((gpu=1,device=2),(device=3,gpu=1)) == + UnionScope([ExactScope(Dagger.ThreadProc(1, 2)), + ExactScope(Dagger.ThreadProc(1, 3))]) + + @test Dagger.scope(rocm=1,gpu=2) == + Dagger.scope(gpu=2,rocm=1) == + ExactScope(Dagger.ThreadProc(wid1, 2)) + @test Dagger.scope(cuda=1,gpu=2) == + Dagger.scope(gpu=2,cuda=1) == + ExactScope(Dagger.ThreadProc(wid2, 2)) + @test_throws ArgumentError Dagger.scope(rocm=1,cuda=1,gpu=2) + @test_throws ArgumentError Dagger.scope(gpu=2,rocm=1,cuda=1) + @test_throws ArgumentError Dagger.scope((rocm=1,cuda=1,gpu=2)) + end + end + rmprocs([wid1, wid2]) end From d264e160c88183306bd28a3f4082e3782f562a69 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Fri, 10 Mar 2023 10:44:07 -0600 Subject: [PATCH 20/44] scopes: Unique subscopes in UnionScope --- src/scopes.jl | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/src/scopes.jl b/src/scopes.jl index 8212bafda..6a5af1e5f 100644 --- a/src/scopes.jl +++ b/src/scopes.jl @@ -26,11 +26,37 @@ DefaultScope() = TaintScope(AnyScope(), "Union of two or more scopes." struct UnionScope <: AbstractScope scopes::Tuple + function UnionScope(scopes::Tuple) + scope_set = Set{AbstractScope}() + for scope in scopes + if scope isa UnionScope + for subscope in scope.scopes + push!(scope_set, subscope) + end + else + push!(scope_set, scope) + end + end + return new((collect(scope_set)...,)) + end end UnionScope(scopes...) = UnionScope((scopes...,)) UnionScope(scopes::Vector{<:AbstractScope}) = UnionScope((scopes...,)) UnionScope(s::AbstractScope) = UnionScope((s,)) -UnionScope() = throw(ArgumentError("Cannot construct empty UnionScope")) +UnionScope() = UnionScope(()) + +function Base.:(==)(us1::UnionScope, us2::UnionScope) + if length(us1.scopes) != length(us2.scopes) + return false + end + scopes = Set{AbstractScope}() + for scope in us2.scopes + if !(scope in us2.scopes) + return false + end + end + return true +end "Scoped to the same physical node." struct NodeScope <: AbstractScope From fa7f8288abc4238e0c82e67adb69d726043f2470 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Tue, 4 Apr 2023 20:25:00 -0500 Subject: [PATCH 21/44] docs: Update scope docs --- docs/src/index.md | 13 +++++++------ docs/src/processors.md | 33 ++------------------------------- docs/src/scopes.md | 25 ++++++++++++++----------- 3 files changed, 23 insertions(+), 48 deletions(-) diff --git a/docs/src/index.md b/docs/src/index.md index 29bdccfeb..7b2cb8307 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -198,15 +198,17 @@ While Dagger generally "just works", sometimes one needs to exert some more fine-grained control over how the scheduler allocates work. There are two parallel mechanisms to achieve this: Scheduler options (from `Dagger.Sch.SchedulerOptions`) and Thunk options (from -`Dagger.Sch.ThunkOptions`). These two options structs generally contain the -same options, with the difference being that Scheduler options operate +`Dagger.Sch.ThunkOptions`). These two options structs contain many shared +options, with the difference being that Scheduler options operate globally across an entire DAG, and Thunk options operate on a thunk-by-thunk -basis. Scheduler options can be constructed and passed to `collect()` or -`compute()` as the keyword argument `options` for lazy API usage: +basis. + +Scheduler options can be constructed and passed to `collect()` or `compute()` +as the keyword argument `options` for lazy API usage: ```julia t = @par 1+2 -opts = Dagger.Sch.ThunkOptions(;single=1) # Execute on worker 1 +opts = Dagger.Sch.SchedulerOptions(;single=1) # Execute on worker 1 compute(t; options=opts) @@ -219,7 +221,6 @@ Thunk options can be passed to `@spawn/spawn`, `@par`, and `delayed` similarly: # Execute on worker 1 Dagger.@spawn single=1 1+2 - Dagger.spawn(+, 1, 2; single=1) opts = Dagger.Sch.ThunkOptions(;single=1) diff --git a/docs/src/processors.md b/docs/src/processors.md index e55fd2119..6e74ecc81 100644 --- a/docs/src/processors.md +++ b/docs/src/processors.md @@ -76,42 +76,13 @@ processor B. This mechanism uses Julia's Serialization library to serialize and deserialize data, so data must be serializable for this mechanism to work properly. -### Future: Hierarchy Generic Path Move - -NOTE: This used to be the default move behavior, but was removed because it -wasn't considered helpful, and there were not any processor implementations -that made use of it. - -Movement of data between any two processors is decomposable into a sequence of -"moves" between a child and its parent, termed a "generic path move". Movement -of data may also take "shortcuts" between nodes in the tree which are not -directly connected if enabled by libraries or the user, which may make use of -IPC mechanisms to transfer data more directly and efficiently (such as -Infiniband, GPU RDMA, NVLINK, etc.). All data is considered local to some -processor, and may only be operated on by another processor by first doing an -explicit move operation to that processor. - ## Processor Selection By default, Dagger uses the CPU to process work, typically single-threaded per cluster node. However, Dagger allows access to a wider range of hardware and software acceleration techniques, such as multithreading and GPUs. These more advanced (but performant) accelerators are disabled by default, but can easily -be enabled by using Scheduler/Thunk options in the `proclist` field. If -`nothing`, all default processors will be used. If a vector of types, only the -processor types contained in `options.proclist` will be used to compute all or -a given thunk. If a function, it will be called for each processor (with the -processor as the argument) until it returns `true`. - -```julia -opts = Dagger.Sch.ThunkOptions(;proclist=nothing) # default behavior -# OR -opts = Dagger.Sch.ThunkOptions(;proclist=[DaggerGPU.CuArrayProc]) # only execute on CuArrayProc -# OR -opts = Dagger.Sch.ThunkOptions(;proclist=(proc)->(proc isa Dagger.ThreadProc && proc.tid == 3)) # only run on ThreadProc with thread ID 3 - -t = Dagger.@par options=opts sum(X) # do sum(X) on the specified processor -``` +be enabled by using scopes (see [Scopes](@ref) for details). ## Resource Control @@ -137,7 +108,7 @@ sufficient resources become available by thunks completing execution. The [DaggerGPU.jl](https://github.com/JuliaGPU/DaggerGPU.jl) package can be imported to enable GPU acceleration for NVIDIA and AMD GPUs, when available. The processors provided by that package are not enabled by default, but may be -enabled via `options.proclist` as usual. +enabled via custom scopes ([Scopes](@ref)). ### Future: Network Devices and Topology diff --git a/docs/src/scopes.md b/docs/src/scopes.md index 01c1a3414..bed60b506 100644 --- a/docs/src/scopes.md +++ b/docs/src/scopes.md @@ -15,16 +15,16 @@ considered valid. Let's take the example of a webcam handle generated by VideoIO.jl. This handle is a C pointer, and thus has process scope. We can open the handle on a given -process, and set the scope of the resulting data to a `ProcessScope()`, which -defaults to the current Julia process: +process, and set the scope of the resulting data to be locked to the current +process with `Dagger.scope` to construct a `ProcessScope`: ```julia -using VideoIO +using VideoIO, Distributed function get_handle() handle = VideoIO.opencamera() proc = Dagger.thunk_processor() - scope = ProcessScope() + scope = Dagger.scope(worker=myid()) # constructs a `ProcessScope` return Dagger.tochunk(handle, proc, scope) end @@ -41,7 +41,7 @@ cam_frame = Dagger.@spawn read(cam_handle) The `cam_frame` task is executed within any processor on the same process that the `cam_handle` task was executed on. Of course, the resulting camera frame is -*not* scoped to anywhere specific (denoted as `AnyScope()`), and thus +*not* scoped to anywhere specific (denoted as `AnyScope`), and thus computations on it may execute anywhere. You may also encounter situations where you want to use a callable struct (such @@ -53,13 +53,15 @@ using Flux m = Chain(...) # If `m` is only safe to transfer to and execute on this process, # we can set a `ProcessScope` on it: -result = Dagger.@spawn scope=ProcessScope() m(rand(8,8)) +result = Dagger.@spawn scope=Dagger.scope(worker=myid()) m(rand(8,8)) ``` Setting a scope on the function treats it as a regular piece of data (like the arguments to the function), so it participates in the scoping rules described in the following sections all the same. +[`Dagger.scope`](@ref) + Now, let's try out some other kinds of scopes, starting with `NodeScope`. This scope encompasses the server that one or more Julia processes may be running on. Say we want to use memory mapping (mmap) to more efficiently send arrays @@ -75,6 +77,7 @@ function generate() arr = Mmap.mmap(path, Matrix{Int}, (64,64)) fill!(arr, 1) Mmap.sync!(arr) + # Note: Dagger.scope() does not yet support node scopes Dagger.tochunk(path, Dagger.thunk_processor(), NodeScope()) end @@ -87,14 +90,14 @@ a = Dagger.@spawn generate() @assert fetch(Dagger.@spawn consume(a)) == 64*64 ``` -Whatever server `a` executed on, `b` will also execute on! +Whatever server `a` executed on, `b` will also execute on it! Finally, we come to the "lowest" scope on the scope hierarchy, the `ExactScope`. This scope specifies one exact processor as the bounding scope, -and is typically useful in certain limited cases. We won't provide an example -here, because you don't usually need to ever use this scope, but if you already -understand the `NodeScope` and `ProcessScope`, the `ExactScope` should be easy -to figure out. +and is typically useful in certain limited cases (such as data existing only on +a specific GPU). We won't provide an example here, because you don't usually +need to ever use this scope, but if you already understand the `NodeScope` and +`ProcessScope`, the `ExactScope` should be easy to figure out. ## Union Scopes From 023a4cd9ce1a0ccaa688adb611e70c6653d616e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felipe=20de=20Alc=C3=A2ntara=20Tom=C3=A9?= Date: Tue, 18 Apr 2023 14:36:11 -0300 Subject: [PATCH 22/44] Changing the receive and yield function to accomodate new MPI implementations --- lib/DaggerMPI/src/DaggerMPI.jl | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/lib/DaggerMPI/src/DaggerMPI.jl b/lib/DaggerMPI/src/DaggerMPI.jl index 727b375c8..56fa764db 100644 --- a/lib/DaggerMPI/src/DaggerMPI.jl +++ b/lib/DaggerMPI/src/DaggerMPI.jl @@ -18,6 +18,8 @@ const MPI_PROCESSORS = Ref{Int}(-1) const PREVIOUS_PROCESSORS = Set() + + function initialize(comm::MPI.Comm=MPI.COMM_WORLD; color_algo=SimpleColoring()) @assert MPI_PROCESSORS[] == -1 "DaggerMPI already initialized" @@ -51,7 +53,7 @@ function finalize() empty!(Dagger.PROCESSOR_CALLBACKS) empty!(Dagger.OSPROC_PROCESSOR_CACHE) i = 1 - for proc in PREVIOUS_PROCESSORS + for proc in PREVIOUS_PROCESSORS Dagger.add_processor_callback!("old_processor_$i") do return proc end @@ -72,12 +74,24 @@ end Dagger.get_parent(proc::MPIProcessor) = Dagger.OSProc() Dagger.default_enabled(proc::MPIProcessor) = true + "Busy-loop Irecv that yields to other tasks." function recv_yield(src, tag, comm) - while true - got, value, _ = MPI.irecv(src, tag, comm) + while true + (got, msg, stat) = MPI.Improbe(src, tag, comm, MPI.Status) if got - return value + count = MPI.Get_count(stat, UInt8) + buf = Array{UInt8}(undef, count) + req = MPI.Imrecv!(MPI.Buffer(buf), msg) + while true + finish = MPI.Test(req) + if finish + value = MPI.deserialize(buf) + return value + end + Core.print("[$(MPI.Comm_rank(comm))] message is not ready on $tag\n") + yield() + end end # TODO: Sigmoidal backoff yield() From 284a374bf36144fe12ad1cad1e9d8cc29172beb3 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Fri, 24 Feb 2023 20:00:52 -0600 Subject: [PATCH 23/44] ThreadProc: Mark task sticky --- src/processor.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/processor.jl b/src/processor.jl index 91091a190..3a47ba863 100644 --- a/src/processor.jl +++ b/src/processor.jl @@ -159,6 +159,7 @@ function execute!(proc::ThreadProc, @nospecialize(f), @nospecialize(args...)) TimespanLogging.prof_task_put!(tls.sch_handle.thunk_id.id) f(args...) end + task.sticky = true ret = ccall(:jl_set_task_tid, Cint, (Any, Cint), task, proc.tid-1) if ret == 0 error("jl_set_task_tid == 0") From 44913e8446b363ae2f035d64f306b51d3bf2404e Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Tue, 28 Feb 2023 18:11:26 -0600 Subject: [PATCH 24/44] ThreadProc: Use at-invokelatest --- src/processor.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/processor.jl b/src/processor.jl index 3a47ba863..85e58620f 100644 --- a/src/processor.jl +++ b/src/processor.jl @@ -1,5 +1,7 @@ export OSProc, Context, addprocs!, rmprocs! +import Base: @invokelatest + """ Processor @@ -157,7 +159,7 @@ function execute!(proc::ThreadProc, @nospecialize(f), @nospecialize(args...)) task = Task() do set_tls!(tls) TimespanLogging.prof_task_put!(tls.sch_handle.thunk_id.id) - f(args...) + @invokelatest f(args...) end task.sticky = true ret = ccall(:jl_set_task_tid, Cint, (Any, Cint), task, proc.tid-1) From 96b2c6b4c2b5eecad4201d4465eaccc2125943d9 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Tue, 18 Apr 2023 16:44:26 -0700 Subject: [PATCH 25/44] thunk: Dont move (error,value) tuple --- src/thunk.jl | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/src/thunk.jl b/src/thunk.jl index 66b8f88d3..3b4aa4900 100644 --- a/src/thunk.jl +++ b/src/thunk.jl @@ -154,16 +154,16 @@ Base.wait(t::ThunkFuture) = Dagger.Sch.thunk_yield() do end function Base.fetch(t::ThunkFuture; proc=OSProc(), raw=false) error, value = Dagger.Sch.thunk_yield() do - if raw - fetch(t.future) - else - move(proc, fetch(t.future)) - end + fetch(t.future) end if error throw(value) end - value + if raw + return value + else + return move(proc, value) + end end Base.put!(t::ThunkFuture, x; error=false) = put!(t.future, (error, x)) @@ -263,13 +263,7 @@ mutable struct EagerThunk end Base.isready(t::EagerThunk) = isready(t.future) Base.wait(t::EagerThunk) = wait(t.future) -function Base.fetch(t::EagerThunk; raw=false) - if raw - fetch(t.future; raw=true) - else - move(OSProc(), fetch(t.future)) - end -end +Base.fetch(t::EagerThunk; raw=false) = fetch(t.future; raw) function Base.show(io::IO, t::EagerThunk) print(io, "EagerThunk ($(isready(t) ? "finished" : "running"))") end From bcf3f32133842ce9cb3eadfda1b879410e32d0ff Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Fri, 21 Apr 2023 19:48:31 -0700 Subject: [PATCH 26/44] Sch: Generate guaranteed unique uid --- src/sch/Sch.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index e5c7f8546..db6ec880c 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -97,8 +97,10 @@ struct ComputeState chan::RemoteChannel{Channel{Any}} end +const UID_COUNTER = Threads.Atomic{UInt64}(1) + function start_state(deps::Dict, node_order, chan) - state = ComputeState(rand(UInt64), + state = ComputeState(Threads.atomic_add!(UID_COUNTER, UInt64(1)), OneToMany(), deps, Vector{Thunk}(undef, 0), From 57cbaf012dca5b7b8d33b22a0feeeb3150da2fcc Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Fri, 21 Apr 2023 19:55:13 -0700 Subject: [PATCH 27/44] Sch: Add at-dagdebug helper macro --- src/sch/Sch.jl | 15 +++++++++++++++ src/sch/dynamic.jl | 2 ++ src/sch/util.jl | 48 +++++++++++++++++++++++++++++++++++++++------- 3 files changed, 58 insertions(+), 7 deletions(-) diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index db6ec880c..1008eb8b3 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -467,6 +467,8 @@ function scheduler_init(ctx, state::ComputeState, d::Thunk, options, deps) end function scheduler_run(ctx, state::ComputeState, d::Thunk, options) + @dagdebug nothing :global "Initializing scheduler" uid=state.uid + safepoint(state) # Loop while we still have thunks to execute @@ -480,12 +482,14 @@ function scheduler_run(ctx, state::ComputeState, d::Thunk, options) isempty(state.running) && continue timespan_start(ctx, :take, 0, 0) + @dagdebug nothing :take "Waiting for results" chan_value = take!(state.chan) # get result of completed thunk timespan_finish(ctx, :take, 0, 0) if chan_value isa RescheduleSignal continue end pid, proc, thunk_id, (res, metadata) = chan_value + @dagdebug thunk_id :take "Got finished task" gproc = OSProc(pid) safepoint(state) lock(state.lock) do @@ -559,6 +563,8 @@ function scheduler_run(ctx, state::ComputeState, d::Thunk, options) return value, errored end function scheduler_exit(ctx, state::ComputeState, options) + @dagdebug nothing :global "Tearing down scheduler" uid=state.uid + close(state.chan) notify(state.halt) @sync for p in procs_to_use(ctx) @@ -697,6 +703,7 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) # FIXME: est_time_util = est_time_util isa MaxUtilization ? cap : est_time_util push!(get!(()->Vector{Tuple{Thunk,<:Any,<:Any}}(), to_fire, (gproc, proc)), (task, est_time_util, est_alloc_util)) state.worker_time_pressure[gproc.pid][proc] += est_time_util + @dagdebug task :schedule "Scheduling to $gproc -> $proc" @goto pop_task end end @@ -1066,6 +1073,8 @@ function do_task(to_proc, comm) end timespan_finish(ctx, :storage_wait, thunk_id, (;f, to_proc, device=typeof(to_storage))) + @dagdebug thunk_id :execute "Moving data" + # Initiate data transfers for function and arguments transfer_time = Threads.Atomic{UInt64}(0) transfer_size = Threads.Atomic{UInt64}(0) @@ -1146,6 +1155,8 @@ function do_task(to_proc, comm) # FIXME #gcnum_start = Base.gc_num() + @dagdebug thunk_id :execute "Executing" + result_meta = try # Set TLS variables Dagger.set_tls!(( @@ -1180,6 +1191,7 @@ function do_task(to_proc, comm) bt = catch_backtrace() RemoteException(myid(), CapturedException(ex, bt)) end + threadtime = cputhreadtime() - threadtime_start # FIXME: This is not a realistic measure of max. required memory #gc_allocd = min(max(UInt64(Base.gc_num().allocd) - UInt64(gcnum_start.allocd), UInt64(0)), UInt64(1024^4)) @@ -1189,6 +1201,9 @@ function do_task(to_proc, comm) pop!(TASKS_RUNNING, thunk_id) notify(TASK_SYNC) end + + @dagdebug thunk_id :execute "Returning" + # TODO: debug_storage("Releasing $to_storage_name") metadata = ( time_pressure=real_time_util[], diff --git a/src/sch/dynamic.jl b/src/sch/dynamic.jl index 80d88091b..4378eb660 100644 --- a/src/sch/dynamic.jl +++ b/src/sch/dynamic.jl @@ -207,9 +207,11 @@ function _add_thunk!(ctx, state, task, tid, (f, args, kwargs, future, ref)) thunk_id = ThunkID(thunk.id, thunk_ref) state.thunk_dict[thunk.id] = WeakThunk(thunk) reschedule_inputs!(state, thunk) + @dagdebug thunk :submit "Added to scheduler" if future !== nothing # Ensure we attach a future before the thunk is scheduled _register_future!(ctx, state, task, tid, (future, thunk_id, false)) + @dagdebug thunk :submit "Registered future" end if ref !== nothing # Preserve the `EagerThunkFinalizer` through `thunk` diff --git a/src/sch/util.jl b/src/sch/util.jl index 855b4d90c..37d70cff0 100644 --- a/src/sch/util.jl +++ b/src/sch/util.jl @@ -1,3 +1,37 @@ +const DAGDEBUG_CATEGORIES = Symbol[:global, :submit, :schedule, :scope, + :take, :execute] +macro dagdebug(thunk, category, msg, args...) + cat_sym = category.value + @gensym id + debug_ex_id = :(@debug "[$($id)] ($($(repr(cat_sym)))) $($msg)" _module=Dagger _file=$(string(__source__.file)) _line=$(__source__.line)) + append!(debug_ex_id.args, args) + debug_ex_noid = :(@debug "($($(repr(cat_sym)))) $($msg)" _module=Dagger _file=$(string(__source__.file)) _line=$(__source__.line)) + append!(debug_ex_noid.args, args) + esc(quote + let $id = -1 + if $thunk isa Integer + $id = Int($thunk) + elseif $thunk isa Thunk + $id = $thunk.id + elseif $thunk === nothing + $id = 0 + else + @warn "Unsupported thunk argument to @dagdebug: $(typeof($thunk))" + $id = -1 + end + if $id > 0 + if $(QuoteNode(cat_sym)) in $DAGDEBUG_CATEGORIES + $debug_ex_id + end + elseif $id == 0 + if $(QuoteNode(cat_sym)) in $DAGDEBUG_CATEGORIES + $debug_ex_noid + end + end + end + end) +end + """ unwrap_nested_exception(err::Exception) -> Bool @@ -267,13 +301,13 @@ function can_use_proc(task, gproc, proc, opts, scope) @warn "The `proclist` option is deprecated, please use scopes instead\nSee https://juliaparallel.org/Dagger.jl/stable/scopes/ for details" maxlog=1 if opts.proclist isa Function if !Base.invokelatest(opts.proclist, proc) - @debug "[$(task.id)] Rejected $proc: proclist(proc) == false" + @dagdebug task :scope "Rejected $proc: proclist(proc) == false" return false, scope end scope = constrain(scope, Dagger.ExactScope(proc)) elseif opts.proclist isa Vector if !(typeof(proc) in opts.proclist) - @debug "[$(task.id)] Rejected $proc: !(typeof(proc) in proclist)" + @dagdebug task :scope "Rejected $proc: !(typeof(proc) in proclist)" return false, scope end scope = constrain(scope, @@ -282,7 +316,7 @@ function can_use_proc(task, gproc, proc, opts, scope) throw(SchedulingException("proclist must be a Function, Vector, or nothing")) end if scope isa Dagger.InvalidScope - @debug "[$(task.id)] Rejected $proc: Not contained in task scope ($scope)" + @dagdebug task :scope "Rejected $proc: Not contained in task scope ($scope)" return false, scope end end @@ -291,12 +325,12 @@ function can_use_proc(task, gproc, proc, opts, scope) if opts.single !== nothing @warn "The `single` option is deprecated, please use scopes instead\nSee https://juliaparallel.org/Dagger.jl/stable/scopes/ for details" maxlog=1 if gproc.pid != opts.single - @debug "[$(task.id)] Rejected $proc: gproc.pid ($(gproc.pid)) != single ($(opts.single))" + @dagdebug task :scope "Rejected $proc: gproc.pid ($(gproc.pid)) != single ($(opts.single))" return false, scope end scope = constrain(scope, Dagger.ProcessScope(opts.single)) if scope isa Dagger.InvalidScope - @debug "[$(task.id)] Rejected $proc: Not contained in task scope ($scope)" + @dagdebug task :scope "Rejected $proc: Not contained in task scope ($scope)" return false, scope end end @@ -304,13 +338,13 @@ function can_use_proc(task, gproc, proc, opts, scope) # Check against scope proc_scope = Dagger.ExactScope(proc) if constrain(scope, proc_scope) isa Dagger.InvalidScope - @debug "[$(task.id)] Rejected $proc: Not contained in task scope ($scope)" + @dagdebug task :scope "Rejected $proc: Not contained in task scope ($scope)" return false, scope end @label accept - @debug "[$(task.id)] Accepted $proc" + @dagdebug task :scope "Accepted $proc" return true, scope end From 9dd7a89856807fe021862ecb478d3b20d4d225d1 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Fri, 24 Feb 2023 19:50:51 -0600 Subject: [PATCH 28/44] Add worker-local task stealing and occupancy limiting The worker scheduler would previously assume that it was fine to schedule infinite amounts of work onto the same processor at once, which is only efficient when tasks do lots of `yield`ing. Because most tasks do not actually exhibit low occupancy, we want to teach at least the worker scheduler to limit its eagerness when executing high-occupancy tasks. This commit teaches `@spawn` and the worker scheduler about a new `occupancy` task option, which (on the user side) is a value between 0 and 1 which approximates how fully the task occupies the processor. If the occupancy is 0.2, then 5 such tasks can execute concurrently and fully occupy the processor. Processors now operate primarily from a single controlling task per processor, and work is executed in a lowest-occupancy-first manner to attempt to maximize throughput. With processors using occupancy estimates to limit oversubscription, it's now quite easy for tasks to become starved for work. This commit also adds work-stealing logic to each processor, allowing a starved processor to steal scope-compatible tasks from other busy processors. Processors will be able to steal so long as they are not fully occupied. --- Project.toml | 1 + docs/src/index.md | 36 ++++ src/Dagger.jl | 1 + src/processor.jl | 6 +- src/sch/Sch.jl | 407 ++++++++++++++++++++++++++++++++----- src/sch/eager.jl | 66 +++--- src/sch/util.jl | 14 +- src/utils/locked-object.jl | 16 ++ 8 files changed, 455 insertions(+), 92 deletions(-) create mode 100644 src/utils/locked-object.jl diff --git a/Project.toml b/Project.toml index 4052255f3..a890bbc3d 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "0.16.3" [deps] Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" ContextVariablesX = "6add18c4-b38d-439d-96f6-d6bc489c04c5" +DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" diff --git a/docs/src/index.md b/docs/src/index.md index 7b2cb8307..2316268f0 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -227,5 +227,41 @@ opts = Dagger.Sch.ThunkOptions(;single=1) delayed(+)(1, 2; options=opts) ``` +### Core vs. Worker Schedulers + +Dagger's scheduler is really two kinds of entities: the "core" scheduler, and +"worker" schedulers: + +The core scheduler runs on worker 1, thread 1, and is the entrypoint to tasks +which have been submitted. The core scheduler manages all task dependencies, +notifies calls to `wait` and `fetch` of task completion, and generally performs +initial task placement. The core scheduler has cached information about each +worker and their processors, and uses that information (together with metrics +about previous tasks and other aspects of the Dagger runtime) to generate a +near-optimal just-in-time task schedule. + +The worker schedulers each run as a set of tasks across all workers and all +processors, and handles data movement and task execution. Once the core +scheduler has scheduled and launched a task, it arrives at the worker scheduler +for handling. The worker scheduler will pass the task to a queue for the +assigned processor, where it will wait until the processor has a sufficient +amount of "occupancy" for the task. Once the processor is ready for the task, +it will first fetch all arguments to the task from other workers, and then it +will execute the task, package the result into a `Chunk`, and pass that back to +the core scheduler. + +### Workload Balancing + +In general, Dagger's core scheduler tries to balance workloads as much as +possible across all the available processors, but it can fail to do so +effectively when either the cached per-processor information is outdated, or +when the estimates about the task's behavior are inaccurate. To minimize the +impact of this potential workload imbalance, the worker schedulers' processors +will attempt to steal tasks from each other when they are under-occupied. Tasks +will only be stolen if their [scope](`Scopes`) matches the processor attempting +the steal, so tasks with wider scopes have better balancing potential. + +### Scheduler/Thunk Options + [`Dagger.Sch.SchedulerOptions`](@ref) [`Dagger.Sch.ThunkOptions`](@ref) diff --git a/src/Dagger.jl b/src/Dagger.jl index 63b0834b1..17d460d2e 100644 --- a/src/Dagger.jl +++ b/src/Dagger.jl @@ -36,6 +36,7 @@ include("chunks.jl") include("compute.jl") include("utils/clock.jl") include("utils/system_uuid.jl") +include("utils/locked-object.jl") include("sch/Sch.jl"); using .Sch # Array computations diff --git a/src/processor.jl b/src/processor.jl index 85e58620f..0c1d05214 100644 --- a/src/processor.jl +++ b/src/processor.jl @@ -313,8 +313,7 @@ get_tls() = ( sch_uid=task_local_storage(:_dagger_sch_uid), sch_handle=task_local_storage(:_dagger_sch_handle), processor=thunk_processor(), - time_utilization=task_local_storage(:_dagger_time_utilization), - alloc_utilization=task_local_storage(:_dagger_alloc_utilization), + task_spec=task_local_storage(:_dagger_task_spec), ) """ @@ -326,6 +325,5 @@ function set_tls!(tls) task_local_storage(:_dagger_sch_uid, tls.sch_uid) task_local_storage(:_dagger_sch_handle, tls.sch_handle) task_local_storage(:_dagger_processor, tls.processor) - task_local_storage(:_dagger_time_utilization, tls.time_utilization) - task_local_storage(:_dagger_alloc_utilization, tls.alloc_utilization) + task_local_storage(:_dagger_task_spec, tls.task_spec) end diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index 1008eb8b3..778d8ee42 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -9,8 +9,11 @@ import Random: randperm import Base: @invokelatest import ..Dagger -import ..Dagger: Context, Processor, Thunk, WeakThunk, ThunkFuture, ThunkFailedException, Chunk, WeakChunk, OSProc, AnyScope, DefaultScope +import ..Dagger: Context, Processor, Thunk, WeakThunk, ThunkFuture, ThunkFailedException, Chunk, WeakChunk, OSProc, AnyScope, DefaultScope, LockedObject import ..Dagger: order, dependents, noffspring, istask, inputs, unwrap_weak_checked, affinity, tochunk, timespan_start, timespan_finish, procs, move, chunktype, processor, default_enabled, get_processors, get_parent, execute!, rmprocs!, addprocs!, thunk_processor, constrain, cputhreadtime +import DataStructures: PriorityQueue, enqueue!, dequeue_pair!, peek + +import ..Dagger const OneToMany = Dict{Thunk, Set{Thunk}} @@ -185,16 +188,20 @@ that are instances/subtypes of a contained type. Alternatively, a function can be supplied, and the function will be called with a processor as the sole argument and should return a `Bool` result to indicate whether or not to use the given processor. `nothing` enables all default processors. -- `time_util::Dict{Type,Any}=Dict{Type,Any}()`: Indicates the maximum expected -time utilization for this thunk. Each keypair maps a processor type to the -utilization, where the value can be a real (approximately the number of -nanoseconds taken), or `MaxUtilization()` (utilizes all processors of this -type). By default, the scheduler assumes that this thunk only uses one -processor. -- `alloc_util::Dict{Type,UInt64}=Dict{Type,UInt64}()`: Indicates the maximum -expected memory utilization for this thunk. Each keypair maps a processor type -to the utilization, where the value is an integer representing approximately -the maximum number of bytes allocated at any one time. +- `time_util::Dict{Type,Any}`: Indicates the maximum expected time utilization +for this thunk. Each keypair maps a processor type to the utilization, where +the value can be a real (approximately the number of nanoseconds taken), or +`MaxUtilization()` (utilizes all processors of this type). By default, the +scheduler assumes that this thunk only uses one processor. +- `alloc_util::Dict{Type,UInt64}`: Indicates the maximum expected memory +utilization for this thunk. Each keypair maps a processor type to the +utilization, where the value is an integer representing approximately the +maximum number of bytes allocated at any one time. +- `occupancy::Dict{Type,Real}`: Indicates the maximum expected processor +occupancy for this thunk. Each keypair maps a processor type to the +utilization, where the value can be a real between 0 and 1 (the occupancy +ratio, where 1 is full occupancy). By default, the scheduler assumes that this +thunk has full occupancy. - `allow_errors::Bool=true`: Allow this thunk to error without affecting non-dependent thunks. - `checkpoint=nothing`: If not `nothing`, uses the provided function to save @@ -217,15 +224,13 @@ Base.@kwdef struct ThunkOptions proclist = nothing time_util::Union{Dict{Type,Any},Nothing} = nothing alloc_util::Union{Dict{Type,UInt64},Nothing} = nothing + occupancy::Union{Dict{Type,Real},Nothing} = nothing allow_errors::Union{Bool,Nothing} = nothing checkpoint = nothing restore = nothing storage::Union{Chunk,Nothing} = nothing end -# Eager scheduling -include("eager.jl") - """ Base.merge(sopts::SchedulerOptions, topts::ThunkOptions) -> ThunkOptions @@ -239,6 +244,7 @@ function Base.merge(sopts::SchedulerOptions, topts::ThunkOptions) proclist, topts.time_util, topts.alloc_util, + topts.occupancy, allow_errors, topts.checkpoint, topts.restore, @@ -271,6 +277,7 @@ function populate_defaults(opts::ThunkOptions, Tf, Targs) maybe_default(:proclist), maybe_default(:time_util), maybe_default(:alloc_util), + maybe_default(:occupancy), maybe_default(:allow_errors), maybe_default(:checkpoint), maybe_default(:restore), @@ -281,6 +288,9 @@ end function cleanup(ctx) end +# Eager scheduling +include("eager.jl") + const WORKER_MONITOR_LOCK = Threads.ReentrantLock() const WORKER_MONITOR_TASKS = Dict{Int,Task}() const WORKER_MONITOR_CHANS = Dict{Int,Dict{UInt64,RemoteChannel}}() @@ -605,7 +615,7 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) populate_processor_cache_list!(state, procs) # Schedule tasks - to_fire = Dict{Tuple{OSProc,<:Processor},Vector{Tuple{Thunk,<:Any,<:Any}}}() + to_fire = Dict{Tuple{OSProc,<:Processor},Vector{Tuple{Thunk,<:Any,<:Any,UInt64,UInt32}}}() failed_scheduling = Thunk[] # Select a new task and get its options @@ -697,11 +707,15 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) gproc = get_parent(proc) can_use, scope = can_use_proc(task, gproc, proc, opts, scope) if can_use - has_cap, est_time_util, est_alloc_util = has_capacity(state, proc, gproc.pid, opts.time_util, opts.alloc_util, sig) + has_cap, est_time_util, est_alloc_util, est_occupancy = + has_capacity(state, proc, gproc.pid, opts.time_util, opts.alloc_util, opts.occupancy, sig) if has_cap # Schedule task onto proc # FIXME: est_time_util = est_time_util isa MaxUtilization ? cap : est_time_util - push!(get!(()->Vector{Tuple{Thunk,<:Any,<:Any}}(), to_fire, (gproc, proc)), (task, est_time_util, est_alloc_util)) + proc_tasks = get!(to_fire, (gproc, proc)) do + Vector{Tuple{Thunk,<:Any,<:Any,UInt64,UInt32}}() + end + push!(proc_tasks, (task, scope, est_time_util, est_alloc_util, est_occupancy)) state.worker_time_pressure[gproc.pid][proc] += est_time_util @dagdebug task :schedule "Scheduling to $gproc -> $proc" @goto pop_task @@ -723,7 +737,8 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) # N.B. if we only have one processor, we need to select it now can_use, scope = can_use_proc(task, entry.gproc, entry.proc, opts, scope) if can_use - has_cap, est_time_util, est_alloc_util = has_capacity(state, entry.proc, entry.gproc.pid, opts.time_util, opts.alloc_util, sig) + has_cap, est_time_util, est_alloc_util, est_occupancy = + has_capacity(state, entry.proc, entry.gproc.pid, opts.time_util, opts.alloc_util, opts.occupancy, sig) if has_cap selected_entry = entry else @@ -748,7 +763,8 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) can_use, scope = can_use_proc(task, entry.gproc, entry.proc, opts, scope) if can_use - has_cap, est_time_util, est_alloc_util = has_capacity(state, entry.proc, entry.gproc.pid, opts.time_util, opts.alloc_util, sig) + has_cap, est_time_util, est_alloc_util, est_occupancy = + has_capacity(state, entry.proc, entry.gproc.pid, opts.time_util, opts.alloc_util, opts.occupancy, sig) if has_cap # Select this processor selected_entry = entry @@ -767,7 +783,10 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) # Schedule task onto proc gproc, proc = entry.gproc, entry.proc est_time_util = est_time_util isa MaxUtilization ? cap : est_time_util - push!(get!(()->Vector{Tuple{Thunk,<:Any,<:Any}}(), to_fire, (gproc, proc)), (task, est_time_util, est_alloc_util)) + proc_tasks = get!(to_fire, (gproc, proc)) do + Vector{Tuple{Thunk,<:Any,<:Any,UInt64,UInt32}}() + end + push!(proc_tasks, (task, scope, est_time_util, est_alloc_util, est_occupancy)) # Proceed to next entry to spread work state.procs_cache_list[] = state.procs_cache_list[].next @@ -885,13 +904,13 @@ function evict_chunks!(log_sink, chunks::Set{Chunk}) nothing end -fire_task!(ctx, thunk::Thunk, p, state; time_util=10^9, alloc_util=10^6) = - fire_task!(ctx, (thunk, time_util, alloc_util), p, state) -fire_task!(ctx, (thunk, time_util, alloc_util)::Tuple{Thunk,<:Any}, p, state) = - fire_tasks!(ctx, [(thunk, time_util, alloc_util)], p, state) +fire_task!(ctx, thunk::Thunk, p, state; scope=AnyScope(), time_util=10^9, alloc_util=10^6, occupancy=typemax(UInt32)) = + fire_task!(ctx, (thunk, scope, time_util, alloc_util, occupancy), p, state) +fire_task!(ctx, (thunk, scope, time_util, alloc_util, occupancy)::Tuple{Thunk,<:Any}, p, state) = + fire_tasks!(ctx, [(thunk, scope, time_util, alloc_util, occupancy)], p, state) function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state) to_send = [] - for (thunk, time_util, alloc_util) in thunks + for (thunk, scope, time_util, alloc_util, occupancy) in thunks push!(state.running, thunk) state.running_on[thunk] = gproc if thunk.cache && thunk.cache_ref !== nothing @@ -939,8 +958,9 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state) sch_handle = SchedulerHandle(ThunkID(thunk.id, nothing), state.worker_chans[gproc.pid]...) # TODO: De-dup common fields (log_sink, uid, etc.) - push!(to_send, Any[thunk.id, time_util, alloc_util, chunktype(thunk.f), data, thunk.get_result, - thunk.persist, thunk.cache, thunk.meta, options, + push!(to_send, Any[thunk.id, time_util, alloc_util, occupancy, + scope, chunktype(thunk.f), data, + thunk.get_result, thunk.persist, thunk.cache, thunk.meta, options, propagated, ids, (log_sink=ctx.log_sink, profile=ctx.profile), sch_handle, state.uid]) @@ -966,38 +986,319 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state) end end +@static if VERSION >= v"1.9" +const Doorbell = Base.Event +else +# We need a sticky, resetable signal +mutable struct Doorbell + waiter::Union{Task,Nothing} + @atomic sleeping::Int + Doorbell() = new(nothing, 0) +end +function Base.wait(db::Doorbell) + db.waiter = current_task() + while true + _, succ = @atomicreplace db.sleeping 0 => 1 + if succ + # No messages, wait for someone to wake us + wait() + end + _, succ = @atomicreplace db.sleeping 2 => 0 + if succ + # We had a notification + return + end + end +end +function Base.notify(db::Doorbell) + while true + if (@atomic db.sleeping) == 2 + # Doorbell already rung + return + end + + _, succ = @atomicreplace db.sleeping 0 => 2 + if succ + # Task was definitely busy, we're done + return + end + + _, succ = @atomicreplace db.sleeping 1 => 2 + if succ + # Task was sleeping, wake it and wait for it to awaken + waiter = db.waiter + @assert waiter !== nothing + waiter::Task + schedule(waiter) + while true + sleep_value = @atomic db.sleeping + if sleep_value == 0 || sleep_value == 2 + return + end + #if waiter._state === Base.task_state_runnable && t.queue === nothing + # schedule(waiter) + #else + yield() + #end + end + end + end +end +end + +struct ProcessorInternalState + ctx::Context + proc::Processor + queue::LockedObject{PriorityQueue{Vector{Any}, UInt32, Base.Order.ForwardOrdering}} + reschedule::Doorbell + tasks::Dict{Int,Task} + proc_occupancy::Base.RefValue{UInt32} + time_pressure::Base.RefValue{UInt64} +end +struct ProcessorState + state::ProcessorInternalState + runner::Task +end + +const PROCESSOR_TASK_STATE = LockedObject(Dict{UInt64,Dict{Processor,ProcessorState}}()) + +function proc_states(f::Base.Callable, uid::UInt64) + lock(PROCESSOR_TASK_STATE) do all_states + if !haskey(all_states, uid) + all_states[uid] = Dict{Processor,ProcessorState}() + end + our_states = all_states[uid] + return f(our_states) + end +end +proc_states(f::Base.Callable) = + proc_states(f, task_local_storage(:_dagger_sch_uid)::UInt64) + +task_tid_for_processor(::Processor) = nothing +task_tid_for_processor(proc::Dagger.ThreadProc) = proc.tid + +stealing_permitted(::Processor) = true +stealing_permitted(proc::Dagger.ThreadProc) = proc.owner != 1 || proc.tid != 1 + +proc_has_occupancy(proc_occupancy, task_occupancy) = + UInt64(task_occupancy) + UInt64(proc_occupancy) <= typemax(UInt32) + +function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, return_queue::RemoteChannel) + to_proc = istate.proc + proc_run_task = @task begin + ctx = istate.ctx + tasks = istate.tasks + proc_occupancy = istate.proc_occupancy + time_pressure = istate.time_pressure + + while isopen(return_queue) + # Wait for new tasks + timespan_start(ctx, :proc_run_wait, to_proc, nothing) + wait(istate.reschedule) + @static if VERSION >= v"1.9" + reset(istate.reschedule) + end + timespan_finish(ctx, :proc_run_wait, to_proc, nothing) + + # Fetch a new task to execute + timespan_start(ctx, :proc_run_fetch, to_proc, nothing) + task_and_occupancy = lock(istate.queue) do queue + # Only steal if there are multiple queued tasks, to prevent + # ping-pong of tasks between empty queues + if length(queue) == 0 + return nothing + end + _, occupancy = peek(queue) + if proc_has_occupancy(proc_occupancy[], occupancy) + return dequeue_pair!(queue) + end + return nothing + end + if task_and_occupancy === nothing + timespan_finish(ctx, :proc_run_fetch, to_proc, nothing) + + if !stealing_permitted(to_proc) + continue + end + + if proc_occupancy[] == typemax(UInt32) + continue + end + + # Try to steal a task + timespan_start(ctx, :steal_local, to_proc, nothing) + + # Try to steal from local queues randomly + # TODO: Prioritize stealing from busiest processors + states = collect(proc_states(values, uid)) + # TODO: Try to pre-allocate this + P = randperm(length(states)) + for state in getindex.(Ref(states), P) + other_istate = state.state + if other_istate.proc === to_proc + continue + end + # FIXME: We need to lock two queues to compare occupancies + proc_occupancy_cached = lock(istate.queue) do _ + proc_occupancy[] + end + task_and_occupancy = lock(other_istate.queue) do queue + if length(queue) == 0 + return nothing + end + task, occupancy = peek(queue) + scope = task[5] + if !isa(constrain(scope, Dagger.ExactScope(to_proc)), + Dagger.InvalidScope) && + typemax(UInt32) - proc_occupancy_cached >= occupancy + # Compatible, steal this task + return dequeue_pair!(queue) + end + return nothing + end + if task_and_occupancy !== nothing + from_proc = other_istate.proc + thunk_id = task[1] + @dagdebug thunk_id :execute "Stolen from $from_proc by $to_proc" + timespan_finish(ctx, :steal_local, to_proc, (;from_proc, thunk_id)) + # TODO: Keep stealing until we hit full occupancy? + @goto execute + end + end + timespan_finish(ctx, :steal_local, to_proc, nothing) + + # TODO: Try to steal from remote queues + + continue + end + + @label execute + task, task_occupancy = task_and_occupancy + thunk_id = task[1] + time_util = task[2] + timespan_finish(ctx, :proc_run_fetch, to_proc, (;thunk_id, proc_occupancy=proc_occupancy[], task_occupancy)) + + # Execute the task and return its result + t = @task begin + result = try + do_task(to_proc, task) + catch err + bt = catch_backtrace() + (CapturedException(err, bt), nothing) + finally + lock(istate.queue) do _ + delete!(tasks, thunk_id) + proc_occupancy[] -= task_occupancy + time_pressure[] -= time_util + end + notify(istate.reschedule) + end + try + put!(return_queue, (myid(), to_proc, thunk_id, result)) + catch err + if unwrap_nested_exception(err) isa InvalidStateException || !isopen(return_queue) + @dagdebug thunk_id :execute "Return queue is closed, failing to put result" chan=return_queue exception=(err, catch_backtrace()) + else + rethrow(err) + end + end + end + lock(istate.queue) do _ + tid = task_tid_for_processor(to_proc) + if tid !== nothing + t.sticky = true + ret = ccall(:jl_set_task_tid, Cint, (Any, Cint), t, tid-1) + else + t.sticky = false + end + tasks[thunk_id] = errormonitor(schedule(t)) + proc_occupancy[] += task_occupancy + time_pressure[] += time_util + end + end + end + tid = task_tid_for_processor(to_proc) + if tid !== nothing + proc_run_task.sticky = true + ret = ccall(:jl_set_task_tid, Cint, (Any, Cint), proc_run_task, tid-1) + else + proc_run_task.sticky = false + end + return errormonitor(schedule(proc_run_task)) +end + """ - do_tasks(to_proc, chan, tasks) + do_tasks(to_proc, return_queue, tasks) -Executes a batch of tasks on `to_proc`. +Executes a batch of tasks on `to_proc`, returning their results through +`return_queue`. """ -function do_tasks(to_proc, chan, tasks) - for task in tasks - thunk_id = task[1] - should_launch = lock(TASK_SYNC) do - # Already running; don't try to re-launch - if !(thunk_id in TASKS_RUNNING) - push!(TASKS_RUNNING, thunk_id) - true - else - false +function do_tasks(to_proc, return_queue, tasks) + # FIXME: This is terrible + ctx_vars = first(tasks)[15] + ctx = Context(Processor[]; log_sink=ctx_vars.log_sink, profile=ctx_vars.profile) + uid = first(tasks)[17] + state = proc_states(uid) do states + get!(states, to_proc) do + queue = PriorityQueue{Vector{Any}, UInt32}() + queue_locked = LockedObject(queue) + reschedule = Doorbell() + istate = ProcessorInternalState(ctx, to_proc, + queue_locked, reschedule, + Dict{Int,Task}(), + Ref(UInt32(0)), Ref(UInt64(0))) + runner = start_processor_runner!(istate, uid, return_queue) + @static if VERSION < v"1.9" + reschedule.waiter = runner end + return ProcessorState(istate, runner) end - should_launch || continue - @async begin - try - result = do_task(to_proc, task) - put!(chan, (myid(), to_proc, thunk_id, result)) - catch ex - bt = catch_backtrace() - put!(chan, (myid(), to_proc, thunk_id, (CapturedException(ex, bt), nothing))) + end + istate = state.state + lock(istate.queue) do queue + for task in tasks + thunk_id = task[1] + occupancy = task[4] + timespan_start(ctx, :enqueue, (;to_proc, thunk_id), nothing) + should_launch = lock(TASK_SYNC) do + # Already running; don't try to re-launch + if !(thunk_id in TASKS_RUNNING) + push!(TASKS_RUNNING, thunk_id) + true + else + false + end end + should_launch || continue + enqueue!(queue, task, occupancy) + timespan_finish(ctx, :enqueue, (;to_proc, thunk_id), nothing) + end + end + notify(istate.reschedule) + + # Kick other processors to make them steal + # TODO: Alternatively, automatically balance work instead of blindly enqueueing + states = collect(proc_states(values, uid)) + P = randperm(length(states)) + for other_state in getindex.(Ref(states), P) + other_istate = other_state.state + if other_istate.proc === to_proc + continue end + notify(other_istate.reschedule) end end -"Executes a single task on `to_proc`." -function do_task(to_proc, comm) - thunk_id, est_time_util, est_alloc_util, Tf, data, send_result, persist, cache, meta, options, propagated, ids, ctx_vars, sch_handle, uid = comm + +""" + do_task(to_proc, task_desc) -> Any + +Executes a single task specified by `task_desc` on `to_proc`. +""" +function do_task(to_proc, task_desc) + thunk_id, est_time_util, est_alloc_util, est_occupancy, + scope, Tf, data, + send_result, persist, cache, meta, + options, propagated, ids, ctx_vars, sch_handle, uid = task_desc ctx = Context(Processor[]; log_sink=ctx_vars.log_sink, profile=ctx_vars.profile) from_proc = OSProc() @@ -1031,7 +1332,6 @@ function do_task(to_proc, comm) "[$(myid()), $thunk_id] $f($Tdata) $msg: $est_alloc_util | $real_alloc_util/$storage_cap" end end - lock(TASK_SYNC) do while true # Get current time utilization for the selected processor @@ -1163,8 +1463,7 @@ function do_task(to_proc, comm) sch_uid=uid, sch_handle=sch_handle, processor=to_proc, - time_utilization=est_time_util, - alloc_utilization=est_alloc_util, + task_spec=task_desc, )) res = Dagger.with_options(propagated) do @@ -1215,7 +1514,7 @@ function do_task(to_proc, comm) gc_allocd=(isa(result_meta, Chunk) ? result_meta.handle.size : 0), transfer_rate=(transfer_size[] > 0 && transfer_time[] > 0) ? round(UInt64, transfer_size[] / (transfer_time[] / 10^9)) : nothing, ) - (result_meta, metadata) + return (result_meta, metadata) end end # module Sch diff --git a/src/sch/eager.jl b/src/sch/eager.jl index 7e15f1e74..94fc7e541 100644 --- a/src/sch/eager.jl +++ b/src/sch/eager.jl @@ -19,11 +19,13 @@ function init_eager() @async try sopts = SchedulerOptions(;allow_errors=true) scope = Dagger.ExactScope(Dagger.ThreadProc(1, 1)) + topts = ThunkOptions(;occupancy=Dict(Dagger.ThreadProc=>0)) atexit() do EAGER_FORCE_KILL[] = true close(EAGER_THUNK_CHAN) end - Dagger.compute(ctx, Dagger.delayed(eager_thunk; scope)(); options=sopts) + Dagger.compute(ctx, Dagger.delayed(eager_thunk; scope, options=topts)(); + options=sopts) catch err iob = IOContext(IOBuffer(), :color=>true) println(iob, "Error in eager scheduler:") @@ -37,41 +39,47 @@ function init_eager() end end -"Adjusts the scheduler's cached pressure indicators for the specified worker by -the specified amount, and signals the scheduler to try scheduling again if -pressure decreased." -function adjust_pressure!(h::SchedulerHandle, proc::Processor, pressure) - uid = Dagger.get_tls().sch_uid - lock(TASK_SYNC) do - PROCESSOR_TIME_UTILIZATION[uid][proc][] += pressure - notify(TASK_SYNC) - end - exec!(_adjust_pressure!, h, myid(), proc, pressure) -end -function _adjust_pressure!(ctx, state, task, tid, (pid, proc, pressure)) - state.worker_time_pressure[pid][proc] += pressure - if pressure < 0 - put!(state.chan, RescheduleSignal()) - end - nothing -end - -"Allows a thunk to safely wait on another thunk, by temporarily reducing its -effective pressure to 0." +""" +Allows a thunk to safely wait on another thunk by temporarily reducing its +effective occupancy to 0, which allows a newly-spawned task to run. +""" function thunk_yield(f) if Dagger.in_thunk() h = sch_handle() tls = Dagger.get_tls() - proc = tls.processor - util = tls.time_utilization - adjust_pressure!(h, proc, -util) + proc = Dagger.thunk_processor() + proc_istate = proc_states(tls.sch_uid) do states + states[proc].state + end + task_occupancy = tls.task_spec[4] + + # Decrease our occupancy and inform the processor to reschedule + lock(proc_istate.queue) do _ + proc_istate.proc_occupancy[] -= task_occupancy + @assert 0 <= proc_istate.proc_occupancy[] <= typemax(UInt32) + end + notify(proc_istate.reschedule) try - f() + # Run the yielding code + return f() finally - adjust_pressure!(h, proc, util) + # Wait for processor to have occupancy to run this task + while true + ready = lock(proc_istate.queue) do _ + @assert 0 <= proc_istate.proc_occupancy[] <= typemax(UInt32) + if proc_has_occupancy(proc_istate.proc_occupancy[], task_occupancy) + proc_istate.proc_occupancy[] += task_occupancy + @assert 0 <= proc_istate.proc_occupancy[] <= typemax(UInt32) + return true + end + return false + end + ready && break + yield() + end end else - f() + return f() end end @@ -83,8 +91,6 @@ function eager_thunk() nothing end tls = Dagger.get_tls() - # Don't apply pressure from this thunk - adjust_pressure!(h, tls.processor, -tls.time_utilization) while isopen(EAGER_THUNK_CHAN) try added_future, future, uid, ref, f, args, opts = take!(EAGER_THUNK_CHAN) diff --git a/src/sch/util.jl b/src/sch/util.jl index 37d70cff0..83b22a817 100644 --- a/src/sch/util.jl +++ b/src/sch/util.jl @@ -348,7 +348,7 @@ function can_use_proc(task, gproc, proc, opts, scope) return true, scope end -function has_capacity(state, p, gp, time_util, alloc_util, sig) +function has_capacity(state, p, gp, time_util, alloc_util, occupancy, sig) T = typeof(p) # FIXME: MaxUtilization est_time_util = round(UInt64, if time_util !== nothing && haskey(time_util, T) @@ -359,8 +359,14 @@ function has_capacity(state, p, gp, time_util, alloc_util, sig) est_alloc_util = if alloc_util !== nothing && haskey(alloc_util, T) alloc_util[T] else - get(state.signature_alloc_cost, sig, 0) - end + get(state.signature_alloc_cost, sig, UInt64(0)) + end::UInt64 + est_occupancy = if occupancy !== nothing && haskey(occupancy, T) + # Clamp to 0-1, and scale between 0 and `typemax(UInt32)` + Base.unsafe_trunc(UInt32, clamp(occupancy[T], 0, 1) * typemax(UInt32)) + else + typemax(UInt32) + end::UInt32 #= FIXME: Estimate if cached data can be swapped to storage storage = storage_resource(p) real_alloc_util = state.worker_storage_pressure[gp][storage] @@ -369,7 +375,7 @@ function has_capacity(state, p, gp, time_util, alloc_util, sig) return false, est_time_util, est_alloc_util end =# - return true, est_time_util, est_alloc_util + return true, est_time_util, est_alloc_util, est_occupancy end function populate_processor_cache_list!(state, procs) diff --git a/src/utils/locked-object.jl b/src/utils/locked-object.jl new file mode 100644 index 000000000..e00eaac2d --- /dev/null +++ b/src/utils/locked-object.jl @@ -0,0 +1,16 @@ +# Copied from CUDA.jl + +struct LockedObject{T} + lock::ReentrantLock + payload::T +end +LockedObject(payload) = LockedObject(Threads.ReentrantLock(), payload) + +function Base.lock(f, x::LockedObject) + lock(x.lock) + try + return f(x.payload) + finally + unlock(x.lock) + end +end From 14dc2b5498e2a4dae70ecc3c786f3743ef318ee2 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Fri, 28 Apr 2023 15:31:04 -0500 Subject: [PATCH 29/44] Sch: Use at-invokelatest for move --- src/sch/Sch.jl | 11 ++++++++--- test/options.jl | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index 778d8ee42..116f2878b 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -1394,7 +1394,8 @@ function do_task(to_proc, task_desc) # TODO: Choose "closest" processor of same type first some_proc = first(keys(CHUNK_CACHE[x])) some_x = CHUNK_CACHE[x][some_proc] - move(some_proc, to_proc, some_x) + @dagdebug thunk_id :move "Cache hit for argument $id at $some_proc: $some_x" + @invokelatest move(some_proc, to_proc, some_x) end) else nothing @@ -1406,13 +1407,16 @@ function do_task(to_proc, task_desc) else # Fetch it time_start = time_ns() - _x = move(to_proc, x) + from_proc = processor(x) + _x = @invokelatest move(from_proc, to_proc, x) time_finish = time_ns() if x.handle.size !== nothing Threads.atomic_add!(transfer_time, time_finish - time_start) Threads.atomic_add!(transfer_size, x.handle.size) end + @dagdebug thunk_id :move "Cache miss for argument $id at $from_proc" + # Update cache lock(TASK_SYNC) do CHUNK_CACHE[x] = Dict{Processor,Any}() @@ -1422,8 +1426,9 @@ function do_task(to_proc, task_desc) _x end else - move(to_proc, x) + @invokelatest move(to_proc, x) end + @dagdebug thunk_id :move "Moved argument $id to $to_proc: $x" timespan_finish(ctx, :move, (;thunk_id, id), (;f, id, data=x); tasks=[Base.current_task()]) return x end diff --git a/test/options.jl b/test/options.jl index 79c3449b8..93cf58680 100644 --- a/test/options.jl +++ b/test/options.jl @@ -70,7 +70,7 @@ end @test fetch(Dagger.@spawn sf(obj)) == 0 @test fetch(Dagger.@spawn sf(obj)) == 0 end - Dagger.with_options(;scope=Dagger.ExactScope(Dagger.ThreadProc(1,1)), processor=Dagger.ThreadProc(1,1), meta=true) do + Dagger.with_options(;scope=Dagger.ExactScope(Dagger.ThreadProc(1,1)), processor=OSProc(1), meta=true) do @test fetch(Dagger.@spawn sf(obj)) == 43 @test fetch(Dagger.@spawn sf(obj)) == 43 end From b9ce129fbc0dab59c82b3c07596f538cb4c459cb Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Mon, 15 May 2023 10:34:15 -0500 Subject: [PATCH 30/44] Add keyword argument support APIs like `delayed` and `spawn` assumed that passed kwargs were to be treated as options to the scheduler, which is both somewhat confusing for users, and precludes passing kwargs to user functions. This commit changes those APIs, as well as `@spawn`, to instead pass kwargs directly to the user's function. Options are now passed in an `Options` struct to `delayed` and `spawn` as the second argument (the first being the function), while `@spawn` still keeps them before the call (which is generally more convenient). Internally, `Thunk`'s `inputs` field is now a `Vector{Pair{Union{Symbol,Nothing},Any}}`, where the second element of each pair is the argument, while the first element is a position; if `nothing`, it's a positional argument, and if a `Symbol`, then it's a kwarg. --- docs/src/checkpointing.md | 6 +-- docs/src/index.md | 45 +++++++++--------- docs/src/propagation.md | 2 +- src/array/darray.jl | 2 +- src/array/map-reduce.jl | 10 ++-- src/array/matrix.jl | 10 ++-- src/array/operators.jl | 2 +- src/array/setindex.jl | 2 +- src/compute.jl | 4 +- src/processor.jl | 12 ++--- src/sch/Sch.jl | 52 +++++++++++++++------ src/sch/dynamic.jl | 12 ++--- src/sch/eager.jl | 19 +++++--- src/sch/fault-handler.jl | 2 +- src/sch/util.jl | 25 +++++----- src/thunk.jl | 98 ++++++++++++++++++++++++++++----------- src/ui/graph.jl | 5 +- test/scheduler.jl | 6 +-- test/thunk.jl | 27 +++++++---- 19 files changed, 216 insertions(+), 125 deletions(-) diff --git a/docs/src/checkpointing.md b/docs/src/checkpointing.md index 9b3f7b618..97dd8e8ed 100644 --- a/docs/src/checkpointing.md +++ b/docs/src/checkpointing.md @@ -54,9 +54,9 @@ Let's see how we'd modify the above example to use checkpointing: ```julia using Serialization + X = compute(randn(Blocks(128,128), 1024, 1024)) -Y = [delayed(sum; options=Dagger.Sch.ThunkOptions(; -checkpoint=(thunk,result)->begin +Y = [delayed(sum; checkpoint=(thunk,result)->begin open("checkpoint-$idx.bin", "w") do io serialize(io, collect(result)) end @@ -64,7 +64,7 @@ end, restore=(thunk)->begin open("checkpoint-$idx.bin", "r") do io Dagger.tochunk(deserialize(io)) end -end))(chunk) for (idx,chunk) in enumerate(X.chunks)] +end)(chunk) for (idx,chunk) in enumerate(X.chunks)] inner(x...) = sqrt(sum(x)) Z = delayed(inner)(Y...) z = collect(Z) diff --git a/docs/src/index.md b/docs/src/index.md index 2316268f0..45cc68827 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -2,32 +2,34 @@ ## Usage -The main function for using Dagger is `spawn`: +The main entrypoint to Dagger is `@spawn`: -`Dagger.spawn(f, args...; options...)` +`Dagger.@spawn [option=value]... f(args...; kwargs...)` -or `@spawn` for the more convenient macro form: +or `spawn` if it's more convenient: -`Dagger.@spawn [option=value]... f(args...)` +`Dagger.spawn(f, Dagger.Options(options), args...; kwargs...)` When called, it creates an `EagerThunk` (also known as a "thunk" or "task") -object representing a call to function `f` with the arguments `args`. If it is -called with other thunks as inputs, such as in `Dagger.@spawn f(Dagger.@spawn -g())`, then the function `f` gets passed the results of those input thunks. If -those thunks aren't yet finished executing, then the execution of `f` waits on -all of its input thunks to complete before executing. +object representing a call to function `f` with the arguments `args` and +keyword arguments `kwargs`. If it is called with other thunks as args/kwargs, +such as in `Dagger.@spawn f(Dagger.@spawn g())`, then the function `f` gets +passed the results of those input thunks, once they're available. If those +thunks aren't yet finished executing, then the execution of `f` waits on all of +its input thunks to complete before executing. The key point is that, for each argument to a thunk, if the argument is an `EagerThunk`, it'll be executed before this node and its result will be passed into the function `f`. If the argument is *not* an `EagerThunk` (instead, some other type of Julia object), it'll be passed as-is to the function `f`. -Thunks don't accept regular keyword arguments for the function `f`. Instead, -the `options` kwargs are passed to the scheduler to control its behavior: +The `Options` struct in the second argument position is optional; if provided, +it is passed to the scheduler to control its behavior. `Options` contains a +`NamedTuple` of option key-value pairs, which can be any of: - Any field in `Dagger.Sch.ThunkOptions` (see [Scheduler and Thunk options](@ref)) - `meta::Bool` -- Pass the input `Chunk` objects themselves to `f` and not the value contained in them -There are also some extra kwargs that can be passed, although they're considered advanced options to be used only by developers or library authors: +There are also some extra optionss that can be passed, although they're considered advanced options to be used only by developers or library authors: - `get_result::Bool` -- return the actual result to the scheduler instead of `Chunk` objects. Used when `f` explicitly constructs a Chunk or when return value is small (e.g. in case of reduce) - `persist::Bool` -- the result of this Thunk should not be released after it becomes unused in the DAG - `cache::Bool` -- cache the result of this Thunk such that if the thunk is evaluated again, one can just reuse the cached value. If it’s been removed from cache, recompute the value. @@ -133,10 +135,10 @@ via `@par` or `delayed`. The above computation can be executed with the lazy API by substituting `@spawn` with `@par` and `fetch` with `collect`: ```julia -p = @par add1(4) -q = @par add2(p) -r = @par add1(3) -s = @par combine(p, q, r) +p = Dagger.@par add1(4) +q = Dagger.@par add2(p) +r = Dagger.@par add1(3) +s = Dagger.@par combine(p, q, r) @assert collect(s) == 16 ``` @@ -144,7 +146,7 @@ s = @par combine(p, q, r) or similarly, in block form: ```julia -s = @par begin +s = Dagger.@par begin p = add1(4) q = add2(p) r = add1(3) @@ -159,7 +161,7 @@ operation, you can call `compute` on the thunk. This will return a `Chunk` object which references the result (see [Chunks](@ref) for more details): ```julia -x = @par 1+2 +x = Dagger.@par 1+2 cx = compute(x) cx::Chunk @assert collect(cx) == 3 @@ -207,7 +209,7 @@ Scheduler options can be constructed and passed to `collect()` or `compute()` as the keyword argument `options` for lazy API usage: ```julia -t = @par 1+2 +t = Dagger.@par 1+2 opts = Dagger.Sch.SchedulerOptions(;single=1) # Execute on worker 1 compute(t; options=opts) @@ -221,10 +223,9 @@ Thunk options can be passed to `@spawn/spawn`, `@par`, and `delayed` similarly: # Execute on worker 1 Dagger.@spawn single=1 1+2 -Dagger.spawn(+, 1, 2; single=1) +Dagger.spawn(+, Dagger.Options(;single=1), 1, 2) -opts = Dagger.Sch.ThunkOptions(;single=1) -delayed(+)(1, 2; options=opts) +delayed(+; single=1)(1, 2) ``` ### Core vs. Worker Schedulers diff --git a/docs/src/propagation.md b/docs/src/propagation.md index c47ba1353..ce00af4f2 100644 --- a/docs/src/propagation.md +++ b/docs/src/propagation.md @@ -1,6 +1,6 @@ # Option Propagation -Most options passed to Dagger are passed via `delayed` or `Dagger.@spawn` +Most options passed to Dagger are passed via `@spawn/spawn` or `delayed` directly. This works well when an option only needs to be set for a single thunk, but is cumbersome when the same option needs to be set on multiple thunks, or set recursively on thunks spawned within other thunks. Thankfully, diff --git a/src/array/darray.jl b/src/array/darray.jl index 7f1b8c6bd..0ba1895ca 100644 --- a/src/array/darray.jl +++ b/src/array/darray.jl @@ -251,7 +251,7 @@ function thunkize(ctx::Context, c::DArray; persist=true) if persist foreach(persist!, thunks) end - Thunk(thunks...; meta=true) do results... + Thunk(map(thunk->nothing=>thunk, thunks)...; meta=true) do results... t = eltype(results[1]) DArray(t, dmn, dmnchunks, reshape(Union{Chunk,Thunk}[results...], sz)) diff --git a/src/array/map-reduce.jl b/src/array/map-reduce.jl index e39019ed0..ca659b7ea 100644 --- a/src/array/map-reduce.jl +++ b/src/array/map-reduce.jl @@ -19,7 +19,7 @@ function stage(ctx::Context, node::Map) f = node.f for i=eachindex(domains) inps = map(x->chunks(x)[i], inputs) - thunks[i] = Thunk((args...) -> map(f, args...), inps...) + thunks[i] = Thunk((args...) -> map(f, args...), map(inp->nothing=>inp, inps)...) end DArray(Any, domain(primary), domainchunks(primary), thunks) end @@ -40,8 +40,8 @@ end function stage(ctx::Context, r::ReduceBlock) inp = stage(ctx, r.input) - reduced_parts = map(x -> Thunk(r.op, x; get_result=r.get_result), chunks(inp)) - Thunk((xs...) -> r.op_master(xs), reduced_parts...; meta=true) + reduced_parts = map(x -> Thunk(r.op, nothing=>x; get_result=r.get_result), chunks(inp)) + Thunk((xs...) -> r.op_master(xs), map(part->nothing=>part, reduced_parts)...; meta=true) end reduceblock_async(f, x::ArrayOp; get_result=true) = ReduceBlock(f, f, x, get_result) @@ -126,10 +126,10 @@ function stage(ctx::Context, r::Reducedim) inp = cached_stage(ctx, r.input) thunks = let op = r.op, dims=r.dims # do reducedim on each block - tmp = map(p->Thunk(b->reduce(op,b,dims=dims), p), chunks(inp)) + tmp = map(p->Thunk(b->reduce(op,b,dims=dims), nothing=>p), chunks(inp)) # combine the results in tree fashion treereducedim(tmp, r.dims) do x,y - Thunk(op, x,y) + Thunk(op, nothing=>x, nothing=>y) end end c = domainchunks(inp) diff --git a/src/array/matrix.jl b/src/array/matrix.jl index 686dfb3b8..b2febe908 100644 --- a/src/array/matrix.jl +++ b/src/array/matrix.jl @@ -17,10 +17,10 @@ function size(x::Transpose) end transpose(x::ArrayOp) = Transpose(transpose, x) -transpose(x::Union{Chunk, Thunk}) = Thunk(transpose, x) +transpose(x::Union{Chunk, Thunk}) = Thunk(transpose, nothing=>x) adjoint(x::ArrayOp) = Transpose(adjoint, x) -adjoint(x::Union{Chunk, Thunk}) = Thunk(adjoint, x) +adjoint(x::Union{Chunk, Thunk}) = Thunk(adjoint, nothing=>x) function adjoint(x::ArrayDomain{2}) d = indexes(x) @@ -91,8 +91,8 @@ function (+)(a::ArrayDomain, b::ArrayDomain) a end -(*)(a::Union{Chunk, Thunk}, b::Union{Chunk, Thunk}) = Thunk(*, a,b) -(+)(a::Union{Chunk, Thunk}, b::Union{Chunk, Thunk}) = Thunk(+, a,b) +(*)(a::Union{Chunk, Thunk}, b::Union{Chunk, Thunk}) = Thunk(*, nothing=>a, nothing=>b) +(+)(a::Union{Chunk, Thunk}, b::Union{Chunk, Thunk}) = Thunk(+, nothing=>a, nothing=>b) # we define our own matmat and matvec multiply # for computing the new domains and thunks. @@ -211,7 +211,7 @@ end function _scale(l, r) res = similar(r, Any) for i=1:length(l) - res[i,:] = map(x->Thunk((a,b) -> Diagonal(a)*b, l[i], x), r[i,:]) + res[i,:] = map(x->Thunk((a,b) -> Diagonal(a)*b, nothing=>l[i], nothing=>x), r[i,:]) end res end diff --git a/src/array/operators.jl b/src/array/operators.jl index 6bfe95ae6..007947d7d 100644 --- a/src/array/operators.jl +++ b/src/array/operators.jl @@ -107,7 +107,7 @@ Base.@deprecate mappart(args...) mapchunk(args...) function stage(ctx::Context, node::MapChunk) inputs = map(x->cached_stage(ctx, x), node.input) thunks = map(map(chunks, inputs)...) do ps... - Thunk(node.f, ps...) + Thunk(node.f, map(p->nothing=>p, ps)...) end DArray(Any, domain(inputs[1]), domainchunks(inputs[1]), thunks) diff --git a/src/array/setindex.jl b/src/array/setindex.jl index e2c842901..695519025 100644 --- a/src/array/setindex.jl +++ b/src/array/setindex.jl @@ -35,7 +35,7 @@ function stage(ctx::Context, sidx::SetIndex) local_dmn = ArrayDomain(map(x->x[2], idx_and_dmn)) s = subdmns[idx...] part_to_set = sidx.val - ps[idx...] = Thunk(ps[idx...]) do p + ps[idx...] = Thunk(nothing=>ps[idx...]) do p q = copy(p) q[indexes(project(s, local_dmn))...] .= part_to_set q diff --git a/src/compute.jl b/src/compute.jl index d895f79db..4934712e7 100644 --- a/src/compute.jl +++ b/src/compute.jl @@ -97,7 +97,7 @@ function dependents(node::Thunk) if !haskey(deps, next) deps[next] = Set{Thunk}() end - for inp in inputs(next) + for (_, inp) in next.inputs if istask(inp) || (inp isa Chunk) s = get!(()->Set{Thunk}(), deps, inp) push!(s, next) @@ -165,7 +165,7 @@ function order(node::Thunk, ndeps) haskey(output, next) && continue s += 1 output[next] = s - parents = filter(istask, inputs(next)) + parents = filter(istask, map(last, next.inputs)) if !isempty(parents) # If parents is empty, sort! should be a no-op, but raises an ambiguity error # when InlineStrings.jl is loaded (at least, version 1.1.0), because InlineStrings diff --git a/src/processor.jl b/src/processor.jl index 0c1d05214..024c66c53 100644 --- a/src/processor.jl +++ b/src/processor.jl @@ -31,11 +31,11 @@ function delete_processor_callback!(name::Symbol) end """ - execute!(proc::Processor, f, args...) -> Any + execute!(proc::Processor, f, args...; kwargs...) -> Any -Executes the function `f` with arguments `args` on processor `proc`. This -function can be overloaded by `Processor` subtypes to allow executing function -calls differently than normal Julia. +Executes the function `f` with arguments `args` and keyword arguments `kwargs` +on processor `proc`. This function can be overloaded by `Processor` subtypes to +allow executing function calls differently than normal Julia. """ function execute! end @@ -154,12 +154,12 @@ end iscompatible(proc::ThreadProc, opts, f, args...) = true iscompatible_func(proc::ThreadProc, opts, f) = true iscompatible_arg(proc::ThreadProc, opts, x) = true -function execute!(proc::ThreadProc, @nospecialize(f), @nospecialize(args...)) +function execute!(proc::ThreadProc, @nospecialize(f), @nospecialize(args...); @nospecialize(kwargs...)) tls = get_tls() task = Task() do set_tls!(tls) TimespanLogging.prof_task_put!(tls.sch_handle.thunk_id.id) - @invokelatest f(args...) + @invokelatest f(args...; kwargs...) end task.sticky = true ret = ccall(:jl_set_task_tid, Cint, (Any, Cint), task, proc.tid-1) diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index 116f2878b..9597ced67 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -130,7 +130,7 @@ function start_state(deps::Dict, node_order, chan) for k in sort(collect(keys(deps)), by=node_order) if istask(k) - waiting = Set{Thunk}(Iterators.filter(istask, inputs(k))) + waiting = Set{Thunk}(Iterators.filter(istask, map(last, inputs(k)))) if isempty(waiting) push!(state.ready, k) else @@ -659,7 +659,7 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) DefaultScope() end end - for input in task.inputs + for (_,input) in task.inputs input = unwrap_weak_checked(input) chunk = if istask(input) state.cache[input] @@ -688,7 +688,7 @@ function schedule!(ctx, state, procs=procs_to_use(ctx)) @goto fallback end - inputs = collect_task_inputs(state, task) + inputs = map(last, collect_task_inputs(state, task)) opts = populate_defaults(opts, chunktype(task.f), map(chunktype, inputs)) local_procs, costs = estimate_task_costs(state, local_procs, task, inputs) scheduled = false @@ -945,10 +945,13 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state) ids = Int[0] data = Any[thunk.f] - for (idx, x) in enumerate(thunk.inputs) + positions = Union{Symbol,Nothing}[] + for (idx, pos_x) in enumerate(thunk.inputs) + pos, x = pos_x x = unwrap_weak_checked(x) push!(ids, istask(x) ? x.id : -idx) push!(data, istask(x) ? state.cache[x] : x) + push!(positions, pos) end toptions = thunk.options !== nothing ? thunk.options : ThunkOptions() options = merge(ctx.options, toptions) @@ -961,7 +964,7 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state) push!(to_send, Any[thunk.id, time_util, alloc_util, occupancy, scope, chunktype(thunk.f), data, thunk.get_result, thunk.persist, thunk.cache, thunk.meta, options, - propagated, ids, + propagated, ids, positions, (log_sink=ctx.log_sink, profile=ctx.profile), sch_handle, state.uid]) end @@ -1093,6 +1096,7 @@ function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, re while isopen(return_queue) # Wait for new tasks + @dagdebug nothing :processor "Waiting for tasks" timespan_start(ctx, :proc_run_wait, to_proc, nothing) wait(istate.reschedule) @static if VERSION >= v"1.9" @@ -1101,22 +1105,27 @@ function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, re timespan_finish(ctx, :proc_run_wait, to_proc, nothing) # Fetch a new task to execute + @dagdebug nothing :processor "Trying to dequeue" timespan_start(ctx, :proc_run_fetch, to_proc, nothing) task_and_occupancy = lock(istate.queue) do queue # Only steal if there are multiple queued tasks, to prevent # ping-pong of tasks between empty queues if length(queue) == 0 + @dagdebug nothing :processor "Nothing to dequeue" return nothing end _, occupancy = peek(queue) - if proc_has_occupancy(proc_occupancy[], occupancy) - return dequeue_pair!(queue) + if !proc_has_occupancy(proc_occupancy[], occupancy) + @dagdebug nothing :processor "Insufficient occupancy" proc_occupancy=proc_occupancy[] task_occupancy=occupancy + return nothing end - return nothing + return dequeue_pair!(queue) end if task_and_occupancy === nothing timespan_finish(ctx, :proc_run_fetch, to_proc, nothing) + @dagdebug nothing :processor "Failed to dequeue" + if !stealing_permitted(to_proc) continue end @@ -1125,6 +1134,8 @@ function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, re continue end + @dagdebug nothing :processor "Trying to steal" + # Try to steal a task timespan_start(ctx, :steal_local, to_proc, nothing) @@ -1159,7 +1170,7 @@ function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, re if task_and_occupancy !== nothing from_proc = other_istate.proc thunk_id = task[1] - @dagdebug thunk_id :execute "Stolen from $from_proc by $to_proc" + @dagdebug thunk_id :processor "Stolen from $from_proc by $to_proc" timespan_finish(ctx, :steal_local, to_proc, (;from_proc, thunk_id)) # TODO: Keep stealing until we hit full occupancy? @goto execute @@ -1177,6 +1188,7 @@ function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, re thunk_id = task[1] time_util = task[2] timespan_finish(ctx, :proc_run_fetch, to_proc, (;thunk_id, proc_occupancy=proc_occupancy[], task_occupancy)) + @dagdebug thunk_id :processor "Dequeued task" # Execute the task and return its result t = @task begin @@ -1234,10 +1246,12 @@ Executes a batch of tasks on `to_proc`, returning their results through `return_queue`. """ function do_tasks(to_proc, return_queue, tasks) + @dagdebug nothing :processor "Enqueuing task batch" batch_size=length(tasks) + # FIXME: This is terrible - ctx_vars = first(tasks)[15] + ctx_vars = first(tasks)[16] ctx = Context(Processor[]; log_sink=ctx_vars.log_sink, profile=ctx_vars.profile) - uid = first(tasks)[17] + uid = first(tasks)[18] state = proc_states(uid) do states get!(states, to_proc) do queue = PriorityQueue{Vector{Any}, UInt32}() @@ -1272,6 +1286,7 @@ function do_tasks(to_proc, return_queue, tasks) should_launch || continue enqueue!(queue, task, occupancy) timespan_finish(ctx, :enqueue, (;to_proc, thunk_id), nothing) + @dagdebug thunk_id :processor "Enqueued task" end end notify(istate.reschedule) @@ -1287,6 +1302,7 @@ function do_tasks(to_proc, return_queue, tasks) end notify(other_istate.reschedule) end + @dagdebug nothing :processor "Kicked processors" end """ @@ -1298,7 +1314,7 @@ function do_task(to_proc, task_desc) thunk_id, est_time_util, est_alloc_util, est_occupancy, scope, Tf, data, send_result, persist, cache, meta, - options, propagated, ids, ctx_vars, sch_handle, uid = task_desc + options, propagated, ids, positions, ctx_vars, sch_handle, uid = task_desc ctx = Context(Processor[]; log_sink=ctx_vars.log_sink, profile=ctx_vars.profile) from_proc = OSProc() @@ -1442,6 +1458,16 @@ function do_task(to_proc, task_desc) end f = popfirst!(fetched) @assert !(f isa Chunk) "Failed to unwrap thunk function" + fetched_args = Any[] + fetched_kwargs = Pair{Symbol,Any}[] + for (idx, x) in enumerate(fetched) + pos = positions[idx] + if pos === nothing + push!(fetched_args, x) + else + push!(fetched_kwargs, pos => x) + end + end #= FIXME: If MaxUtilization, stop processors and wait if (est_time_util isa MaxUtilization) && (real_time_util > 0) @@ -1473,7 +1499,7 @@ function do_task(to_proc, task_desc) res = Dagger.with_options(propagated) do # Execute - execute!(to_proc, f, fetched...) + execute!(to_proc, f, fetched_args...; fetched_kwargs...) end # Check if result is safe to store diff --git a/src/sch/dynamic.jl b/src/sch/dynamic.jl index 4378eb660..679514b83 100644 --- a/src/sch/dynamic.jl +++ b/src/sch/dynamic.jl @@ -144,7 +144,7 @@ function _register_future!(ctx, state, task, tid, (future, id, check)::Tuple{Thu if t == target return true end - for input in t.inputs + for (_, input) in t.inputs # N.B. Skips expired tasks input = Dagger.unwrap_weak(input) istask(input) || continue @@ -195,13 +195,13 @@ function _get_dag_ids(ctx, state, task, tid, _) end "Adds a new Thunk to the DAG." -add_thunk!(f, h::SchedulerHandle, args...; future=nothing, ref=nothing, kwargs...) = - exec!(_add_thunk!, h, f, args, kwargs, future, ref) -function _add_thunk!(ctx, state, task, tid, (f, args, kwargs, future, ref)) +add_thunk!(f, h::SchedulerHandle, args...; future=nothing, ref=nothing, options...) = + exec!(_add_thunk!, h, f, args, options, future, ref) +function _add_thunk!(ctx, state, task, tid, (f, args, options, future, ref)) timespan_start(ctx, :add_thunk, tid, 0) - _args = map(arg->arg isa ThunkID ? state.thunk_dict[arg.id] : arg, args) + _args = map(pos_arg->pos_arg[1] => (pos_arg[2] isa ThunkID ? state.thunk_dict[pos_arg[2].id] : pos_arg[2]), args) GC.@preserve _args begin - thunk = Thunk(f, _args...; kwargs...) + thunk = Thunk(f, _args...; options...) # Create a `DRef` to `thunk` so that the caller can preserve it thunk_ref = poolset(thunk; size=64, device=MemPool.CPURAMDevice()) thunk_id = ThunkID(thunk.id, thunk_ref) diff --git a/src/sch/eager.jl b/src/sch/eager.jl index 94fc7e541..1bafc0874 100644 --- a/src/sch/eager.jl +++ b/src/sch/eager.jl @@ -19,12 +19,13 @@ function init_eager() @async try sopts = SchedulerOptions(;allow_errors=true) scope = Dagger.ExactScope(Dagger.ThreadProc(1, 1)) - topts = ThunkOptions(;occupancy=Dict(Dagger.ThreadProc=>0)) atexit() do EAGER_FORCE_KILL[] = true close(EAGER_THUNK_CHAN) end - Dagger.compute(ctx, Dagger.delayed(eager_thunk; scope, options=topts)(); + opts = Dagger.Options((;scope, + occupancy=Dict(Dagger.ThreadProc=>0))) + Dagger.compute(ctx, Dagger.delayed(eager_thunk, opts)(); options=sopts) catch err iob = IOContext(IOBuffer(), :color=>true) @@ -96,10 +97,16 @@ function eager_thunk() added_future, future, uid, ref, f, args, opts = take!(EAGER_THUNK_CHAN) # preserve inputs until they enter the scheduler tid = GC.@preserve args begin - _args = map(x->x isa Dagger.EagerThunk ? ThunkID(EAGER_ID_MAP[x.uid], x.thunk_ref) : - x isa Dagger.Chunk ? WeakChunk(x) : - x, - args) + _args = map(args) do pos_x + pos, x = pos_x + if x isa Dagger.EagerThunk + return pos => ThunkID(EAGER_ID_MAP[x.uid], x.thunk_ref) + elseif x isa Dagger.Chunk + return pos => WeakChunk(x) + else + return pos => x + end + end add_thunk!(f, h, _args...; future=future, ref=ref, opts...) end EAGER_ID_MAP[uid] = tid.id diff --git a/src/sch/fault-handler.jl b/src/sch/fault-handler.jl index 401f47096..0782af3a2 100644 --- a/src/sch/fault-handler.jl +++ b/src/sch/fault-handler.jl @@ -44,7 +44,7 @@ function handle_fault(ctx, state, deadproc) # Remove thunks from state.ready that have inputs on the deadlist for idx in length(state.ready):-1:1 rt = state.ready[idx] - if any((input in deadlist) for input in rt.inputs) + if any((input in deadlist) for input in map(last, rt.inputs)) deleteat!(state.ready, idx) end end diff --git a/src/sch/util.jl b/src/sch/util.jl index 83b22a817..67c0ffb2b 100644 --- a/src/sch/util.jl +++ b/src/sch/util.jl @@ -1,5 +1,5 @@ const DAGDEBUG_CATEGORIES = Symbol[:global, :submit, :schedule, :scope, - :take, :execute] + :take, :execute, :processor] macro dagdebug(thunk, category, msg, args...) cat_sym = category.value @gensym id @@ -78,7 +78,7 @@ end of all chunks that can now be evicted from workers." function cleanup_inputs!(state, node) to_evict = Set{Chunk}() - for inp in node.inputs + for (_, inp) in node.inputs inp = unwrap_weak_checked(inp) if !istask(inp) && !(inp isa Chunk) continue @@ -142,7 +142,7 @@ function reschedule_inputs!(state, thunk, seen=Set{Thunk}()) continue end w = get!(()->Set{Thunk}(), state.waiting, thunk) - for input in thunk.inputs + for (_, input) in thunk.inputs input = unwrap_weak_checked(input) input in seen && continue @@ -228,7 +228,7 @@ function print_sch_status(io::IO, state, thunk; offset=0, limit=5, max_inputs=3) print(io, "($(status_string(thunk))) ") end println(io, "$(thunk.id): $(thunk.f)") - for (idx,input) in enumerate(thunk.inputs) + for (idx, (_, input)) in enumerate(thunk.inputs) if input isa WeakThunk input = unwrap_weak(input) if input === nothing @@ -289,10 +289,13 @@ end chunktype(x) = typeof(x) function signature(task::Thunk, state) sig = Any[chunktype(task.f)] - for input in collect_task_inputs(state, task) - push!(sig, chunktype(input)) + for (pos, input) in collect_task_inputs(state, task) + # N.B. Skips kwargs + if pos === nothing + push!(sig, chunktype(input)) + end end - sig + return sig end function can_use_proc(task, gproc, proc, opts, scope) @@ -420,12 +423,12 @@ end "Collects all arguments for `task`, converting Thunk inputs to Chunks." function collect_task_inputs(state, task) - inputs = Any[] - for input in task.inputs + inputs = Pair{Union{Symbol,Nothing},Any}[] + for (pos, input) in task.inputs input = unwrap_weak_checked(input) - push!(inputs, istask(input) ? state.cache[input] : input) + push!(inputs, pos => (istask(input) ? state.cache[input] : input)) end - inputs + return inputs end """ diff --git a/src/thunk.jl b/src/thunk.jl index 3b4aa4900..b426c3a20 100644 --- a/src/thunk.jl +++ b/src/thunk.jl @@ -1,4 +1,4 @@ -export Thunk, delayed, delayedmap +export Thunk, delayed const ID_COUNTER = Threads.Atomic{Int}(1) next_id() = Threads.atomic_add!(ID_COUNTER, 1) @@ -50,7 +50,7 @@ If omitted, options can also be specified by passing key-value pairs as """ mutable struct Thunk f::Any # usually a Function, but could be any callable - inputs::Vector{Any} # TODO: Use `ImmutableArray` in 1.8 + inputs::Vector{Pair{Union{Symbol,Nothing},Any}} # TODO: Use `ImmutableArray` in 1.8 id::Int get_result::Bool # whether the worker should send the result or only the metadata meta::Bool @@ -83,6 +83,7 @@ mutable struct Thunk something(scope, DefaultScope())) end xs = Any[xs...] + @assert all(x->x isa Pair, xs) if options !== nothing @assert isempty(kwargs) new(f, xs, id, get_result, meta, persist, cache, cache_ref, @@ -105,7 +106,7 @@ function affinity(t::Thunk) aff_vec = affinity(t.cache_ref) else aff = Dict{OSProc,Int}() - for inp in inputs(t) + for (_, inp) in t.inputs #if haskey(state.cache, inp) # as = affinity(state.cache[inp]) # for a in as @@ -130,17 +131,35 @@ function affinity(t::Thunk) #end end +struct Options + options::NamedTuple +end +Options(;options...) = Options((;options...)) +Options(options...) = Options((;options...)) + +function args_kwargs_to_pairs(args, kwargs) + args_kwargs = Pair{Union{Symbol,Nothing},Any}[] + for arg in args + push!(args_kwargs, nothing => arg) + end + for kwarg in kwargs + push!(args_kwargs, kwarg[1] => kwarg[2]) + end + return args_kwargs +end + """ - delayed(f; kwargs...)(args...) + delayed(f, options=Options())(args...; kwargs...) -> Thunk + delayed(f; options...)(args...; kwargs...) -> Thunk Creates a [`Thunk`](@ref) object which can be executed later, which will call -`f` with `args`. `kwargs` controls various properties of the resulting `Thunk`. +`f` with `args` and `kwargs`. `options` controls various properties of the +resulting `Thunk`. """ -function delayed(f; kwargs...) - (args...) -> Thunk(f, args...; kwargs...) +function delayed(f, options::Options) + (args...; kwargs...) -> Thunk(f, args_kwargs_to_pairs(args, kwargs)...; options.options...) end - -delayedmap(f, xs...) = map(delayed(f), xs...) +delayed(f; kwargs...) = delayed(f, Options(;kwargs...)) "A future holding the result of a `Thunk`." struct ThunkFuture @@ -209,7 +228,7 @@ function Base.showerror(io::IO, ex::ThunkFailedException) return "Thunk(?)" end Tinputs = Any[] - for input in t.inputs + for (_, input) in t.inputs input = unwrap_weak(input) if istask(input) push!(Tinputs, "Thunk(id=$(input.id))") @@ -218,11 +237,11 @@ function Base.showerror(io::IO, ex::ThunkFailedException) end end t_sig = if length(Tinputs) <= 4 - "$(t.f)($(join(Tinputs, ", "))))" + "$(t.f)($(join(Tinputs, ", ")))" else "$(t.f)($(length(Tinputs)) inputs...)" end - return "Thunk(id=$(t.id), $t_sig" + return "Thunk(id=$(t.id), $t_sig)" end t_str = thunk_string(t) o_str = thunk_string(o) @@ -281,20 +300,21 @@ end const EAGER_ID_COUNTER = Threads.Atomic{UInt64}(1) eager_next_id() = Threads.atomic_add!(EAGER_ID_COUNTER, one(UInt64)) -function _spawn(f, args...; options, kwargs...) +function _spawn(f, options::Options, args...) Dagger.Sch.init_eager() for arg in args - @assert !isa(arg, Thunk) "Cannot use Thunks in spawn" + @assert !isa(last(arg), Thunk) "Cannot use Thunks in spawn" end uid = eager_next_id() future = ThunkFuture() finalizer_ref = poolset(EagerThunkFinalizer(uid); device=MemPool.CPURAMDevice()) added_future = Future() - propagates = keys(options) - put!(Dagger.Sch.EAGER_THUNK_CHAN, (added_future, future, uid, finalizer_ref, f, (args...,), (;propagates, options..., kwargs...,))) + propagates = keys(options.options) + put!(Dagger.Sch.EAGER_THUNK_CHAN, (added_future, future, uid, finalizer_ref, f, (args...,), (;propagates, options.options...,))) thunk_ref = fetch(added_future) return (uid, future, finalizer_ref, thunk_ref) end + """ spawn(f, args...; kwargs...) -> EagerThunk @@ -304,26 +324,34 @@ an `EagerThunk`. Uses a scheduler running in the background to execute code. Note that `kwargs` are passed to the `Thunk` constructor, and are documented in its docstring. """ -function spawn(f, args...; processor=nothing, scope=nothing, kwargs...) +function spawn(f, args...; kwargs...) options = get_options() + if length(args) >= 1 && first(args) isa Options + options = merge(options, first(args).options) + args = args[2:end] + end + processor = haskey(options, :processor) ? options.processor : nothing + scope = haskey(options, :scope) ? options.scope : nothing if !isnothing(processor) || !isnothing(scope) f = tochunk(f, something(processor, get_options(:processor, OSProc())), something(scope, get_options(:scope, DefaultScope()))) end + args_kwargs = args_kwargs_to_pairs(args, kwargs) uid, future, finalizer_ref, thunk_ref = if myid() == 1 - _spawn(f, args...; options, kwargs...) + _spawn(f, Options(options), args_kwargs...) else - remotecall_fetch(_spawn, 1, f, args...; options, kwargs...) + remotecall_fetch(_spawn, 1, f, Options(options), args_kwargs...) end return EagerThunk(uid, future, finalizer_ref, thunk_ref) end """ - @par [opts] f(args...) -> Thunk + @par [opts] f(args...; kwargs...) -> Thunk -Convenience macro to call `Dagger.delayed` on `f` with arguments `args`. -May also be called with a series of assignments like so: +Convenience macro to call `Dagger.delayed` on `f` with arguments `args` and +keyword arguments `kwargs`. May also be called with a series of assignments +like so: ```julia x = @par begin @@ -375,16 +403,24 @@ end function _par(ex::Expr; lazy=true, recur=true, opts=()) if ex.head == :call && recur f = replace_broadcast(ex.args[1]) - args = ex.args[2:end] + if length(ex.args) >= 2 && Meta.isexpr(ex.args[2], :parameters) + args = ex.args[3:end] + kwargs = ex.args[2] + else + args = ex.args[2:end] + kwargs = Expr(:parameters) + end opts = esc.(opts) + args_ex = _par.(args; lazy=lazy, recur=false) + kwargs_ex = _par.(kwargs.args; lazy=lazy, recur=false) if lazy - return :(Dagger.delayed($(esc(f)); $(opts...))($(_par.(args; lazy=lazy, recur=false)...))) + return :(Dagger.delayed($(esc(f)), $Options(;$(opts...)))($(args_ex...); $(kwargs_ex...))) else sync_var = esc(Base.sync_varname) @gensym result return quote - let args = ($(_par.(args; lazy=lazy, recur=false)...),) - $result = $spawn($(esc(f)), args...; $(opts...)) + let args = ($(args_ex...),) + $result = $spawn($(esc(f)), $Options(;$(opts...)), args...; $(kwargs_ex...)) if $(Expr(:islocal, sync_var)) put!($sync_var, schedule(Task(()->wait($result)))) end @@ -437,7 +473,15 @@ function Base.show(io::IO, z::Thunk) end print(io, "Thunk[$(z.id)]($f, ") if lvl > 0 - show(IOContext(io, :lazy_level => lvl-1), z.inputs) + inputs = Any[] + for (pos, input) in z.inputs + if pos === nothing + push!(inputs, input) + else + push!(inputs, pos => input) + end + end + show(IOContext(io, :lazy_level => lvl-1), inputs) else print(io, "...") end diff --git a/src/ui/graph.jl b/src/ui/graph.jl index d4cf0acee..033c4baff 100644 --- a/src/ui/graph.jl +++ b/src/ui/graph.jl @@ -142,8 +142,9 @@ write_edge(ctx, io, from::String, to::String) = println(io, "n_$from -> n_$to;") getargs!(d, t) = nothing function getargs!(d, t::Thunk) - d[t.id] = [filter(x->!istask(x[2]), collect(enumerate(t.inputs)))...,] - foreach(i->getargs!(d, i), t.inputs) + raw_inputs = map(last, t.inputs) + d[t.id] = [filter(x->!istask(x[2]), collect(enumerate(raw_inputs)))...,] + foreach(i->getargs!(d, i), raw_inputs) end function write_dag(io, logs::Vector, t=nothing) ctx = (proc_to_color = Dict{Processor,String}(), diff --git a/test/scheduler.jl b/test/scheduler.jl index 96611b82d..28ddcb5c6 100644 --- a/test/scheduler.jl +++ b/test/scheduler.jl @@ -50,7 +50,7 @@ function dynamic_get_dag(x...) end function dynamic_add_thunk(x) h = sch_handle() - id = Dagger.Sch.add_thunk!(h, x) do y + id = Dagger.Sch.add_thunk!(h, nothing=>x) do y y+1 end wait(h, id) @@ -58,7 +58,7 @@ function dynamic_add_thunk(x) end function dynamic_add_thunk_self_dominated(x) h = sch_handle() - id = Dagger.Sch.add_thunk!(h, h.thunk_id, x) do y + id = Dagger.Sch.add_thunk!(h, nothing=>h.thunk_id, nothing=>x) do y y+1 end return fetch(h, id) @@ -205,7 +205,7 @@ end # until we end up on a non-blocked worker h = Dagger.Sch.sch_handle() wkrs = Dagger.Sch.exec!(_list_workers, h) - id = Dagger.Sch.add_thunk!(testfun, h, i) + id = Dagger.Sch.add_thunk!(testfun, h, nothing=>i) return fetch(h, id) end return myid() diff --git a/test/thunk.jl b/test/thunk.jl index 28933b3b8..c99a2bbfb 100644 --- a/test/thunk.jl +++ b/test/thunk.jl @@ -6,7 +6,7 @@ import Dagger: Chunk function dynamic_fib(n) n <= 1 && return n - t = Dagger.spawn(dynamic_fib, n-1) + t = Dagger.@spawn dynamic_fib(n-1) y = dynamic_fib(n-2) return (fetch(t)::Int) + y end @@ -55,13 +55,22 @@ end a = @spawn x + x @test a isa Dagger.EagerThunk b = @spawn sum([x,1,2]) - c = spawn(*, a, b) + c = @spawn a * b @test c isa Dagger.EagerThunk @test fetch(a) == 4 @test fetch(b) == 5 @test fetch(c) == 20 end @test Dagger.Sch.EAGER_CONTEXT[] isa Context + @testset "keyword arguments" begin + A = rand(4, 4) + @test fetch(@spawn sum(A; dims=1)) ≈ sum(A; dims=1) + + @test_throws_unwrap Dagger.ThunkFailedException fetch(@spawn sum(A; fakearg=2)) + + @test fetch(@spawn reduce(+, A; dims=1, init=2.0)) ≈ + reduce(+, A; dims=1, init=2.0) + end @testset "broadcast" begin A, B = rand(4), rand(4) @test fetch(@spawn A .+ B) ≈ A .+ B @@ -161,7 +170,7 @@ end end end @testset "remote spawn" begin - a = fetch(Distributed.@spawnat 2 Dagger.spawn(+, 1, 2)) + a = fetch(Distributed.@spawnat 2 Dagger.@spawn 1+2) @test Dagger.Sch.EAGER_INIT[] @test fetch(Distributed.@spawnat 2 !(Dagger.Sch.EAGER_INIT[])) @test a isa Dagger.EagerThunk @@ -227,19 +236,19 @@ end end end @testset "eager API" begin - _a = Dagger.spawn(+, 1, 2; scope=NodeScope()) + _a = Dagger.@spawn scope=NodeScope() 1+2 a = Dagger.Sch._find_thunk(_a) @test a.f isa Chunk @test a.f.processor isa OSProc @test a.f.scope isa NodeScope - _a = Dagger.spawn(+, 1, 2; processor=Dagger.ThreadProc(1,1)) + _a = Dagger.@spawn processor=Dagger.ThreadProc(1,1) 1+2 a = Dagger.Sch._find_thunk(_a) @test a.f isa Chunk @test a.f.processor isa Dagger.ThreadProc @test a.f.scope == DefaultScope() - _a = Dagger.spawn(+, 1, 2; processor=Dagger.ThreadProc(1,1), scope=NodeScope()) + _a = Dagger.@spawn processor=Dagger.ThreadProc(1,1) scope=NodeScope() 1+2 a = Dagger.Sch._find_thunk(_a) @test a.f isa Chunk @test a.f.processor isa Dagger.ThreadProc @@ -251,12 +260,12 @@ end s = p -> p == Dagger.ThreadProc(1, 1) f = (x) -> 10 + x - g = (x) -> fetch(Dagger.spawn(f, x; proclist=s)) - fetch(Dagger.spawn(g, 10; proclist=s)) + g = (x) -> fetch(Dagger.@spawn proclist=s f(x)) + fetch(Dagger.@spawn proclist=s g(10)) end @testset "no cross-scheduler Thunk usage" begin a = delayed(+)(1,2) - @test_throws Exception Dagger.spawn(identity, a) + @test_throws Exception Dagger.@spawn identity(a) end @testset "@sync support" begin result = Dagger.@spawn sleep(1) From 1c8878d709c9e6eeabb812e5368b84b75f270593 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Thu, 18 May 2023 08:35:51 -0500 Subject: [PATCH 31/44] CI: Add Julia 1.9 --- .buildkite/pipeline.yml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index f46473b40..871e51a8b 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -35,12 +35,22 @@ steps: julia_args: "--threads=1" - JuliaCI/julia-coverage#v1: codecov: true + - label: Julia 1.9 + timeout_in_minutes: 60 + <<: *test + plugins: + - JuliaCI/julia#v1: + version: "1.9" + - JuliaCI/julia-test#v1: + julia_args: "--threads=1" + - JuliaCI/julia-coverage#v1: + codecov: true - label: Julia nightly timeout_in_minutes: 60 <<: *test plugins: - JuliaCI/julia#v1: - version: "1.9-nightly" + version: "1.10-nightly" - JuliaCI/julia-test#v1: julia_args: "--threads=1" - JuliaCI/julia-coverage#v1: From 0ca17034e673d114f8a6ed9ae014dc83929d0731 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felipe=20de=20Alc=C3=A2ntara=20Tom=C3=A9?= Date: Thu, 25 May 2023 08:36:40 -0300 Subject: [PATCH 32/44] Included the array on mock hashing scheme and changed deprecated single directive --- lib/DaggerMPI/src/DaggerMPI.jl | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/DaggerMPI/src/DaggerMPI.jl b/lib/DaggerMPI/src/DaggerMPI.jl index 56fa764db..7fb55bdac 100644 --- a/lib/DaggerMPI/src/DaggerMPI.jl +++ b/lib/DaggerMPI/src/DaggerMPI.jl @@ -1,8 +1,10 @@ module DaggerMPI - using Dagger using MPI +bcastDict = Base.Dict{Int32, MPI.Buffer}() + + struct MPIProcessor{P,C} <: Dagger.Processor proc::P comm::MPI.Comm @@ -18,8 +20,6 @@ const MPI_PROCESSORS = Ref{Int}(-1) const PREVIOUS_PROCESSORS = Set() - - function initialize(comm::MPI.Comm=MPI.COMM_WORLD; color_algo=SimpleColoring()) @assert MPI_PROCESSORS[] == -1 "DaggerMPI already initialized" @@ -89,7 +89,6 @@ function recv_yield(src, tag, comm) value = MPI.deserialize(buf) return value end - Core.print("[$(MPI.Comm_rank(comm))] message is not ready on $tag\n") yield() end end @@ -144,7 +143,7 @@ function Dagger.move(from_proc::MPIProcessor, to_proc::Dagger.Processor, x::Dagg @assert Dagger.chunktype(x) <: MPIColoredValue x_value = fetch(x) rank = MPI.Comm_rank(from_proc.comm) - tag = abs(Base.unsafe_trunc(Int32, Dagger.get_task_hash(:input) >> 32)) + tag = abs(Base.unsafe_trunc(Int32, Dagger.get_task_hash(:input) >> 32)) if rank == x_value.color # FIXME: Broadcast send @sync for other in 0:(MPI.Comm_size(from_proc.comm)-1) From 3426eac3ce9cb21a78cf8c869a3761b7de258b3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felipe=20de=20Alc=C3=A2ntara=20Tom=C3=A9?= Date: Fri, 9 Jun 2023 17:37:20 -0300 Subject: [PATCH 33/44] [Temporary] Changes to the nonblocking broadcast impletations and wrappers --- lib/DaggerMPI/src/DaggerMPI.jl | 79 ++++++++++++++++++++++++++++------ 1 file changed, 65 insertions(+), 14 deletions(-) diff --git a/lib/DaggerMPI/src/DaggerMPI.jl b/lib/DaggerMPI/src/DaggerMPI.jl index 7fb55bdac..5cdbe5ccc 100644 --- a/lib/DaggerMPI/src/DaggerMPI.jl +++ b/lib/DaggerMPI/src/DaggerMPI.jl @@ -2,9 +2,6 @@ module DaggerMPI using Dagger using MPI -bcastDict = Base.Dict{Int32, MPI.Buffer}() - - struct MPIProcessor{P,C} <: Dagger.Processor proc::P comm::MPI.Comm @@ -18,6 +15,8 @@ end const MPI_PROCESSORS = Ref{Int}(-1) +const BCAST_VALUES = Dict{Any, Any}() + const PREVIOUS_PROCESSORS = Set() function initialize(comm::MPI.Comm=MPI.COMM_WORLD; color_algo=SimpleColoring()) @@ -74,9 +73,64 @@ end Dagger.get_parent(proc::MPIProcessor) = Dagger.OSProc() Dagger.default_enabled(proc::MPIProcessor) = true +function IbcastAPI(data, root::Integer, comm::MPI.Comm, req::MPI.AbstractRequest=MPI.Request()) + @assert MPI.isnull(req) + buf = MPI.Buffer(data) + @debug "[$(MPI.Comm_rank(comm))] Entered API call with root $root" + MPI.API.MPI_Ibcast(buf.data, buf.count, buf.datatype, Cint(root), comm, req) + MPI.setbuffer!(req, buf) + @debug "[$(MPI.Comm_rank(comm))] Ended API call with root $root" + return req +end + +function IbcastSend(obj, root, tag, comm, req::MPI.AbstractRequest=MPI.Request()) + @debug "[$(MPI.Comm_rank(comm))] Started IbcastSend on tag [$tag]" + count = Ref{Cint}() + sendTag = Ref{Cint}() + buf = MPI.serialize(obj) + count[] = length(buf) + sendTag[] = tag + IbcastAPI(count, root, comm) + IbcastAPI(sendTag, root, comm) + IbcastAPI(buf, root, comm) +end + +function IbcastRecv(root, tag, comm, req::MPI.AbstractRequest=MPI.Request()) + count = Ref{Cint}() + tagRec = Ref{Cint}() + req = IbcastAPI(count, root, comm, req) + while true + finish = MPI.test(req) + if finish + break + end + end + req = IbcastAPI(tagRec, root, comm, req) + while true + finish = MPI.Test(req) + if finish + break + end + end + buf = Array{UInt8}(undef, count[]) + req = IbcastAPI(buf, root, comm, req) + while true + finish = MPI.test(req) + if finish + break + end + end + BCAST_VALUES[tagRec[]] = MPI.deserialize(buf) + @debug "[$(MPI.Comm_rank(comm))] Received [$tag] on IbcastRecv" +end + +function Ibcast_yield() +end + + "Busy-loop Irecv that yields to other tasks." -function recv_yield(src, tag, comm) +function recv_yield(src, tag, comm) while true (got, msg, stat) = MPI.Improbe(src, tag, comm, MPI.Status) if got @@ -145,19 +199,16 @@ function Dagger.move(from_proc::MPIProcessor, to_proc::Dagger.Processor, x::Dagg rank = MPI.Comm_rank(from_proc.comm) tag = abs(Base.unsafe_trunc(Int32, Dagger.get_task_hash(:input) >> 32)) if rank == x_value.color - # FIXME: Broadcast send - @sync for other in 0:(MPI.Comm_size(from_proc.comm)-1) - other == rank && continue - @async begin - @debug "[$rank] Starting bcast send to [$other] on $tag" - MPI.isend(x_value.value, other, tag, from_proc.comm) - @debug "[$rank] Finished bcast send to [$other] on $tag" - end - end + @debug "[$rank] Starting bcast send on $tag" + IbcastSend(x_value.value, x_value.color, tag, from_proc.comm) + @debug "[$rank] Finished bcast send on $tag" return Dagger.move(from_proc.proc, to_proc, x_value.value) else @debug "[$rank] Starting bcast recv on $tag" - value = recv_yield(x_value.color, tag, from_proc.comm) + while !haskey(BCAST_VALUES, tag) + IbcastRecv(x_value.color, tag, from_proc.comm) + end + value = BCAST_VALUES[tag] @debug "[$rank] Finished bcast recv on $tag" return Dagger.move(from_proc.proc, to_proc, value) end From 6dc697540f7c6f64811a4a16aec2479c058fba39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felipe=20de=20Alc=C3=A2ntara=20Tom=C3=A9?= Date: Fri, 9 Jun 2023 17:39:02 -0300 Subject: [PATCH 34/44] Changes regarding allocation, mapping, reducing and indexing for the DArray type --- src/array/alloc.jl | 2 +- src/array/darray.jl | 57 +++++++++++++++++++---------------------- src/array/getindex.jl | 4 +-- src/array/map-reduce.jl | 18 ++++++------- src/array/matrix.jl | 15 ++++++----- src/compute.jl | 5 ---- src/thunk.jl | 3 +++ 7 files changed, 50 insertions(+), 54 deletions(-) diff --git a/src/array/alloc.jl b/src/array/alloc.jl index b47af6f56..39ca60396 100644 --- a/src/array/alloc.jl +++ b/src/array/alloc.jl @@ -25,7 +25,7 @@ end function stage(ctx, a::AllocateArray) alloc(idx, sz) = a.f(idx, a.eltype, sz) - thunks = [delayed(alloc)(i, size(x)) for (i, x) in enumerate(a.domainchunks)] + thunks = [Dagger.@spawn alloc(i, size(x)) for (i, x) in enumerate(a.domainchunks)] DArray(a.eltype,a.domain, a.domainchunks, thunks) end diff --git a/src/array/darray.jl b/src/array/darray.jl index 7f1b8c6bd..068d325d9 100644 --- a/src/array/darray.jl +++ b/src/array/darray.jl @@ -1,4 +1,4 @@ -import Base: == +import Base: ==, fetch using Serialization import Serialization: serialize, deserialize @@ -78,13 +78,14 @@ domain(x::AbstractArray) = ArrayDomain([1:l for l in size(x)]) abstract type ArrayOp{T, N} <: AbstractArray{T, N} end Base.IndexStyle(::Type{<:ArrayOp}) = IndexCartesian() -compute(ctx, x::ArrayOp; options=nothing) = - compute(ctx, cached_stage(ctx, x)::DArray; options=options) -collect(ctx::Context, x::ArrayOp; options=nothing) = - collect(ctx, compute(ctx, x; options=options); options=options) +collect(x::ArrayOp) = collect(fetch(x)) -collect(x::ArrayOp; options=nothing) = collect(Context(global_context()), x; options=options) +Base.fetch(x::ArrayOp) = fetch(cached_stage(Context(global_context()), x)::DArray) + +collect(x::Computation) = collect(fetch(x)) + +Base.fetch(x::Computation) = fetch(cached_stage(Context(global_context()), x)) function Base.show(io::IO, ::MIME"text/plain", x::ArrayOp) write(io, string(typeof(x))) @@ -113,7 +114,7 @@ An N-dimensional distributed array of element type T, with a concatenation funct mutable struct DArray{T,N,F} <: ArrayOp{T, N} domain::ArrayDomain{N} subdomains::AbstractArray{ArrayDomain{N}, N} - chunks::AbstractArray{Union{Chunk,Thunk}, N} + chunks::AbstractArray{Any, N} concat::F function DArray{T,N,F}(domain, subdomains, chunks, concat::Function) where {T, N,F} new(domain, subdomains, chunks, concat) @@ -135,8 +136,8 @@ domainchunks(d::DArray) = d.subdomains size(x::DArray) = size(domain(x)) stage(ctx, c::DArray) = c -function collect(ctx::Context, d::DArray; tree=false, options=nothing) - a = compute(ctx, d; options=options) +function collect(d::DArray; tree=false) + a = fetch(d) if isempty(d.chunks) return Array{eltype(d)}(undef, size(d)...) @@ -144,9 +145,9 @@ function collect(ctx::Context, d::DArray; tree=false, options=nothing) dimcatfuncs = [(x...) -> d.concat(x..., dims=i) for i in 1:ndims(d)] if tree - collect(treereduce_nd(delayed.(dimcatfuncs), a.chunks)) + collect(fetch(treereduce_nd(map(x -> ((args...,) -> Dagger.@spawn x(args...)) , dimcatfuncs), a.chunks))) else - treereduce_nd(dimcatfuncs, asyncmap(collect, a.chunks)) + treereduce_nd(dimcatfuncs, asyncmap(fetch, a.chunks)) end end @@ -209,12 +210,12 @@ _cumsum(x::AbstractArray) = length(x) == 0 ? Int[] : cumsum(x) function lookup_parts(ps::AbstractArray, subdmns::DomainBlocks{N}, d::ArrayDomain{N}) where N groups = map(group_indices, subdmns.cumlength, indexes(d)) sz = map(length, groups) - pieces = Array{Union{Chunk,Thunk}}(undef, sz) + pieces = Array{Any}(undef, sz) for i = CartesianIndices(sz) idx_and_dmn = map(getindex, groups, i.I) idx = map(x->x[1], idx_and_dmn) dmn = ArrayDomain(map(x->x[2], idx_and_dmn)) - pieces[i] = delayed(getindex)(ps[idx...], project(subdmns[idx...], dmn)) + pieces[i] = Dagger.@spawn getindex(ps[idx...], project(subdmns[idx...], dmn)) end out_cumlength = map(g->_cumsum(map(x->length(x[2]), g)), groups) out_dmn = DomainBlocks(ntuple(x->1,Val(N)), out_cumlength) @@ -222,40 +223,37 @@ function lookup_parts(ps::AbstractArray, subdmns::DomainBlocks{N}, d::ArrayDomai end -""" +#=""" compute(ctx::Context, x::DArray; persist=true, options=nothing) A `DArray` object may contain a thunk in it, in which case we first turn it into a `Thunk` and then compute it. """ -function compute(ctx::Context, x::DArray; persist=true, options=nothing) +function fetch(x::DArray; persist=true, options=nothing) thunk = thunkize(ctx, x, persist=persist) - if isa(thunk, Thunk) - compute(ctx, thunk; options=options) + if isa(thunk, EagerThunk) + fetch(thunk; options=options) else x end end +=# """ thunkize(ctx::Context, c::DArray; persist=true) If a `DArray` tree has a `Thunk` in it, make the whole thing a big thunk. """ -function thunkize(ctx::Context, c::DArray; persist=true) +function Base.fetch(c::DArray) if any(istask, chunks(c)) thunks = chunks(c) sz = size(thunks) dmn = domain(c) dmnchunks = domainchunks(c) - if persist - foreach(persist!, thunks) - end - Thunk(thunks...; meta=true) do results... + fetch(Dagger.spawn(thunks...; meta=true) do results... t = eltype(results[1]) - DArray(t, dmn, dmnchunks, - reshape(Union{Chunk,Thunk}[results...], sz)) - end + DArray(t, dmn, dmnchunks, reshape(Any[results...], sz)) + end) else c end @@ -335,19 +333,18 @@ function stage(ctx::Context, d::Distribute) cs = map(d.domainchunks) do idx chunks = cached_stage(ctx, x[idx]).chunks shape = size(chunks) - (delayed() do shape, parts... + Dagger.spawn(shape, chunks...) do shape, parts... if prod(shape) == 0 return Array{T}(undef, shape) end dimcatfuncs = [(x...) -> concat(x..., dims=i) for i in 1:length(shape)] ps = reshape(Any[parts...], shape) collect(treereduce_nd(dimcatfuncs, ps)) - end)(shape, chunks...) + end end else - cs = map(c -> delayed(identity)(d.data[c]), d.domainchunks) + cs = map(c -> Dagger.@spawn identity(d.data[c]), d.domainchunks) end - DArray( eltype(d.data), domain(d.data), @@ -357,7 +354,7 @@ function stage(ctx::Context, d::Distribute) end function distribute(x::AbstractArray, dist) - compute(Distribute(dist, x)) + fetch(Distribute(dist, x)) end function distribute(x::AbstractArray{T,N}, n::NTuple{N}) where {T,N} diff --git a/src/array/getindex.jl b/src/array/getindex.jl index d04ae024f..89d38bb97 100644 --- a/src/array/getindex.jl +++ b/src/array/getindex.jl @@ -34,9 +34,9 @@ end function stage(ctx::Context, gidx::GetIndexScalar) inp = cached_stage(ctx, gidx.input) s = view(inp, ArrayDomain(gidx.idx)) - delayed(identity)(collect(s)[1]) + Dagger.@spawn identity(collect(s)[1]) end Base.getindex(c::ArrayOp, idx::ArrayDomain) = GetIndex(c, indexes(idx)) Base.getindex(c::ArrayOp, idx...) = GetIndex(c, idx) -Base.getindex(c::ArrayOp, idx::Integer...) = compute(GetIndexScalar(c, idx)) +Base.getindex(c::ArrayOp, idx::Integer...) = fetch(GetIndexScalar(c, idx)) diff --git a/src/array/map-reduce.jl b/src/array/map-reduce.jl index e39019ed0..0381519d2 100644 --- a/src/array/map-reduce.jl +++ b/src/array/map-reduce.jl @@ -19,7 +19,7 @@ function stage(ctx::Context, node::Map) f = node.f for i=eachindex(domains) inps = map(x->chunks(x)[i], inputs) - thunks[i] = Thunk((args...) -> map(f, args...), inps...) + thunks[i] = Dagger.@spawn map(f, inps...) end DArray(Any, domain(primary), domainchunks(primary), thunks) end @@ -40,16 +40,16 @@ end function stage(ctx::Context, r::ReduceBlock) inp = stage(ctx, r.input) - reduced_parts = map(x -> Thunk(r.op, x; get_result=r.get_result), chunks(inp)) - Thunk((xs...) -> r.op_master(xs), reduced_parts...; meta=true) + reduced_parts = map(x -> (Dagger.@spawn get_result=r.get_result r.op(x)), chunks(inp)) + r_op_master(args...,) = r.op_master(args) + Dagger.@spawn meta=true r_op_master(reduced_parts...) end reduceblock_async(f, x::ArrayOp; get_result=true) = ReduceBlock(f, f, x, get_result) reduceblock_async(f, g::Function, x::ArrayOp; get_result=true) = ReduceBlock(f, g, x, get_result) -reduceblock(f, x::ArrayOp) = compute(reduceblock_async(f, x)) -reduceblock(f, g::Function, x::ArrayOp) = - compute(reduceblock_async(f, g, x)) +reduceblock(f, x::ArrayOp) = reduceblock_async(f, x) +reduceblock(f, g::Function, x::ArrayOp) = reduceblock_async(f, g, x) reduce_async(f::Function, x::ArrayOp) = reduceblock_async(xs->reduce(f,xs), xs->reduce(f,xs), x) @@ -115,7 +115,7 @@ end function reduce(f::Function, x::ArrayOp; dims = nothing) if dims === nothing - return compute(reduce_async(f,x)) + return fetch(reduce_async(f,x)) elseif dims isa Int dims = (dims,) end @@ -126,10 +126,10 @@ function stage(ctx::Context, r::Reducedim) inp = cached_stage(ctx, r.input) thunks = let op = r.op, dims=r.dims # do reducedim on each block - tmp = map(p->Thunk(b->reduce(op,b,dims=dims), p), chunks(inp)) + tmp = map(p->Dagger.spawn(b->reduce(op,b,dims=dims), p), chunks(inp)) # combine the results in tree fashion treereducedim(tmp, r.dims) do x,y - Thunk(op, x,y) + Dagger.@spawn op(x,y) end end c = domainchunks(inp) diff --git a/src/array/matrix.jl b/src/array/matrix.jl index 686dfb3b8..436f0d321 100644 --- a/src/array/matrix.jl +++ b/src/array/matrix.jl @@ -17,10 +17,10 @@ function size(x::Transpose) end transpose(x::ArrayOp) = Transpose(transpose, x) -transpose(x::Union{Chunk, Thunk}) = Thunk(transpose, x) +#transpose(x::Union{Chunk, Thunk}) = Thunk(transpose, x) adjoint(x::ArrayOp) = Transpose(adjoint, x) -adjoint(x::Union{Chunk, Thunk}) = Thunk(adjoint, x) +#adjoint(x::Union{Chunk, Thunk}) = Thunk(adjoint, x) function adjoint(x::ArrayDomain{2}) d = indexes(x) @@ -32,7 +32,7 @@ function adjoint(x::ArrayDomain{1}) end function _ctranspose(x::AbstractArray) - Any[delayed(adjoint)(x[j,i]) for i=1:size(x,2), j=1:size(x,1)] + Any[Dagger.@spawn adjoint(x[j,i]) for i=1:size(x,2), j=1:size(x,1)] end function stage(ctx::Context, node::Transpose) @@ -91,8 +91,9 @@ function (+)(a::ArrayDomain, b::ArrayDomain) a end -(*)(a::Union{Chunk, Thunk}, b::Union{Chunk, Thunk}) = Thunk(*, a,b) -(+)(a::Union{Chunk, Thunk}, b::Union{Chunk, Thunk}) = Thunk(+, a,b) +#(*)(a::Union{Chunk, EagerThunk}, b::Union{Chunk, EagerThunk}) = Thunk(*, a,b) + +#(+)(a::Union{Chunk, Thunk}, b::Union{Chunk, Thunk}) = Thunk(+, a,b) # we define our own matmat and matvec multiply # for computing the new domains and thunks. @@ -176,7 +177,7 @@ function stage(ctx::Context, mul::MatMul) a, b = stage_operands(ctx, mul, mul.a, mul.b) d = domain(a)*domain(b) DArray(Any, d, domainchunks(a)*domainchunks(b), - _mul(chunks(a), chunks(b); T=Thunk)) + _mul(chunks(a), chunks(b); T=Any)) end Base.power_by_squaring(x::DArray, i::Int) = foldl(*, ntuple(idx->x, i)) @@ -211,7 +212,7 @@ end function _scale(l, r) res = similar(r, Any) for i=1:length(l) - res[i,:] = map(x->Thunk((a,b) -> Diagonal(a)*b, l[i], x), r[i,:]) + res[i,:] = map(x->Dagger.spawn((a,b) -> Diagonal(a)*b, l[i], x), r[i,:]) end res end diff --git a/src/compute.jl b/src/compute.jl index d895f79db..0490e3dc6 100644 --- a/src/compute.jl +++ b/src/compute.jl @@ -12,11 +12,6 @@ collect(d::Union{Chunk,Thunk}; options=nothing) = abstract type Computation end -compute(ctx, c::Computation; options=nothing) = - compute(ctx, stage(ctx, c); options=options) -collect(c::Computation; options=nothing) = - collect(Context(global_context()), c; options=options) - """ compute(ctx::Context, d::Thunk; options=nothing) -> Chunk diff --git a/src/thunk.jl b/src/thunk.jl index 603f53ac3..cb2eecaa0 100644 --- a/src/thunk.jl +++ b/src/thunk.jl @@ -240,6 +240,9 @@ function Base.fetch(t::EagerThunk; raw=false) end end end + +istask(t::EagerThunk) = true + function Base.show(io::IO, t::EagerThunk) print(io, "EagerThunk ($(isready(t) ? "finished" : "running"))") end From 86eb14cb75b5ee4e42ac70ebfce1641a9f834acf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felipe=20de=20Alc=C3=A2ntara=20Tom=C3=A9?= Date: Fri, 9 Jun 2023 17:39:02 -0300 Subject: [PATCH 35/44] Changes regarding allocation, mapping, reducing and indexing for the DArray type --- src/array/alloc.jl | 2 +- src/array/darray.jl | 57 +++++++++++++++++++---------------------- src/array/getindex.jl | 4 +-- src/array/map-reduce.jl | 18 ++++++------- src/array/matrix.jl | 15 ++++++----- src/compute.jl | 5 ---- src/thunk.jl | 3 +++ 7 files changed, 50 insertions(+), 54 deletions(-) diff --git a/src/array/alloc.jl b/src/array/alloc.jl index b47af6f56..39ca60396 100644 --- a/src/array/alloc.jl +++ b/src/array/alloc.jl @@ -25,7 +25,7 @@ end function stage(ctx, a::AllocateArray) alloc(idx, sz) = a.f(idx, a.eltype, sz) - thunks = [delayed(alloc)(i, size(x)) for (i, x) in enumerate(a.domainchunks)] + thunks = [Dagger.@spawn alloc(i, size(x)) for (i, x) in enumerate(a.domainchunks)] DArray(a.eltype,a.domain, a.domainchunks, thunks) end diff --git a/src/array/darray.jl b/src/array/darray.jl index 0ba1895ca..068d325d9 100644 --- a/src/array/darray.jl +++ b/src/array/darray.jl @@ -1,4 +1,4 @@ -import Base: == +import Base: ==, fetch using Serialization import Serialization: serialize, deserialize @@ -78,13 +78,14 @@ domain(x::AbstractArray) = ArrayDomain([1:l for l in size(x)]) abstract type ArrayOp{T, N} <: AbstractArray{T, N} end Base.IndexStyle(::Type{<:ArrayOp}) = IndexCartesian() -compute(ctx, x::ArrayOp; options=nothing) = - compute(ctx, cached_stage(ctx, x)::DArray; options=options) -collect(ctx::Context, x::ArrayOp; options=nothing) = - collect(ctx, compute(ctx, x; options=options); options=options) +collect(x::ArrayOp) = collect(fetch(x)) -collect(x::ArrayOp; options=nothing) = collect(Context(global_context()), x; options=options) +Base.fetch(x::ArrayOp) = fetch(cached_stage(Context(global_context()), x)::DArray) + +collect(x::Computation) = collect(fetch(x)) + +Base.fetch(x::Computation) = fetch(cached_stage(Context(global_context()), x)) function Base.show(io::IO, ::MIME"text/plain", x::ArrayOp) write(io, string(typeof(x))) @@ -113,7 +114,7 @@ An N-dimensional distributed array of element type T, with a concatenation funct mutable struct DArray{T,N,F} <: ArrayOp{T, N} domain::ArrayDomain{N} subdomains::AbstractArray{ArrayDomain{N}, N} - chunks::AbstractArray{Union{Chunk,Thunk}, N} + chunks::AbstractArray{Any, N} concat::F function DArray{T,N,F}(domain, subdomains, chunks, concat::Function) where {T, N,F} new(domain, subdomains, chunks, concat) @@ -135,8 +136,8 @@ domainchunks(d::DArray) = d.subdomains size(x::DArray) = size(domain(x)) stage(ctx, c::DArray) = c -function collect(ctx::Context, d::DArray; tree=false, options=nothing) - a = compute(ctx, d; options=options) +function collect(d::DArray; tree=false) + a = fetch(d) if isempty(d.chunks) return Array{eltype(d)}(undef, size(d)...) @@ -144,9 +145,9 @@ function collect(ctx::Context, d::DArray; tree=false, options=nothing) dimcatfuncs = [(x...) -> d.concat(x..., dims=i) for i in 1:ndims(d)] if tree - collect(treereduce_nd(delayed.(dimcatfuncs), a.chunks)) + collect(fetch(treereduce_nd(map(x -> ((args...,) -> Dagger.@spawn x(args...)) , dimcatfuncs), a.chunks))) else - treereduce_nd(dimcatfuncs, asyncmap(collect, a.chunks)) + treereduce_nd(dimcatfuncs, asyncmap(fetch, a.chunks)) end end @@ -209,12 +210,12 @@ _cumsum(x::AbstractArray) = length(x) == 0 ? Int[] : cumsum(x) function lookup_parts(ps::AbstractArray, subdmns::DomainBlocks{N}, d::ArrayDomain{N}) where N groups = map(group_indices, subdmns.cumlength, indexes(d)) sz = map(length, groups) - pieces = Array{Union{Chunk,Thunk}}(undef, sz) + pieces = Array{Any}(undef, sz) for i = CartesianIndices(sz) idx_and_dmn = map(getindex, groups, i.I) idx = map(x->x[1], idx_and_dmn) dmn = ArrayDomain(map(x->x[2], idx_and_dmn)) - pieces[i] = delayed(getindex)(ps[idx...], project(subdmns[idx...], dmn)) + pieces[i] = Dagger.@spawn getindex(ps[idx...], project(subdmns[idx...], dmn)) end out_cumlength = map(g->_cumsum(map(x->length(x[2]), g)), groups) out_dmn = DomainBlocks(ntuple(x->1,Val(N)), out_cumlength) @@ -222,40 +223,37 @@ function lookup_parts(ps::AbstractArray, subdmns::DomainBlocks{N}, d::ArrayDomai end -""" +#=""" compute(ctx::Context, x::DArray; persist=true, options=nothing) A `DArray` object may contain a thunk in it, in which case we first turn it into a `Thunk` and then compute it. """ -function compute(ctx::Context, x::DArray; persist=true, options=nothing) +function fetch(x::DArray; persist=true, options=nothing) thunk = thunkize(ctx, x, persist=persist) - if isa(thunk, Thunk) - compute(ctx, thunk; options=options) + if isa(thunk, EagerThunk) + fetch(thunk; options=options) else x end end +=# """ thunkize(ctx::Context, c::DArray; persist=true) If a `DArray` tree has a `Thunk` in it, make the whole thing a big thunk. """ -function thunkize(ctx::Context, c::DArray; persist=true) +function Base.fetch(c::DArray) if any(istask, chunks(c)) thunks = chunks(c) sz = size(thunks) dmn = domain(c) dmnchunks = domainchunks(c) - if persist - foreach(persist!, thunks) - end - Thunk(map(thunk->nothing=>thunk, thunks)...; meta=true) do results... + fetch(Dagger.spawn(thunks...; meta=true) do results... t = eltype(results[1]) - DArray(t, dmn, dmnchunks, - reshape(Union{Chunk,Thunk}[results...], sz)) - end + DArray(t, dmn, dmnchunks, reshape(Any[results...], sz)) + end) else c end @@ -335,19 +333,18 @@ function stage(ctx::Context, d::Distribute) cs = map(d.domainchunks) do idx chunks = cached_stage(ctx, x[idx]).chunks shape = size(chunks) - (delayed() do shape, parts... + Dagger.spawn(shape, chunks...) do shape, parts... if prod(shape) == 0 return Array{T}(undef, shape) end dimcatfuncs = [(x...) -> concat(x..., dims=i) for i in 1:length(shape)] ps = reshape(Any[parts...], shape) collect(treereduce_nd(dimcatfuncs, ps)) - end)(shape, chunks...) + end end else - cs = map(c -> delayed(identity)(d.data[c]), d.domainchunks) + cs = map(c -> Dagger.@spawn identity(d.data[c]), d.domainchunks) end - DArray( eltype(d.data), domain(d.data), @@ -357,7 +354,7 @@ function stage(ctx::Context, d::Distribute) end function distribute(x::AbstractArray, dist) - compute(Distribute(dist, x)) + fetch(Distribute(dist, x)) end function distribute(x::AbstractArray{T,N}, n::NTuple{N}) where {T,N} diff --git a/src/array/getindex.jl b/src/array/getindex.jl index d04ae024f..89d38bb97 100644 --- a/src/array/getindex.jl +++ b/src/array/getindex.jl @@ -34,9 +34,9 @@ end function stage(ctx::Context, gidx::GetIndexScalar) inp = cached_stage(ctx, gidx.input) s = view(inp, ArrayDomain(gidx.idx)) - delayed(identity)(collect(s)[1]) + Dagger.@spawn identity(collect(s)[1]) end Base.getindex(c::ArrayOp, idx::ArrayDomain) = GetIndex(c, indexes(idx)) Base.getindex(c::ArrayOp, idx...) = GetIndex(c, idx) -Base.getindex(c::ArrayOp, idx::Integer...) = compute(GetIndexScalar(c, idx)) +Base.getindex(c::ArrayOp, idx::Integer...) = fetch(GetIndexScalar(c, idx)) diff --git a/src/array/map-reduce.jl b/src/array/map-reduce.jl index ca659b7ea..0381519d2 100644 --- a/src/array/map-reduce.jl +++ b/src/array/map-reduce.jl @@ -19,7 +19,7 @@ function stage(ctx::Context, node::Map) f = node.f for i=eachindex(domains) inps = map(x->chunks(x)[i], inputs) - thunks[i] = Thunk((args...) -> map(f, args...), map(inp->nothing=>inp, inps)...) + thunks[i] = Dagger.@spawn map(f, inps...) end DArray(Any, domain(primary), domainchunks(primary), thunks) end @@ -40,16 +40,16 @@ end function stage(ctx::Context, r::ReduceBlock) inp = stage(ctx, r.input) - reduced_parts = map(x -> Thunk(r.op, nothing=>x; get_result=r.get_result), chunks(inp)) - Thunk((xs...) -> r.op_master(xs), map(part->nothing=>part, reduced_parts)...; meta=true) + reduced_parts = map(x -> (Dagger.@spawn get_result=r.get_result r.op(x)), chunks(inp)) + r_op_master(args...,) = r.op_master(args) + Dagger.@spawn meta=true r_op_master(reduced_parts...) end reduceblock_async(f, x::ArrayOp; get_result=true) = ReduceBlock(f, f, x, get_result) reduceblock_async(f, g::Function, x::ArrayOp; get_result=true) = ReduceBlock(f, g, x, get_result) -reduceblock(f, x::ArrayOp) = compute(reduceblock_async(f, x)) -reduceblock(f, g::Function, x::ArrayOp) = - compute(reduceblock_async(f, g, x)) +reduceblock(f, x::ArrayOp) = reduceblock_async(f, x) +reduceblock(f, g::Function, x::ArrayOp) = reduceblock_async(f, g, x) reduce_async(f::Function, x::ArrayOp) = reduceblock_async(xs->reduce(f,xs), xs->reduce(f,xs), x) @@ -115,7 +115,7 @@ end function reduce(f::Function, x::ArrayOp; dims = nothing) if dims === nothing - return compute(reduce_async(f,x)) + return fetch(reduce_async(f,x)) elseif dims isa Int dims = (dims,) end @@ -126,10 +126,10 @@ function stage(ctx::Context, r::Reducedim) inp = cached_stage(ctx, r.input) thunks = let op = r.op, dims=r.dims # do reducedim on each block - tmp = map(p->Thunk(b->reduce(op,b,dims=dims), nothing=>p), chunks(inp)) + tmp = map(p->Dagger.spawn(b->reduce(op,b,dims=dims), p), chunks(inp)) # combine the results in tree fashion treereducedim(tmp, r.dims) do x,y - Thunk(op, nothing=>x, nothing=>y) + Dagger.@spawn op(x,y) end end c = domainchunks(inp) diff --git a/src/array/matrix.jl b/src/array/matrix.jl index b2febe908..436f0d321 100644 --- a/src/array/matrix.jl +++ b/src/array/matrix.jl @@ -17,10 +17,10 @@ function size(x::Transpose) end transpose(x::ArrayOp) = Transpose(transpose, x) -transpose(x::Union{Chunk, Thunk}) = Thunk(transpose, nothing=>x) +#transpose(x::Union{Chunk, Thunk}) = Thunk(transpose, x) adjoint(x::ArrayOp) = Transpose(adjoint, x) -adjoint(x::Union{Chunk, Thunk}) = Thunk(adjoint, nothing=>x) +#adjoint(x::Union{Chunk, Thunk}) = Thunk(adjoint, x) function adjoint(x::ArrayDomain{2}) d = indexes(x) @@ -32,7 +32,7 @@ function adjoint(x::ArrayDomain{1}) end function _ctranspose(x::AbstractArray) - Any[delayed(adjoint)(x[j,i]) for i=1:size(x,2), j=1:size(x,1)] + Any[Dagger.@spawn adjoint(x[j,i]) for i=1:size(x,2), j=1:size(x,1)] end function stage(ctx::Context, node::Transpose) @@ -91,8 +91,9 @@ function (+)(a::ArrayDomain, b::ArrayDomain) a end -(*)(a::Union{Chunk, Thunk}, b::Union{Chunk, Thunk}) = Thunk(*, nothing=>a, nothing=>b) -(+)(a::Union{Chunk, Thunk}, b::Union{Chunk, Thunk}) = Thunk(+, nothing=>a, nothing=>b) +#(*)(a::Union{Chunk, EagerThunk}, b::Union{Chunk, EagerThunk}) = Thunk(*, a,b) + +#(+)(a::Union{Chunk, Thunk}, b::Union{Chunk, Thunk}) = Thunk(+, a,b) # we define our own matmat and matvec multiply # for computing the new domains and thunks. @@ -176,7 +177,7 @@ function stage(ctx::Context, mul::MatMul) a, b = stage_operands(ctx, mul, mul.a, mul.b) d = domain(a)*domain(b) DArray(Any, d, domainchunks(a)*domainchunks(b), - _mul(chunks(a), chunks(b); T=Thunk)) + _mul(chunks(a), chunks(b); T=Any)) end Base.power_by_squaring(x::DArray, i::Int) = foldl(*, ntuple(idx->x, i)) @@ -211,7 +212,7 @@ end function _scale(l, r) res = similar(r, Any) for i=1:length(l) - res[i,:] = map(x->Thunk((a,b) -> Diagonal(a)*b, nothing=>l[i], nothing=>x), r[i,:]) + res[i,:] = map(x->Dagger.spawn((a,b) -> Diagonal(a)*b, l[i], x), r[i,:]) end res end diff --git a/src/compute.jl b/src/compute.jl index 4934712e7..af034cb81 100644 --- a/src/compute.jl +++ b/src/compute.jl @@ -12,11 +12,6 @@ collect(d::Union{Chunk,Thunk}; options=nothing) = abstract type Computation end -compute(ctx, c::Computation; options=nothing) = - compute(ctx, stage(ctx, c); options=options) -collect(c::Computation; options=nothing) = - collect(Context(global_context()), c; options=options) - """ compute(ctx::Context, d::Thunk; options=nothing) -> Chunk diff --git a/src/thunk.jl b/src/thunk.jl index b426c3a20..c89e89523 100644 --- a/src/thunk.jl +++ b/src/thunk.jl @@ -283,6 +283,9 @@ end Base.isready(t::EagerThunk) = isready(t.future) Base.wait(t::EagerThunk) = wait(t.future) Base.fetch(t::EagerThunk; raw=false) = fetch(t.future; raw) + +istask(t::EagerThunk) = true + function Base.show(io::IO, t::EagerThunk) print(io, "EagerThunk ($(isready(t) ? "finished" : "running"))") end From dd1a595d1949e4fb453d376e395ac659fb4982db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felipe=20de=20Alc=C3=A2ntara=20Tom=C3=A9?= Date: Sat, 17 Jun 2023 00:55:22 -0300 Subject: [PATCH 36/44] Finished array implementation, having problems with the darray distribute function, revising testing forthe darray --- src/array/darray.jl | 4 ++-- src/array/getindex.jl | 3 ++- src/array/operators.jl | 5 ++--- src/array/setindex.jl | 4 ++-- src/array/sort.jl | 43 +++++++++++++++++++++--------------------- 5 files changed, 30 insertions(+), 29 deletions(-) diff --git a/src/array/darray.jl b/src/array/darray.jl index 068d325d9..63123827e 100644 --- a/src/array/darray.jl +++ b/src/array/darray.jl @@ -250,7 +250,7 @@ function Base.fetch(c::DArray) sz = size(thunks) dmn = domain(c) dmnchunks = domainchunks(c) - fetch(Dagger.spawn(thunks...; meta=true) do results... + fetch(Dagger.spawn(thunks...) do results... t = eltype(results[1]) DArray(t, dmn, dmnchunks, reshape(Any[results...], sz)) end) @@ -343,7 +343,7 @@ function stage(ctx::Context, d::Distribute) end end else - cs = map(c -> Dagger.@spawn identity(d.data[c]), d.domainchunks) + cs = map(c -> (Dagger.@spawn identity(d.data[c])), d.domainchunks) end DArray( eltype(d.data), diff --git a/src/array/getindex.jl b/src/array/getindex.jl index 89d38bb97..5f0809267 100644 --- a/src/array/getindex.jl +++ b/src/array/getindex.jl @@ -16,6 +16,7 @@ function stage(ctx::Context, gidx::GetIndex) gidx.idx[i] end for i in 1:length(gidx.idx)] + @debug "$(idxs)" # Figure out output dimension view(inp, ArrayDomain(idxs)) end @@ -39,4 +40,4 @@ end Base.getindex(c::ArrayOp, idx::ArrayDomain) = GetIndex(c, indexes(idx)) Base.getindex(c::ArrayOp, idx...) = GetIndex(c, idx) -Base.getindex(c::ArrayOp, idx::Integer...) = fetch(GetIndexScalar(c, idx)) +Base.getindex(c::ArrayOp, idx::Integer...) = collect(GetIndexScalar(c, idx)) diff --git a/src/array/operators.jl b/src/array/operators.jl index 007947d7d..640db4ea9 100644 --- a/src/array/operators.jl +++ b/src/array/operators.jl @@ -90,8 +90,7 @@ function stage(ctx::Context, node::BCast) end blcks = DomainBlocks(map(_->1, size(node)), cumlengths) - thunks = broadcast(delayed((args...)->broadcast(bc.f, args...); ), - args2...) + thunks = broadcast((args3...)->Dagger.spawn((args...)->broadcast(bc.f, args...), args3...), args2...) DArray(eltype(node), domain(node), blcks, thunks) end @@ -107,7 +106,7 @@ Base.@deprecate mappart(args...) mapchunk(args...) function stage(ctx::Context, node::MapChunk) inputs = map(x->cached_stage(ctx, x), node.input) thunks = map(map(chunks, inputs)...) do ps... - Thunk(node.f, map(p->nothing=>p, ps)...) + Dagger.spawn(node.f, map(p->nothing=>p, ps)...) end DArray(Any, domain(inputs[1]), domainchunks(inputs[1]), thunks) diff --git a/src/array/setindex.jl b/src/array/setindex.jl index 695519025..45ae35c64 100644 --- a/src/array/setindex.jl +++ b/src/array/setindex.jl @@ -28,14 +28,14 @@ function stage(ctx::Context, sidx::SetIndex) groups = map(group_indices, subdmns.cumlength, indexes(d)) sz = map(length, groups) - pieces = Array{Union{Chunk, Thunk}}(undef, sz) + pieces = Array{Any}(undef, sz) for i = CartesianIndices(sz) idx_and_dmn = map(getindex, groups, i.I) idx = map(x->x[1], idx_and_dmn) local_dmn = ArrayDomain(map(x->x[2], idx_and_dmn)) s = subdmns[idx...] part_to_set = sidx.val - ps[idx...] = Thunk(nothing=>ps[idx...]) do p + ps[idx...] = Dagger.spawn(ps[idx...]) do p q = copy(p) q[indexes(project(s, local_dmn))...] .= part_to_set q diff --git a/src/array/sort.jl b/src/array/sort.jl index cbc9b21f1..c9beed70e 100644 --- a/src/array/sort.jl +++ b/src/array/sort.jl @@ -3,6 +3,7 @@ using Distributed using StatsBase + function getmedians(x, n) q,r = divrem(length(x), n+1) @@ -18,8 +19,10 @@ function getmedians(x, n) end function sortandsample_array(ord, xs, nsamples, presorted=false) + r = sample(1:length(xs), min(length(xs), nsamples), replace=false, ordered=true) + if !presorted sorted = sort(xs, order=ord) chunk = tochunk(sorted) @@ -28,6 +31,7 @@ function sortandsample_array(ord, xs, nsamples, presorted=false) chunk = nothing # avoid communicating metadata if already sorted samples = chunk[r] end + (chunk, samples) end @@ -89,14 +93,14 @@ end function collect_merge(merge, group) #delayed((xs...) -> treereduce(merge, Any[xs...]))(group...) - t = treereduce(delayed(merge), group) + t = treereduce(merge, group) end # Given sorted chunks, splits each chunk according to splitters # then merges corresponding splits together to form length(splitters) + 1 sorted chunks # these chunks will be in turn sorted function splitmerge(chunks, splitters, merge, by, sub, ord) - c1 = map(c->splitchunk(c, splitters, by, sub, ord), chunks) + c1 = map(c->splitchunk(c, splitters, by, sub, ord), chunks) map(cs->collect_merge(merge, cs), transpose_vecvec(c1)) end @@ -117,15 +121,15 @@ function splitchunk(c, splitters, by, sub, ord) 1:j end - between = map((hi, lo) -> delayed(x->sub(x, getbetween(by(x), hi, lo, ord)))(c), + between = map((hi, lo) -> Dagger.spawn(x->sub(x, getbetween(by(x), hi, lo, ord)), c), splitters[1:end-1], splitters[2:end]) hi = splitters[1] lo = splitters[end] - a = delayed(x->sub(x, getlt(by(x), hi, ord)))(c) + a = Dagger.spawn(x->sub(x, getlt(by(x), hi, ord)), c) b = between - c = delayed(x->sub(x, getgt(by(x), lo, ord)))(c) - return vcat(a, b, c) + c = Dagger.spawn(x->sub(x, getgt(by(x), lo, ord)), c) + return map(fetch, vcat(a, b, c)) #[delayed(c->sub(c, getlt(by(c), hi, ord)))(c); # between; delayed(c->sub(c, getgt(by(c), lo, ord)))(c)] @@ -252,6 +256,8 @@ Each chunk in turn is sorted. # Returns A tuple of `(chunk, samples)` where `chunk` is the `Dagger.Chunk` object. `chunk` can be `nothing` if no change to the initial array was made (e.g. it was already sorted) """ + + function dsort_chunks(cs, nchunks=length(cs), nsamples=2000; merge = merge_sorted, by=identity, @@ -263,17 +269,17 @@ function dsort_chunks(cs, nchunks=length(cs), nsamples=2000; sortandsample = (x,ns, presorted)->sortandsample_array(order, x,ns, presorted), affinities=workers(), ) - if splitters !== nothing + if splitters != nothing # this means splitters are given beforehand nsamples = 0 # no samples needed end # first sort each chunk and sample nsamples elements from each chunk - cs1 = map(c->delayed(sortandsample)(c, nsamples, chunks_presorted), cs) - + cs1 = map(c->Dagger.spawn(sortandsample, c, nsamples, chunks_presorted), cs) batchsize = max(2, batchsize) - # collect the samples - xs = collect(treereduce(delayed(vcat), cs1)) + + xs = treereduce((cs...)->Dagger.spawn(vcat, cs...), cs1) + xs = map(fetch, fetch(xs)) if length(cs1) == 1 xs = [xs] end @@ -283,11 +289,10 @@ function dsort_chunks(cs, nchunks=length(cs), nsamples=2000; samples = reduce((a,b)->merge_sorted(order, a, b), map(x->x[2], xs)) splitters = getmedians(samples, nchunks-1) end - # if first(x) === nothing this means that # the chunk was already sorted, so just # use the input chunk as-is - cs2 = map((x,c) -> x === nothing ? c : x, map(first, xs), cs) + cs2 = map((x,c) -> x === nothing ? c : x, map(first, xs), cs) # main sort routine. At this point: # we know the splitters we want to use, @@ -304,7 +309,7 @@ function dsort_chunks(cs, nchunks=length(cs), nsamples=2000; end function propagate_affinity!(c, aff) - if !isa(c, Thunk) + if !isa(c, EagerThunk) return end if c.affinity !== nothing @@ -326,17 +331,13 @@ function Base.sort(v::ArrayOp; batchsize=max(2, nworkers()), nsamples=2000, order::Ordering=default_ord) - v1 = compute(v) + v1 = fetch(v) ord = Base.Sort.ord(lt,by,rev,order) nchunks = nchunks === nothing ? length(v1.chunks) : nchunks cs = dsort_chunks(v1.chunks, nchunks, nsamples, order=ord, merge=(x,y)->merge_sorted(ord, x,y)) - foreach(persist!, cs) - t=delayed((xs...)->[xs...]; meta=true)(cs...) - # `compute(t)` only computes references to materialized version of `cs` - # we don't want the scheduler to think that `cs`s' job is done - # so we call persist! above - chunks = compute(t) + t=Dagger.spawn((xs...)->[xs...], cs...) + chunks = fetch(t) dmn = ArrayDomain((1:sum(length(domain(c)) for c in chunks),)) DArray(eltype(v1), dmn, map(domain, chunks), chunks) end From 1268410ae3466934675fe6b26cd4179dd2b8d143 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felipe=20de=20Alc=C3=A2ntara=20Tom=C3=A9?= Date: Sat, 17 Jun 2023 00:56:10 -0300 Subject: [PATCH 37/44] Finished array implementation, having problems with the darray distribute function, revising testing forthe darray --- test/array.jl | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/test/array.jl b/test/array.jl index d44dd9af8..fa30e8c9b 100644 --- a/test/array.jl +++ b/test/array.jl @@ -1,5 +1,6 @@ using LinearAlgebra, SparseArrays, Random, SharedArrays import Dagger: chunks, DArray, domainchunks, treereduce_nd +import Distributed: myid, procs @testset "treereduce_nd" begin xs = rand(1:10, 8,8,8) @@ -9,13 +10,13 @@ import Dagger: chunks, DArray, domainchunks, treereduce_nd end @testset "DArray constructor" begin - x = compute(rand(Blocks(2,2), 3,3)) + x = fetch(rand(Blocks(2,2), 3,3)) @test collect(x) == DArray{Float64, 2}(x.domain, x.subdomains, x.chunks) |> collect end @testset "rand" begin function test_rand(X) - X1 = compute(X) + X1 = fetch(X) X2 = collect(X1) @test isa(X, Dagger.ArrayOp) @@ -38,16 +39,16 @@ end end @testset "sum" begin X = ones(Blocks(10, 10), 100, 100) - @test sum(X) == 10000 + @test fetch(sum(X)) == 10000 Y = zeros(Blocks(10, 10), 100, 100) - @test sum(Y) == 0 + @test fetch(sum(Y)) == 0 end @testset "distributing an array" begin function test_dist(X) X1 = Distribute(Blocks(10, 20), X) @test X1 == X - Xc = compute(X1) + Xc = fetch(X1) @test chunks(Xc) |> size == (10, 5) @test domainchunks(Xc) |> size == (10, 5) @test map(x->size(x) == (10, 20), domainchunks(Xc)) |> all @@ -66,7 +67,7 @@ end x, y = size(X) X1 = Distribute(Blocks(10, 20), X) @test X1' == X' - Xc = compute(X1') + Xc = fetch(X1') @test chunks(Xc) |> size == (div(y, 20), div(x,10)) @test domainchunks(Xc) |> size == (div(y, 20), div(x, 10)) @test map(x->size(x) == (20, 10), domainchunks(Xc)) |> all @@ -77,13 +78,13 @@ end test_transpose(sprand(100, 120, 0.1)) end -@testset "matrix-matrix multiply" begin +#=@testset "matrix-matrix multiply" begin function test_mul(X) tol = 1e-12 X1 = Distribute(Blocks(10, 20), X) - @test_throws DimensionMismatch compute(X1*X1) - X2 = compute(X1'*X1) - X3 = compute(X1*X1') + @test_throws DimensionMismatch fetch(X1*X1) + X2 = fetch(X1'*X1) + X3 = fetch(X1*X1') @test norm(collect(X2) - X'X) < tol @test norm(collect(X3) - X*X') < tol @test chunks(X2) |> size == (2, 2) @@ -100,11 +101,11 @@ end end @testset "matrix powers" begin - x = compute(rand(Blocks(4,4), 16, 16)) + x = fetch(rand(Blocks(4,4), 16, 16)) @test collect(x^1) == collect(x) @test collect(x^2) == collect(x*x) @test collect(x^3) == collect(x*x*x) -end +end=# @testset "concat" begin m = rand(75,75) @@ -113,16 +114,16 @@ end @test hcat(m,m) == collect(hcat(x,x)) @test vcat(m,m) == collect(vcat(x,x)) @test hcat(m,m) == collect(hcat(x,y)) - @test_throws DimensionMismatch compute(vcat(x,y)) + @test_throws DimensionMismatch fetch(vcat(x,y)) end -@testset "scale" begin +#=@testset "scale" begin x = rand(10,10) X = Distribute(Blocks(3,3), x) y = rand(10) @test Diagonal(y)*x == collect(Diagonal(y)*X) -end +end=# @testset "Getindex" begin function test_getindex(x) @@ -136,12 +137,12 @@ end @test collect(X[[], ragged_idx]) == x[[], ragged_idx] @test collect(X[[], []]) == x[[], []] - @testset "dimensionality reduction" begin + #=@testset "dimensionality reduction" begin # THESE NEED FIXING!! @test vec(collect(X[ragged_idx, 5])) == vec(x[ragged_idx, 5]) @test vec(collect(X[5, ragged_idx])) == vec(x[5, ragged_idx]) @test collect(X[5, 5]) == x[5,5] - end + end=# end test_getindex(rand(10,10)) @@ -150,7 +151,7 @@ end y = rand(10, 10) xs = distribute(y, Blocks(2,2)) for i=1:10, j=1:10 - @test compute(xs[i:j, j:i]) == y[i:j, j:i] + @test fetch(xs[i:j, j:i]) == y[i:j, j:i] end end @@ -195,12 +196,12 @@ end @test collect(sort(X)) == sort(x) @test collect(sort(X, rev=true)) == sort(x, rev=true) - x = [("A",1), ("A",2), ("B",1)] + #=x = [("A",1), ("A",2), ("B",1)] y = distribute(x, 3) - @test collect(sort(y)) == x + @test collect(sort(y)) == x=# x = ones(10) - y = compute(Distribute(Blocks(3), x)) + y = fetch(Distribute(Blocks(3), x)) #@test map(x->length(collect(x)), compute(sort(y)).chunks) == [3,3,3,1] end @@ -220,7 +221,7 @@ using MemPool end @testset "show_plan" begin - @test !isempty(Dagger.show_plan(Dagger.Thunk(()->10))) + @test !isempty(Dagger.show_plan(Dagger.spawn(()->10))) end #= From 4acb50c94f9a73161228851e0580f65341efd7f6 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Mon, 19 Jun 2023 14:45:49 -0500 Subject: [PATCH 38/44] DArray: Fix adj/transpose and matmul --- src/array/matrix.jl | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/array/matrix.jl b/src/array/matrix.jl index 436f0d321..a20020741 100644 --- a/src/array/matrix.jl +++ b/src/array/matrix.jl @@ -17,10 +17,10 @@ function size(x::Transpose) end transpose(x::ArrayOp) = Transpose(transpose, x) -#transpose(x::Union{Chunk, Thunk}) = Thunk(transpose, x) +transpose(x::Union{Chunk, EagerThunk}) = @spawn transpose(x) adjoint(x::ArrayOp) = Transpose(adjoint, x) -#adjoint(x::Union{Chunk, Thunk}) = Thunk(adjoint, x) +adjoint(x::Union{Chunk, EagerThunk}) = @spawn adjoint(x) function adjoint(x::ArrayDomain{2}) d = indexes(x) @@ -91,9 +91,12 @@ function (+)(a::ArrayDomain, b::ArrayDomain) a end -#(*)(a::Union{Chunk, EagerThunk}, b::Union{Chunk, EagerThunk}) = Thunk(*, a,b) +struct BinaryComputeOp{F} end +BinaryComputeOp{F}(x::Union{Chunk,EagerThunk}, y::Union{Chunk,EagerThunk}) where F = @spawn F(x, y) +BinaryComputeOp{F}(x, y) where F = F(x, y) -#(+)(a::Union{Chunk, Thunk}, b::Union{Chunk, Thunk}) = Thunk(+, a,b) +const AddComputeOp = BinaryComputeOp{+} +const MulComputeOp = BinaryComputeOp{*} # we define our own matmat and matvec multiply # for computing the new domains and thunks. @@ -102,7 +105,7 @@ function _mul(a::Matrix, b::Matrix; T=eltype(a)) n = size(a, 2) for i=1:size(a,1) for j=1:size(b, 2) - c[i,j] = treereduce(+, map(*, reshape(a[i,:], (n,)), b[:, j])) + c[i,j] = treereduce(AddComputeOp, map(MulComputeOp, reshape(a[i,:], (n,)), b[:, j])) end end c @@ -112,14 +115,14 @@ function _mul(a::Matrix, b::Vector; T=eltype(b)) c = Array{T}(undef, size(a,1)) n = size(a,2) for i=1:size(a,1) - c[i] = treereduce(+, map(*, reshape(a[i, :], (n,)), b)) + c[i] = treereduce(AddComputeOp, map(MulComputeOp, reshape(a[i, :], (n,)), b)) end c end function _mul(a::Vector, b::Vector; T=eltype(b)) @assert length(b) == 1 - [x * b[1] for x in a] + [MulComputeOp(x, b[1]) for x in a] end function promote_distribution(ctx::Context, m::MatMul, a,b) From fffcc01b2703da49f09506bde3b28e8f71ba7d48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felipe=20de=20Alc=C3=A2ntara=20Tom=C3=A9?= Date: Mon, 19 Jun 2023 17:43:35 -0300 Subject: [PATCH 39/44] Tests passing --- src/array/darray.jl | 4 ++-- src/array/getindex.jl | 3 +-- src/array/map-reduce.jl | 12 +++++------ src/array/sort.jl | 4 ++-- test/array.jl | 45 ++++++++++++++++------------------------- 5 files changed, 28 insertions(+), 40 deletions(-) diff --git a/src/array/darray.jl b/src/array/darray.jl index 63123827e..200854efa 100644 --- a/src/array/darray.jl +++ b/src/array/darray.jl @@ -250,7 +250,7 @@ function Base.fetch(c::DArray) sz = size(thunks) dmn = domain(c) dmnchunks = domainchunks(c) - fetch(Dagger.spawn(thunks...) do results... + fetch(Dagger.spawn(Options(meta=true), thunks...) do results... t = eltype(results[1]) DArray(t, dmn, dmnchunks, reshape(Any[results...], sz)) end) @@ -350,7 +350,7 @@ function stage(ctx::Context, d::Distribute) domain(d.data), d.domainchunks, cs - ) + ) end function distribute(x::AbstractArray, dist) diff --git a/src/array/getindex.jl b/src/array/getindex.jl index 5f0809267..89d38bb97 100644 --- a/src/array/getindex.jl +++ b/src/array/getindex.jl @@ -16,7 +16,6 @@ function stage(ctx::Context, gidx::GetIndex) gidx.idx[i] end for i in 1:length(gidx.idx)] - @debug "$(idxs)" # Figure out output dimension view(inp, ArrayDomain(idxs)) end @@ -40,4 +39,4 @@ end Base.getindex(c::ArrayOp, idx::ArrayDomain) = GetIndex(c, indexes(idx)) Base.getindex(c::ArrayOp, idx...) = GetIndex(c, idx) -Base.getindex(c::ArrayOp, idx::Integer...) = collect(GetIndexScalar(c, idx)) +Base.getindex(c::ArrayOp, idx::Integer...) = fetch(GetIndexScalar(c, idx)) diff --git a/src/array/map-reduce.jl b/src/array/map-reduce.jl index 0381519d2..030a388ab 100644 --- a/src/array/map-reduce.jl +++ b/src/array/map-reduce.jl @@ -53,16 +53,16 @@ reduceblock(f, g::Function, x::ArrayOp) = reduceblock_async(f, g, x) reduce_async(f::Function, x::ArrayOp) = reduceblock_async(xs->reduce(f,xs), xs->reduce(f,xs), x) -sum(x::ArrayOp; dims::Union{Int,Nothing} = nothing) = _sum(x, dims) +sum(x::ArrayOp; dims::Union{Int,Nothing} = nothing) = fetch(_sum(x, dims)) _sum(x, dims::Nothing) = reduceblock(sum, sum, x) _sum(x, dims::Int) = reduce(+, x; dims=dims) -sum(f::Function, x::ArrayOp) = reduceblock(a->sum(f, a), sum, x) -prod(x::ArrayOp) = reduceblock(prod, x) -prod(f::Function, x::ArrayOp) = reduceblock(a->prod(f, a), prod, x) +sum(f::Function, x::ArrayOp) = fetch(reduceblock(a->sum(f, a), sum, x)) +prod(x::ArrayOp) = fetch(reduceblock(prod, x)) +prod(f::Function, x::ArrayOp) = fetch(reduceblock(a->prod(f, a), prod, x)) -mean(x::ArrayOp) = reduceblock(mean, mean, x) +mean(x::ArrayOp) = fetch(reduceblock(mean, mean, x)) -mapreduce(f::Function, g::Function, x::ArrayOp) = reduce(g, map(f, x)) +mapreduce(f::Function, g::Function, x::ArrayOp) = reduce(g, map(f, x)) #think about fetching function mapreducebykey_seq(f, op, itr, dict=Dict()) for x in itr diff --git a/src/array/sort.jl b/src/array/sort.jl index c9beed70e..374f22412 100644 --- a/src/array/sort.jl +++ b/src/array/sort.jl @@ -93,7 +93,7 @@ end function collect_merge(merge, group) #delayed((xs...) -> treereduce(merge, Any[xs...]))(group...) - t = treereduce(merge, group) + t = treereduce((args...) -> (Dagger.@spawn merge(args...)), group) end # Given sorted chunks, splits each chunk according to splitters @@ -279,7 +279,7 @@ function dsort_chunks(cs, nchunks=length(cs), nsamples=2000; batchsize = max(2, batchsize) xs = treereduce((cs...)->Dagger.spawn(vcat, cs...), cs1) - xs = map(fetch, fetch(xs)) + xs = collect(fetch(xs)) if length(cs1) == 1 xs = [xs] end diff --git a/test/array.jl b/test/array.jl index fa30e8c9b..6e898448f 100644 --- a/test/array.jl +++ b/test/array.jl @@ -37,11 +37,12 @@ end r = collect(R) @test r[1:10] != r[11:20] end + @testset "sum" begin X = ones(Blocks(10, 10), 100, 100) - @test fetch(sum(X)) == 10000 + @test sum(X) == 10000 Y = zeros(Blocks(10, 10), 100, 100) - @test fetch(sum(Y)) == 0 + @test sum(Y) == 0 end @testset "distributing an array" begin @@ -55,8 +56,8 @@ end end x = [1 2; 3 4] @test Distribute(Blocks(1,1), x) == x - #test_dist(rand(100, 100)) - #test_dist(sprand(100, 100, 0.1)) + test_dist(rand(100, 100)) + test_dist(sprand(100, 100, 0.1)) x = distribute(rand(10), 2) @test collect(distribute(x, 3)) == collect(x) @@ -78,7 +79,7 @@ end test_transpose(sprand(100, 120, 0.1)) end -#=@testset "matrix-matrix multiply" begin +@testset "matrix-matrix multiply" begin function test_mul(X) tol = 1e-12 X1 = Distribute(Blocks(10, 20), X) @@ -105,7 +106,7 @@ end @test collect(x^1) == collect(x) @test collect(x^2) == collect(x*x) @test collect(x^3) == collect(x*x*x) -end=# +end @testset "concat" begin m = rand(75,75) @@ -117,13 +118,13 @@ end=# @test_throws DimensionMismatch fetch(vcat(x,y)) end -#=@testset "scale" begin +@testset "scale" begin x = rand(10,10) X = Distribute(Blocks(3,3), x) y = rand(10) @test Diagonal(y)*x == collect(Diagonal(y)*X) -end=# +end @testset "Getindex" begin function test_getindex(x) @@ -137,12 +138,11 @@ end=# @test collect(X[[], ragged_idx]) == x[[], ragged_idx] @test collect(X[[], []]) == x[[], []] - #=@testset "dimensionality reduction" begin - # THESE NEED FIXING!! + @testset "dimensionality reduction" begin @test vec(collect(X[ragged_idx, 5])) == vec(x[ragged_idx, 5]) @test vec(collect(X[5, ragged_idx])) == vec(x[5, ragged_idx]) - @test collect(X[5, 5]) == x[5,5] - end=# + @test X[5, 5] == x[5,5] + end end test_getindex(rand(10,10)) @@ -196,13 +196,13 @@ end @test collect(sort(X)) == sort(x) @test collect(sort(X, rev=true)) == sort(x, rev=true) - #=x = [("A",1), ("A",2), ("B",1)] + x = [("A",1), ("A",2), ("B",1)] y = distribute(x, 3) - @test collect(sort(y)) == x=# + @test collect(sort(y)) == x x = ones(10) y = fetch(Distribute(Blocks(3), x)) - #@test map(x->length(collect(x)), compute(sort(y)).chunks) == [3,3,3,1] + @test_broken map(x->length(collect(x)), fetch(sort(y)).chunks) == [3,3,3,1] end using MemPool @@ -220,20 +220,9 @@ using MemPool @test aff[2] == sizeof(Int)*10 end -@testset "show_plan" begin +#=@testset "show_plan" begin @test !isempty(Dagger.show_plan(Dagger.spawn(()->10))) -end - -#= -@testset "darray distributed refcount" begin - D2 = remotecall_fetch(2, compute(Distribute(Blocks(10, 20), rand(40,40)))) do D - D2 = D - end - @test size(collect(D2)) == (40,40) - GC.gc() - @test size(collect(D2)) == (40,40) -end -=# +end=# @testset "sharedarray" begin A = SharedArray{Int}((1024,)) From 08d23090807b445623f83c0d39c89a0acea6363f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felipe=20de=20Alc=C3=A2ntara=20Tom=C3=A9?= <62853093+fda-tome@users.noreply.github.com> Date: Mon, 19 Jun 2023 17:57:16 -0300 Subject: [PATCH 40/44] Apply suggestions from code review Co-authored-by: Julian Samaroo --- src/array/darray.jl | 6 +++--- src/array/sort.jl | 6 +----- test/array.jl | 2 +- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/array/darray.jl b/src/array/darray.jl index 200854efa..baae60c71 100644 --- a/src/array/darray.jl +++ b/src/array/darray.jl @@ -240,7 +240,7 @@ end =# """ - thunkize(ctx::Context, c::DArray; persist=true) + Base.fetch(c::DArray) If a `DArray` tree has a `Thunk` in it, make the whole thing a big thunk. """ @@ -340,7 +340,7 @@ function stage(ctx::Context, d::Distribute) dimcatfuncs = [(x...) -> concat(x..., dims=i) for i in 1:length(shape)] ps = reshape(Any[parts...], shape) collect(treereduce_nd(dimcatfuncs, ps)) - end + end end else cs = map(c -> (Dagger.@spawn identity(d.data[c])), d.domainchunks) @@ -350,7 +350,7 @@ function stage(ctx::Context, d::Distribute) domain(d.data), d.domainchunks, cs - ) + ) end function distribute(x::AbstractArray, dist) diff --git a/src/array/sort.jl b/src/array/sort.jl index 374f22412..8209e0e9d 100644 --- a/src/array/sort.jl +++ b/src/array/sort.jl @@ -19,10 +19,8 @@ function getmedians(x, n) end function sortandsample_array(ord, xs, nsamples, presorted=false) - r = sample(1:length(xs), min(length(xs), nsamples), replace=false, ordered=true) - if !presorted sorted = sort(xs, order=ord) chunk = tochunk(sorted) @@ -100,7 +98,7 @@ end # then merges corresponding splits together to form length(splitters) + 1 sorted chunks # these chunks will be in turn sorted function splitmerge(chunks, splitters, merge, by, sub, ord) - c1 = map(c->splitchunk(c, splitters, by, sub, ord), chunks) + c1 = map(c->splitchunk(c, splitters, by, sub, ord), chunks) map(cs->collect_merge(merge, cs), transpose_vecvec(c1)) end @@ -256,8 +254,6 @@ Each chunk in turn is sorted. # Returns A tuple of `(chunk, samples)` where `chunk` is the `Dagger.Chunk` object. `chunk` can be `nothing` if no change to the initial array was made (e.g. it was already sorted) """ - - function dsort_chunks(cs, nchunks=length(cs), nsamples=2000; merge = merge_sorted, by=identity, diff --git a/test/array.jl b/test/array.jl index 6e898448f..e04f8c235 100644 --- a/test/array.jl +++ b/test/array.jl @@ -40,7 +40,7 @@ end @testset "sum" begin X = ones(Blocks(10, 10), 100, 100) - @test sum(X) == 10000 + @test sum(X) == 10000 Y = zeros(Blocks(10, 10), 100, 100) @test sum(Y) == 0 end From 2a797b15bba2fb923b15c08f32a46802b6e65e45 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Mon, 19 Jun 2023 16:04:00 -0500 Subject: [PATCH 41/44] tests/logging: Remove reliance on DArray --- test/logging.jl | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/test/logging.jl b/test/logging.jl index c4431bba5..bb97811d0 100644 --- a/test/logging.jl +++ b/test/logging.jl @@ -3,18 +3,17 @@ import TimespanLogging: Timespan, Event, Events, LocalEventLog, MultiEventLog @testset "Logging" begin @testset "LocalEventLog" begin - @testset "ArrayOp" begin + @testset "Basics" begin ctx = Context() log = LocalEventLog() ctx.log_sink = log - sz = 2^4 - bsz = 2^2 - X = rand(Float32, sz, sz) - XD = Distribute(Blocks(bsz, bsz), X) + X = rand(Float32, 4, 3) + a = delayed(sum)(X) + b = delayed(sum)(X) + c = delayed(+)(a,b) - dag = XD*XD - collect(ctx, dag) + compute(ctx, c) logs = TimespanLogging.get_logs!(log) @test logs isa Vector{Timespan} @test length(logs) > 0 @@ -48,10 +47,13 @@ import TimespanLogging: Timespan, Event, Events, LocalEventLog, MultiEventLog end @testset "Automatic Plan Rendering" begin - x = compute(rand(Blocks(2,2),4,4)) + X = rand(Float32, 4, 3) + a = delayed(sum)(X) + b = delayed(sum)(X) + c = delayed(+)(a,b) mktemp() do path, io ctx = Context(;log_sink=LocalEventLog(),log_file=path) - compute(ctx, x * x) + compute(ctx, c) plan = String(read(io)) @test occursin("digraph {", plan) @test occursin("Move:", plan) @@ -73,8 +75,11 @@ import TimespanLogging: Timespan, Event, Events, LocalEventLog, MultiEventLog ml[:psat] = Dagger.Events.ProcessorSaturation() ctx.log_sink = ml - A = rand(Blocks(4, 4), 16, 16) - collect(ctx, A*A) + X = rand(Float32, 4, 3) + a = delayed(sum)(X) + b = delayed(sum)(X) + c = delayed(+)(a,b) + compute(ctx, c) logs = TimespanLogging.get_logs!(ml) for w in keys(logs) From b763d585496466cb918537e37bdad0f62b1db9bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felipe=20de=20Alc=C3=A2ntara=20Tom=C3=A9?= Date: Mon, 19 Jun 2023 18:21:13 -0300 Subject: [PATCH 42/44] Comments resolved --- src/array/darray.jl | 17 ----------------- src/array/map-reduce.jl | 14 +++++++------- src/array/sort.jl | 8 ++++---- test/array.jl | 17 +++++++++++++++++ 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/src/array/darray.jl b/src/array/darray.jl index baae60c71..a8575cb78 100644 --- a/src/array/darray.jl +++ b/src/array/darray.jl @@ -222,23 +222,6 @@ function lookup_parts(ps::AbstractArray, subdmns::DomainBlocks{N}, d::ArrayDomai pieces, out_dmn end - -#=""" - compute(ctx::Context, x::DArray; persist=true, options=nothing) - -A `DArray` object may contain a thunk in it, in which case -we first turn it into a `Thunk` and then compute it. -""" -function fetch(x::DArray; persist=true, options=nothing) - thunk = thunkize(ctx, x, persist=persist) - if isa(thunk, EagerThunk) - fetch(thunk; options=options) - else - x - end -end -=# - """ Base.fetch(c::DArray) diff --git a/src/array/map-reduce.jl b/src/array/map-reduce.jl index 030a388ab..76bdcd727 100644 --- a/src/array/map-reduce.jl +++ b/src/array/map-reduce.jl @@ -48,19 +48,19 @@ end reduceblock_async(f, x::ArrayOp; get_result=true) = ReduceBlock(f, f, x, get_result) reduceblock_async(f, g::Function, x::ArrayOp; get_result=true) = ReduceBlock(f, g, x, get_result) -reduceblock(f, x::ArrayOp) = reduceblock_async(f, x) -reduceblock(f, g::Function, x::ArrayOp) = reduceblock_async(f, g, x) +reduceblock(f, x::ArrayOp) = fetch(reduceblock_async(f, x)) +reduceblock(f, g::Function, x::ArrayOp) = fetch(reduceblock_async(f, g, x)) reduce_async(f::Function, x::ArrayOp) = reduceblock_async(xs->reduce(f,xs), xs->reduce(f,xs), x) -sum(x::ArrayOp; dims::Union{Int,Nothing} = nothing) = fetch(_sum(x, dims)) +sum(x::ArrayOp; dims::Union{Int,Nothing} = nothing) = _sum(x, dims) _sum(x, dims::Nothing) = reduceblock(sum, sum, x) _sum(x, dims::Int) = reduce(+, x; dims=dims) -sum(f::Function, x::ArrayOp) = fetch(reduceblock(a->sum(f, a), sum, x)) -prod(x::ArrayOp) = fetch(reduceblock(prod, x)) -prod(f::Function, x::ArrayOp) = fetch(reduceblock(a->prod(f, a), prod, x)) +sum(f::Function, x::ArrayOp) = reduceblock(a->sum(f, a), sum, x) +prod(x::ArrayOp) = reduceblock(prod, x) +prod(f::Function, x::ArrayOp) = reduceblock(a->prod(f, a), prod, x) -mean(x::ArrayOp) = fetch(reduceblock(mean, mean, x)) +mean(x::ArrayOp) = reduceblock(mean, mean, x) mapreduce(f::Function, g::Function, x::ArrayOp) = reduce(g, map(f, x)) #think about fetching diff --git a/src/array/sort.jl b/src/array/sort.jl index 8209e0e9d..7c989e05c 100644 --- a/src/array/sort.jl +++ b/src/array/sort.jl @@ -285,10 +285,10 @@ function dsort_chunks(cs, nchunks=length(cs), nsamples=2000; samples = reduce((a,b)->merge_sorted(order, a, b), map(x->x[2], xs)) splitters = getmedians(samples, nchunks-1) end - # if first(x) === nothing this means that - # the chunk was already sorted, so just - # use the input chunk as-is - cs2 = map((x,c) -> x === nothing ? c : x, map(first, xs), cs) + # if first(x) === nothing this means that + # the chunk was already sorted, so just + # use the input chunk as-is + cs2 = map((x,c) -> x === nothing ? c : x, map(first, xs), cs) # main sort routine. At this point: # we know the splitters we want to use, diff --git a/test/array.jl b/test/array.jl index e04f8c235..452d92f2c 100644 --- a/test/array.jl +++ b/test/array.jl @@ -1,6 +1,7 @@ using LinearAlgebra, SparseArrays, Random, SharedArrays import Dagger: chunks, DArray, domainchunks, treereduce_nd import Distributed: myid, procs +import Statistics: mean @testset "treereduce_nd" begin xs = rand(1:10, 8,8,8) @@ -45,6 +46,22 @@ end @test sum(Y) == 0 end +@testset "prod" begin + x = randn(100, 100) + X = distribute(x, Blocks(10, 10)) + @test prod(X) == prod(x) + Y = zeros(Blocks(10, 10), 100, 100) + @test prod(Y) == 0 +end + +@testset "mean" begin + x = randn(100, 100) + X = distribute(x, Blocks(10, 10)) + @test mean(X) ≈ mean(x) + Y = zeros(Blocks(10, 10), 100, 100) + @test mean(Y) == 0 +end + @testset "distributing an array" begin function test_dist(X) X1 = Distribute(Blocks(10, 20), X) From 846ff35f153420c3d4ff7fb8cbf02dd8b52a83a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felipe=20de=20Alc=C3=A2ntara=20Tom=C3=A9?= Date: Fri, 30 Jun 2023 17:38:12 -0300 Subject: [PATCH 43/44] Started MPI style implementation --- lib/DaggerMPI/src/DaggerMPI.jl | 25 ++++++++++++++++++++++--- src/array/darray.jl | 10 ++++++---- src/sch/Sch.jl | 9 +++++---- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/lib/DaggerMPI/src/DaggerMPI.jl b/lib/DaggerMPI/src/DaggerMPI.jl index 5cdbe5ccc..3335c3758 100644 --- a/lib/DaggerMPI/src/DaggerMPI.jl +++ b/lib/DaggerMPI/src/DaggerMPI.jl @@ -15,7 +15,7 @@ end const MPI_PROCESSORS = Ref{Int}(-1) -const BCAST_VALUES = Dict{Any, Any}() +#const BCAST_VALUES = Dict{Any, Any}() const PREVIOUS_PROCESSORS = Set() @@ -73,7 +73,26 @@ end Dagger.get_parent(proc::MPIProcessor) = Dagger.OSProc() Dagger.default_enabled(proc::MPIProcessor) = true -function IbcastAPI(data, root::Integer, comm::MPI.Comm, req::MPI.AbstractRequest=MPI.Request()) +struct MPI_Blocks{N} <: Blocks{N} end + +struct MPI_ArrayDomain{N} <: ArrayDomain{N} end + + +function distribute(x::AbstractArray, dist::MPI_Blocks, comm::MPI.comm=MPI.COMM_WORLD, root::Integer=0) + isroot = MPI.Comm_rank(comm) == root + #TODO: Make better load balancing + if isroot + + + + + + + + + + +#=function IbcastAPI(data, root::Integer, comm::MPI.Comm, req::MPI.AbstractRequest=MPI.Request()) @assert MPI.isnull(req) buf = MPI.Buffer(data) @debug "[$(MPI.Comm_rank(comm))] Entered API call with root $root" @@ -125,7 +144,7 @@ function IbcastRecv(root, tag, comm, req::MPI.AbstractRequest=MPI.Request()) end function Ibcast_yield() -end +end=# diff --git a/src/array/darray.jl b/src/array/darray.jl index 2e0b903e8..221600922 100644 --- a/src/array/darray.jl +++ b/src/array/darray.jl @@ -12,7 +12,6 @@ An `N`-dimensional domain over an array. struct ArrayDomain{N} indexes::NTuple{N, Any} end - include("../lib/domain-blocks.jl") @@ -305,7 +304,7 @@ Distribute(p::Blocks, data::AbstractArray) = function stage(ctx::Context, d::Distribute) if isa(d.data, ArrayOp) - # distributing a dsitributed array + # distributing a distributed array x = cached_stage(ctx, d.data) if d.domainchunks == domainchunks(x) return x # already properly distributed @@ -316,6 +315,7 @@ function stage(ctx::Context, d::Distribute) cs = map(d.domainchunks) do idx chunks = cached_stage(ctx, x[idx]).chunks shape = size(chunks) + hash = Base.hash(idx, Base.hash(stage, Base.hash(d.data))) Dagger.spawn(shape, chunks...) do shape, parts... if prod(shape) == 0 return Array{T}(undef, shape) @@ -326,8 +326,10 @@ function stage(ctx::Context, d::Distribute) end end else - cs = map(c -> (Dagger.@spawn identity(d.data[c])), d.domainchunks) - + cs = map(d.domainchunks) do c + hash = Base.hash(c, Base.hash(stage, Base.hash(d.data))) + Dagger.@spawn hash=hash identity(d.data[c]) + end end DArray( eltype(d.data), diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index c1e67689d..7ea653f35 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -1318,10 +1318,11 @@ end Executes a single task specified by `task_desc` on `to_proc`. """ function do_task(to_proc, task_desc) - thunk_id, task_ hash, est_time_util, est_alloc_util, est_occupancy, - scope, Tf, data, - send_result, persist, cache, meta, - options, propagated, ids, hashes, positions, ctx_vars, sch_handle, sch_uid = task_desc + thunk_id, task_hash, est_time_util, est_alloc_util, est_occupancy, + scope, Tf, data, + send_result, persist, cache, meta, + options, propagated, ids, hashes, positions, + ctx_vars, sch_handle, sch_uid = task_desc ctx = Context(Processor[]; log_sink=ctx_vars.log_sink, profile=ctx_vars.profile) from_proc = OSProc() From 1ce3b7fb8207f7d540a62139e9af2dc0643fd68b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felipe=20de=20Alc=C3=A2ntara=20Tom=C3=A9?= Date: Wed, 19 Jul 2023 14:50:46 -0300 Subject: [PATCH 44/44] Rebasing MPI branch and DArray interface --- lib/DaggerMPI/src/DaggerMPI.jl | 124 +++++++++++++++++++++++++++++---- src/lib/util.jl | 2 + src/processor.jl | 1 + src/sch/Sch.jl | 8 ++- 4 files changed, 118 insertions(+), 17 deletions(-) diff --git a/lib/DaggerMPI/src/DaggerMPI.jl b/lib/DaggerMPI/src/DaggerMPI.jl index 3335c3758..aa12c2d91 100644 --- a/lib/DaggerMPI/src/DaggerMPI.jl +++ b/lib/DaggerMPI/src/DaggerMPI.jl @@ -1,5 +1,6 @@ module DaggerMPI using Dagger +import Base: reduce, fetch, cat using MPI struct MPIProcessor{P,C} <: Dagger.Processor @@ -28,14 +29,14 @@ function initialize(comm::MPI.Comm=MPI.COMM_WORLD; color_algo=SimpleColoring()) MPI.Init(; finalize_atexit=false) procs = Dagger.get_processors(OSProc()) i = 0 - empty!(Dagger.PROCESSOR_CALLBACKS) + #=empty!(Dagger.PROCESSOR_CALLBACKS) empty!(Dagger.OSPROC_PROCESSOR_CACHE) for proc in procs Dagger.add_processor_callback!("mpiprocessor_$i") do return MPIProcessor(proc, comm, color_algo) end i += 1 - end + end=# MPI_PROCESSORS[] = i # FIXME: Hack to populate new processors @@ -73,21 +74,113 @@ end Dagger.get_parent(proc::MPIProcessor) = Dagger.OSProc() Dagger.default_enabled(proc::MPIProcessor) = true -struct MPI_Blocks{N} <: Blocks{N} end - -struct MPI_ArrayDomain{N} <: ArrayDomain{N} end +export MPIBlocks +struct MPIBlocks{N} <: Dagger.AbstractSingleBlocks{N} + blocksize::NTuple{N, Int} +end +MPIBlocks(xs::Int...) = MPIBlocks(xs) -function distribute(x::AbstractArray, dist::MPI_Blocks, comm::MPI.comm=MPI.COMM_WORLD, root::Integer=0) +function Dagger.distribute(::Type{A}, + x::Union{AbstractArray, Nothing}, + dist::MPIBlocks, + comm::MPI.Comm=MPI.COMM_WORLD, + root::Integer=0) where {A<:AbstractArray{T, N}} where {T, N} isroot = MPI.Comm_rank(comm) == root + #TODO: Make better load balancing - if isroot - + data = Array{T, N}(undef, dist.blocksize) + if isroot + cs = Array{T, N}(undef, size(x)) + parts = partition(dist, domain(x)) + idx = 1 + for part in parts + cs[idx:(idx - 1 + prod(dist.blocksize))] = x[part] + idx += prod(dist.blocksize) + end + MPI.Scatter!(MPI.UBuffer(cs, div(length(cs), MPI.Comm_size(comm))), data, comm, root=root) + else + MPI.Scatter!(nothing, data, comm, root=root) + end + data = Dagger.tochunk(data) + Dagger.DArray( + T, + domain(data), + domain(data), + data, + dist + ) +end +function Dagger.distribute(::Type{A}, + dist::MPIBlocks, + comm::MPI.Comm=MPI.COMM_WORLD, + root::Integer=0) where {A<:AbstractArray{T, N}} where {T, N} + distribute(A, nothing, dist, comm, root) +end +function Dagger.distribute(x::AbstractArray, + dist::MPIBlocks, + comm::MPI.Comm=MPI.COMM_WORLD, + root::Integer=0) + distribute(typeof(x), x, dist, comm, root) +end + +export MPIBlocks, distribute + +function Base.reduce(f::Function, x::Dagger.DArray{T,N,MPIBlocks{N},F}; comm=MPI.COMM_WORLD, root=nothing, dims=nothing, acrossranks::Bool=true) where {T,N,F} + if dims === nothing + if !acrossranks + return fetch(reduce_async(f,x)) + elseif root === nothing + return MPI.Allreduce(fetch(reduce_async(f,x)), f, comm) + else + return MPI.Reduce(fetch(reduce_async(f,x)), f, comm; root) + end + else + if dims isa Int + dims = (dims,) + end + d = reduce(x.domain, dims=dims) + ds = reduce(x.subdomains[1], dims=dims) + if !acrossranks + thunks = Dagger.spawn(b->reduce(f, b, dims=dims), x.chunks[1]) + return Dagger.DArray(T, + d, + ds, + thunks, + x.partitioning) + else + tmp = collect(reduce(f, x, comm=comm, root=root, dims=dims, acrossranks=false)) + if root === nothing + h = UInt(0) + for dim in 1:N + if dim in dims + continue + end + h = hash(x.subdomains[1].indexes[dim], h) + end + h = abs(Base.unsafe_trunc(Int32, h)) + newc = MPI.Comm_split(comm, h, MPI.Comm_rank(comm)) + chunks = Dagger.tochunk(reshape(MPI.Allreduce(tmp, f, newc), size(tmp))) + else + rcvbuf = MPI.Reduce(tmp, f, comm; root) + if root === MPI.Comm_rank(comm) + chunks = Dagger.tochunk(reshape(rcvbuf, size(tmp))) + return Dagger.DArray(T, + d, + ds, + chunks, + x.partitioning) + end + return nothing + end + end + end +end @@ -219,15 +312,18 @@ function Dagger.move(from_proc::MPIProcessor, to_proc::Dagger.Processor, x::Dagg tag = abs(Base.unsafe_trunc(Int32, Dagger.get_task_hash(:input) >> 32)) if rank == x_value.color @debug "[$rank] Starting bcast send on $tag" - IbcastSend(x_value.value, x_value.color, tag, from_proc.comm) - @debug "[$rank] Finished bcast send on $tag" + @sync for other in 0:(MPI_Comm_size(from_proc.comm)-1) + other == rank && continue + @async begin + @debug "[$rank] Starting bcast send on $tag" + MPI.isend(x_value.value, other, tag, from_proc.comm) + @debug "[$rank] Finished bcast send on $tag" + end + end return Dagger.move(from_proc.proc, to_proc, x_value.value) else @debug "[$rank] Starting bcast recv on $tag" - while !haskey(BCAST_VALUES, tag) - IbcastRecv(x_value.color, tag, from_proc.comm) - end - value = BCAST_VALUES[tag] + value = recv_yield(x_value.color, tag, from_proc.comm) @debug "[$rank] Finished bcast recv on $tag" return Dagger.move(from_proc.proc, to_proc, value) end diff --git a/src/lib/util.jl b/src/lib/util.jl index 977007dd4..67cca689b 100644 --- a/src/lib/util.jl +++ b/src/lib/util.jl @@ -152,6 +152,8 @@ function setindex(x::NTuple{N}, idx, v) where N map(ifelse, ntuple(x->idx === x, Val(N)), ntuple(x->v, Val(N)), x) end +export treereducedim + function showloc(f, argcount) args = ntuple(x->Any, argcount) ms = methods(f) diff --git a/src/processor.jl b/src/processor.jl index c8cd6ca03..24739cd43 100644 --- a/src/processor.jl +++ b/src/processor.jl @@ -352,6 +352,7 @@ function set_tls!(tls) task_local_storage(:_dagger_input_hash, tls.input_hash) end task_local_storage(:_dagger_processor, get(tls, :processor, nothing)) + task_local_storage(:_dagger_task_spec, get(tls, :task_spec, nothing)) task_local_storage(:_dagger_time_utilization, get(tls, :time_utilization, nothing)) task_local_storage(:_dagger_alloc_utilization, get(tls, :alloc_utilization, nothing)) end diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index 7ea653f35..587899f3b 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -1318,10 +1318,10 @@ end Executes a single task specified by `task_desc` on `to_proc`. """ function do_task(to_proc, task_desc) - thunk_id, task_hash, est_time_util, est_alloc_util, est_occupancy, - scope, Tf, data, + thunk_id, task_hash, est_time_util, est_alloc_util, + Tf, data, send_result, persist, cache, meta, - options, propagated, ids, hashes, positions, + options, propagated, ids, positions, hashes, ctx_vars, sch_handle, sch_uid = task_desc ctx = Context(Processor[]; log_sink=ctx_vars.log_sink, profile=ctx_vars.profile) @@ -1463,12 +1463,14 @@ function do_task(to_proc, task_desc) end end fetched = Any[] + @debug fetch_tasks for task in fetch_tasks push!(fetched, fetch_report(task)) end if meta append!(fetched, data[2:end]) end + f = popfirst!(fetched) @assert !(f isa Chunk) "Failed to unwrap thunk function" fetched_args = Any[]