diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 8f3bb2f07..871e51a8b 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -35,12 +35,22 @@ steps:
           julia_args: "--threads=1"
       - JuliaCI/julia-coverage#v1:
           codecov: true
+  - label: Julia 1.9
+    timeout_in_minutes: 60
+    <<: *test
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.9"
+      - JuliaCI/julia-test#v1:
+          julia_args: "--threads=1"
+      - JuliaCI/julia-coverage#v1:
+          codecov: true
   - label: Julia nightly
     timeout_in_minutes: 60
     <<: *test
     plugins:
       - JuliaCI/julia#v1:
-          version: "1.9-nightly"
+          version: "1.10-nightly"
       - JuliaCI/julia-test#v1:
           julia_args: "--threads=1"
       - JuliaCI/julia-coverage#v1:
@@ -93,3 +103,16 @@ steps:
       BENCHMARK_SCALE: "5:5:50"
     artifacts:
       - benchmarks/result*
+  - label: DTables.jl stability test
+    timeout_in_minutes: 20
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.8"
+    env:
+      JULIA_NUM_THREADS: "4"
+    agents:
+      queue: "juliaecosystem"
+      sandbox.jl: "true"
+      os: linux
+      arch: x86_64
+    command: "git clone https://github.com/JuliaParallel/DTables.jl.git ; julia -t4 -e 'using Pkg; Pkg.activate(\"DTables.jl\"); Pkg.develop(;path=\".\"); Pkg.instantiate(); Pkg.test()'"
diff --git a/Project.toml b/Project.toml
index f09176147..a890bbc3d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,10 +1,11 @@
 name = "Dagger"
 uuid = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
-version = "0.16.1"
+version = "0.16.3"
 
 [deps]
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 ContextVariablesX = "6add18c4-b38d-439d-96f6-d6bc489c04c5"
+DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
diff --git a/README.md b/README.md
index 1f3833cca..b472f0e78 100644
--- a/README.md
+++ b/README.md
@@ -17,10 +17,14 @@ At the core of Dagger.jl is a scheduler heavily inspired by [Dask](https://docs.
 
 ## Installation
 
-You can install Dagger by typing
+Dagger.jl can be installed using the Julia package manager. Enter the Pkg REPL mode by typing "]" in the Julia REPL and then run:
 
 ```julia
-julia> ] add Dagger
+pkg> add Dagger
+```
+Or, equivalently, via the Pkg API:
+```julia
+julia> import Pkg; Pkg.add("Dagger")
 ```
 
 ## Usage
@@ -37,6 +41,34 @@ b = Dagger.@spawn rand(a, 4)
 c = Dagger.@spawn sum(b)
 fetch(c) # some number!
 ```
+## Contributing Guide
+[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square)](http://makeapullrequest.com)
+[![GitHub issues](https://img.shields.io/github/issues/JuliaParallel/Dagger.jl)](https://github.com/JuliaParallel/Dagger.jl/issues)
+[![GitHub contributors](https://img.shields.io/github/contributors/JuliaParallel/Dagger.jl)](https://github.com/JuliaParallel/Dagger.jl/graphs/contributors)
+
+Contributions are encouraged. 
+
+There are several ways to contribute to our project:
+
+**Reporting Bugs**: If you find a bug, please open an issue and describe the problem. Make sure to include steps to reproduce the issue and any error messages you receive regarding that issue.
+
+**Fixing Bugs**: If you'd like to fix a bug, please create a pull request with your changes. Make sure to include a description of the problem and how your changes will address it.
+
+Additional examples and documentation improvements are also very welcome.
+
+## Resources
+List of recommended Dagger.jl resources:
+- Docs [![][docs-master-img]][docs-master-url]
+- Videos
+  - [Distributed Computing with Dagger.jl](https://youtu.be/capjmjVHfMU)
+  - [Easy, Featureful Parallelism with Dagger.jl](https://youtu.be/t3S8W6A4Ago)
+  - [Easier parallel Julia workflow with Dagger.jl](https://youtu.be/VrqzOsav61w)
+  - [Dagger.jl Development and Roadmap](https://youtu.be/G0Y62ysFbDk)
+
+## Help and Discussion
+For help and discussion, we suggest asking in the following places:
+
+[Julia Discourse](https://discourse.julialang.org/c/domain/parallel/34) and on the [Julia Slack](https://julialang.org/slack/) in the `#distributed` channel.
 
 ## Acknowledgements
 
diff --git a/docs/src/checkpointing.md b/docs/src/checkpointing.md
index 9b3f7b618..97dd8e8ed 100644
--- a/docs/src/checkpointing.md
+++ b/docs/src/checkpointing.md
@@ -54,9 +54,9 @@ Let's see how we'd modify the above example to use checkpointing:
 
 ```julia
 using Serialization
+
 X = compute(randn(Blocks(128,128), 1024, 1024))
-Y = [delayed(sum; options=Dagger.Sch.ThunkOptions(;
-checkpoint=(thunk,result)->begin
+Y = [delayed(sum; checkpoint=(thunk,result)->begin
     open("checkpoint-$idx.bin", "w") do io
         serialize(io, collect(result))
     end
@@ -64,7 +64,7 @@ end, restore=(thunk)->begin
     open("checkpoint-$idx.bin", "r") do io
         Dagger.tochunk(deserialize(io))
     end
-end))(chunk) for (idx,chunk) in enumerate(X.chunks)]
+end)(chunk) for (idx,chunk) in enumerate(X.chunks)]
 inner(x...) = sqrt(sum(x))
 Z = delayed(inner)(Y...)
 z = collect(Z)
diff --git a/docs/src/index.md b/docs/src/index.md
index 29bdccfeb..45cc68827 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -2,32 +2,34 @@
 
 ## Usage
 
-The main function for using Dagger is `spawn`:
+The main entrypoint to Dagger is `@spawn`:
 
-`Dagger.spawn(f, args...; options...)`
+`Dagger.@spawn [option=value]... f(args...; kwargs...)`
 
-or `@spawn` for the more convenient macro form:
+or `spawn` if it's more convenient:
 
-`Dagger.@spawn [option=value]... f(args...)`
+`Dagger.spawn(f, Dagger.Options(options), args...; kwargs...)`
 
 When called, it creates an `EagerThunk` (also known as a "thunk" or "task")
-object representing a call to function `f` with the arguments `args`. If it is
-called with other thunks as inputs, such as in `Dagger.@spawn f(Dagger.@spawn
-g())`, then the function `f` gets passed the results of those input thunks. If
-those thunks aren't yet finished executing, then the execution of `f` waits on
-all of its input thunks to complete before executing.
+object representing a call to function `f` with the arguments `args` and
+keyword arguments `kwargs`. If it is called with other thunks as args/kwargs,
+such as in `Dagger.@spawn f(Dagger.@spawn g())`, then the function `f` gets
+passed the results of those input thunks, once they're available. If those
+thunks aren't yet finished executing, then the execution of `f` waits on all of
+its input thunks to complete before executing.
 
 The key point is that, for each argument to a thunk, if the argument is an
 `EagerThunk`, it'll be executed before this node and its result will be passed
 into the function `f`. If the argument is *not* an `EagerThunk` (instead, some
 other type of Julia object), it'll be passed as-is to the function `f`.
 
-Thunks don't accept regular keyword arguments for the function `f`. Instead,
-the `options` kwargs are passed to the scheduler to control its behavior:
+The `Options` struct in the second argument position is optional; if provided,
+it is passed to the scheduler to control its behavior. `Options` contains a
+`NamedTuple` of option key-value pairs, which can be any of:
 - Any field in `Dagger.Sch.ThunkOptions` (see [Scheduler and Thunk options](@ref))
 - `meta::Bool` -- Pass the input `Chunk` objects themselves to `f` and not the value contained in them
 
-There are also some extra kwargs that can be passed, although they're considered advanced options to be used only by developers or library authors:
+There are also some extra optionss that can be passed, although they're considered advanced options to be used only by developers or library authors:
 - `get_result::Bool` -- return the actual result to the scheduler instead of `Chunk` objects. Used when `f` explicitly constructs a Chunk or when return value is small (e.g. in case of reduce)
 - `persist::Bool` -- the result of this Thunk should not be released after it becomes unused in the DAG
 - `cache::Bool` -- cache the result of this Thunk such that if the thunk is evaluated again, one can just reuse the cached value. If it’s been removed from cache, recompute the value.
@@ -133,10 +135,10 @@ via `@par` or `delayed`. The above computation can be executed with the lazy
 API by substituting `@spawn` with `@par` and `fetch` with `collect`:
 
 ```julia
-p = @par add1(4)
-q = @par add2(p)
-r = @par add1(3)
-s = @par combine(p, q, r)
+p = Dagger.@par add1(4)
+q = Dagger.@par add2(p)
+r = Dagger.@par add1(3)
+s = Dagger.@par combine(p, q, r)
 
 @assert collect(s) == 16
 ```
@@ -144,7 +146,7 @@ s = @par combine(p, q, r)
 or similarly, in block form:
 
 ```julia
-s = @par begin
+s = Dagger.@par begin
     p = add1(4)
     q = add2(p)
     r = add1(3)
@@ -159,7 +161,7 @@ operation, you can call `compute` on the thunk. This will return a `Chunk`
 object which references the result (see [Chunks](@ref) for more details):
 
 ```julia
-x = @par 1+2
+x = Dagger.@par 1+2
 cx = compute(x)
 cx::Chunk
 @assert collect(cx) == 3
@@ -198,15 +200,17 @@ While Dagger generally "just works", sometimes one needs to exert some more
 fine-grained control over how the scheduler allocates work. There are two
 parallel mechanisms to achieve this: Scheduler options (from
 `Dagger.Sch.SchedulerOptions`) and Thunk options (from
-`Dagger.Sch.ThunkOptions`). These two options structs generally contain the
-same options, with the difference being that Scheduler options operate
+`Dagger.Sch.ThunkOptions`). These two options structs contain many shared
+options, with the difference being that Scheduler options operate
 globally across an entire DAG, and Thunk options operate on a thunk-by-thunk
-basis. Scheduler options can be constructed and passed to `collect()` or
-`compute()` as the keyword argument `options` for lazy API usage:
+basis.
+
+Scheduler options can be constructed and passed to `collect()` or `compute()`
+as the keyword argument `options` for lazy API usage:
 
 ```julia
-t = @par 1+2
-opts = Dagger.Sch.ThunkOptions(;single=1) # Execute on worker 1
+t = Dagger.@par 1+2
+opts = Dagger.Sch.SchedulerOptions(;single=1) # Execute on worker 1
 
 compute(t; options=opts)
 
@@ -219,12 +223,46 @@ Thunk options can be passed to `@spawn/spawn`, `@par`, and `delayed` similarly:
 # Execute on worker 1
 
 Dagger.@spawn single=1 1+2
+Dagger.spawn(+, Dagger.Options(;single=1), 1, 2)
 
-Dagger.spawn(+, 1, 2; single=1)
-
-opts = Dagger.Sch.ThunkOptions(;single=1)
-delayed(+)(1, 2; options=opts)
+delayed(+; single=1)(1, 2)
 ```
 
+### Core vs. Worker Schedulers
+
+Dagger's scheduler is really two kinds of entities: the "core" scheduler, and
+"worker" schedulers:
+
+The core scheduler runs on worker 1, thread 1, and is the entrypoint to tasks
+which have been submitted. The core scheduler manages all task dependencies,
+notifies calls to `wait` and `fetch` of task completion, and generally performs
+initial task placement. The core scheduler has cached information about each
+worker and their processors, and uses that information (together with metrics
+about previous tasks and other aspects of the Dagger runtime) to generate a
+near-optimal just-in-time task schedule.
+
+The worker schedulers each run as a set of tasks across all workers and all
+processors, and handles data movement and task execution. Once the core
+scheduler has scheduled and launched a task, it arrives at the worker scheduler
+for handling. The worker scheduler will pass the task to a queue for the
+assigned processor, where it will wait until the processor has a sufficient
+amount of "occupancy" for the task. Once the processor is ready for the task,
+it will first fetch all arguments to the task from other workers, and then it
+will execute the task, package the result into a `Chunk`, and pass that back to
+the core scheduler.
+
+### Workload Balancing
+
+In general, Dagger's core scheduler tries to balance workloads as much as
+possible across all the available processors, but it can fail to do so
+effectively when either the cached per-processor information is outdated, or
+when the estimates about the task's behavior are inaccurate. To minimize the
+impact of this potential workload imbalance, the worker schedulers' processors
+will attempt to steal tasks from each other when they are under-occupied. Tasks
+will only be stolen if their [scope](`Scopes`) matches the processor attempting
+the steal, so tasks with wider scopes have better balancing potential.
+
+### Scheduler/Thunk Options
+
 [`Dagger.Sch.SchedulerOptions`](@ref)
 [`Dagger.Sch.ThunkOptions`](@ref)
diff --git a/docs/src/logging.md b/docs/src/logging.md
index 951af63b1..81f97490b 100644
--- a/docs/src/logging.md
+++ b/docs/src/logging.md
@@ -32,7 +32,7 @@ called. Let's construct one:
 
 ```julia
 ctx = Context()
-ml = TimspanLogging.MultiEventLog()
+ml = TimespanLogging.MultiEventLog()
 
 # Add the BytesAllocd consumer to the log as `:bytes`
 ml[:bytes] = Dagger.Events.BytesAllocd()
diff --git a/docs/src/processors.md b/docs/src/processors.md
index e55fd2119..6e74ecc81 100644
--- a/docs/src/processors.md
+++ b/docs/src/processors.md
@@ -76,42 +76,13 @@ processor B. This mechanism uses Julia's Serialization library to serialize and
 deserialize data, so data must be serializable for this mechanism to work
 properly.
 
-### Future: Hierarchy Generic Path Move
-
-NOTE: This used to be the default move behavior, but was removed because it
-wasn't considered helpful, and there were not any processor implementations
-that made use of it.
-
-Movement of data between any two processors is decomposable into a sequence of
-"moves" between a child and its parent, termed a "generic path move". Movement
-of data may also take "shortcuts" between nodes in the tree which are not
-directly connected if enabled by libraries or the user, which may make use of
-IPC mechanisms to transfer data more directly and efficiently (such as
-Infiniband, GPU RDMA, NVLINK, etc.). All data is considered local to some
-processor, and may only be operated on by another processor by first doing an
-explicit move operation to that processor.
-
 ## Processor Selection
 
 By default, Dagger uses the CPU to process work, typically single-threaded per
 cluster node. However, Dagger allows access to a wider range of hardware and
 software acceleration techniques, such as multithreading and GPUs. These more
 advanced (but performant) accelerators are disabled by default, but can easily
-be enabled by using Scheduler/Thunk options in the `proclist` field. If
-`nothing`, all default processors will be used. If a vector of types, only the
-processor types contained in `options.proclist` will be used to compute all or
-a given thunk. If a function, it will be called for each processor (with the
-processor as the argument) until it returns `true`.
-
-```julia
-opts = Dagger.Sch.ThunkOptions(;proclist=nothing) # default behavior
-# OR
-opts = Dagger.Sch.ThunkOptions(;proclist=[DaggerGPU.CuArrayProc]) # only execute on CuArrayProc
-# OR
-opts = Dagger.Sch.ThunkOptions(;proclist=(proc)->(proc isa Dagger.ThreadProc && proc.tid == 3)) # only run on ThreadProc with thread ID 3
-
-t = Dagger.@par options=opts sum(X) # do sum(X) on the specified processor
-```
+be enabled by using scopes (see [Scopes](@ref) for details).
 
 ## Resource Control
 
@@ -137,7 +108,7 @@ sufficient resources become available by thunks completing execution.
 The [DaggerGPU.jl](https://github.com/JuliaGPU/DaggerGPU.jl) package can be
 imported to enable GPU acceleration for NVIDIA and AMD GPUs, when available.
 The processors provided by that package are not enabled by default, but may be
-enabled via `options.proclist` as usual.
+enabled via custom scopes ([Scopes](@ref)).
 
 ### Future: Network Devices and Topology
 
diff --git a/docs/src/propagation.md b/docs/src/propagation.md
index c47ba1353..ce00af4f2 100644
--- a/docs/src/propagation.md
+++ b/docs/src/propagation.md
@@ -1,6 +1,6 @@
 # Option Propagation
 
-Most options passed to Dagger are passed via `delayed` or `Dagger.@spawn`
+Most options passed to Dagger are passed via `@spawn/spawn` or `delayed`
 directly. This works well when an option only needs to be set for a single
 thunk, but is cumbersome when the same option needs to be set on multiple
 thunks, or set recursively on thunks spawned within other thunks. Thankfully,
diff --git a/docs/src/scheduler-visualization.md b/docs/src/scheduler-visualization.md
index 1696aac5d..234f54c92 100644
--- a/docs/src/scheduler-visualization.md
+++ b/docs/src/scheduler-visualization.md
@@ -16,14 +16,16 @@ easiest way to get started with the web dashboard for new users.
 For manual usage, the following snippet of code will suffice:
 
 ```julia
+using Dagger, DaggerWebDash, TimespanLogging
+
 ctx = Context() # or `ctx = Dagger.Sch.eager_context()` for eager API usage
-ml = Dagger.MultiEventLog()
+ml = TimespanLogging.MultiEventLog()
 
 ## Add some logging events of interest
 
-ml[:core] = Dagger.Events.CoreMetrics()
-ml[:id] = Dagger.Events.IDMetrics()
-ml[:timeline] = Dagger.Events.TimelineMetrics()
+ml[:core] = TimespanLogging.Events.CoreMetrics()
+ml[:id] = TimespanLogging.Events.IDMetrics()
+ml[:timeline] = TimespanLogging.Events.TimelineMetrics()
 # ...
 
 # (Optional) Enable profile flamegraph generation with ProfileSVG
@@ -31,7 +33,7 @@ ml[:profile] = DaggerWebDash.ProfileMetrics()
 ctx.profile = true
 
 # Create a LogWindow; necessary for real-time event updates
-lw = Dagger.Events.LogWindow(20*10^9, :core)
+lw = TimespanLogging.Events.LogWindow(20*10^9, :core)
 ml.aggregators[:logwindow] = lw
 
 # Create the D3Renderer server on port 8080
@@ -40,20 +42,20 @@ d3r = DaggerWebDash.D3Renderer(8080)
 ## Add some plots! Rendered top-down in order
 
 # Show an overview of all generated events as a Gantt chart
-push!(d3r, GanttPlot(:core, :id, :esat, :psat; title="Overview"))
+push!(d3r, DaggerWebDash.GanttPlot(:core, :id, :esat, :psat; title="Overview"))
 
 # Show various numerical events as line plots over time
-push!(d3r, LinePlot(:core, :wsat, "Worker Saturation", "Running Tasks"))
-push!(d3r, LinePlot(:core, :loadavg, "CPU Load Average", "Average Running Threads"))
-push!(d3r, LinePlot(:core, :bytes, "Allocated Bytes", "Bytes"))
-push!(d3r, LinePlot(:core, :mem, "Available Memory", "% Free"))
+push!(d3r, DaggerWebDash.LinePlot(:core, :wsat, "Worker Saturation", "Running Tasks"))
+push!(d3r, DaggerWebDash.LinePlot(:core, :loadavg, "CPU Load Average", "Average Running Threads"))
+push!(d3r, DaggerWebDash.LinePlot(:core, :bytes, "Allocated Bytes", "Bytes"))
+push!(d3r, DaggerWebDash.LinePlot(:core, :mem, "Available Memory", "% Free"))
 
 # Show a graph rendering of compute tasks and data movement between them
 # Note: Profile events are ignored if absent from the log
-push!(d3r, GraphPlot(:core, :id, :timeline, :profile, "DAG"))
+push!(d3r, DaggerWebDash.GraphPlot(:core, :id, :timeline, :profile, "DAG"))
 
 # TODO: Not yet functional
-#push!(d3r, ProfileViewer(:core, :profile, "Profile Viewer"))
+#push!(d3r, DaggerWebDash.ProfileViewer(:core, :profile, "Profile Viewer"))
 
 # Add the D3Renderer as a consumer of special events generated by LogWindow
 push!(lw.creation_handlers, d3r)
diff --git a/docs/src/scopes.md b/docs/src/scopes.md
index 01c1a3414..bed60b506 100644
--- a/docs/src/scopes.md
+++ b/docs/src/scopes.md
@@ -15,16 +15,16 @@ considered valid.
 
 Let's take the example of a webcam handle generated by VideoIO.jl. This handle
 is a C pointer, and thus has process scope. We can open the handle on a given
-process, and set the scope of the resulting data to a `ProcessScope()`, which
-defaults to the current Julia process:
+process, and set the scope of the resulting data to be locked to the current
+process with `Dagger.scope` to construct a `ProcessScope`:
 
 ```julia
-using VideoIO
+using VideoIO, Distributed
 
 function get_handle()
     handle = VideoIO.opencamera()
     proc = Dagger.thunk_processor()
-    scope = ProcessScope()
+    scope = Dagger.scope(worker=myid()) # constructs a `ProcessScope`
     return Dagger.tochunk(handle, proc, scope)
 end
 
@@ -41,7 +41,7 @@ cam_frame = Dagger.@spawn read(cam_handle)
 
 The `cam_frame` task is executed within any processor on the same process that
 the `cam_handle` task was executed on. Of course, the resulting camera frame is
-*not* scoped to anywhere specific (denoted as `AnyScope()`), and thus
+*not* scoped to anywhere specific (denoted as `AnyScope`), and thus
 computations on it may execute anywhere.
 
 You may also encounter situations where you want to use a callable struct (such
@@ -53,13 +53,15 @@ using Flux
 m = Chain(...)
 # If `m` is only safe to transfer to and execute on this process,
 # we can set a `ProcessScope` on it:
-result = Dagger.@spawn scope=ProcessScope() m(rand(8,8))
+result = Dagger.@spawn scope=Dagger.scope(worker=myid()) m(rand(8,8))
 ```
 
 Setting a scope on the function treats it as a regular piece of data (like the
 arguments to the function), so it participates in the scoping rules described
 in the following sections all the same.
 
+[`Dagger.scope`](@ref)
+
 Now, let's try out some other kinds of scopes, starting with `NodeScope`. This
 scope encompasses the server that one or more Julia processes may be running
 on. Say we want to use memory mapping (mmap) to more efficiently send arrays
@@ -75,6 +77,7 @@ function generate()
     arr = Mmap.mmap(path, Matrix{Int}, (64,64))
     fill!(arr, 1)
     Mmap.sync!(arr)
+    # Note: Dagger.scope() does not yet support node scopes
     Dagger.tochunk(path, Dagger.thunk_processor(), NodeScope())
 end
 
@@ -87,14 +90,14 @@ a = Dagger.@spawn generate()
 @assert fetch(Dagger.@spawn consume(a)) == 64*64
 ```
 
-Whatever server `a` executed on, `b` will also execute on!
+Whatever server `a` executed on, `b` will also execute on it!
 
 Finally, we come to the "lowest" scope on the scope hierarchy, the
 `ExactScope`. This scope specifies one exact processor as the bounding scope,
-and is typically useful in certain limited cases. We won't provide an example
-here, because you don't usually need to ever use this scope, but if you already
-understand the `NodeScope` and `ProcessScope`, the `ExactScope` should be easy
-to figure out.
+and is typically useful in certain limited cases (such as data existing only on
+a specific GPU). We won't provide an example here, because you don't usually
+need to ever use this scope, but if you already understand the `NodeScope` and
+`ProcessScope`, the `ExactScope` should be easy to figure out.
 
 ## Union Scopes
 
diff --git a/lib/DaggerMPI/src/DaggerMPI.jl b/lib/DaggerMPI/src/DaggerMPI.jl
index 7f185ed2f..adffa583e 100644
--- a/lib/DaggerMPI/src/DaggerMPI.jl
+++ b/lib/DaggerMPI/src/DaggerMPI.jl
@@ -1,6 +1,6 @@
 module DaggerMPI
-
 using Dagger
+import Base: reduce, fetch, cat
 using MPI
 
 struct MPIProcessor{P,C} <: Dagger.Processor
@@ -16,6 +16,8 @@ end
 
 const MPI_PROCESSORS = Ref{Int}(-1)
 
+#const BCAST_VALUES = Dict{Any, Any}()
+
 const PREVIOUS_PROCESSORS = Set()
 
 
@@ -28,14 +30,14 @@ function initialize(comm::MPI.Comm=MPI.COMM_WORLD; color_algo=SimpleColoring())
     MPI.Init(; finalize_atexit=false)
     procs = Dagger.get_processors(OSProc())
     i = 0
-    empty!(Dagger.PROCESSOR_CALLBACKS)
+    #=empty!(Dagger.PROCESSOR_CALLBACKS)
     empty!(Dagger.OSPROC_PROCESSOR_CACHE)
     for proc in procs
         Dagger.add_processor_callback!("mpiprocessor_$i") do
             return MPIProcessor(proc, comm, color_algo)
         end
         i += 1
-    end
+    end=#
     MPI_PROCESSORS[] = i
 
     # FIXME: Hack to populate new processors
@@ -52,7 +54,7 @@ function finalize()
     empty!(Dagger.PROCESSOR_CALLBACKS)
     empty!(Dagger.OSPROC_PROCESSOR_CACHE)
     i = 1
-    for proc in PREVIOUS_PROCESSORS
+   for proc in PREVIOUS_PROCESSORS
         Dagger.add_processor_callback!("old_processor_$i") do
             return proc
         end
@@ -73,6 +75,113 @@ end
 Dagger.get_parent(proc::MPIProcessor) = Dagger.OSProc()
 Dagger.default_enabled(proc::MPIProcessor) = true
 
+export MPIBlocks
+
+struct MPIBlocks{N} <: Dagger.AbstractSingleBlocks{N}
+    blocksize::NTuple{N, Int}
+end
+MPIBlocks(xs::Int...) = MPIBlocks(xs)
+
+function Dagger.distribute(::Type{A}, 
+                           x::Union{AbstractArray, Nothing}, 
+                           dist::MPIBlocks, 
+                           comm::MPI.Comm=MPI.COMM_WORLD, 
+                           root::Integer=0) where {A<:AbstractArray{T, N}} where {T, N}
+    isroot = MPI.Comm_rank(comm) == root
+
+    #TODO: Make better load balancing
+
+    data = Array{T, N}(undef, dist.blocksize)
+    if isroot
+        cs = Array{T, N}(undef, size(x))
+        parts =  partition(dist, domain(x))
+        idx = 1
+        for part in parts
+            cs[idx:(idx - 1 + prod(dist.blocksize))] = x[part]
+            idx += prod(dist.blocksize)
+        end
+        MPI.Scatter!(MPI.UBuffer(cs, div(length(cs), MPI.Comm_size(comm))), data, comm, root=root)
+    else
+        MPI.Scatter!(nothing, data, comm, root=root)
+    end
+
+    data = Dagger.tochunk(data)
+
+    Dagger.DArray(
+        T,
+        domain(data),
+        domain(data),
+        data,
+        dist
+       )
+end
+
+function Dagger.distribute(::Type{A}, 
+                    dist::MPIBlocks, 
+                    comm::MPI.Comm=MPI.COMM_WORLD, 
+                    root::Integer=0) where {A<:AbstractArray{T, N}} where {T, N}
+    distribute(A, nothing, dist, comm, root)
+end
+
+function Dagger.distribute(x::AbstractArray, 
+                    dist::MPIBlocks, 
+                    comm::MPI.Comm=MPI.COMM_WORLD, 
+                    root::Integer=0)
+    distribute(typeof(x), x, dist, comm, root)
+end      
+
+export MPIBlocks, distribute
+
+function Base.reduce(f::Function, x::Dagger.DArray{T,N,MPIBlocks{N},F}; comm=MPI.COMM_WORLD, root=nothing, dims=nothing, acrossranks::Bool=true) where {T,N,F} 
+    if dims === nothing
+        if !acrossranks
+            return fetch(reduce_async(f,x))
+        elseif root === nothing
+            return MPI.Allreduce(fetch(reduce_async(f,x)), f, comm)
+        else 
+            return MPI.Reduce(fetch(reduce_async(f,x)), f, comm; root)
+        end 
+    else
+        if dims isa Int 
+            dims = (dims,)
+        end
+        d = reduce(x.domain, dims=dims)
+        ds = reduce(x.subdomains[1], dims=dims)
+        if !acrossranks   
+            thunks = Dagger.spawn(b->reduce(f, b, dims=dims), x.chunks[1])
+            return Dagger.DArray(T, 
+                                 d, 
+                                 ds, 
+                                 thunks,
+                                 x.partitioning)
+        else
+            tmp = collect(reduce(f, x, comm=comm, root=root, dims=dims, acrossranks=false))
+            if root === nothing
+                h = UInt(0)
+                for dim in 1:N
+                    if dim in dims
+                        continue
+                    end
+                    h = hash(x.subdomains[1].indexes[dim], h)
+                end
+                h = abs(Base.unsafe_trunc(Int32, h))
+                newc = MPI.Comm_split(comm, h, MPI.Comm_rank(comm))
+                chunks = Dagger.tochunk(reshape(MPI.Allreduce(tmp, f, newc), size(tmp)))
+            else
+                rcvbuf = MPI.Reduce(tmp, f, comm; root)
+                if root === MPI.Comm_rank(comm)
+                    chunks = Dagger.tochunk(reshape(rcvbuf, size(tmp)))
+                    return Dagger.DArray(T,
+                                         d,
+                                         ds,
+                                         chunks,
+                                         x.partitioning)
+                end
+                return nothing
+            end
+        end
+    end
+end
 
 "Busy-loop Irecv that yields to other tasks."
 function recv_yield(src, tag, comm)
@@ -142,15 +251,15 @@ function Dagger.move(from_proc::MPIProcessor, to_proc::Dagger.Processor, x::Dagg
     @assert Dagger.chunktype(x) <: MPIColoredValue
     x_value = fetch(x)
     rank = MPI.Comm_rank(from_proc.comm)
-    tag = abs(Base.unsafe_trunc(Int32, Dagger.get_task_hash(:input) >> 32))
+    tag = abs(Base.unsafe_trunc(Int32, Dagger.get_task_hash(:input) >> 32)) 
     if rank == x_value.color
-        # FIXME: Broadcast send
-        @sync for other in 0:(MPI.Comm_size(from_proc.comm)-1)
+        @debug "[$rank] Starting bcast send on $tag"
+        @sync for other in 0:(MPI_Comm_size(from_proc.comm)-1)
             other == rank && continue
             @async begin
-                @debug "[$rank] Starting bcast send to [$other] on $tag"
+                @debug "[$rank] Starting bcast send on $tag"
                 MPI.isend(x_value.value, other, tag, from_proc.comm)
-                @debug "[$rank] Finished bcast send to [$other] on $tag"
+                @debug "[$rank] Finished bcast send on $tag"
             end
         end
         return Dagger.move(from_proc.proc, to_proc, x_value.value)
diff --git a/lib/DaggerWebDash/Project.toml b/lib/DaggerWebDash/Project.toml
index fb1d99d1c..1dccdec2f 100644
--- a/lib/DaggerWebDash/Project.toml
+++ b/lib/DaggerWebDash/Project.toml
@@ -6,6 +6,7 @@ version = "0.1.2"
 [deps]
 Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
 MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
 Mux = "a975b10e-0019-58db-a62f-e48ff68538c9"
@@ -17,9 +18,10 @@ TimespanLogging = "a526e669-04d3-4846-9525-c66122c55f63"
 
 [compat]
 Dagger = "0.16"
+HTTP = "1.7"
 JSON3 = "1"
 MemPool = "0.3, 0.4"
-Mux = "0.7"
+Mux = "0.7, 1"
 ProfileSVG = "0.2"
 StructTypes = "1"
 Tables = "1"
diff --git a/lib/DaggerWebDash/src/d3.jl b/lib/DaggerWebDash/src/d3.jl
index bc77991cd..00d2ffb33 100644
--- a/lib/DaggerWebDash/src/d3.jl
+++ b/lib/DaggerWebDash/src/d3.jl
@@ -1,4 +1,7 @@
-using Mux, MemPool, Distributed, Sockets
+using Mux, Sockets
+import HTTP
+import HTTP: WebSockets
+using MemPool, Distributed
 
 struct LinePlot
     core_key::Symbol
@@ -172,22 +175,22 @@ function client_handler(sock, id, port, port_range, config_updated, config, seek
     if id != myid()
         fsock_ready = Base.Event()
         worker_host, worker_port = worker_host_port(id, port, port_range)
-        Mux.HTTP.WebSockets.open("ws://$worker_host:$worker_port/data_feed") do _fsock
+        WebSockets.open("ws://$worker_host:$worker_port/data_feed") do _fsock
             fsock = _fsock
             @debug "D3R forwarder for $id ready"
             notify(fsock_ready)
-            while !eof(fsock) && isopen(fsock)
+            while true
                 try
-                    bytes = readavailable(fsock)
+                    bytes = WebSockets.receive(fsock)
                     if length(bytes) == 0
                         sleep(0.1)
                         continue
                     end
                     data = String(bytes)
                     #@info "D3R forwarder for $id received data"
-                    write(sock, data)
+                    WebSockets.send(sock, data)
                 catch err
-                    if err isa Mux.WebSockets.WebSocketClosedError || err isa Base.IOError
+                    if err isa WebSockets.WebSocketError && err.message isa WebSockets.CloseFrameBody
                         # Force-close client and forwarder
                         @async close(sock)
                         @async close(fsock)
@@ -203,25 +206,25 @@ function client_handler(sock, id, port, port_range, config_updated, config, seek
     end
     if id == myid()
         @debug "D3R client for $id sending initial config"
-        write(sock, JSON3.write((;cmd="data", payload=sanitize(D3R_LOGS[port]))))
+        WebSockets.send(sock, JSON3.write((;cmd="data", payload=sanitize(D3R_LOGS[port]))))
         _workers = workers()
         if !(myid() in _workers)
             # FIXME: Get this from the Context
             _workers = vcat(myid(), _workers)
         end
-        write(sock, JSON3.write((;cmd="config", payload=sanitize((;myid=myid(),workers=_workers,ctxs=config)))))
+        WebSockets.send(sock, JSON3.write((;cmd="config", payload=sanitize((;myid=myid(),workers=_workers,ctxs=config)))))
     end
     push!(get!(()->[], D3R_CLIENT_SOCKETS[port], id), sock)
     @debug "D3R client for $id ready"
-    while !eof(sock) && isopen(sock)
+    while true
         try
-            data = String(read(sock))
+            data = String(WebSockets.receive(sock))
             @debug "D3R client for $id received: $data"
             if id == myid()
                 #= FIXME
                 if config_updated[]
                     config_updated[] = false
-                    write(sock, JSON3.write((;cmd="config", payload=sanitize(config))))
+                    WebSockets.send(sock, JSON3.write((;cmd="config", payload=sanitize(config))))
                 end
                 =#
                 if seek_store !== nothing
@@ -231,7 +234,7 @@ function client_handler(sock, id, port, port_range, config_updated, config, seek
                         for (idx,key) in enumerate(Tables.columnnames(raw_logs))
                             logs[key] = Tables.columns(raw_logs)[idx]
                         end
-                        write(sock, JSON3.write((;cmd="data", payload=sanitize(logs))))
+                        WebSockets.send(sock, JSON3.write((;cmd="data", payload=sanitize(logs))))
                         continue
                     end
                     m = match(r"seek\(([0-9]*),([0-9]*)\)", data)
@@ -242,19 +245,19 @@ function client_handler(sock, id, port, port_range, config_updated, config, seek
                         for (idx,key) in enumerate(Tables.columnnames(raw_logs))
                             logs[key] = Tables.columns(raw_logs)[idx]
                         end
-                        write(sock, JSON3.write((;cmd="data", payload=sanitize(logs))))
+                        WebSockets.send(sock, JSON3.write((;cmd="data", payload=sanitize(logs))))
                         continue
                     end
                 end
                 if data == "data"
-                    write(sock, JSON3.write((;cmd="data", payload=sanitize(D3R_LOGS[port]))))
+                    WebSockets.send(sock, JSON3.write((;cmd="data", payload=sanitize(D3R_LOGS[port]))))
                 end
             else
                 @debug "D3R client sending to forwarder: $data"
-                write(fsock, data)
+                WebSockets.send(fsock, data)
             end
         catch err
-            if err isa Mux.WebSockets.WebSocketClosedError || err isa Base.IOError
+            if err isa WebSockets.WebSocketError && err.message isa WebSockets.CloseFrameBody
                 idx = findfirst(x->x==sock, D3R_CLIENT_SOCKETS[port][id])
                 if idx !== nothing
                     deleteat!(D3R_CLIENT_SOCKETS[port][id], idx)
@@ -273,11 +276,9 @@ end
 function TimespanLogging.Events.creation_hook(d3r::D3Renderer, log)
     for sock in get!(()->[], get!(()->Dict{Int,Vector{Any}}(), D3R_CLIENT_SOCKETS, d3r.port), myid())
         try
-            if isopen(sock)
-                write(sock, JSON3.write((;cmd="add", payload=sanitize(log))))
-            end
+            WebSockets.send(sock, JSON3.write((;cmd="add", payload=sanitize(log))))
         catch err
-            if err isa Mux.WebSockets.WebSocketClosedError
+            if err isa WebSockets.WebSocketError && err.message isa WebSockets.CloseFrameBody
                 continue
             end
             rethrow(err)
@@ -287,11 +288,9 @@ end
 function TimespanLogging.Events.deletion_hook(d3r::D3Renderer, idx)
     for sock in get!(()->[], get!(()->Dict{Int,Vector{Any}}(), D3R_CLIENT_SOCKETS, d3r.port), myid())
         try
-            if isopen(sock)
-                write(sock, JSON3.write((;cmd="delete", payload=idx)))
-            end
+            WebSockets.send(sock, JSON3.write((;cmd="delete", payload=idx)))
         catch err
-            if err isa Mux.WebSockets.WebSocketClosedError
+            if err isa WebSockets.WebSocketError && err.message isa WebSockets.CloseFrameBody
                 continue
             end
             rethrow(err)
diff --git a/src/Dagger.jl b/src/Dagger.jl
index a9b947921..1be27cacf 100644
--- a/src/Dagger.jl
+++ b/src/Dagger.jl
@@ -37,6 +37,7 @@ include("compute.jl")
 include("utils/clock.jl")
 include("utils/system_uuid.jl")
 include("utils/uhash.jl")
+include("utils/locked-object.jl")
 include("sch/Sch.jl"); using .Sch
 
 # Array computations
diff --git a/src/array/alloc.jl b/src/array/alloc.jl
index b47af6f56..39ca60396 100644
--- a/src/array/alloc.jl
+++ b/src/array/alloc.jl
@@ -25,7 +25,7 @@ end
 
 function stage(ctx, a::AllocateArray)
     alloc(idx, sz) = a.f(idx, a.eltype, sz)
-    thunks = [delayed(alloc)(i, size(x)) for (i, x) in enumerate(a.domainchunks)]
+    thunks = [Dagger.@spawn alloc(i, size(x)) for (i, x) in enumerate(a.domainchunks)]
     DArray(a.eltype,a.domain, a.domainchunks, thunks)
 end
 
diff --git a/src/array/darray.jl b/src/array/darray.jl
index 7f1b8c6bd..221600922 100644
--- a/src/array/darray.jl
+++ b/src/array/darray.jl
@@ -1,4 +1,4 @@
-import Base: ==
+import Base: ==, fetch
 using Serialization
 import Serialization: serialize, deserialize
 
@@ -12,7 +12,6 @@ An `N`-dimensional domain over an array.
 struct ArrayDomain{N}
     indexes::NTuple{N, Any}
 end
-
 include("../lib/domain-blocks.jl")
 
 
@@ -78,13 +77,14 @@ domain(x::AbstractArray) = ArrayDomain([1:l for l in size(x)])
 abstract type ArrayOp{T, N} <: AbstractArray{T, N} end
 Base.IndexStyle(::Type{<:ArrayOp}) = IndexCartesian()
 
-compute(ctx, x::ArrayOp; options=nothing) =
-    compute(ctx, cached_stage(ctx, x)::DArray; options=options)
 
-collect(ctx::Context, x::ArrayOp; options=nothing) =
-    collect(ctx, compute(ctx, x; options=options); options=options)
+collect(x::ArrayOp) = collect(fetch(x))
+
+Base.fetch(x::ArrayOp) = fetch(cached_stage(Context(global_context()), x)::DArray)
 
-collect(x::ArrayOp; options=nothing) = collect(Context(global_context()), x; options=options)
+collect(x::Computation) = collect(fetch(x))
+
+Base.fetch(x::Computation) = fetch(cached_stage(Context(global_context()), x))
 
 function Base.show(io::IO, ::MIME"text/plain", x::ArrayOp)
     write(io, string(typeof(x)))
@@ -113,7 +113,7 @@ An N-dimensional distributed array of element type T, with a concatenation funct
 mutable struct DArray{T,N,F} <: ArrayOp{T, N}
     domain::ArrayDomain{N}
     subdomains::AbstractArray{ArrayDomain{N}, N}
-    chunks::AbstractArray{Union{Chunk,Thunk}, N}
+    chunks::AbstractArray{Any, N}
     concat::F
     function DArray{T,N,F}(domain, subdomains, chunks, concat::Function) where {T, N,F}
         new(domain, subdomains, chunks, concat)
@@ -135,8 +135,8 @@ domainchunks(d::DArray) = d.subdomains
 size(x::DArray) = size(domain(x))
 stage(ctx, c::DArray) = c
 
-function collect(ctx::Context, d::DArray; tree=false, options=nothing)
-    a = compute(ctx, d; options=options)
+function collect(d::DArray; tree=false)
+    a = fetch(d)
 
     if isempty(d.chunks)
         return Array{eltype(d)}(undef, size(d)...)
@@ -144,9 +144,9 @@ function collect(ctx::Context, d::DArray; tree=false, options=nothing)
 
     dimcatfuncs = [(x...) -> d.concat(x..., dims=i) for i in 1:ndims(d)]
     if tree
-        collect(treereduce_nd(delayed.(dimcatfuncs), a.chunks))
+        collect(fetch(treereduce_nd(map(x -> ((args...,) -> Dagger.@spawn x(args...)) , dimcatfuncs), a.chunks)))
     else
-        treereduce_nd(dimcatfuncs, asyncmap(collect, a.chunks))
+        treereduce_nd(dimcatfuncs, asyncmap(fetch, a.chunks))
     end
 end
 
@@ -209,53 +209,33 @@ _cumsum(x::AbstractArray) = length(x) == 0 ? Int[] : cumsum(x)
 function lookup_parts(ps::AbstractArray, subdmns::DomainBlocks{N}, d::ArrayDomain{N}) where N
     groups = map(group_indices, subdmns.cumlength, indexes(d))
     sz = map(length, groups)
-    pieces = Array{Union{Chunk,Thunk}}(undef, sz)
+    pieces = Array{Any}(undef, sz)
     for i = CartesianIndices(sz)
         idx_and_dmn = map(getindex, groups, i.I)
         idx = map(x->x[1], idx_and_dmn)
         dmn = ArrayDomain(map(x->x[2], idx_and_dmn))
-        pieces[i] = delayed(getindex)(ps[idx...], project(subdmns[idx...], dmn))
+        pieces[i] = Dagger.@spawn getindex(ps[idx...], project(subdmns[idx...], dmn))
     end
     out_cumlength = map(g->_cumsum(map(x->length(x[2]), g)), groups)
     out_dmn = DomainBlocks(ntuple(x->1,Val(N)), out_cumlength)
     pieces, out_dmn
 end
 
-
 """
-    compute(ctx::Context, x::DArray; persist=true, options=nothing)
-
-A `DArray` object may contain a thunk in it, in which case
-we first turn it into a `Thunk` and then compute it.
-"""
-function compute(ctx::Context, x::DArray; persist=true, options=nothing)
-    thunk = thunkize(ctx, x, persist=persist)
-    if isa(thunk, Thunk)
-        compute(ctx, thunk; options=options)
-    else
-        x
-    end
-end
-
-"""
-    thunkize(ctx::Context, c::DArray; persist=true)
+    Base.fetch(c::DArray)
 
 If a `DArray` tree has a `Thunk` in it, make the whole thing a big thunk.
 """
-function thunkize(ctx::Context, c::DArray; persist=true)
+function Base.fetch(c::DArray)
     if any(istask, chunks(c))
         thunks = chunks(c)
         sz = size(thunks)
         dmn = domain(c)
         dmnchunks = domainchunks(c)
-        if persist
-            foreach(persist!, thunks)
-        end
-        Thunk(thunks...; meta=true) do results...
+        fetch(Dagger.spawn(Options(meta=true), thunks...) do results...
             t = eltype(results[1])
-            DArray(t, dmn, dmnchunks,
-                                  reshape(Union{Chunk,Thunk}[results...], sz))
-        end
+            DArray(t, dmn, dmnchunks, reshape(Any[results...], sz))
+        end)
     else
         c
     end
@@ -324,7 +304,7 @@ Distribute(p::Blocks, data::AbstractArray) =
 
 function stage(ctx::Context, d::Distribute)
     if isa(d.data, ArrayOp)
-        # distributing a dsitributed array
+        # distributing a distributed array
         x = cached_stage(ctx, d.data)
         if d.domainchunks == domainchunks(x)
             return x # already properly distributed
@@ -335,19 +315,22 @@ function stage(ctx::Context, d::Distribute)
         cs = map(d.domainchunks) do idx
             chunks = cached_stage(ctx, x[idx]).chunks
             shape = size(chunks)
-            (delayed() do shape, parts...
+            hash = Base.hash(idx, Base.hash(stage, Base.hash(d.data)))
+            Dagger.spawn(shape, chunks...) do shape, parts...
                 if prod(shape) == 0
                     return Array{T}(undef, shape)
                 end
                 dimcatfuncs = [(x...) -> concat(x..., dims=i) for i in 1:length(shape)]
                 ps = reshape(Any[parts...], shape)
                 collect(treereduce_nd(dimcatfuncs, ps))
-            end)(shape, chunks...)
+            end
         end
     else
-        cs = map(c -> delayed(identity)(d.data[c]), d.domainchunks)
+        cs = map(d.domainchunks) do c
+            hash = Base.hash(c, Base.hash(stage, Base.hash(d.data)))
+            Dagger.@spawn hash=hash identity(d.data[c])
+        end
     end
-
     DArray(
            eltype(d.data),
            domain(d.data),
@@ -357,7 +340,7 @@ function stage(ctx::Context, d::Distribute)
 end
 
 function distribute(x::AbstractArray, dist)
-    compute(Distribute(dist, x))
+    fetch(Distribute(dist, x))
 end
 
 function distribute(x::AbstractArray{T,N}, n::NTuple{N}) where {T,N}
diff --git a/src/array/getindex.jl b/src/array/getindex.jl
index d04ae024f..89d38bb97 100644
--- a/src/array/getindex.jl
+++ b/src/array/getindex.jl
@@ -34,9 +34,9 @@ end
 function stage(ctx::Context, gidx::GetIndexScalar)
     inp = cached_stage(ctx, gidx.input)
     s = view(inp, ArrayDomain(gidx.idx))
-    delayed(identity)(collect(s)[1])
+    Dagger.@spawn identity(collect(s)[1])
 end
 
 Base.getindex(c::ArrayOp, idx::ArrayDomain) = GetIndex(c, indexes(idx))
 Base.getindex(c::ArrayOp, idx...)           = GetIndex(c, idx)
-Base.getindex(c::ArrayOp, idx::Integer...)  = compute(GetIndexScalar(c, idx))
+Base.getindex(c::ArrayOp, idx::Integer...)  = fetch(GetIndexScalar(c, idx))
diff --git a/src/array/map-reduce.jl b/src/array/map-reduce.jl
index e39019ed0..76bdcd727 100644
--- a/src/array/map-reduce.jl
+++ b/src/array/map-reduce.jl
@@ -19,7 +19,7 @@ function stage(ctx::Context, node::Map)
     f = node.f
     for i=eachindex(domains)
         inps = map(x->chunks(x)[i], inputs)
-        thunks[i] = Thunk((args...) -> map(f, args...), inps...)
+        thunks[i] = Dagger.@spawn map(f, inps...) 
     end
     DArray(Any, domain(primary), domainchunks(primary), thunks)
 end
@@ -40,16 +40,16 @@ end
 
 function stage(ctx::Context, r::ReduceBlock)
     inp = stage(ctx, r.input)
-    reduced_parts = map(x -> Thunk(r.op, x; get_result=r.get_result), chunks(inp))
-    Thunk((xs...) -> r.op_master(xs), reduced_parts...; meta=true)
+    reduced_parts = map(x -> (Dagger.@spawn get_result=r.get_result r.op(x)), chunks(inp))
+    r_op_master(args...,) = r.op_master(args)
+    Dagger.@spawn meta=true r_op_master(reduced_parts...) 
 end
 
 reduceblock_async(f, x::ArrayOp; get_result=true) = ReduceBlock(f, f, x, get_result)
 reduceblock_async(f, g::Function, x::ArrayOp; get_result=true) = ReduceBlock(f, g, x, get_result)
 
-reduceblock(f, x::ArrayOp) = compute(reduceblock_async(f, x))
-reduceblock(f, g::Function, x::ArrayOp) =
-    compute(reduceblock_async(f, g, x))
+reduceblock(f, x::ArrayOp) = fetch(reduceblock_async(f, x))
+reduceblock(f, g::Function, x::ArrayOp) = fetch(reduceblock_async(f, g, x))
 
 reduce_async(f::Function, x::ArrayOp) = reduceblock_async(xs->reduce(f,xs), xs->reduce(f,xs), x)
 
@@ -62,7 +62,7 @@ prod(f::Function, x::ArrayOp) = reduceblock(a->prod(f, a), prod, x)
 
 mean(x::ArrayOp) = reduceblock(mean, mean, x)
 
-mapreduce(f::Function, g::Function, x::ArrayOp) = reduce(g, map(f, x))
+mapreduce(f::Function, g::Function, x::ArrayOp) = reduce(g, map(f, x)) #think about fetching
 
 function mapreducebykey_seq(f, op,  itr, dict=Dict())
     for x in itr
@@ -115,7 +115,7 @@ end
 
 function reduce(f::Function, x::ArrayOp; dims = nothing)
     if dims === nothing
-        return compute(reduce_async(f,x))
+        return fetch(reduce_async(f,x))
     elseif dims isa Int
         dims = (dims,)
     end
@@ -126,10 +126,10 @@ function stage(ctx::Context, r::Reducedim)
     inp = cached_stage(ctx, r.input)
     thunks = let op = r.op, dims=r.dims
         # do reducedim on each block
-        tmp = map(p->Thunk(b->reduce(op,b,dims=dims), p), chunks(inp))
+        tmp = map(p->Dagger.spawn(b->reduce(op,b,dims=dims), p), chunks(inp))
         # combine the results in tree fashion
         treereducedim(tmp, r.dims) do x,y
-            Thunk(op, x,y)
+            Dagger.@spawn op(x,y)
         end
     end
     c = domainchunks(inp)
diff --git a/src/array/matrix.jl b/src/array/matrix.jl
index 686dfb3b8..48a77190e 100644
--- a/src/array/matrix.jl
+++ b/src/array/matrix.jl
@@ -17,10 +17,12 @@ function size(x::Transpose)
 end
 
 transpose(x::ArrayOp) = Transpose(transpose, x)
-transpose(x::Union{Chunk, Thunk}) = Thunk(transpose, x)
+
+transpose(x::Union{Chunk, EagerThunk}) = @spawn transpose(x)
 
 adjoint(x::ArrayOp) = Transpose(adjoint, x)
-adjoint(x::Union{Chunk, Thunk}) = Thunk(adjoint, x)
+adjoint(x::Union{Chunk, EagerThunk}) = @spawn adjoint(x)
+
 
 function adjoint(x::ArrayDomain{2})
     d = indexes(x)
@@ -32,7 +34,7 @@ function adjoint(x::ArrayDomain{1})
 end
 
 function _ctranspose(x::AbstractArray)
-    Any[delayed(adjoint)(x[j,i]) for i=1:size(x,2), j=1:size(x,1)]
+    Any[Dagger.@spawn adjoint(x[j,i]) for i=1:size(x,2), j=1:size(x,1)]
 end
 
 function stage(ctx::Context, node::Transpose)
@@ -91,8 +93,12 @@ function (+)(a::ArrayDomain, b::ArrayDomain)
     a
 end
 
-(*)(a::Union{Chunk, Thunk}, b::Union{Chunk, Thunk}) = Thunk(*, a,b)
-(+)(a::Union{Chunk, Thunk}, b::Union{Chunk, Thunk}) = Thunk(+, a,b)
+struct BinaryComputeOp{F} end
+BinaryComputeOp{F}(x::Union{Chunk,EagerThunk}, y::Union{Chunk,EagerThunk}) where F = @spawn F(x, y)
+BinaryComputeOp{F}(x, y) where F = F(x, y)
+
+const AddComputeOp = BinaryComputeOp{+}
+const MulComputeOp = BinaryComputeOp{*}
 
 # we define our own matmat and matvec multiply
 # for computing the new domains and thunks.
@@ -101,7 +107,7 @@ function _mul(a::Matrix, b::Matrix; T=eltype(a))
     n = size(a, 2)
     for i=1:size(a,1)
         for j=1:size(b, 2)
-            c[i,j] = treereduce(+, map(*, reshape(a[i,:], (n,)), b[:, j]))
+            c[i,j] = treereduce(AddComputeOp, map(MulComputeOp, reshape(a[i,:], (n,)), b[:, j]))
         end
     end
     c
@@ -111,14 +117,14 @@ function _mul(a::Matrix, b::Vector; T=eltype(b))
     c = Array{T}(undef, size(a,1))
     n = size(a,2)
     for i=1:size(a,1)
-        c[i] = treereduce(+, map(*, reshape(a[i, :], (n,)), b))
+        c[i] = treereduce(AddComputeOp, map(MulComputeOp, reshape(a[i, :], (n,)), b))
     end
     c
 end
 
 function _mul(a::Vector, b::Vector; T=eltype(b))
     @assert length(b) == 1
-    [x * b[1] for x in a]
+    [MulComputeOp(x, b[1]) for x in a]
 end
 
 function promote_distribution(ctx::Context, m::MatMul, a,b)
@@ -176,7 +182,7 @@ function stage(ctx::Context, mul::MatMul)
     a, b = stage_operands(ctx, mul, mul.a, mul.b)
     d = domain(a)*domain(b)
     DArray(Any, d, domainchunks(a)*domainchunks(b),
-                          _mul(chunks(a), chunks(b); T=Thunk))
+                          _mul(chunks(a), chunks(b); T=Any))
 end
 
 Base.power_by_squaring(x::DArray, i::Int) = foldl(*, ntuple(idx->x, i))
@@ -211,7 +217,7 @@ end
 function _scale(l, r)
     res = similar(r, Any)
     for i=1:length(l)
-        res[i,:] = map(x->Thunk((a,b) -> Diagonal(a)*b, l[i], x), r[i,:])
+        res[i,:] = map(x->Dagger.spawn((a,b) -> Diagonal(a)*b, l[i], x), r[i,:])
     end
     res
 end
diff --git a/src/array/operators.jl b/src/array/operators.jl
index 6bfe95ae6..640db4ea9 100644
--- a/src/array/operators.jl
+++ b/src/array/operators.jl
@@ -90,8 +90,7 @@ function stage(ctx::Context, node::BCast)
     end
     blcks = DomainBlocks(map(_->1, size(node)), cumlengths)
 
-    thunks = broadcast(delayed((args...)->broadcast(bc.f, args...); ),
-                       args2...)
+    thunks = broadcast((args3...)->Dagger.spawn((args...)->broadcast(bc.f, args...), args3...), args2...)
     DArray(eltype(node), domain(node), blcks, thunks)
 end
 
@@ -107,7 +106,7 @@ Base.@deprecate mappart(args...) mapchunk(args...)
 function stage(ctx::Context, node::MapChunk)
     inputs = map(x->cached_stage(ctx, x), node.input)
     thunks = map(map(chunks, inputs)...) do ps...
-        Thunk(node.f, ps...)
+        Dagger.spawn(node.f, map(p->nothing=>p, ps)...)
     end
 
     DArray(Any, domain(inputs[1]), domainchunks(inputs[1]), thunks)
diff --git a/src/array/setindex.jl b/src/array/setindex.jl
index e2c842901..45ae35c64 100644
--- a/src/array/setindex.jl
+++ b/src/array/setindex.jl
@@ -28,14 +28,14 @@ function stage(ctx::Context, sidx::SetIndex)
 
     groups = map(group_indices, subdmns.cumlength, indexes(d))
     sz = map(length, groups)
-    pieces = Array{Union{Chunk, Thunk}}(undef, sz)
+    pieces = Array{Any}(undef, sz)
     for i = CartesianIndices(sz)
         idx_and_dmn = map(getindex, groups, i.I)
         idx = map(x->x[1], idx_and_dmn)
         local_dmn = ArrayDomain(map(x->x[2], idx_and_dmn))
         s = subdmns[idx...]
         part_to_set = sidx.val
-        ps[idx...] = Thunk(ps[idx...]) do p
+        ps[idx...] = Dagger.spawn(ps[idx...]) do p
             q = copy(p)
             q[indexes(project(s, local_dmn))...] .= part_to_set
             q
diff --git a/src/array/sort.jl b/src/array/sort.jl
index cbc9b21f1..7c989e05c 100644
--- a/src/array/sort.jl
+++ b/src/array/sort.jl
@@ -3,6 +3,7 @@ using Distributed
 
 using StatsBase
 
+
 function getmedians(x, n)
     q,r = divrem(length(x), n+1)
 
@@ -28,6 +29,7 @@ function sortandsample_array(ord, xs, nsamples, presorted=false)
         chunk = nothing # avoid communicating metadata if already sorted
         samples = chunk[r]
     end
+
     (chunk, samples)
 end
 
@@ -89,7 +91,7 @@ end
 
 function collect_merge(merge, group)
     #delayed((xs...) -> treereduce(merge, Any[xs...]))(group...)
-    t = treereduce(delayed(merge), group)
+    t = treereduce((args...) -> (Dagger.@spawn merge(args...)), group)
 end
 
 # Given sorted chunks, splits each chunk according to splitters
@@ -117,15 +119,15 @@ function splitchunk(c, splitters, by, sub, ord)
         1:j
     end
 
-    between = map((hi, lo) -> delayed(x->sub(x, getbetween(by(x), hi, lo, ord)))(c),
+    between = map((hi, lo) -> Dagger.spawn(x->sub(x, getbetween(by(x), hi, lo, ord)), c),
                   splitters[1:end-1], splitters[2:end])
     hi = splitters[1]
     lo = splitters[end]
 
-    a = delayed(x->sub(x, getlt(by(x), hi, ord)))(c)
+    a = Dagger.spawn(x->sub(x, getlt(by(x), hi, ord)), c)
     b = between
-    c = delayed(x->sub(x, getgt(by(x), lo, ord)))(c)
-    return vcat(a, b, c)
+    c = Dagger.spawn(x->sub(x, getgt(by(x), lo, ord)), c)
+    return map(fetch, vcat(a, b, c))
 
    #[delayed(c->sub(c, getlt(by(c), hi, ord)))(c);
    # between; delayed(c->sub(c, getgt(by(c), lo, ord)))(c)]
@@ -263,17 +265,17 @@ function dsort_chunks(cs, nchunks=length(cs), nsamples=2000;
                       sortandsample = (x,ns, presorted)->sortandsample_array(order, x,ns, presorted),
                       affinities=workers(),
                      )
-    if splitters !== nothing
+    if splitters != nothing
         # this means splitters are given beforehand
         nsamples = 0 # no samples needed
     end
 
     # first sort each chunk and sample nsamples elements from each chunk
-    cs1 = map(c->delayed(sortandsample)(c, nsamples, chunks_presorted), cs)
-
+    cs1 = map(c->Dagger.spawn(sortandsample, c, nsamples, chunks_presorted), cs)
     batchsize = max(2, batchsize)
-    # collect the samples
-    xs = collect(treereduce(delayed(vcat), cs1))
+
+    xs = treereduce((cs...)->Dagger.spawn(vcat, cs...), cs1)
+    xs = collect(fetch(xs))
     if length(cs1) == 1
         xs = [xs]
     end
@@ -283,10 +285,9 @@ function dsort_chunks(cs, nchunks=length(cs), nsamples=2000;
         samples = reduce((a,b)->merge_sorted(order, a, b), map(x->x[2], xs))
         splitters = getmedians(samples, nchunks-1)
     end
-
-                       # if first(x) === nothing this means that
-                       # the chunk was already sorted, so just
-                       # use the input chunk as-is
+    # if first(x) === nothing this means that
+    # the chunk was already sorted, so just
+    # use the input chunk as-is
     cs2 = map((x,c) -> x === nothing ? c : x, map(first, xs), cs)
 
     # main sort routine. At this point:
@@ -304,7 +305,7 @@ function dsort_chunks(cs, nchunks=length(cs), nsamples=2000;
 end
 
 function propagate_affinity!(c, aff)
-    if !isa(c, Thunk)
+    if !isa(c, EagerThunk)
         return
     end
     if c.affinity !== nothing
@@ -326,17 +327,13 @@ function Base.sort(v::ArrayOp;
                batchsize=max(2, nworkers()),
                nsamples=2000,
                order::Ordering=default_ord)
-    v1 = compute(v)
+    v1 = fetch(v)
     ord = Base.Sort.ord(lt,by,rev,order)
     nchunks = nchunks === nothing ? length(v1.chunks) : nchunks
     cs = dsort_chunks(v1.chunks, nchunks, nsamples,
                       order=ord, merge=(x,y)->merge_sorted(ord, x,y))
-    foreach(persist!, cs)
-    t=delayed((xs...)->[xs...]; meta=true)(cs...)
-    # `compute(t)` only computes references to materialized version of `cs`
-    # we don't want the scheduler to think that `cs`s' job is done
-    # so we call persist! above
-    chunks = compute(t)
+    t=Dagger.spawn((xs...)->[xs...], cs...)
+    chunks = fetch(t)
     dmn = ArrayDomain((1:sum(length(domain(c)) for c in chunks),))
     DArray(eltype(v1), dmn, map(domain, chunks), chunks)
 end
diff --git a/src/chunks.jl b/src/chunks.jl
index 5c3d1533f..b8f2a0cd0 100644
--- a/src/chunks.jl
+++ b/src/chunks.jl
@@ -267,6 +267,16 @@ function savechunk(data, dir, f)
     Chunk{typeof(data),typeof(fr),typeof(proc),typeof(scope)}(typeof(data), domain(data), fr, proc, scope, true)
 end
 
+struct WeakChunk
+    x::WeakRef
+end
+WeakChunk(c::Chunk) = WeakChunk(WeakRef(c))
+unwrap_weak(c::WeakChunk) = c.x.value
+function unwrap_weak_checked(c::WeakChunk)
+    c = unwrap_weak(c)
+    @assert c !== nothing
+    return c
+end
 
 Base.@deprecate_binding AbstractPart Union{Chunk, Thunk}
 Base.@deprecate_binding Part Chunk
diff --git a/src/compute.jl b/src/compute.jl
index d895f79db..af034cb81 100644
--- a/src/compute.jl
+++ b/src/compute.jl
@@ -12,11 +12,6 @@ collect(d::Union{Chunk,Thunk}; options=nothing) =
 
 abstract type Computation end
 
-compute(ctx, c::Computation; options=nothing) =
-    compute(ctx, stage(ctx, c); options=options)
-collect(c::Computation; options=nothing) =
-    collect(Context(global_context()), c; options=options)
-
 """
     compute(ctx::Context, d::Thunk; options=nothing) -> Chunk
 
@@ -97,7 +92,7 @@ function dependents(node::Thunk)
         if !haskey(deps, next)
             deps[next] = Set{Thunk}()
         end
-        for inp in inputs(next)
+        for (_, inp) in next.inputs
             if istask(inp) || (inp isa Chunk)
                 s = get!(()->Set{Thunk}(), deps, inp)
                 push!(s, next)
@@ -165,7 +160,7 @@ function order(node::Thunk, ndeps)
         haskey(output, next) && continue
         s += 1
         output[next] = s
-        parents = filter(istask, inputs(next))
+        parents = filter(istask, map(last, next.inputs))
         if !isempty(parents)
             # If parents is empty, sort! should be a no-op, but raises an ambiguity error
             # when InlineStrings.jl is loaded (at least, version 1.1.0), because InlineStrings
diff --git a/src/lib/util.jl b/src/lib/util.jl
index 977007dd4..67cca689b 100644
--- a/src/lib/util.jl
+++ b/src/lib/util.jl
@@ -152,6 +152,8 @@ function setindex(x::NTuple{N}, idx, v) where N
     map(ifelse, ntuple(x->idx === x, Val(N)), ntuple(x->v, Val(N)), x)
 end
 
+export treereducedim
+
 function showloc(f, argcount)
     args = ntuple(x->Any, argcount)
     ms = methods(f)
diff --git a/src/processor.jl b/src/processor.jl
index e99399dc0..24739cd43 100644
--- a/src/processor.jl
+++ b/src/processor.jl
@@ -1,5 +1,7 @@
 export OSProc, Context, addprocs!, rmprocs!
 
+import Base: @invokelatest
+
 """
     Processor
 
@@ -29,11 +31,11 @@ function delete_processor_callback!(name::Symbol)
 end
 
 """
-    execute!(proc::Processor, f, args...) -> Any
+    execute!(proc::Processor, f, args...; kwargs...) -> Any
 
-Executes the function `f` with arguments `args` on processor `proc`. This
-function can be overloaded by `Processor` subtypes to allow executing function
-calls differently than normal Julia.
+Executes the function `f` with arguments `args` and keyword arguments `kwargs`
+on processor `proc`. This function can be overloaded by `Processor` subtypes to
+allow executing function calls differently than normal Julia.
 """
 function execute! end
 
@@ -152,13 +154,14 @@ end
 iscompatible(proc::ThreadProc, opts, f, args...) = true
 iscompatible_func(proc::ThreadProc, opts, f) = true
 iscompatible_arg(proc::ThreadProc, opts, x) = true
-function execute!(proc::ThreadProc, @nospecialize(f), @nospecialize(args...))
+function execute!(proc::ThreadProc, @nospecialize(f), @nospecialize(args...); @nospecialize(kwargs...))
     tls = get_tls()
     task = Task() do
         set_tls!(tls)
         TimespanLogging.prof_task_put!(tls.sch_handle.thunk_id.id)
-        f(args...)
+        @invokelatest f(args...; kwargs...)
     end
+    task.sticky = true
     ret = ccall(:jl_set_task_tid, Cint, (Any, Cint), task, proc.tid-1)
     if ret == 0
         error("jl_set_task_tid == 0")
@@ -333,8 +336,7 @@ get_tls() = (
     task_hash=task_local_storage(:_dagger_task_hash),
     input_hash=get(task_local_storage(), :_dagger_input_hash, nothing),
     processor=thunk_processor(),
-    time_utilization=task_local_storage(:_dagger_time_utilization),
-    alloc_utilization=task_local_storage(:_dagger_alloc_utilization),
+    task_spec=task_local_storage(:_dagger_task_spec),
 )
 
 """
@@ -350,6 +352,7 @@ function set_tls!(tls)
         task_local_storage(:_dagger_input_hash, tls.input_hash)
     end
     task_local_storage(:_dagger_processor, get(tls, :processor, nothing))
+    task_local_storage(:_dagger_task_spec, get(tls, :task_spec, nothing))
     task_local_storage(:_dagger_time_utilization, get(tls, :time_utilization, nothing))
     task_local_storage(:_dagger_alloc_utilization, get(tls, :alloc_utilization, nothing))
 end
diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl
index bee0717ef..587899f3b 100644
--- a/src/sch/Sch.jl
+++ b/src/sch/Sch.jl
@@ -6,10 +6,14 @@ import MemPool: DRef, StorageResource
 import MemPool: poolset, storage_available, storage_capacity, storage_utilized, externally_varying
 import Statistics: mean
 import Random: randperm
+import Base: @invokelatest
 
 import ..Dagger
-import ..Dagger: Context, Processor, Thunk, WeakThunk, ThunkFuture, ThunkFailedException, Chunk, OSProc, AnyScope
+import ..Dagger: Context, Processor, Thunk, WeakThunk, ThunkFuture, ThunkFailedException, Chunk, WeakChunk, OSProc, AnyScope, DefaultScope, LockedObject
 import ..Dagger: order, dependents, noffspring, istask, inputs, unwrap_weak_checked, affinity, tochunk, timespan_start, timespan_finish, procs, move, chunktype, processor, default_enabled, get_processors, get_parent, execute!, rmprocs!, addprocs!, thunk_processor, constrain, cputhreadtime, uhash
+import DataStructures: PriorityQueue, enqueue!, dequeue_pair!, peek
+
+import ..Dagger
 
 const OneToMany = Dict{Thunk, Set{Thunk}}
 
@@ -96,8 +100,10 @@ struct ComputeState
     chan::RemoteChannel{Channel{Any}}
 end
 
+const UID_COUNTER = Threads.Atomic{UInt64}(1)
+
 function start_state(deps::Dict, node_order, chan)
-    state = ComputeState(rand(UInt64),
+    state = ComputeState(Threads.atomic_add!(UID_COUNTER, UInt64(1)),
                          OneToMany(),
                          deps,
                          Vector{Thunk}(undef, 0),
@@ -124,7 +130,7 @@ function start_state(deps::Dict, node_order, chan)
 
     for k in sort(collect(keys(deps)), by=node_order)
         if istask(k)
-            waiting = Set{Thunk}(Iterators.filter(istask, inputs(k)))
+            waiting = Set{Thunk}(Iterators.filter(istask, map(last, inputs(k))))
             if isempty(waiting)
                 push!(state.ready, k)
             else
@@ -142,13 +148,13 @@ end
 Stores DAG-global options to be passed to the Dagger.Sch scheduler.
 
 # Arguments
-- `single::Int=0`: Force all work onto worker with specified id. `0` disables
-this option.
-- `proclist=nothing`: Force scheduler to use one or more processors that are
-instances/subtypes of a contained type. Alternatively, a function can be
-supplied, and the function will be called with a processor as the sole
-argument and should return a `Bool` result to indicate whether or not to use
-the given processor. `nothing` enables all default processors.
+- `single::Int=0`: (Deprecated) Force all work onto worker with specified id.
+`0` disables this option.
+- `proclist=nothing`: (Deprecated) Force scheduler to use one or more
+processors that are instances/subtypes of a contained type. Alternatively, a
+function can be supplied, and the function will be called with a processor as
+the sole argument and should return a `Bool` result to indicate whether or not
+to use the given processor. `nothing` enables all default processors.
 - `allow_errors::Bool=true`: Allow thunks to error without affecting
 non-dependent thunks.
 - `checkpoint=nothing`: If not `nothing`, uses the provided function to save
@@ -175,23 +181,27 @@ end
 Stores Thunk-local options to be passed to the Dagger.Sch scheduler.
 
 # Arguments
-- `single::Int=0`: Force thunk onto worker with specified id. `0` disables this
-option.
-- `proclist=nothing`: Force thunk to use one or more processors that are
-instances/subtypes of a contained type. Alternatively, a function can be
-supplied, and the function will be called with a processor as the sole
+- `single::Int=0`: (Deprecated) Force thunk onto worker with specified id. `0`
+disables this option.
+- `proclist=nothing`: (Deprecated) Force thunk to use one or more processors
+that are instances/subtypes of a contained type. Alternatively, a function can
+be supplied, and the function will be called with a processor as the sole
 argument and should return a `Bool` result to indicate whether or not to use
 the given processor. `nothing` enables all default processors.
-- `time_util::Dict{Type,Any}=Dict{Type,Any}()`: Indicates the maximum expected
-time utilization for this thunk. Each keypair maps a processor type to the
-utilization, where the value can be a real (approximately the number of
-nanoseconds taken), or `MaxUtilization()` (utilizes all processors of this
-type). By default, the scheduler assumes that this thunk only uses one
-processor.
-- `alloc_util::Dict{Type,UInt64}=Dict{Type,UInt64}()`: Indicates the maximum
-expected memory utilization for this thunk. Each keypair maps a processor type
-to the utilization, where the value is an integer representing approximately
-the maximum number of bytes allocated at any one time.
+- `time_util::Dict{Type,Any}`: Indicates the maximum expected time utilization
+for this thunk. Each keypair maps a processor type to the utilization, where
+the value can be a real (approximately the number of nanoseconds taken), or
+`MaxUtilization()` (utilizes all processors of this type). By default, the
+scheduler assumes that this thunk only uses one processor.
+- `alloc_util::Dict{Type,UInt64}`: Indicates the maximum expected memory
+utilization for this thunk. Each keypair maps a processor type to the
+utilization, where the value is an integer representing approximately the
+maximum number of bytes allocated at any one time.
+- `occupancy::Dict{Type,Real}`: Indicates the maximum expected processor
+occupancy for this thunk. Each keypair maps a processor type to the
+utilization, where the value can be a real between 0 and 1 (the occupancy
+ratio, where 1 is full occupancy). By default, the scheduler assumes that this
+thunk has full occupancy.
 - `allow_errors::Bool=true`: Allow this thunk to error without affecting
 non-dependent thunks.
 - `checkpoint=nothing`: If not `nothing`, uses the provided function to save
@@ -214,15 +224,13 @@ Base.@kwdef struct ThunkOptions
     proclist = nothing
     time_util::Union{Dict{Type,Any},Nothing} = nothing
     alloc_util::Union{Dict{Type,UInt64},Nothing} = nothing
+    occupancy::Union{Dict{Type,Real},Nothing} = nothing
     allow_errors::Union{Bool,Nothing} = nothing
     checkpoint = nothing
     restore = nothing
     storage::Union{Chunk,Nothing} = nothing
 end
 
-# Eager scheduling
-include("eager.jl")
-
 """
     Base.merge(sopts::SchedulerOptions, topts::ThunkOptions) -> ThunkOptions
 
@@ -236,6 +244,7 @@ function Base.merge(sopts::SchedulerOptions, topts::ThunkOptions)
                  proclist,
                  topts.time_util,
                  topts.alloc_util,
+                 topts.occupancy,
                  allow_errors,
                  topts.checkpoint,
                  topts.restore,
@@ -268,6 +277,7 @@ function populate_defaults(opts::ThunkOptions, Tf, Targs)
         maybe_default(:proclist),
         maybe_default(:time_util),
         maybe_default(:alloc_util),
+        maybe_default(:occupancy),
         maybe_default(:allow_errors),
         maybe_default(:checkpoint),
         maybe_default(:restore),
@@ -278,6 +288,9 @@ end
 function cleanup(ctx)
 end
 
+# Eager scheduling
+include("eager.jl")
+
 const WORKER_MONITOR_LOCK = Threads.ReentrantLock()
 const WORKER_MONITOR_TASKS = Dict{Int,Task}()
 const WORKER_MONITOR_CHANS = Dict{Int,Dict{UInt64,RemoteChannel}}()
@@ -464,6 +477,8 @@ function scheduler_init(ctx, state::ComputeState, d::Thunk, options, deps)
 end
 
 function scheduler_run(ctx, state::ComputeState, d::Thunk, options)
+    @dagdebug nothing :global "Initializing scheduler" uid=state.uid
+
     safepoint(state)
 
     # Loop while we still have thunks to execute
@@ -477,12 +492,14 @@ function scheduler_run(ctx, state::ComputeState, d::Thunk, options)
 
         isempty(state.running) && continue
         timespan_start(ctx, :take, 0, 0)
+        @dagdebug nothing :take "Waiting for results"
         chan_value = take!(state.chan) # get result of completed thunk
         timespan_finish(ctx, :take, 0, 0)
         if chan_value isa RescheduleSignal
             continue
         end
         pid, proc, thunk_id, (res, metadata) = chan_value
+        @dagdebug thunk_id :take "Got finished task"
         gproc = OSProc(pid)
         safepoint(state)
         lock(state.lock) do
@@ -527,7 +544,7 @@ function scheduler_run(ctx, state::ComputeState, d::Thunk, options)
             state.errored[node] = thunk_failed
             if node.options !== nothing && node.options.checkpoint !== nothing
                 try
-                    node.options.checkpoint(node, res)
+                    @invokelatest node.options.checkpoint(node, res)
                 catch err
                     report_catch_error(err, "Thunk checkpoint failed")
                 end
@@ -556,6 +573,8 @@ function scheduler_run(ctx, state::ComputeState, d::Thunk, options)
     return value, errored
 end
 function scheduler_exit(ctx, state::ComputeState, options)
+    @dagdebug nothing :global "Tearing down scheduler" uid=state.uid
+
     close(state.chan)
     notify(state.halt)
     @sync for p in procs_to_use(ctx)
@@ -596,7 +615,7 @@ function schedule!(ctx, state, procs=procs_to_use(ctx))
         populate_processor_cache_list!(state, procs)
 
         # Schedule tasks
-        to_fire = Dict{Tuple{OSProc,<:Processor},Vector{Tuple{Thunk,<:Any,<:Any}}}()
+        to_fire = Dict{Tuple{OSProc,<:Processor},Vector{Tuple{Thunk,<:Any,<:Any,UInt64,UInt32}}}()
         failed_scheduling = Thunk[]
 
         # Select a new task and get its options
@@ -610,7 +629,22 @@ function schedule!(ctx, state, procs=procs_to_use(ctx))
         end
         task = pop!(state.ready)
         timespan_start(ctx, :schedule, task.id, (;thunk_id=task.id))
-        @assert !haskey(state.cache, task)
+        if haskey(state.cache, task)
+            if haskey(state.errored, task)
+                # An error was eagerly propagated to this task
+                finish_failed!(state, task)
+            else
+                # This shouldn't have happened
+                iob = IOBuffer()
+                println(iob, "Scheduling inconsistency: Task being scheduled is already cached!")
+                println(iob, "  Task: $(task.id)")
+                println(iob, "  Cache Entry: $(typeof(state.cache[task]))")
+                ex = SchedulingException(String(take!(iob)))
+                state.cache[task] = ex
+                state.errored[task] = true
+            end
+            @goto pop_task
+        end
         opts = merge(ctx.options, task.options)
         sig = signature(task, state)
         if task.hash == UInt(0)
@@ -622,9 +656,14 @@ function schedule!(ctx, state, procs=procs_to_use(ctx))
         scope = if task.f isa Chunk
             task.f.scope
         else
-            AnyScope()
+            if task.options.proclist !== nothing
+                # proclist overrides scope selection
+                AnyScope()
+            else
+                DefaultScope()
+            end
         end
-        for input in task.inputs
+        for (_,input) in task.inputs
             input = unwrap_weak_checked(input)
             chunk = if istask(input)
                 state.cache[input]
@@ -653,7 +692,7 @@ function schedule!(ctx, state, procs=procs_to_use(ctx))
             @goto fallback
         end
 
-        inputs = collect_task_inputs(state, task)
+        inputs = map(last, collect_task_inputs(state, task))
         opts = populate_defaults(opts, chunktype(task.f), map(chunktype, inputs))
         local_procs, costs = estimate_task_costs(state, local_procs, task, inputs)
         scheduled = false
@@ -670,18 +709,24 @@ function schedule!(ctx, state, procs=procs_to_use(ctx))
 
         for proc in local_procs
             gproc = get_parent(proc)
-            if can_use_proc(task, gproc, proc, opts, scope)
-                has_cap, est_time_util, est_alloc_util = has_capacity(state, proc, gproc.pid, opts.time_util, opts.alloc_util, sig)
+            can_use, scope = can_use_proc(task, gproc, proc, opts, scope)
+            if can_use
+                has_cap, est_time_util, est_alloc_util, est_occupancy =
+                    has_capacity(state, proc, gproc.pid, opts.time_util, opts.alloc_util, opts.occupancy, sig)
                 if has_cap
                     # Schedule task onto proc
                     # FIXME: est_time_util = est_time_util isa MaxUtilization ? cap : est_time_util
-                    push!(get!(()->Vector{Tuple{Thunk,<:Any,<:Any}}(), to_fire, (gproc, proc)), (task, est_time_util, est_alloc_util))
-                    state.worker_time_pressure[gproc.pid][proc] = get(state.worker_time_pressure[gproc.pid], proc, UInt64(0)) + est_time_util
+                    proc_tasks = get!(to_fire, (gproc, proc)) do
+                        Vector{Tuple{Thunk,<:Any,<:Any,UInt64,UInt32}}()
+                    end
+                    push!(proc_tasks, (task, scope, est_time_util, est_alloc_util, est_occupancy))
+                    state.worker_time_pressure[gproc.pid][proc] = get(state.worker_time_pressure[gproc.pid], proc, UInt64(0)) +  est_time_util
+                    @dagdebug task :schedule "Scheduling to $gproc -> $proc"
                     @goto pop_task
                 end
             end
         end
-        state.cache[task] = SchedulingException("No processors available, try making proclist more liberal")
+        state.cache[task] = SchedulingException("No processors available, try widening scope")
         state.errored[task] = true
         set_failed!(state, task)
         @goto pop_task
@@ -694,8 +739,10 @@ function schedule!(ctx, state, procs=procs_to_use(ctx))
         cap, extra_util = nothing, nothing
         procs_found = false
         # N.B. if we only have one processor, we need to select it now
-        if can_use_proc(task, entry.gproc, entry.proc, opts, scope)
-            has_cap, est_time_util, est_alloc_util = has_capacity(state, entry.proc, entry.gproc.pid, opts.time_util, opts.alloc_util, sig)
+        can_use, scope = can_use_proc(task, entry.gproc, entry.proc, opts, scope)
+        if can_use
+            has_cap, est_time_util, est_alloc_util, est_occupancy =
+                has_capacity(state, entry.proc, entry.gproc.pid, opts.time_util, opts.alloc_util, opts.occupancy, sig)
             if has_cap
                 selected_entry = entry
             else
@@ -711,15 +758,17 @@ function schedule!(ctx, state, procs=procs_to_use(ctx))
                 if procs_found
                     push!(failed_scheduling, task)
                 else
-                    state.cache[task] = SchedulingException("No processors available, try making proclist more liberal")
+                    state.cache[task] = SchedulingException("No processors available, try widening scope")
                     state.errored[task] = true
                     set_failed!(state, task)
                 end
                 @goto pop_task
             end
 
-            if can_use_proc(task, entry.gproc, entry.proc, opts, scope)
-                has_cap, est_time_util, est_alloc_util = has_capacity(state, entry.proc, entry.gproc.pid, opts.time_util, opts.alloc_util, sig)
+            can_use, scope = can_use_proc(task, entry.gproc, entry.proc, opts, scope)
+            if can_use
+                has_cap, est_time_util, est_alloc_util, est_occupancy =
+                    has_capacity(state, entry.proc, entry.gproc.pid, opts.time_util, opts.alloc_util, opts.occupancy, sig)
                 if has_cap
                     # Select this processor
                     selected_entry = entry
@@ -738,7 +787,10 @@ function schedule!(ctx, state, procs=procs_to_use(ctx))
         # Schedule task onto proc
         gproc, proc = entry.gproc, entry.proc
         est_time_util = est_time_util isa MaxUtilization ? cap : est_time_util
-        push!(get!(()->Vector{Tuple{Thunk,<:Any,<:Any}}(), to_fire, (gproc, proc)), (task, est_time_util, est_alloc_util))
+        proc_tasks = get!(to_fire, (gproc, proc)) do
+            Vector{Tuple{Thunk,<:Any,<:Any,UInt64,UInt32}}()
+        end
+        push!(proc_tasks, (task, scope, est_time_util, est_alloc_util, est_occupancy))
 
         # Proceed to next entry to spread work
         state.procs_cache_list[] = state.procs_cache_list[].next
@@ -856,13 +908,13 @@ function evict_chunks!(log_sink, chunks::Set{Chunk})
     nothing
 end
 
-fire_task!(ctx, thunk::Thunk, p, state; time_util=10^9, alloc_util=10^6) =
-    fire_task!(ctx, (thunk, time_util, alloc_util), p, state)
-fire_task!(ctx, (thunk, time_util, alloc_util)::Tuple{Thunk,<:Any}, p, state) =
-    fire_tasks!(ctx, [(thunk, time_util, alloc_util)], p, state)
+fire_task!(ctx, thunk::Thunk, p, state; scope=AnyScope(), time_util=10^9, alloc_util=10^6, occupancy=typemax(UInt32)) =
+    fire_task!(ctx, (thunk, scope, time_util, alloc_util, occupancy), p, state)
+fire_task!(ctx, (thunk, scope, time_util, alloc_util, occupancy)::Tuple{Thunk,<:Any}, p, state) =
+    fire_tasks!(ctx, [(thunk, scope, time_util, alloc_util, occupancy)], p, state)
 function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state)
     to_send = []
-    for (thunk, time_util, alloc_util) in thunks
+    for (thunk, scope, time_util, alloc_util, occupancy) in thunks
         push!(state.running, thunk)
         state.running_on[thunk] = gproc
         if thunk.cache && thunk.cache_ref !== nothing
@@ -881,7 +933,7 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state)
         end
         if thunk.options !== nothing && thunk.options.restore !== nothing
             try
-                result = thunk.options.restore(thunk)
+                result = @invokelatest thunk.options.restore(thunk)
                 if result isa Chunk
                     state.cache[thunk] = result
                     state.errored[thunk] = false
@@ -898,10 +950,13 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state)
         ids = Int[0]
         data = Any[thunk.f]
         hashes = Union{UInt,Nothing}[uhash(thunk.f, UInt(0))]
-        for (idx, x) in enumerate(thunk.inputs)
+        positions = Union{Symbol,Nothing}[]
+        for (idx, pos_x) in enumerate(thunk.inputs)
+            pos, x = pos_x
             x = unwrap_weak_checked(x)
             push!(ids, istask(x) ? x.id : -idx)
             push!(data, istask(x) ? state.cache[x] : x)
+            push!(positions, pos)
             push!(hashes, uhash(x, UInt(0)))
         end
         toptions = thunk.options !== nothing ? thunk.options : ThunkOptions()
@@ -912,10 +967,11 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state)
         sch_handle = SchedulerHandle(ThunkID(thunk.id, nothing), state.worker_chans[gproc.pid]...)
 
         # TODO: De-dup common fields (log_sink, uid, etc.)
+
         push!(to_send, Any[thunk.id, thunk.hash,
                            time_util, alloc_util, chunktype(thunk.f), data, thunk.get_result,
                            thunk.persist, thunk.cache, thunk.meta, options,
-                           propagated, ids, hashes,
+                           propagated, ids, positions, hashes,
                            (log_sink=ctx.log_sink, profile=ctx.profile),
                            sch_handle, state.uid])
     end
@@ -940,38 +996,333 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state)
     end
 end
 
+@static if VERSION >= v"1.9"
+const Doorbell = Base.Event
+else
+# We need a sticky, resetable signal
+mutable struct Doorbell
+    waiter::Union{Task,Nothing}
+    @atomic sleeping::Int
+    Doorbell() = new(nothing, 0)
+end
+function Base.wait(db::Doorbell)
+    db.waiter = current_task()
+    while true
+        _, succ = @atomicreplace db.sleeping 0 => 1
+        if succ
+            # No messages, wait for someone to wake us
+            wait()
+        end
+        _, succ = @atomicreplace db.sleeping 2 => 0
+        if succ
+            # We had a notification
+            return
+        end
+    end
+end
+function Base.notify(db::Doorbell)
+    while true
+        if (@atomic db.sleeping) == 2
+            # Doorbell already rung
+            return
+        end
+
+        _, succ = @atomicreplace db.sleeping 0 => 2
+        if succ
+            # Task was definitely busy, we're done
+            return
+        end
+
+        _, succ = @atomicreplace db.sleeping 1 => 2
+        if succ
+            # Task was sleeping, wake it and wait for it to awaken
+            waiter = db.waiter
+            @assert waiter !== nothing
+            waiter::Task
+            schedule(waiter)
+            while true
+                sleep_value = @atomic db.sleeping
+                if sleep_value == 0 || sleep_value == 2
+                    return
+                end
+                #if waiter._state === Base.task_state_runnable && t.queue === nothing
+                #    schedule(waiter)
+                #else
+                    yield()
+                #end
+            end
+        end
+    end
+end
+end
+
+struct ProcessorInternalState
+    ctx::Context
+    proc::Processor
+    queue::LockedObject{PriorityQueue{Vector{Any}, UInt32, Base.Order.ForwardOrdering}}
+    reschedule::Doorbell
+    tasks::Dict{Int,Task}
+    proc_occupancy::Base.RefValue{UInt32}
+    time_pressure::Base.RefValue{UInt64}
+end
+struct ProcessorState
+    state::ProcessorInternalState
+    runner::Task
+end
+
+const PROCESSOR_TASK_STATE = LockedObject(Dict{UInt64,Dict{Processor,ProcessorState}}())
+
+function proc_states(f::Base.Callable, uid::UInt64)
+    lock(PROCESSOR_TASK_STATE) do all_states
+        if !haskey(all_states, uid)
+            all_states[uid] = Dict{Processor,ProcessorState}()
+        end
+        our_states = all_states[uid]
+        return f(our_states)
+    end
+end
+proc_states(f::Base.Callable) =
+    proc_states(f, task_local_storage(:_dagger_sch_uid)::UInt64)
+
+task_tid_for_processor(::Processor) = nothing
+task_tid_for_processor(proc::Dagger.ThreadProc) = proc.tid
+
+stealing_permitted(::Processor) = true
+stealing_permitted(proc::Dagger.ThreadProc) = proc.owner != 1 || proc.tid != 1
+
+proc_has_occupancy(proc_occupancy, task_occupancy) =
+    UInt64(task_occupancy) + UInt64(proc_occupancy) <= typemax(UInt32)
+
+function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, return_queue::RemoteChannel)
+    to_proc = istate.proc
+    proc_run_task = @task begin
+        ctx = istate.ctx
+        tasks = istate.tasks
+        proc_occupancy = istate.proc_occupancy
+        time_pressure = istate.time_pressure
+
+        while isopen(return_queue)
+            # Wait for new tasks
+            @dagdebug nothing :processor "Waiting for tasks"
+            timespan_start(ctx, :proc_run_wait, to_proc, nothing)
+            wait(istate.reschedule)
+            @static if VERSION >= v"1.9"
+                reset(istate.reschedule)
+            end
+            timespan_finish(ctx, :proc_run_wait, to_proc, nothing)
+
+            # Fetch a new task to execute
+            @dagdebug nothing :processor "Trying to dequeue"
+            timespan_start(ctx, :proc_run_fetch, to_proc, nothing)
+            task_and_occupancy = lock(istate.queue) do queue
+                # Only steal if there are multiple queued tasks, to prevent
+                # ping-pong of tasks between empty queues
+                if length(queue) == 0
+                    @dagdebug nothing :processor "Nothing to dequeue"
+                    return nothing
+                end
+                _, occupancy = peek(queue)
+                if !proc_has_occupancy(proc_occupancy[], occupancy)
+                    @dagdebug nothing :processor "Insufficient occupancy" proc_occupancy=proc_occupancy[] task_occupancy=occupancy
+                    return nothing
+                end
+                return dequeue_pair!(queue)
+            end
+            if task_and_occupancy === nothing
+                timespan_finish(ctx, :proc_run_fetch, to_proc, nothing)
+
+                @dagdebug nothing :processor "Failed to dequeue"
+
+                if !stealing_permitted(to_proc)
+                    continue
+                end
+
+                if proc_occupancy[] == typemax(UInt32)
+                    continue
+                end
+
+                @dagdebug nothing :processor "Trying to steal"
+
+                # Try to steal a task
+                timespan_start(ctx, :steal_local, to_proc, nothing)
+
+                # Try to steal from local queues randomly
+                # TODO: Prioritize stealing from busiest processors
+                states = collect(proc_states(values, uid))
+                # TODO: Try to pre-allocate this
+                P = randperm(length(states))
+                for state in getindex.(Ref(states), P)
+                    other_istate = state.state
+                    if other_istate.proc === to_proc
+                        continue
+                    end
+                    # FIXME: We need to lock two queues to compare occupancies
+                    proc_occupancy_cached = lock(istate.queue) do _
+                        proc_occupancy[]
+                    end
+                    task_and_occupancy = lock(other_istate.queue) do queue
+                        if length(queue) == 0
+                            return nothing
+                        end
+                        task, occupancy = peek(queue)
+                        scope = task[5]
+                        if !isa(constrain(scope, Dagger.ExactScope(to_proc)),
+                                Dagger.InvalidScope) &&
+                           typemax(UInt32) - proc_occupancy_cached >= occupancy
+                            # Compatible, steal this task
+                            return dequeue_pair!(queue)
+                        end
+                        return nothing
+                    end
+                    if task_and_occupancy !== nothing
+                        from_proc = other_istate.proc
+                        thunk_id = task[1]
+                        @dagdebug thunk_id :processor "Stolen from $from_proc by $to_proc"
+                        timespan_finish(ctx, :steal_local, to_proc, (;from_proc, thunk_id))
+                        # TODO: Keep stealing until we hit full occupancy?
+                        @goto execute
+                    end
+                end
+                timespan_finish(ctx, :steal_local, to_proc, nothing)
+
+                # TODO: Try to steal from remote queues
+
+                continue
+            end
+
+            @label execute
+            task, task_occupancy = task_and_occupancy
+            thunk_id = task[1]
+            time_util = task[2]
+            timespan_finish(ctx, :proc_run_fetch, to_proc, (;thunk_id, proc_occupancy=proc_occupancy[], task_occupancy))
+            @dagdebug thunk_id :processor "Dequeued task"
+
+            # Execute the task and return its result
+            t = @task begin
+                result = try
+                    do_task(to_proc, task)
+                catch err
+                    bt = catch_backtrace()
+                    (CapturedException(err, bt), nothing)
+                finally
+                    lock(istate.queue) do _
+                        delete!(tasks, thunk_id)
+                        proc_occupancy[] -= task_occupancy
+                        time_pressure[] -= time_util
+                    end
+                    notify(istate.reschedule)
+                end
+                try
+                    put!(return_queue, (myid(), to_proc, thunk_id, result))
+                catch err
+                    if unwrap_nested_exception(err) isa InvalidStateException || !isopen(return_queue)
+                        @dagdebug thunk_id :execute "Return queue is closed, failing to put result" chan=return_queue exception=(err, catch_backtrace())
+                    else
+                        rethrow(err)
+                    end
+                end
+            end
+            lock(istate.queue) do _
+                tid = task_tid_for_processor(to_proc)
+                if tid !== nothing
+                    t.sticky = true
+                    ret = ccall(:jl_set_task_tid, Cint, (Any, Cint), t, tid-1)
+                else
+                    t.sticky = false
+                end
+                tasks[thunk_id] = errormonitor(schedule(t))
+                proc_occupancy[] += task_occupancy
+                time_pressure[] += time_util
+            end
+        end
+    end
+    tid = task_tid_for_processor(to_proc)
+    if tid !== nothing
+        proc_run_task.sticky = true
+        ret = ccall(:jl_set_task_tid, Cint, (Any, Cint), proc_run_task, tid-1)
+    else
+        proc_run_task.sticky = false
+    end
+    return errormonitor(schedule(proc_run_task))
+end
+
 """
-    do_tasks(to_proc, chan, tasks)
+    do_tasks(to_proc, return_queue, tasks)
 
-Executes a batch of tasks on `to_proc`.
+Executes a batch of tasks on `to_proc`, returning their results through
+`return_queue`.
 """
-function do_tasks(to_proc, chan, tasks)
-    for task in tasks
-        thunk_id = task[1]
-        should_launch = lock(TASK_SYNC) do
-            # Already running; don't try to re-launch
-            if !(thunk_id in TASKS_RUNNING)
-                push!(TASKS_RUNNING, thunk_id)
-                true
-            else
-                false
+function do_tasks(to_proc, return_queue, tasks)
+    @dagdebug nothing :processor "Enqueuing task batch" batch_size=length(tasks)
+
+    # FIXME: This is terrible
+    ctx_vars = first(tasks)[16]
+    ctx = Context(Processor[]; log_sink=ctx_vars.log_sink, profile=ctx_vars.profile)
+    uid = first(tasks)[18]
+    state = proc_states(uid) do states
+        get!(states, to_proc) do
+            queue = PriorityQueue{Vector{Any}, UInt32}()
+            queue_locked = LockedObject(queue)
+            reschedule = Doorbell()
+            istate = ProcessorInternalState(ctx, to_proc,
+                                            queue_locked, reschedule,
+                                            Dict{Int,Task}(),
+                                            Ref(UInt32(0)), Ref(UInt64(0)))
+            runner = start_processor_runner!(istate, uid, return_queue)
+            @static if VERSION < v"1.9"
+                reschedule.waiter = runner
             end
+            return ProcessorState(istate, runner)
         end
-        should_launch || continue
-        @async begin
-            try
-                result = do_task(to_proc, task)
-                put!(chan, (myid(), to_proc, thunk_id, result))
-            catch ex
-                bt = catch_backtrace()
-                put!(chan, (myid(), to_proc, thunk_id, (CapturedException(ex, bt), nothing)))
+    end
+    istate = state.state
+    lock(istate.queue) do queue
+        for task in tasks
+            thunk_id = task[1]
+            occupancy = task[4]
+            timespan_start(ctx, :enqueue, (;to_proc, thunk_id), nothing)
+            should_launch = lock(TASK_SYNC) do
+                # Already running; don't try to re-launch
+                if !(thunk_id in TASKS_RUNNING)
+                    push!(TASKS_RUNNING, thunk_id)
+                    true
+                else
+                    false
+                end
             end
+            should_launch || continue
+            enqueue!(queue, task, occupancy)
+            timespan_finish(ctx, :enqueue, (;to_proc, thunk_id), nothing)
+            @dagdebug thunk_id :processor "Enqueued task"
         end
     end
+    notify(istate.reschedule)
+
+    # Kick other processors to make them steal
+    # TODO: Alternatively, automatically balance work instead of blindly enqueueing
+    states = collect(proc_states(values, uid))
+    P = randperm(length(states))
+    for other_state in getindex.(Ref(states), P)
+        other_istate = other_state.state
+        if other_istate.proc === to_proc
+            continue
+        end
+        notify(other_istate.reschedule)
+    end
+    @dagdebug nothing :processor "Kicked processors"
 end
-"Executes a single task on `to_proc`."
-function do_task(to_proc, comm)
-    thunk_id, task_hash, est_time_util, est_alloc_util, Tf, data, send_result, persist, cache, meta, options, propagated, ids, hashes, ctx_vars, sch_handle, sch_uid = comm
+
+"""
+    do_task(to_proc, task_desc) -> Any
+
+Executes a single task specified by `task_desc` on `to_proc`.
+"""
+function do_task(to_proc, task_desc)
+    thunk_id, task_hash, est_time_util, est_alloc_util, 
+    Tf, data,
+    send_result, persist, cache, meta,
+    options, propagated, ids, positions, hashes, 
+    ctx_vars, sch_handle, sch_uid = task_desc
     ctx = Context(Processor[]; log_sink=ctx_vars.log_sink, profile=ctx_vars.profile)
 
     from_proc = OSProc()
@@ -1005,7 +1356,6 @@ function do_task(to_proc, comm)
             "[$(myid()), $thunk_id] $f($Tdata) $msg: $est_alloc_util | $real_alloc_util/$storage_cap"
         end
     end
-
     lock(TASK_SYNC) do
         while true
             # Get current time utilization for the selected processor
@@ -1047,6 +1397,8 @@ function do_task(to_proc, comm)
     end
     timespan_finish(ctx, :storage_wait, thunk_id, (;f, to_proc, device=typeof(to_storage)))
 
+    @dagdebug thunk_id :execute "Moving data"
+
     # Initiate data transfers for function and arguments
     transfer_time = Threads.Atomic{UInt64}(0)
     transfer_size = Threads.Atomic{UInt64}(0)
@@ -1071,7 +1423,8 @@ function do_task(to_proc, comm)
                             # TODO: Choose "closest" processor of same type first
                             some_proc = first(keys(CHUNK_CACHE[x]))
                             some_x = CHUNK_CACHE[x][some_proc]
-                            move(some_proc, to_proc, some_x)
+                            @dagdebug thunk_id :move "Cache hit for argument $id at $some_proc: $some_x"
+                            @invokelatest move(some_proc, to_proc, some_x)
                         end)
                     else
                         nothing
@@ -1083,13 +1436,16 @@ function do_task(to_proc, comm)
                 else
                     # Fetch it
                     time_start = time_ns()
-                    _x = move(to_proc, x)
+                    from_proc = processor(x)
+                    _x = @invokelatest move(from_proc, to_proc, x)
                     time_finish = time_ns()
                     if x.handle.size !== nothing
                         Threads.atomic_add!(transfer_time, time_finish - time_start)
                         Threads.atomic_add!(transfer_size, x.handle.size)
                     end
 
+                    @dagdebug thunk_id :move "Cache miss for argument $id at $from_proc"
+
                     # Update cache
                     lock(TASK_SYNC) do
                         CHUNK_CACHE[x] = Dict{Processor,Any}()
@@ -1099,21 +1455,34 @@ function do_task(to_proc, comm)
                     _x
                 end
             else
-                move(to_proc, x)
+                @invokelatest move(to_proc, x)
             end
+            @dagdebug thunk_id :move "Moved argument $id to $to_proc: $x"
             timespan_finish(ctx, :move, (;thunk_id, id), (;f, id, data=x); tasks=[Base.current_task()])
             return x
         end
     end
     fetched = Any[]
+    @debug fetch_tasks
     for task in fetch_tasks
         push!(fetched, fetch_report(task))
     end
     if meta
         append!(fetched, data[2:end])
     end
+
     f = popfirst!(fetched)
     @assert !(f isa Chunk) "Failed to unwrap thunk function"
+    fetched_args = Any[]
+    fetched_kwargs = Pair{Symbol,Any}[]
+    for (idx, x) in enumerate(fetched)
+        pos = positions[idx]
+        if pos === nothing
+            push!(fetched_args, x)
+        else
+            push!(fetched_kwargs, pos => x)
+        end
+    end
 
     #= FIXME: If MaxUtilization, stop processors and wait
     if (est_time_util isa MaxUtilization) && (real_time_util > 0)
@@ -1132,6 +1501,8 @@ function do_task(to_proc, comm)
     # FIXME
     #gcnum_start = Base.gc_num()
 
+    @dagdebug thunk_id :execute "Executing"
+
     result_meta = try
         # Set TLS variables
         Dagger.set_tls!((
@@ -1139,13 +1510,12 @@ function do_task(to_proc, comm)
             sch_handle=sch_handle,
             task_hash,
             processor=to_proc,
-            time_utilization=est_time_util,
-            alloc_utilization=est_alloc_util,
+            task_spec=task_desc,
         ))
 
         res = Dagger.with_options(propagated) do
             # Execute
-            execute!(to_proc, f, fetched...)
+            execute!(to_proc, f, fetched_args...; fetched_kwargs...)
         end
 
         # Check if result is safe to store
@@ -1167,6 +1537,7 @@ function do_task(to_proc, comm)
         bt = catch_backtrace()
         RemoteException(myid(), CapturedException(ex, bt))
     end
+
     threadtime = cputhreadtime() - threadtime_start
     # FIXME: This is not a realistic measure of max. required memory
     #gc_allocd = min(max(UInt64(Base.gc_num().allocd) - UInt64(gcnum_start.allocd), UInt64(0)), UInt64(1024^4))
@@ -1176,6 +1547,9 @@ function do_task(to_proc, comm)
         pop!(TASKS_RUNNING, thunk_id)
         notify(TASK_SYNC)
     end
+
+    @dagdebug thunk_id :execute "Returning"
+
     # TODO: debug_storage("Releasing $to_storage_name")
     metadata = (
         time_pressure=real_time_util[],
@@ -1187,7 +1561,7 @@ function do_task(to_proc, comm)
         gc_allocd=(isa(result_meta, Chunk) ? result_meta.handle.size : 0),
         transfer_rate=(transfer_size[] > 0 && transfer_time[] > 0) ? round(UInt64, transfer_size[] / (transfer_time[] / 10^9)) : nothing,
     )
-    (result_meta, metadata)
+    return (result_meta, metadata)
 end
 
 end # module Sch
diff --git a/src/sch/dynamic.jl b/src/sch/dynamic.jl
index 80d88091b..679514b83 100644
--- a/src/sch/dynamic.jl
+++ b/src/sch/dynamic.jl
@@ -144,7 +144,7 @@ function _register_future!(ctx, state, task, tid, (future, id, check)::Tuple{Thu
                 if t == target
                     return true
                 end
-                for input in t.inputs
+                for (_, input) in t.inputs
                     # N.B. Skips expired tasks
                     input = Dagger.unwrap_weak(input)
                     istask(input) || continue
@@ -195,21 +195,23 @@ function _get_dag_ids(ctx, state, task, tid, _)
 end
 
 "Adds a new Thunk to the DAG."
-add_thunk!(f, h::SchedulerHandle, args...; future=nothing, ref=nothing, kwargs...) =
-    exec!(_add_thunk!, h, f, args, kwargs, future, ref)
-function _add_thunk!(ctx, state, task, tid, (f, args, kwargs, future, ref))
+add_thunk!(f, h::SchedulerHandle, args...; future=nothing, ref=nothing, options...) =
+    exec!(_add_thunk!, h, f, args, options, future, ref)
+function _add_thunk!(ctx, state, task, tid, (f, args, options, future, ref))
     timespan_start(ctx, :add_thunk, tid, 0)
-    _args = map(arg->arg isa ThunkID ? state.thunk_dict[arg.id] : arg, args)
+    _args = map(pos_arg->pos_arg[1] => (pos_arg[2] isa ThunkID ? state.thunk_dict[pos_arg[2].id] : pos_arg[2]), args)
     GC.@preserve _args begin
-        thunk = Thunk(f, _args...; kwargs...)
+        thunk = Thunk(f, _args...; options...)
         # Create a `DRef` to `thunk` so that the caller can preserve it
         thunk_ref = poolset(thunk; size=64, device=MemPool.CPURAMDevice())
         thunk_id = ThunkID(thunk.id, thunk_ref)
         state.thunk_dict[thunk.id] = WeakThunk(thunk)
         reschedule_inputs!(state, thunk)
+        @dagdebug thunk :submit "Added to scheduler"
         if future !== nothing
             # Ensure we attach a future before the thunk is scheduled
             _register_future!(ctx, state, task, tid, (future, thunk_id, false))
+            @dagdebug thunk :submit "Registered future"
         end
         if ref !== nothing
             # Preserve the `EagerThunkFinalizer` through `thunk`
diff --git a/src/sch/eager.jl b/src/sch/eager.jl
index 0b15543e2..1bafc0874 100644
--- a/src/sch/eager.jl
+++ b/src/sch/eager.jl
@@ -18,12 +18,15 @@ function init_eager()
     ctx = eager_context()
     @async try
         sopts = SchedulerOptions(;allow_errors=true)
-        topts = ThunkOptions(;single=1)
+        scope = Dagger.ExactScope(Dagger.ThreadProc(1, 1))
         atexit() do
             EAGER_FORCE_KILL[] = true
             close(EAGER_THUNK_CHAN)
         end
-        Dagger.compute(ctx, Dagger.delayed(eager_thunk; options=topts)(); options=sopts)
+        opts = Dagger.Options((;scope,
+                                occupancy=Dict(Dagger.ThreadProc=>0)))
+        Dagger.compute(ctx, Dagger.delayed(eager_thunk, opts)();
+                       options=sopts)
     catch err
         iob = IOContext(IOBuffer(), :color=>true)
         println(iob, "Error in eager scheduler:")
@@ -37,41 +40,47 @@ function init_eager()
     end
 end
 
-"Adjusts the scheduler's cached pressure indicators for the specified worker by
-the specified amount, and signals the scheduler to try scheduling again if
-pressure decreased."
-function adjust_pressure!(h::SchedulerHandle, proc::Processor, pressure)
-    uid = Dagger.get_tls().sch_uid
-    lock(TASK_SYNC) do
-        PROCESSOR_TIME_UTILIZATION[uid][proc][] += pressure
-        notify(TASK_SYNC)
-    end
-    exec!(_adjust_pressure!, h, myid(), proc, pressure)
-end
-function _adjust_pressure!(ctx, state, task, tid, (pid, proc, pressure))
-    state.worker_time_pressure[pid][proc] += pressure
-    if pressure < 0
-        put!(state.chan, RescheduleSignal())
-    end
-    nothing
-end
-
-"Allows a thunk to safely wait on another thunk, by temporarily reducing its
-effective pressure to 0."
+"""
+Allows a thunk to safely wait on another thunk by temporarily reducing its
+effective occupancy to 0, which allows a newly-spawned task to run.
+"""
 function thunk_yield(f)
     if Dagger.in_thunk()
         h = sch_handle()
         tls = Dagger.get_tls()
-        proc = tls.processor
-        util = tls.time_utilization
-        adjust_pressure!(h, proc, -util)
+        proc = Dagger.thunk_processor()
+        proc_istate = proc_states(tls.sch_uid) do states
+            states[proc].state
+        end
+        task_occupancy = tls.task_spec[4]
+
+        # Decrease our occupancy and inform the processor to reschedule
+        lock(proc_istate.queue) do _
+            proc_istate.proc_occupancy[] -= task_occupancy
+            @assert 0 <= proc_istate.proc_occupancy[] <= typemax(UInt32)
+        end
+        notify(proc_istate.reschedule)
         try
-            f()
+            # Run the yielding code
+            return f()
         finally
-            adjust_pressure!(h, proc, util)
+            # Wait for processor to have occupancy to run this task
+            while true
+                ready = lock(proc_istate.queue) do _
+                    @assert 0 <= proc_istate.proc_occupancy[] <= typemax(UInt32)
+                    if proc_has_occupancy(proc_istate.proc_occupancy[], task_occupancy)
+                        proc_istate.proc_occupancy[] += task_occupancy
+                        @assert 0 <= proc_istate.proc_occupancy[] <= typemax(UInt32)
+                        return true
+                    end
+                    return false
+                end
+                ready && break
+                yield()
+            end
         end
     else
-        f()
+        return f()
     end
 end
 
@@ -83,14 +92,21 @@ function eager_thunk()
         nothing
     end
     tls = Dagger.get_tls()
-    # Don't apply pressure from this thunk
-    adjust_pressure!(h, tls.processor, -tls.time_utilization)
     while isopen(EAGER_THUNK_CHAN)
         try
             added_future, future, uid, ref, f, args, opts = take!(EAGER_THUNK_CHAN)
             # preserve inputs until they enter the scheduler
             tid = GC.@preserve args begin
-                _args = map(x->x isa Dagger.EagerThunk ? ThunkID(EAGER_ID_MAP[x.uid], x.thunk_ref) : x, args)
+                _args = map(args) do pos_x
+                    pos, x = pos_x
+                    if x isa Dagger.EagerThunk
+                        return pos => ThunkID(EAGER_ID_MAP[x.uid], x.thunk_ref)
+                    elseif x isa Dagger.Chunk
+                        return pos => WeakChunk(x)
+                    else
+                        return pos => x
+                    end
+                end
                 add_thunk!(f, h, _args...; future=future, ref=ref, opts...)
             end
             EAGER_ID_MAP[uid] = tid.id
diff --git a/src/sch/fault-handler.jl b/src/sch/fault-handler.jl
index 401f47096..0782af3a2 100644
--- a/src/sch/fault-handler.jl
+++ b/src/sch/fault-handler.jl
@@ -44,7 +44,7 @@ function handle_fault(ctx, state, deadproc)
     # Remove thunks from state.ready that have inputs on the deadlist
     for idx in length(state.ready):-1:1
         rt = state.ready[idx]
-        if any((input in deadlist) for input in rt.inputs)
+        if any((input in deadlist) for input in map(last, rt.inputs))
             deleteat!(state.ready, idx)
         end
     end
diff --git a/src/sch/util.jl b/src/sch/util.jl
index 5ac2f0eb6..a95d727f7 100644
--- a/src/sch/util.jl
+++ b/src/sch/util.jl
@@ -1,3 +1,37 @@
+const DAGDEBUG_CATEGORIES = Symbol[:global, :submit, :schedule, :scope,
+                                   :take, :execute, :processor]
+macro dagdebug(thunk, category, msg, args...)
+    cat_sym = category.value
+    @gensym id
+    debug_ex_id = :(@debug "[$($id)] ($($(repr(cat_sym)))) $($msg)" _module=Dagger _file=$(string(__source__.file)) _line=$(__source__.line))
+    append!(debug_ex_id.args, args)
+    debug_ex_noid = :(@debug "($($(repr(cat_sym)))) $($msg)" _module=Dagger _file=$(string(__source__.file)) _line=$(__source__.line))
+    append!(debug_ex_noid.args, args)
+    esc(quote
+        let $id = -1
+            if $thunk isa Integer
+                $id = Int($thunk)
+            elseif $thunk isa Thunk
+                $id = $thunk.id
+            elseif $thunk === nothing
+                $id = 0
+            else
+                @warn "Unsupported thunk argument to @dagdebug: $(typeof($thunk))"
+                $id = -1
+            end
+            if $id > 0
+                if $(QuoteNode(cat_sym)) in $DAGDEBUG_CATEGORIES
+                    $debug_ex_id
+                end
+            elseif $id == 0
+                if $(QuoteNode(cat_sym)) in $DAGDEBUG_CATEGORIES
+                    $debug_ex_noid
+                end
+            end
+        end
+    end)
+end
+
 """
     unwrap_nested_exception(err::Exception) -> Bool
 
@@ -14,7 +48,7 @@ function get_propagated_options(thunk)
     nt = NamedTuple()
     for key in thunk.propagates
         value = if key == :scope
-            isa(thunk.f, Chunk) ? thunk.f.scope : AnyScope()
+            isa(thunk.f, Chunk) ? thunk.f.scope : DefaultScope()
         elseif key == :processor
             isa(thunk.f, Chunk) ? thunk.f.processor : OSProc()
         elseif key in fieldnames(Thunk)
@@ -44,7 +78,7 @@ end
 of all chunks that can now be evicted from workers."
 function cleanup_inputs!(state, node)
     to_evict = Set{Chunk}()
-    for inp in node.inputs
+    for (_, inp) in node.inputs
         inp = unwrap_weak_checked(inp)
         if !istask(inp) && !(inp isa Chunk)
             continue
@@ -108,12 +142,12 @@ function reschedule_inputs!(state, thunk, seen=Set{Thunk}())
             continue
         end
         w = get!(()->Set{Thunk}(), state.waiting, thunk)
-        for input in thunk.inputs
+        for (_, input) in thunk.inputs
             input = unwrap_weak_checked(input)
             input in seen && continue
 
             # Unseen
-            if istask(input) || (input isa Chunk)
+            if istask(input) || isa(input, Chunk)
                 push!(get!(()->Set{Thunk}(), state.waiting_data, input), thunk)
             end
             istask(input) || continue
@@ -145,6 +179,9 @@ function set_failed!(state, origin, thunk=origin)
     filter!(x->x!==thunk, state.ready)
     state.cache[thunk] = ThunkFailedException(thunk, origin, state.cache[origin])
     state.errored[thunk] = true
+    finish_failed!(state, thunk, origin)
+end
+function finish_failed!(state, thunk, origin=nothing)
     fill_registered_futures!(state, thunk, true)
     if haskey(state.waiting_data, thunk)
         for dep in state.waiting_data[thunk]
@@ -152,7 +189,7 @@ function set_failed!(state, origin, thunk=origin)
                 delete!(state.waiting, dep)
             haskey(state.errored, dep) &&
                 continue
-            set_failed!(state, origin, dep)
+            origin !== nothing && set_failed!(state, origin, dep)
         end
         delete!(state.waiting_data, thunk)
     end
@@ -191,7 +228,7 @@ function print_sch_status(io::IO, state, thunk; offset=0, limit=5, max_inputs=3)
         print(io, "($(status_string(thunk))) ")
     end
     println(io, "$(thunk.id): $(thunk.f)")
-    for (idx,input) in enumerate(thunk.inputs)
+    for (idx, (_, input)) in enumerate(thunk.inputs)
         if input isa WeakThunk
             input = unwrap_weak(input)
             if input === nothing
@@ -252,50 +289,69 @@ end
 chunktype(x) = typeof(x)
 function signature(task::Thunk, state)
     sig = Any[chunktype(task.f)]
-    append!(sig, collect_task_inputs(state, task))
-    sig
+    for (pos, input) in collect_task_inputs(state, task)
+        # N.B. Skips kwargs
+        if pos === nothing
+            push!(sig, chunktype(input))
+        end
+    end
+    return sig
 end
 
 function can_use_proc(task, gproc, proc, opts, scope)
     # Check against proclist
-    if opts.proclist === nothing
-        if !default_enabled(proc)
-            @debug "Rejected $proc: !default_enabled(proc)"
-            return false
-        end
-    elseif opts.proclist isa Function
-        if !Base.invokelatest(opts.proclist, proc)
-            @debug "Rejected $proc: proclist(proc) == false"
-            return false
+    if opts.proclist !== nothing
+        @warn "The `proclist` option is deprecated, please use scopes instead\nSee https://juliaparallel.org/Dagger.jl/stable/scopes/ for details" maxlog=1
+        if opts.proclist isa Function
+            if !Base.invokelatest(opts.proclist, proc)
+                @dagdebug task :scope "Rejected $proc: proclist(proc) == false"
+                return false, scope
+            end
+            scope = constrain(scope, Dagger.ExactScope(proc))
+        elseif opts.proclist isa Vector
+            if !(typeof(proc) in opts.proclist)
+                @dagdebug task :scope "Rejected $proc: !(typeof(proc) in proclist)"
+                return false, scope
+            end
+            scope = constrain(scope,
+                              Dagger.UnionScope(map(Dagger.ProcessorTypeScope, opts.proclist)))
+        else
+            throw(SchedulingException("proclist must be a Function, Vector, or nothing"))
         end
-    elseif opts.proclist isa Vector
-        if !(typeof(proc) in opts.proclist)
-            @debug "Rejected $proc: !(typeof(proc) in proclist)"
-            return false
+        if scope isa Dagger.InvalidScope
+            @dagdebug task :scope "Rejected $proc: Not contained in task scope ($scope)"
+            return false, scope
         end
-    else
-        throw(SchedulingException("proclist must be a Function, Vector, or nothing"))
     end
 
     # Check against single
     if opts.single !== nothing
+        @warn "The `single` option is deprecated, please use scopes instead\nSee https://juliaparallel.org/Dagger.jl/stable/scopes/ for details" maxlog=1
         if gproc.pid != opts.single
-            @debug "Rejected $proc: gproc.pid != single"
-            return false
+            @dagdebug task :scope "Rejected $proc: gproc.pid ($(gproc.pid)) != single ($(opts.single))"
+            return false, scope
+        end
+        scope = constrain(scope, Dagger.ProcessScope(opts.single))
+        if scope isa Dagger.InvalidScope
+            @dagdebug task :scope "Rejected $proc: Not contained in task scope ($scope)"
+            return false, scope
         end
     end
 
-    # Check scope
+    # Check against scope
     proc_scope = Dagger.ExactScope(proc)
     if constrain(scope, proc_scope) isa Dagger.InvalidScope
-        @debug "Rejected $proc: Task scope ($scope) vs. processor scope ($proc_scope)"
-        return false
+        @dagdebug task :scope "Rejected $proc: Not contained in task scope ($scope)"
+        return false, scope
     end
 
-    return true
+    @label accept
+
+    @dagdebug task :scope "Accepted $proc"
+    return true, scope
 end
 
-function has_capacity(state, p, gp, time_util, alloc_util, sig)
+function has_capacity(state, p, gp, time_util, alloc_util, occupancy, sig)
     T = typeof(p)
     # FIXME: MaxUtilization
     est_time_util = round(UInt64, if time_util !== nothing && haskey(time_util, T)
@@ -306,8 +362,14 @@ function has_capacity(state, p, gp, time_util, alloc_util, sig)
     est_alloc_util = if alloc_util !== nothing && haskey(alloc_util, T)
         alloc_util[T]
     else
-        get(state.signature_alloc_cost, sig, 0)
-    end
+        get(state.signature_alloc_cost, sig, UInt64(0))
+    end::UInt64
+    est_occupancy = if occupancy !== nothing && haskey(occupancy, T)
+        # Clamp to 0-1, and scale between 0 and `typemax(UInt32)`
+        Base.unsafe_trunc(UInt32, clamp(occupancy[T], 0, 1) * typemax(UInt32))
+    else
+        typemax(UInt32)
+    end::UInt32
     #= FIXME: Estimate if cached data can be swapped to storage
     storage = storage_resource(p)
     real_alloc_util = state.worker_storage_pressure[gp][storage]
@@ -316,7 +378,7 @@ function has_capacity(state, p, gp, time_util, alloc_util, sig)
         return false, est_time_util, est_alloc_util
     end
     =#
-    return true, est_time_util, est_alloc_util
+    return true, est_time_util, est_alloc_util, est_occupancy
 end
 
 function populate_processor_cache_list!(state, procs)
@@ -361,12 +423,12 @@ end
 
 "Collects all arguments for `task`, converting Thunk inputs to Chunks."
 function collect_task_inputs(state, task)
-    inputs = Any[]
-    for input in task.inputs
+    inputs = Pair{Union{Symbol,Nothing},Any}[]
+    for (pos, input) in task.inputs
         input = unwrap_weak_checked(input)
-        push!(inputs, istask(input) ? state.cache[input] : input)
+        push!(inputs, pos => (istask(input) ? state.cache[input] : input))
     end
-    inputs
+    return inputs
 end
 
 """
@@ -413,6 +475,11 @@ Walks the data contained in `x` in DFS fashion, and executes `f` at each object
 that hasn't yet been seen.
 """
 function walk_data(f, @nospecialize(x))
+    action = f(x)
+    if action !== missing
+        return action
+    end
+
     seen = IdDict{Any,Nothing}()
     to_visit = Any[x]
 
diff --git a/src/scopes.jl b/src/scopes.jl
index 2b203f566..6a5af1e5f 100644
--- a/src/scopes.jl
+++ b/src/scopes.jl
@@ -1,18 +1,62 @@
-export AnyScope, UnionScope, NodeScope, ProcessScope, ExactScope
+export AnyScope, DefaultScope, UnionScope, NodeScope, ProcessScope, ExactScope, ProcessorTypeScope
 
 abstract type AbstractScope end
 
-"Default scope that is unconstrained."
+"Widest scope that contains all processors."
 struct AnyScope <: AbstractScope end
 
+abstract type AbstractScopeTaint end
+
+"Taints a scope for later evaluation."
+struct TaintScope <: AbstractScope
+    scope::AbstractScope
+    taints::Set{AbstractScopeTaint}
+end
+Base.:(==)(ts1::TaintScope, ts2::TaintScope) =
+    ts1.scope == ts2.scope &&
+    length(ts1.taints) == length(ts2.taints) &&
+    all(collect(ts1.taints) .== collect(ts2.taints))
+
+struct DefaultEnabledTaint <: AbstractScopeTaint end
+
+"Default scope that contains the set of `default_enabled` processors."
+DefaultScope() = TaintScope(AnyScope(),
+                            Set{AbstractScopeTaint}([DefaultEnabledTaint()]))
+
 "Union of two or more scopes."
 struct UnionScope <: AbstractScope
     scopes::Tuple
+    function UnionScope(scopes::Tuple)
+        scope_set = Set{AbstractScope}()
+        for scope in scopes
+            if scope isa UnionScope
+                for subscope in scope.scopes
+                    push!(scope_set, subscope)
+                end
+            else
+                push!(scope_set, scope)
+            end
+        end
+        return new((collect(scope_set)...,))
+    end
 end
 UnionScope(scopes...) = UnionScope((scopes...,))
 UnionScope(scopes::Vector{<:AbstractScope}) = UnionScope((scopes...,))
 UnionScope(s::AbstractScope) = UnionScope((s,))
-UnionScope() = throw(ArgumentError("Cannot construct empty UnionScope"))
+UnionScope() = UnionScope(())
+
+function Base.:(==)(us1::UnionScope, us2::UnionScope)
+    if length(us1.scopes) != length(us2.scopes)
+        return false
+    end
+    scopes = Set{AbstractScope}()
+    for scope in us2.scopes
+        if !(scope in us2.scopes)
+            return false
+        end
+    end
+    return true
+end
 
 "Scoped to the same physical node."
 struct NodeScope <: AbstractScope
@@ -35,6 +79,13 @@ end
 ProcessScope(p::OSProc) = ProcessScope(p.pid)
 ProcessScope() = ProcessScope(myid())
 
+"Scoped to any processor with a given supertype."
+struct ProcessorTypeTaint{T} <: AbstractScopeTaint end
+
+ProcessorTypeScope(T) =
+    TaintScope(AnyScope(),
+               Set{AbstractScopeTaint}([ProcessorTypeTaint{T}()]))
+
 "Scoped to a specific processor."
 struct ExactScope <: AbstractScope
     parent::ProcessScope
@@ -48,6 +99,47 @@ struct InvalidScope <: AbstractScope
     y::AbstractScope
 end
 
+# Show methods
+
+function Base.show(io::IO, scope::UnionScope)
+    indent = io isa IOContext ? get(io, :indent, 0) : 0
+    println(io, "UnionScope:")
+    indent += 2
+    inner_io = IOContext(io, :indent => indent)
+    if length(scope.scopes) == 0
+        print(io, " "^indent, "empty")
+        return
+    end
+    for (idx, inner_scope) in enumerate(scope.scopes)
+        print(io, " "^indent); print(inner_io, inner_scope)
+        idx < length(scope.scopes) && println(io)
+    end
+end
+function Base.show(io::IO, scope::TaintScope)
+    indent = io isa IOContext ? get(io, :indent, 0) : 0
+    println(io, "TaintScope:")
+    indent += 2
+    inner_io = IOContext(io, :indent => indent)
+    print(io, " "^indent, "scope: "); println(inner_io, scope.scope)
+    if length(scope.taints) == 0
+        print(io, " "^indent, "no taints")
+        return
+    end
+    println(io, " "^indent, "taints: ")
+    indent += 4
+    inner_io = IOContext(io, :indent => indent)
+    for (idx, taint) in enumerate(scope.taints)
+        print(io, " "^indent); print(inner_io, taint)
+        idx < length(scope.taints) && println(io)
+    end
+end
+Base.show(io::IO, scope::NodeScope) =
+    print(io, "NodeScope: node == $(scope.uuid)")
+Base.show(io::IO, scope::ProcessScope) =
+    print(io, "ProcessScope: worker == $(scope.wid)")
+Base.show(io::IO, scope::ExactScope) =
+    print(io, "ExactScope: processor == $(scope.processor)")
+
 # Comparisons and constraint checking
 
 constrain(x, y) = x < y ? constrain(y, x) : throw(MethodError(constrain, x, y))
@@ -58,7 +150,46 @@ Base.isless(x, ::AnyScope) = true
 constrain(::AnyScope, ::AnyScope) = AnyScope()
 constrain(::AnyScope, y) = y
 
+# N.B. TaintScope taints constraining (until encountering an `ExactScope`) to
+# allow lazy evaluation of the taint matching on the final processor
+taint_match(::DefaultEnabledTaint, x::Processor) = default_enabled(x)
+taint_match(::ProcessorTypeTaint{T}, x::Processor) where T = x isa T
+Base.isless(::TaintScope, ::TaintScope) = false
+Base.isless(::TaintScope, ::AnyScope) = true
+Base.isless(::TaintScope, x) = false
+Base.isless(x, ::TaintScope) = true
+function constrain(x::TaintScope, y::TaintScope)
+    scope = constrain(x.scope, y.scope)
+    if scope isa InvalidScope
+        return scope
+    end
+    taints = Set{AbstractScopeTaint}()
+    for tx in x.taints
+        push!(taints, tx)
+    end
+    for ty in y.taints
+        push!(taints, ty)
+    end
+    return TaintScope(scope, taints)
+end
+function constrain(x::TaintScope, y)
+    scope = constrain(x.scope, y)
+    if scope isa InvalidScope
+        return scope
+    end
+    return TaintScope(scope, x.taints)
+end
+function constrain(x::TaintScope, y::ExactScope)
+    for taint in x.taints
+        if !taint_match(taint, y.processor)
+            return InvalidScope(x, y)
+        end
+    end
+    return constrain(x.scope, y)
+end
+
 Base.isless(::UnionScope, ::UnionScope) = false
+Base.isless(::UnionScope, ::TaintScope) = true
 Base.isless(::UnionScope, ::AnyScope) = true
 function constrain(x::UnionScope, y::UnionScope)
     zs = Vector{AbstractScope}()
@@ -75,12 +206,14 @@ end
 constrain(x::UnionScope, y) = constrain(x, UnionScope((y,)))
 
 Base.isless(::NodeScope, ::NodeScope) = false
+Base.isless(::NodeScope, ::TaintScope) = true
 Base.isless(::NodeScope, ::AnyScope) = true
 constrain(x::NodeScope, y::NodeScope) =
     x == y ? y : InvalidScope(x, y)
 
 Base.isless(::ProcessScope, ::ProcessScope) = false
 Base.isless(::ProcessScope, ::NodeScope) = true
+Base.isless(::ProcessScope, ::TaintScope) = true
 Base.isless(::ProcessScope, ::AnyScope) = true
 constrain(x::ProcessScope, y::ProcessScope) =
     x == y ? y : InvalidScope(x, y)
@@ -90,6 +223,7 @@ constrain(x::NodeScope, y::ProcessScope) =
 Base.isless(::ExactScope, ::ExactScope) = false
 Base.isless(::ExactScope, ::ProcessScope) = true
 Base.isless(::ExactScope, ::NodeScope) = true
+Base.isless(::ExactScope, ::TaintScope) = true
 Base.isless(::ExactScope, ::AnyScope) = true
 constrain(x::ExactScope, y::ExactScope) =
     x == y ? y : InvalidScope(x, y)
@@ -97,3 +231,137 @@ constrain(x::ProcessScope, y::ExactScope) =
     x == y.parent ? y : InvalidScope(x, y)
 constrain(x::NodeScope, y::ExactScope) =
     x == y.parent.parent ? y : InvalidScope(x, y)
+
+### Scopes helper
+
+"""
+    scope(scs...) -> AbstractScope
+    scope(;scs...) -> AbstractScope
+
+Constructs an `AbstractScope` from a set of scope specifiers. Each element in
+`scs` is a separate specifier; if `scs` is empty, an empty `UnionScope()` is
+produced; if `scs` has one element, then exactly one specifier is constructed;
+if `scs` has more than one element, a `UnionScope` of the scopes specified by
+`scs` is constructed. A variety of specifiers can be passed to construct a
+scope:
+- `:any` - Constructs an `AnyScope()`
+- `:default` - Constructs a `DefaultScope()`
+- `(scs...,)` - Constructs a `UnionScope` of scopes, each specified by `scs`
+- `thread=tid` or `threads=[tids...]` - Constructs an `ExactScope` or `UnionScope` containing all `Dagger.ThreadProc`s with thread ID `tid`/`tids` across all workers.
+- `worker=wid` or `workers=[wids...]` - Constructs a `ProcessScope` or `UnionScope` containing all `Dagger.ThreadProc`s with worker ID `wid`/`wids` across all threads.
+- `thread=tid`/`threads=tids` and `worker=wid`/`workers=wids` - Constructs an `ExactScope`, `ProcessScope`, or `UnionScope` containing all `Dagger.ThreadProc`s with worker ID `wid`/`wids` and threads `tid`/`tids`.
+
+Aside from the worker and thread specifiers, it's possible to add custom
+specifiers for scoping to other kinds of processors (like GPUs) or providing
+different ways to specify a scope. Specifier selection is determined by a
+precedence ordering: by default, all specifiers have precedence `0`, which can
+be changed by defining `scope_key_precedence(::Val{spec}) = precedence` (where
+`spec` is the specifier as a `Symbol)`. The specifier with the highest
+precedence in a set of specifiers is used to determine the scope by calling
+`to_scope(::Val{spec}, sc::NamedTuple)` (where `sc` is the full set of
+specifiers), which should be overriden for each custom specifier, and which
+returns an `AbstractScope`. For example:
+
+```julia
+# Setup a GPU specifier
+Dagger.scope_key_precedence(::Val{:gpu}) = 1
+Dagger.to_scope(::Val{:gpu}, sc::NamedTuple) = ExactScope(MyGPUDevice(sc.worker, sc.gpu))
+
+# Generate an `ExactScope` for `MyGPUDevice` on worker 2, device 3
+Dagger.scope(gpu=3, worker=2)
+```
+"""
+scope(scs...) = simplified_union_scope(map(to_scope, scs))
+scope(; kwargs...) = to_scope((;kwargs...))
+
+function simplified_union_scope(scopes)
+    if length(scopes) == 1
+        return only(scopes)
+    else
+        return UnionScope(scopes)
+    end
+end
+to_scope(scope::AbstractScope) = scope
+function to_scope(sc::Symbol)
+    if sc == :any
+        return AnyScope()
+    elseif sc == :default
+        return DefaultScope()
+    else
+        throw(ArgumentError("Cannot construct scope from: $(repr(sc))"))
+    end
+end
+function to_scope(sc::NamedTuple)
+    if isempty(sc)
+        return UnionScope()
+    end
+
+    # FIXME: node and nodes
+    known_keys = (:worker, :workers, :thread, :threads)
+    unknown_keys = filter(key->!in(key, known_keys), keys(sc))
+    if length(unknown_keys) > 0
+        # Hand off construction if unknown members encountered
+        precs = map(scope_key_precedence, map(Val, unknown_keys))
+        max_prec = maximum(precs)
+        if length(findall(prec->prec==max_prec, precs)) > 1
+            throw(ArgumentError("Incompatible scope specifiers detected: $unknown_keys"))
+        end
+        max_prec_key = unknown_keys[argmax(precs)]
+        return to_scope(Val(max_prec_key), sc)
+    end
+
+    workers = if haskey(sc, :worker)
+        Int[sc.worker]
+    elseif haskey(sc, :workers)
+        Int[sc.workers...]
+    else
+        nothing
+    end
+    threads = if haskey(sc, :thread)
+        Int[sc.thread]
+    elseif haskey(sc, :threads)
+        Int[sc.threads...]
+    else
+        nothing
+    end
+
+    # Simple cases
+    if workers !== nothing && threads !== nothing
+        subscopes = AbstractScope[]
+        for w in workers, t in threads
+            push!(subscopes, ExactScope(ThreadProc(w, t)))
+        end
+        return simplified_union_scope(subscopes)
+    elseif workers !== nothing && threads === nothing
+        subscopes = AbstractScope[ProcessScope(w) for w in workers]
+        return simplified_union_scope(subscopes)
+    end
+
+    # More complex cases that require querying the cluster
+    # FIXME: Use per-field scope taint
+    if workers === nothing
+        workers = procs()
+    end
+    subscopes = AbstractScope[]
+    for w in workers
+        if threads === nothing
+            threads = map(c->c.tid,
+                          filter(c->c isa ThreadProc,
+                                 collect(children(OSProc(w)))))
+        end
+        for t in threads
+            push!(subscopes, ExactScope(ThreadProc(w, t)))
+        end
+    end
+    return simplified_union_scope(subscopes)
+end
+to_scope(scs::Tuple) =
+    simplified_union_scope(map(to_scope, scs))
+to_scope(sc) =
+    throw(ArgumentError("Cannot construct scope from: $sc"))
+
+to_scope(::Val{key}, sc::NamedTuple) where key =
+    throw(ArgumentError("Scope construction not implemented for key: $key"))
+
+# Base case for all Dagger-owned keys
+scope_key_precedence(::Val) = 0
diff --git a/src/thunk.jl b/src/thunk.jl
index 603f53ac3..9e71ae274 100644
--- a/src/thunk.jl
+++ b/src/thunk.jl
@@ -1,4 +1,4 @@
-export Thunk, delayed, delayedmap
+export Thunk, delayed
 
 const ID_COUNTER = Threads.Atomic{Int}(1)
 next_id() = Threads.atomic_add!(ID_COUNTER, 1)
@@ -39,7 +39,7 @@ arguments are still passed as-is.
 - `processor::Processor=OSProc()` - The processor associated with `f`. Useful if
 `f` is a callable struct that exists on a given processor and should be
 transferred appropriately.
-- `scope::Dagger.AbstractScope=AnyScope()` - The scope associated with `f`.
+- `scope::Dagger.AbstractScope=DefaultScope()` - The scope associated with `f`.
 Useful if `f` is a function or callable struct that may only be transferred to,
 and executed within, the specified scope.
 
@@ -50,7 +50,7 @@ If omitted, options can also be specified by passing key-value pairs as
 """
 mutable struct Thunk
     f::Any # usually a Function, but could be any callable
-    inputs::Vector{Any} # TODO: Use `ImmutableArray` in 1.8
+    inputs::Vector{Pair{Union{Symbol,Nothing},Any}} # TODO: Use `ImmutableArray` in 1.8
     id::Int
     hash::UInt
     get_result::Bool # whether the worker should send the result or only the metadata
@@ -82,9 +82,10 @@ mutable struct Thunk
         if !isa(f, Chunk) && (!isnothing(processor) || !isnothing(scope))
             f = tochunk(f,
                         something(processor, OSProc()),
-                        something(scope, AnyScope()))
+                        something(scope, DefaultScope()))
         end
         xs = Any[xs...]
+        @assert all(x->x isa Pair, xs)
         if options !== nothing
             @assert isempty(kwargs)
             new(f, xs, id, hash, get_result, meta, persist, cache, cache_ref,
@@ -109,7 +110,7 @@ function affinity(t::Thunk)
         aff_vec = affinity(t.cache_ref)
     else
         aff = Dict{OSProc,Int}()
-        for inp in inputs(t)
+        for (_, inp) in t.inputs
            #if haskey(state.cache, inp)
            #    as = affinity(state.cache[inp])
            #    for a in as
@@ -134,17 +135,35 @@ function affinity(t::Thunk)
    #end
 end
 
+struct Options
+    options::NamedTuple
+end
+Options(;options...) = Options((;options...))
+Options(options...) = Options((;options...))
+
+function args_kwargs_to_pairs(args, kwargs)
+    args_kwargs = Pair{Union{Symbol,Nothing},Any}[]
+    for arg in args
+        push!(args_kwargs, nothing => arg)
+    end
+    for kwarg in kwargs
+        push!(args_kwargs, kwarg[1] => kwarg[2])
+    end
+    return args_kwargs
+end
+
 """
-    delayed(f; kwargs...)(args...)
+    delayed(f, options=Options())(args...; kwargs...) -> Thunk
+    delayed(f; options...)(args...; kwargs...) -> Thunk
 
 Creates a [`Thunk`](@ref) object which can be executed later, which will call
-`f` with `args`. `kwargs` controls various properties of the resulting `Thunk`.
+`f` with `args` and `kwargs`. `options` controls various properties of the
+resulting `Thunk`.
 """
-function delayed(f; kwargs...)
-    (args...) -> Thunk(f, args...; kwargs...)
+function delayed(f, options::Options)
+    (args...; kwargs...) -> Thunk(f, args_kwargs_to_pairs(args, kwargs)...; options.options...)
 end
-
-delayedmap(f, xs...) = map(delayed(f), xs...)
+delayed(f; kwargs...) = delayed(f, Options(;kwargs...))
 
 "A future holding the result of a `Thunk`."
 struct ThunkFuture
@@ -158,16 +177,16 @@ Base.wait(t::ThunkFuture) = Dagger.Sch.thunk_yield() do
 end
 function Base.fetch(t::ThunkFuture; proc=OSProc(), raw=false)
     error, value = Dagger.Sch.thunk_yield() do
-        if raw
-            fetch(t.future)
-        else
-            move(proc, fetch(t.future))
-        end
+        fetch(t.future)
     end
     if error
         throw(value)
     end
-    value
+    if raw
+        return value
+    else
+        return move(proc, value)
+    end
 end
 Base.put!(t::ThunkFuture, x; error=false) = put!(t.future, (error, x))
 
@@ -198,15 +217,59 @@ ThunkFailedException(thunk, origin, ex::E) where E =
     ThunkFailedException{E}(convert(WeakThunk, thunk), convert(WeakThunk, origin), ex)
 function Base.showerror(io::IO, ex::ThunkFailedException)
     t = unwrap_weak(ex.thunk)
-    o = unwrap_weak(ex.origin)
-    t_str = t !== nothing ? "$t" : "?"
-    o_str = o !== nothing ? "$o" : "?"
+
+    # Find root-cause thunk
+    last_tfex = ex
+    failed_tasks = Union{Thunk,Nothing}[]
+    while last_tfex.ex isa ThunkFailedException && unwrap_weak(last_tfex.ex.origin) !== nothing
+        push!(failed_tasks, unwrap_weak(last_tfex.thunk))
+        last_tfex = last_tfex.ex
+    end
+    o = unwrap_weak(last_tfex.origin)
+    root_ex = last_tfex.ex
+
+    function thunk_string(t)
+        if t === nothing
+            return "Thunk(?)"
+        end
+        Tinputs = Any[]
+        for (_, input) in t.inputs
+            input = unwrap_weak(input)
+            if istask(input)
+                push!(Tinputs, "Thunk(id=$(input.id))")
+            else
+                push!(Tinputs, input)
+            end
+        end
+        t_sig = if length(Tinputs) <= 4
+            "$(t.f)($(join(Tinputs, ", ")))"
+        else
+            "$(t.f)($(length(Tinputs)) inputs...)"
+        end
+        return "Thunk(id=$(t.id), $t_sig)"
+    end
+    t_str = thunk_string(t)
+    o_str = thunk_string(o)
     t_id = t !== nothing ? t.id : '?'
     o_id = o !== nothing ? o.id : '?'
-    println(io, "ThunkFailedException ($t failure",
-                (o !== nothing && t != o) ? " due to a failure in $o)" : ")",
-                ":")
-    Base.showerror(io, ex.ex)
+    println(io, "ThunkFailedException:")
+    println(io, "  Root Exception Type: $(typeof(root_ex))")
+    println(io, "  Root Exception:")
+    Base.showerror(io, root_ex); println(io)
+    if t !== o
+        println(io, "  Root Thunk:  $o_str")
+        if length(failed_tasks) <= 4
+            for i in failed_tasks
+                i_str = thunk_string(i)
+                println(io, "  Inner Thunk: $i_str")
+            end
+        else
+            println(io, " ...")
+            println(io, "  $(length(failed_tasks)) Inner Thunks...")
+            println(io, " ...")
+        end
+    end
+    print(io, "  This Thunk:  $t_str")
 end
 
 """
@@ -224,6 +287,7 @@ mutable struct EagerThunk
 end
 Base.isready(t::EagerThunk) = isready(t.future)
 Base.wait(t::EagerThunk) = wait(t.future)
+
 function Base.fetch(t::EagerThunk; raw=false)
     if raw
         fetch(t.future; raw=true)
@@ -240,6 +304,9 @@ function Base.fetch(t::EagerThunk; raw=false)
         end
     end
 end
+
+istask(t::EagerThunk) = true
+
 function Base.show(io::IO, t::EagerThunk)
     print(io, "EagerThunk ($(isready(t) ? "finished" : "running"))")
 end
@@ -259,20 +326,21 @@ end
 const EAGER_ID_COUNTER = Threads.Atomic{UInt64}(1)
 eager_next_id() = Threads.atomic_add!(EAGER_ID_COUNTER, one(UInt64))
 
-function _spawn(f, args...; options, kwargs...)
+function _spawn(f, options::Options, args...)
     Dagger.Sch.init_eager()
     for arg in args
-        @assert !isa(arg, Thunk) "Cannot use Thunks in spawn"
+        @assert !isa(last(arg), Thunk) "Cannot use Thunks in spawn"
     end
     uid = eager_next_id()
     future = ThunkFuture()
     finalizer_ref = poolset(EagerThunkFinalizer(uid); device=MemPool.CPURAMDevice())
     added_future = Future()
-    propagates = keys(options)
-    put!(Dagger.Sch.EAGER_THUNK_CHAN, (added_future, future, uid, finalizer_ref, f, (args...,), (;propagates, options..., kwargs...,)))
+    propagates = keys(options.options)
+    put!(Dagger.Sch.EAGER_THUNK_CHAN, (added_future, future, uid, finalizer_ref, f, (args...,), (;propagates, options.options...,)))
     thunk_ref = fetch(added_future)
     return (uid, future, finalizer_ref, thunk_ref)
 end
+
 """
     spawn(f, args...; kwargs...) -> EagerThunk
 
@@ -282,26 +350,34 @@ an `EagerThunk`. Uses a scheduler running in the background to execute code.
 Note that `kwargs` are passed to the `Thunk` constructor, and are documented in
 its docstring.
 """
-function spawn(f, args...; processor=nothing, scope=nothing, kwargs...)
+function spawn(f, args...; kwargs...)
     options = get_options()
+    if length(args) >= 1 && first(args) isa Options
+        options = merge(options, first(args).options)
+        args = args[2:end]
+    end
+    processor = haskey(options, :processor) ? options.processor : nothing
+    scope = haskey(options, :scope) ? options.scope : nothing
     if !isnothing(processor) || !isnothing(scope)
         f = tochunk(f,
                     something(processor, get_options(:processor, OSProc())),
-                    something(scope, get_options(:scope, AnyScope())))
+                    something(scope, get_options(:scope, DefaultScope())))
     end
+    args_kwargs = args_kwargs_to_pairs(args, kwargs)
     uid, future, finalizer_ref, thunk_ref = if myid() == 1
-        _spawn(f, args...; options, kwargs...)
+        _spawn(f, Options(options), args_kwargs...)
     else
-        remotecall_fetch(_spawn, 1, f, args...; options, kwargs...)
+        remotecall_fetch(_spawn, 1, f, Options(options), args_kwargs...)
     end
     return EagerThunk(uid, future, finalizer_ref, thunk_ref)
 end
 
 """
-    @par [opts] f(args...) -> Thunk
+    @par [opts] f(args...; kwargs...) -> Thunk
 
-Convenience macro to call `Dagger.delayed` on `f` with arguments `args`.
-May also be called with a series of assignments like so:
+Convenience macro to call `Dagger.delayed` on `f` with arguments `args` and
+keyword arguments `kwargs`. May also be called with a series of assignments
+like so:
 
 ```julia
 x = @par begin
@@ -339,19 +415,38 @@ macro spawn(exs...)
     _par(ex; lazy=false, opts=opts)
 end
 
+struct ExpandedBroadcast{F} end
+(eb::ExpandedBroadcast{F})(args...) where F =
+    Base.materialize(Base.broadcasted(F, args...))
+replace_broadcast(ex) = ex
+function replace_broadcast(fn::Symbol)
+    if startswith(string(fn), '.')
+        return :($ExpandedBroadcast{$(Symbol(string(fn)[2:end]))}())
+    end
+    return fn
+end
+
 function _par(ex::Expr; lazy=true, recur=true, opts=())
     if ex.head == :call && recur
-        f = ex.args[1]
-        args = ex.args[2:end]
+        f = replace_broadcast(ex.args[1])
+        if length(ex.args) >= 2 && Meta.isexpr(ex.args[2], :parameters)
+            args = ex.args[3:end]
+            kwargs = ex.args[2]
+        else
+            args = ex.args[2:end]
+            kwargs = Expr(:parameters)
+        end
         opts = esc.(opts)
+        args_ex = _par.(args; lazy=lazy, recur=false)
+        kwargs_ex = _par.(kwargs.args; lazy=lazy, recur=false)
         if lazy
-            return :(Dagger.delayed($(esc(f)); $(opts...))($(_par.(args; lazy=lazy, recur=false)...)))
+            return :(Dagger.delayed($(esc(f)), $Options(;$(opts...)))($(args_ex...); $(kwargs_ex...)))
         else
             sync_var = esc(Base.sync_varname)
             @gensym result
             return quote
-                let args = ($(_par.(args; lazy=lazy, recur=false)...),)
-                    $result = $spawn($(esc(f)), args...; $(opts...))
+                let args = ($(args_ex...),)
+                    $result = $spawn($(esc(f)), $Options(;$(opts...)), args...; $(kwargs_ex...))
                     if $(Expr(:islocal, sync_var))
                         put!($sync_var, schedule(Task(()->wait($result))))
                     end
@@ -404,7 +499,15 @@ function Base.show(io::IO, z::Thunk)
     end
     print(io, "Thunk[$(z.id)]($f, ")
     if lvl > 0
-        show(IOContext(io, :lazy_level => lvl-1), z.inputs)
+        inputs = Any[]
+        for (pos, input) in z.inputs
+            if pos === nothing
+                push!(inputs, input)
+            else
+                push!(inputs, pos => input)
+            end
+        end
+        show(IOContext(io, :lazy_level => lvl-1), inputs)
     else
         print(io, "...")
     end
diff --git a/src/ui/graph.jl b/src/ui/graph.jl
index d4cf0acee..033c4baff 100644
--- a/src/ui/graph.jl
+++ b/src/ui/graph.jl
@@ -142,8 +142,9 @@ write_edge(ctx, io, from::String, to::String) = println(io, "n_$from -> n_$to;")
 
 getargs!(d, t) = nothing
 function getargs!(d, t::Thunk)
-    d[t.id] = [filter(x->!istask(x[2]), collect(enumerate(t.inputs)))...,]
-    foreach(i->getargs!(d, i), t.inputs)
+    raw_inputs = map(last, t.inputs)
+    d[t.id] = [filter(x->!istask(x[2]), collect(enumerate(raw_inputs)))...,]
+    foreach(i->getargs!(d, i), raw_inputs)
 end
 function write_dag(io, logs::Vector, t=nothing)
     ctx = (proc_to_color = Dict{Processor,String}(),
diff --git a/src/utils/locked-object.jl b/src/utils/locked-object.jl
new file mode 100644
index 000000000..e00eaac2d
--- /dev/null
+++ b/src/utils/locked-object.jl
@@ -0,0 +1,16 @@
+# Copied from CUDA.jl
+
+struct LockedObject{T}
+    lock::ReentrantLock
+    payload::T
+end
+LockedObject(payload) = LockedObject(Threads.ReentrantLock(), payload)
+
+function Base.lock(f, x::LockedObject)
+    lock(x.lock)
+    try
+        return f(x.payload)
+    finally
+        unlock(x.lock)
+    end
+end
diff --git a/test/array.jl b/test/array.jl
index d44dd9af8..452d92f2c 100644
--- a/test/array.jl
+++ b/test/array.jl
@@ -1,5 +1,7 @@
 using LinearAlgebra, SparseArrays, Random, SharedArrays
 import Dagger: chunks, DArray, domainchunks, treereduce_nd
+import Distributed: myid, procs
+import Statistics: mean
 
 @testset "treereduce_nd" begin
     xs = rand(1:10, 8,8,8)
@@ -9,13 +11,13 @@ import Dagger: chunks, DArray, domainchunks, treereduce_nd
 end
 
 @testset "DArray constructor" begin
-    x = compute(rand(Blocks(2,2), 3,3))
+    x = fetch(rand(Blocks(2,2), 3,3))
     @test collect(x) == DArray{Float64, 2}(x.domain, x.subdomains, x.chunks) |> collect
 end
 
 @testset "rand" begin
     function test_rand(X)
-        X1 = compute(X)
+        X1 = fetch(X)
         X2 = collect(X1)
 
         @test isa(X, Dagger.ArrayOp)
@@ -36,6 +38,7 @@ end
     r = collect(R)
     @test r[1:10] != r[11:20]
 end
+
 @testset "sum" begin
     X = ones(Blocks(10, 10), 100, 100)
     @test sum(X) == 10000
@@ -43,19 +46,35 @@ end
     @test sum(Y) == 0
 end
 
+@testset "prod" begin
+    x = randn(100, 100)
+    X = distribute(x, Blocks(10, 10))
+    @test prod(X) == prod(x)
+    Y = zeros(Blocks(10, 10), 100, 100)
+    @test prod(Y) == 0
+end
+
+@testset "mean" begin
+    x = randn(100, 100)
+    X = distribute(x, Blocks(10, 10))
+    @test mean(X) ≈ mean(x)
+    Y = zeros(Blocks(10, 10), 100, 100)
+    @test mean(Y) == 0
+end
+
 @testset "distributing an array" begin
     function test_dist(X)
         X1 = Distribute(Blocks(10, 20), X)
         @test X1 == X
-        Xc = compute(X1)
+        Xc = fetch(X1)
         @test chunks(Xc) |> size == (10, 5)
         @test domainchunks(Xc) |> size == (10, 5)
         @test map(x->size(x) == (10, 20), domainchunks(Xc)) |> all
     end
     x = [1 2; 3 4]
     @test Distribute(Blocks(1,1), x) == x
-    #test_dist(rand(100, 100))
-    #test_dist(sprand(100, 100, 0.1))
+    test_dist(rand(100, 100))
+    test_dist(sprand(100, 100, 0.1))
 
     x = distribute(rand(10), 2)
     @test collect(distribute(x, 3)) == collect(x)
@@ -66,7 +85,7 @@ end
         x, y = size(X)
         X1 = Distribute(Blocks(10, 20), X)
         @test X1' == X'
-        Xc = compute(X1')
+        Xc = fetch(X1')
         @test chunks(Xc) |> size == (div(y, 20), div(x,10))
         @test domainchunks(Xc) |> size == (div(y, 20), div(x, 10))
         @test map(x->size(x) == (20, 10), domainchunks(Xc)) |> all
@@ -81,9 +100,9 @@ end
     function test_mul(X)
         tol = 1e-12
         X1 = Distribute(Blocks(10, 20), X)
-        @test_throws DimensionMismatch compute(X1*X1)
-        X2 = compute(X1'*X1)
-        X3 = compute(X1*X1')
+        @test_throws DimensionMismatch fetch(X1*X1)
+        X2 = fetch(X1'*X1)
+        X3 = fetch(X1*X1')
         @test norm(collect(X2) - X'X) < tol
         @test norm(collect(X3) - X*X') < tol
         @test chunks(X2) |> size == (2, 2)
@@ -100,7 +119,7 @@ end
 end
 
 @testset "matrix powers" begin
-    x = compute(rand(Blocks(4,4), 16, 16))
+    x = fetch(rand(Blocks(4,4), 16, 16))
     @test collect(x^1) == collect(x)
     @test collect(x^2) == collect(x*x)
     @test collect(x^3) == collect(x*x*x)
@@ -113,7 +132,7 @@ end
     @test hcat(m,m) == collect(hcat(x,x))
     @test vcat(m,m) == collect(vcat(x,x))
     @test hcat(m,m) == collect(hcat(x,y))
-    @test_throws DimensionMismatch compute(vcat(x,y))
+    @test_throws DimensionMismatch fetch(vcat(x,y))
 end
 
 @testset "scale" begin
@@ -137,10 +156,9 @@ end
         @test collect(X[[], []]) == x[[], []]
 
         @testset "dimensionality reduction" begin
-        # THESE NEED FIXING!!
             @test vec(collect(X[ragged_idx, 5])) == vec(x[ragged_idx, 5])
             @test vec(collect(X[5, ragged_idx])) == vec(x[5, ragged_idx])
-            @test collect(X[5, 5]) == x[5,5]
+            @test X[5, 5] == x[5,5]
         end
     end
 
@@ -150,7 +168,7 @@ end
     y = rand(10, 10)
     xs = distribute(y, Blocks(2,2))
     for i=1:10, j=1:10
-        @test compute(xs[i:j, j:i]) == y[i:j, j:i]
+        @test fetch(xs[i:j, j:i]) == y[i:j, j:i]
     end
 end
 
@@ -200,8 +218,8 @@ end
     @test collect(sort(y)) == x
 
     x = ones(10)
-    y = compute(Distribute(Blocks(3), x))
-    #@test map(x->length(collect(x)), compute(sort(y)).chunks) == [3,3,3,1]
+    y = fetch(Distribute(Blocks(3), x))
+    @test_broken map(x->length(collect(x)), fetch(sort(y)).chunks) == [3,3,3,1]
 end
 
 using MemPool
@@ -219,20 +237,9 @@ using MemPool
     @test aff[2] == sizeof(Int)*10
 end
 
-@testset "show_plan" begin
-    @test !isempty(Dagger.show_plan(Dagger.Thunk(()->10)))
-end
-
-#=
-@testset "darray distributed refcount" begin
-    D2 = remotecall_fetch(2, compute(Distribute(Blocks(10, 20), rand(40,40)))) do D
-        D2 = D
-    end
-    @test size(collect(D2)) == (40,40)
-    GC.gc()
-    @test size(collect(D2)) == (40,40)
-end
-=#
+#=@testset "show_plan" begin
+    @test !isempty(Dagger.show_plan(Dagger.spawn(()->10)))
+end=#
 
 @testset "sharedarray" begin
     A = SharedArray{Int}((1024,))
diff --git a/test/fakeproc.jl b/test/fakeproc.jl
index 54328697b..3f7c0b0b5 100644
--- a/test/fakeproc.jl
+++ b/test/fakeproc.jl
@@ -1,7 +1,7 @@
 @everywhere begin
 
 using Dagger
-import Dagger: ThreadProc
+import Dagger: OSProc, ThreadProc
 
 struct FakeProc <: Dagger.Processor
     owner::Int
@@ -18,6 +18,7 @@ fakesum(xs...) = FakeVal(sum(map(y->y.x, xs)))
 Dagger.iscompatible_func(proc::FakeProc, opts, f) = true
 Dagger.iscompatible_arg(proc::FakeProc, opts, ::Type{<:Integer}) = true
 Dagger.iscompatible_arg(proc::FakeProc, opts, ::Type{<:FakeVal}) = true
+Dagger.move(from_proc::OSProc, to_proc::FakeProc, x::Integer) = FakeVal(x)
 Dagger.move(from_proc::ThreadProc, to_proc::FakeProc, x::Integer) = FakeVal(x)
 Dagger.execute!(proc::FakeProc, func, args...) = FakeVal(42+func(args...).x)
 
diff --git a/test/logging.jl b/test/logging.jl
index c4431bba5..bb97811d0 100644
--- a/test/logging.jl
+++ b/test/logging.jl
@@ -3,18 +3,17 @@ import TimespanLogging: Timespan, Event, Events, LocalEventLog, MultiEventLog
 
 @testset "Logging" begin
     @testset "LocalEventLog" begin
-        @testset "ArrayOp" begin
+        @testset "Basics" begin
             ctx = Context()
             log = LocalEventLog()
             ctx.log_sink = log
 
-            sz = 2^4
-            bsz = 2^2
-            X = rand(Float32, sz, sz)
-            XD = Distribute(Blocks(bsz, bsz), X)
+            X = rand(Float32, 4, 3)
+            a = delayed(sum)(X)
+            b = delayed(sum)(X)
+            c = delayed(+)(a,b)
 
-            dag = XD*XD
-            collect(ctx, dag)
+            compute(ctx, c)
             logs = TimespanLogging.get_logs!(log)
             @test logs isa Vector{Timespan}
             @test length(logs) > 0
@@ -48,10 +47,13 @@ import TimespanLogging: Timespan, Event, Events, LocalEventLog, MultiEventLog
         end
 
         @testset "Automatic Plan Rendering" begin
-            x = compute(rand(Blocks(2,2),4,4))
+            X = rand(Float32, 4, 3)
+            a = delayed(sum)(X)
+            b = delayed(sum)(X)
+            c = delayed(+)(a,b)
             mktemp() do path, io
                 ctx = Context(;log_sink=LocalEventLog(),log_file=path)
-                compute(ctx, x * x)
+                compute(ctx, c)
                 plan = String(read(io))
                 @test occursin("digraph {", plan)
                 @test occursin("Move:", plan)
@@ -73,8 +75,11 @@ import TimespanLogging: Timespan, Event, Events, LocalEventLog, MultiEventLog
         ml[:psat] = Dagger.Events.ProcessorSaturation()
         ctx.log_sink = ml
 
-        A = rand(Blocks(4, 4), 16, 16)
-        collect(ctx, A*A)
+        X = rand(Float32, 4, 3)
+        a = delayed(sum)(X)
+        b = delayed(sum)(X)
+        c = delayed(+)(a,b)
+        compute(ctx, c)
 
         logs = TimespanLogging.get_logs!(ml)
         for w in keys(logs)
diff --git a/test/mutation.jl b/test/mutation.jl
index 48b192476..f076719d4 100644
--- a/test/mutation.jl
+++ b/test/mutation.jl
@@ -41,11 +41,12 @@ end
 
 @testset "@mutable" begin
     w = first(workers())
+    @assert w != 1 "Not enough workers to test mutability"
     x = remotecall_fetch(w) do
         Dagger.@mutable Ref{Int}()
     end
     @test fetch(Dagger.@spawn (x->x[] = myid())(x)) == w
-    @test_throws_unwrap Dagger.ThunkFailedException fetch(Dagger.@spawn single=last(workers()) (x->x[] = myid())(x))
+    @test_throws_unwrap Dagger.ThunkFailedException fetch(Dagger.@spawn single=1 (x->x[] = myid())(x))
 end # @testset "@mutable"
 
 @testset "Shard" begin
diff --git a/test/options.jl b/test/options.jl
index 79c3449b8..93cf58680 100644
--- a/test/options.jl
+++ b/test/options.jl
@@ -70,7 +70,7 @@ end
         @test fetch(Dagger.@spawn sf(obj)) == 0
         @test fetch(Dagger.@spawn sf(obj)) == 0
     end
-    Dagger.with_options(;scope=Dagger.ExactScope(Dagger.ThreadProc(1,1)), processor=Dagger.ThreadProc(1,1), meta=true) do
+    Dagger.with_options(;scope=Dagger.ExactScope(Dagger.ThreadProc(1,1)), processor=OSProc(1), meta=true) do
         @test fetch(Dagger.@spawn sf(obj)) == 43
         @test fetch(Dagger.@spawn sf(obj)) == 43
     end
diff --git a/test/processors.jl b/test/processors.jl
index 75870cb4e..9efcc1a4b 100644
--- a/test/processors.jl
+++ b/test/processors.jl
@@ -37,9 +37,9 @@ end
     end
     @testset "Processor exhaustion" begin
         opts = ThunkOptions(proclist=[OptOutProc])
-        @test_throws_unwrap Dagger.ThunkFailedException ex isa Dagger.Sch.SchedulingException ex.reason="No processors available, try making proclist more liberal" collect(delayed(sum; options=opts)([1,2,3]))
+        @test_throws_unwrap Dagger.ThunkFailedException ex isa Dagger.Sch.SchedulingException ex.reason="No processors available, try widening scope" collect(delayed(sum; options=opts)([1,2,3]))
         opts = ThunkOptions(proclist=(proc)->false)
-        @test_throws_unwrap Dagger.ThunkFailedException ex isa Dagger.Sch.SchedulingException ex.reason="No processors available, try making proclist more liberal" collect(delayed(sum; options=opts)([1,2,3]))
+        @test_throws_unwrap Dagger.ThunkFailedException ex isa Dagger.Sch.SchedulingException ex.reason="No processors available, try widening scope" collect(delayed(sum; options=opts)([1,2,3]))
         opts = ThunkOptions(proclist=nothing)
         @test collect(delayed(sum; options=opts)([1,2,3])) == 6
     end
diff --git a/test/scheduler.jl b/test/scheduler.jl
index 96611b82d..28ddcb5c6 100644
--- a/test/scheduler.jl
+++ b/test/scheduler.jl
@@ -50,7 +50,7 @@ function dynamic_get_dag(x...)
 end
 function dynamic_add_thunk(x)
     h = sch_handle()
-    id = Dagger.Sch.add_thunk!(h, x) do y
+    id = Dagger.Sch.add_thunk!(h, nothing=>x) do y
         y+1
     end
     wait(h, id)
@@ -58,7 +58,7 @@ function dynamic_add_thunk(x)
 end
 function dynamic_add_thunk_self_dominated(x)
     h = sch_handle()
-    id = Dagger.Sch.add_thunk!(h, h.thunk_id, x) do y
+    id = Dagger.Sch.add_thunk!(h, nothing=>h.thunk_id, nothing=>x) do y
         y+1
     end
     return fetch(h, id)
@@ -205,7 +205,7 @@ end
                     # until we end up on a non-blocked worker
                     h = Dagger.Sch.sch_handle()
                     wkrs = Dagger.Sch.exec!(_list_workers, h)
-                    id = Dagger.Sch.add_thunk!(testfun, h, i)
+                    id = Dagger.Sch.add_thunk!(testfun, h, nothing=>i)
                     return fetch(h, id)
                 end
                 return myid()
diff --git a/test/scopes.jl b/test/scopes.jl
index 6b5b6252f..d5d7e0a6b 100644
--- a/test/scopes.jl
+++ b/test/scopes.jl
@@ -31,6 +31,17 @@
     es1_ch = Dagger.tochunk(nothing, OSProc(), es1)
     es2_ch = Dagger.tochunk(nothing, OSProc(), es2)
 
+    os1 = ExactScope(OSProc(1))
+
+    @testset "Default Scope" begin
+        ds = DefaultScope()
+        for (s1, s2) in ((ds, es1), (es1, ds))
+            @test Dagger.constrain(s1, s2) == es1
+        end
+        for (s1, s2) in ((ds, os1), (os1, ds))
+            @test Dagger.constrain(s1, s2) isa Dagger.InvalidScope
+        end
+    end
     @testset "Node Scope" begin
         @everywhere node_scope_test(ch...) = Dagger.system_uuid()
 
@@ -113,7 +124,7 @@
         @test fetch(Dagger.@spawn exact_scope_test(us_es1_multi_ch)) == es1.processor
 
         # No inner scopes
-        @test_throws ArgumentError UnionScope()
+        @test UnionScope() isa UnionScope
 
         # Same inner scope
         @test fetch(Dagger.@spawn exact_scope_test(us_es1_ch, us_es1_ch)) == es1.processor
@@ -128,7 +139,104 @@
         @test es1 in us_res.scopes
         @test !(es2 in us_res.scopes)
     end
+    @testset "Processor Type Scope" begin
+        pts_th = ProcessorTypeScope(Dagger.ThreadProc)
+        pts_os = ProcessorTypeScope(Dagger.OSProc)
+
+        @test Dagger.constrain(pts_th, es1) == es1
+        @test Dagger.constrain(pts_th, os1) isa Dagger.InvalidScope
+
+        @test Dagger.constrain(pts_os, es1) isa Dagger.InvalidScope
+        @test Dagger.constrain(pts_os, os1) == os1
+
+        # Duplicate
+        pts_th_dup = Dagger.constrain(pts_th, pts_th)
+        @test Dagger.constrain(pts_th_dup, es1) == es1
+        @test Dagger.constrain(pts_th_dup, os1) isa Dagger.InvalidScope
+
+        # Empty intersection
+        pts_all = Dagger.constrain(pts_th, pts_os)
+        @test Dagger.constrain(pts_all, es1) isa Dagger.InvalidScope
+        @test Dagger.constrain(pts_all, os1) isa Dagger.InvalidScope
+    end
     # TODO: Test scope propagation
 
+    @testset "scope helper" begin
+        @test Dagger.scope(:any) isa AnyScope
+        @test Dagger.scope(:default) == DefaultScope()
+        @test_throws ArgumentError Dagger.scope(:blah)
+        @test Dagger.scope(()) == UnionScope()
+
+        @test Dagger.scope(worker=wid1) ==
+              Dagger.scope(workers=[wid1]) ==
+              ProcessScope(wid1)
+        @test Dagger.scope(workers=[wid1,wid2]) == UnionScope([ProcessScope(wid1),
+                                                               ProcessScope(wid2)])
+        @test Dagger.scope(workers=[]) == UnionScope()
+
+        @test Dagger.scope(thread=1) ==
+              Dagger.scope(threads=[1]) ==
+              UnionScope([ExactScope(Dagger.ThreadProc(w,1)) for w in procs()])
+        @test Dagger.scope(threads=[1,2]) == UnionScope([ExactScope(Dagger.ThreadProc(w,t)) for t in [1,2] for w in procs()])
+        @test Dagger.scope(threads=[]) == UnionScope()
+
+        @test Dagger.scope(worker=wid1,thread=1) ==
+              Dagger.scope(thread=1,worker=wid1) ==
+              Dagger.scope(workers=[wid1],thread=1) ==
+              Dagger.scope(worker=wid1,threads=[1]) ==
+              Dagger.scope(workers=[wid1],threads=[1]) ==
+              ExactScope(Dagger.ThreadProc(wid1,1))
+
+        @test_throws ArgumentError Dagger.scope(blah=1)
+        @test_throws ArgumentError Dagger.scope(thread=1, blah=1)
+
+        @test Dagger.scope(worker=1,thread=1) ==
+              Dagger.scope((worker=1,thread=1)) ==
+              Dagger.scope(((worker=1,thread=1),))
+        @test Dagger.scope((worker=1,thread=1),(worker=wid1,thread=2)) ==
+              Dagger.scope(((worker=1,thread=1),(worker=wid1,thread=2),)) ==
+              Dagger.scope(((worker=1,thread=1),), ((worker=wid1,thread=2),)) ==
+              UnionScope([ExactScope(Dagger.ThreadProc(1, 1)),
+                          ExactScope(Dagger.ThreadProc(wid1, 2))])
+        @test_throws ArgumentError Dagger.scope((;blah=1))
+        @test_throws ArgumentError Dagger.scope((thread=1, blah=1))
+
+        @testset "custom handler" begin
+            @eval begin
+                Dagger.scope_key_precedence(::Val{:gpu}) = 1
+                Dagger.scope_key_precedence(::Val{:rocm}) = 2
+                Dagger.scope_key_precedence(::Val{:cuda}) = 2
+
+                # Some fake scopes to use as sentinels
+                Dagger.to_scope(::Val{:gpu}, sc::NamedTuple) = ExactScope(Dagger.ThreadProc(1, sc.device))
+                Dagger.to_scope(::Val{:rocm}, sc::NamedTuple) = ExactScope(Dagger.ThreadProc($wid1, sc.gpu))
+                Dagger.to_scope(::Val{:cuda}, sc::NamedTuple) = ExactScope(Dagger.ThreadProc($wid2, sc.gpu))
+            end
+
+            @test Dagger.scope(gpu=1,device=2) ==
+                  Dagger.scope(device=2,gpu=1) ==
+                  Dagger.scope(gpu=1,device=2,blah=3) ==
+                  Dagger.scope((gpu=1,device=2,blah=3)) ==
+                  ExactScope(Dagger.ThreadProc(1, 2))
+            @test Dagger.scope((gpu=1,device=2),(worker=1,thread=1)) ==
+                  Dagger.scope((worker=1,thread=1),(device=2,gpu=1)) ==
+                  UnionScope([ExactScope(Dagger.ThreadProc(1, 2)),
+                              ExactScope(Dagger.ThreadProc(1, 1))])
+            @test Dagger.scope((gpu=1,device=2),(device=3,gpu=1)) ==
+                  UnionScope([ExactScope(Dagger.ThreadProc(1, 2)),
+                              ExactScope(Dagger.ThreadProc(1, 3))])
+
+            @test Dagger.scope(rocm=1,gpu=2) ==
+                  Dagger.scope(gpu=2,rocm=1) ==
+                  ExactScope(Dagger.ThreadProc(wid1, 2))
+            @test Dagger.scope(cuda=1,gpu=2) ==
+                  Dagger.scope(gpu=2,cuda=1) ==
+                  ExactScope(Dagger.ThreadProc(wid2, 2))
+            @test_throws ArgumentError Dagger.scope(rocm=1,cuda=1,gpu=2)
+            @test_throws ArgumentError Dagger.scope(gpu=2,rocm=1,cuda=1)
+            @test_throws ArgumentError Dagger.scope((rocm=1,cuda=1,gpu=2))
+        end
+    end
+
     rmprocs([wid1, wid2])
 end
diff --git a/test/thunk.jl b/test/thunk.jl
index 1055819a2..c99a2bbfb 100644
--- a/test/thunk.jl
+++ b/test/thunk.jl
@@ -6,7 +6,7 @@ import Dagger: Chunk
 
     function dynamic_fib(n)
         n <= 1 && return n
-        t = Dagger.spawn(dynamic_fib, n-1)
+        t = Dagger.@spawn dynamic_fib(n-1)
         y = dynamic_fib(n-2)
         return (fetch(t)::Int) + y
     end
@@ -21,6 +21,7 @@ import Dagger: Chunk
     end
     MulProc() = MulProc(myid())
     Dagger.get_parent(mp::MulProc) = OSProc(mp.owner)
+    Dagger.move(src::MulProc, dest::Dagger.OSProc, x::Function) = Base.:*
     Dagger.move(src::MulProc, dest::Dagger.ThreadProc, x::Function) = Base.:*
 end
 
@@ -54,13 +55,27 @@ end
         a = @spawn x + x
         @test a isa Dagger.EagerThunk
         b = @spawn sum([x,1,2])
-        c = spawn(*, a, b)
+        c = @spawn a * b
         @test c isa Dagger.EagerThunk
         @test fetch(a) == 4
         @test fetch(b) == 5
         @test fetch(c) == 20
     end
     @test Dagger.Sch.EAGER_CONTEXT[] isa Context
+    @testset "keyword arguments" begin
+        A = rand(4, 4)
+        @test fetch(@spawn sum(A; dims=1)) ≈ sum(A; dims=1)
+
+        @test_throws_unwrap Dagger.ThunkFailedException fetch(@spawn sum(A; fakearg=2))
+
+        @test fetch(@spawn reduce(+, A; dims=1, init=2.0)) ≈
+              reduce(+, A; dims=1, init=2.0)
+    end
+    @testset "broadcast" begin
+        A, B = rand(4), rand(4)
+        @test fetch(@spawn A .+ B) ≈ A .+ B
+        @test fetch(@spawn A .* B) ≈ A .* B
+    end
     @testset "waiting" begin
         a = @spawn sleep(1)
         @test !isready(a)
@@ -95,9 +110,9 @@ end
             end
             ex = Dagger.Sch.unwrap_nested_exception(ex)
             ex_str = sprint(io->Base.showerror(io,ex))
-            @test occursin(r"^ThunkFailedException \(Thunk.*failure\):", ex_str)
+            @test occursin(r"^ThunkFailedException:", ex_str)
             @test occursin("Test", ex_str)
-            @test !occursin("due to a failure in", ex_str)
+            @test !occursin("Root Thunk", ex_str)
 
             ex = try
                 fetch(b)
@@ -106,8 +121,8 @@ end
             end
             ex = Dagger.Sch.unwrap_nested_exception(ex)
             ex_str = sprint(io->Base.showerror(io,ex))
-            @test occursin(r"Thunk.*failure due to a failure in", ex_str)
             @test occursin("Test", ex_str)
+            @test occursin("Root Thunk", ex_str)
         end
         @testset "single dependent" begin
             a = @spawn error("Test")
@@ -155,7 +170,7 @@ end
         end
     end
     @testset "remote spawn" begin
-        a = fetch(Distributed.@spawnat 2 Dagger.spawn(+, 1, 2))
+        a = fetch(Distributed.@spawnat 2 Dagger.@spawn 1+2)
         @test Dagger.Sch.EAGER_INIT[]
         @test fetch(Distributed.@spawnat 2 !(Dagger.Sch.EAGER_INIT[]))
         @test a isa Dagger.EagerThunk
@@ -197,7 +212,7 @@ end
             a = delayed(+; processor=Dagger.ThreadProc(1,1))(1,2)
             @test a.f isa Chunk
             @test a.f.processor isa Dagger.ThreadProc
-            @test a.f.scope isa AnyScope
+            @test a.f.scope == DefaultScope()
 
             a = delayed(+; processor=Dagger.ThreadProc(1,1), scope=NodeScope())(1,2)
             @test a.f isa Chunk
@@ -221,19 +236,19 @@ end
             end
         end
         @testset "eager API" begin
-            _a = Dagger.spawn(+, 1, 2; scope=NodeScope())
+            _a = Dagger.@spawn scope=NodeScope() 1+2
             a = Dagger.Sch._find_thunk(_a)
             @test a.f isa Chunk
             @test a.f.processor isa OSProc
             @test a.f.scope isa NodeScope
 
-            _a = Dagger.spawn(+, 1, 2; processor=Dagger.ThreadProc(1,1))
+            _a = Dagger.@spawn processor=Dagger.ThreadProc(1,1) 1+2
             a = Dagger.Sch._find_thunk(_a)
             @test a.f isa Chunk
             @test a.f.processor isa Dagger.ThreadProc
-            @test a.f.scope isa AnyScope
+            @test a.f.scope == DefaultScope()
 
-            _a = Dagger.spawn(+, 1, 2; processor=Dagger.ThreadProc(1,1), scope=NodeScope())
+            _a = Dagger.@spawn processor=Dagger.ThreadProc(1,1) scope=NodeScope() 1+2
             a = Dagger.Sch._find_thunk(_a)
             @test a.f isa Chunk
             @test a.f.processor isa Dagger.ThreadProc
@@ -245,12 +260,12 @@ end
 
         s = p -> p == Dagger.ThreadProc(1, 1)
         f = (x) -> 10 + x
-        g = (x) -> fetch(Dagger.spawn(f, x; proclist=s))
-        fetch(Dagger.spawn(g, 10; proclist=s))
+        g = (x) -> fetch(Dagger.@spawn proclist=s f(x))
+        fetch(Dagger.@spawn proclist=s g(10))
     end
     @testset "no cross-scheduler Thunk usage" begin
         a = delayed(+)(1,2)
-        @test_throws Exception Dagger.spawn(identity, a)
+        @test_throws Exception Dagger.@spawn identity(a)
     end
     @testset "@sync support" begin
         result = Dagger.@spawn sleep(1)