From 59d31c6ac271cec661a09d01a5eb524be26490fb Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Tue, 1 Oct 2024 14:58:53 -0400
Subject: [PATCH 1/3] perf: benchmarking CI (#136)

* ci(buildkite): add benchmark runners

* perf: initial ViT benchmarking

* fix: path names

* ci: run CPU benchmarks on larger machine

* ci: try fixing CUDA bench

* fix: aggregation script

* ci: run GC to rule out allocations
---
 .buildkite/pipeline.yml                   | 71 ++++++++++++++++++
 .github/workflows/benchmark_aggregate.yml | 46 ++++++++++++
 .gitignore                                |  2 +
 benchmark/Project.toml                    | 15 ++++
 benchmark/aggregate.jl                    | 36 +++++++++
 benchmark/runbenchmarks.jl                | 57 ++++++++++++++
 benchmark/setup.jl                        | 90 +++++++++++++++++++++++
 src/XLA.jl                                |  4 +-
 8 files changed, 319 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/benchmark_aggregate.yml
 create mode 100644 benchmark/aggregate.jl
 create mode 100644 benchmark/runbenchmarks.jl
 create mode 100644 benchmark/setup.jl

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index e6b3d603e..e2451b00c 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -20,6 +20,77 @@ steps:
     if: build.message !~ /\[skip tests\]/
     timeout_in_minutes: 60
 
+  - group: ":racehorse: Benchmarks"
+    steps:
+      - label: "CPU: Run Benchmarks"
+        plugins:
+          - JuliaCI/julia#v1:
+              version: "1"
+        command: |
+          julia --project=benchmark -e 'println("--- :julia: Instantiating project")
+              using Pkg
+              Pkg.develop([PackageSpec(path=pwd())])'
+
+          julia --project=benchmark -e 'println("--- :julia: Run Benchmarks")
+              include("benchmark/runbenchmarks.jl")'
+        artifact_paths:
+          - "benchmark/results/*"
+        agents:
+          # Models are quite large so we need a decent sized machine. Don't tell Chris we
+          # are stealing SciMLBenchmarks machine :P
+          queue: "juliaecosystem"
+          sandbox_capable: true
+          exclusive: true
+          arch: "x86_64"
+        env:
+          BENCHMARK_GROUP: CPU
+          JULIA_NUM_THREADS: "auto"
+        timeout_in_minutes: 120
+
+      - label: "CUDA: Run Benchmarks"
+        plugins:
+          - JuliaCI/julia#v1:
+              version: "1"
+        command: |
+          julia --project=benchmark -e 'println("--- :julia: Instantiating project")
+              using Pkg
+              Pkg.develop([PackageSpec(path=pwd())])'
+
+          julia --project=benchmark -e 'println("--- :julia: Run Benchmarks")
+              include("benchmark/runbenchmarks.jl")'
+        artifact_paths:
+          - "benchmark/results/*"
+        agents:
+          queue: "benchmark"
+          gpu: "rtx4070"
+          cuda: "*"
+        env:
+          BENCHMARK_GROUP: CUDA
+          JULIA_NUM_THREADS: "auto"
+        timeout_in_minutes: 120
+
+      - wait: ~
+        continue_on_failure: true
+
+      - label: "Combine benchmarks"
+        plugins:
+          - JuliaCI/julia#v1:
+              version: "1"
+        command: |
+          buildkite-agent artifact download "benchmark/results/*" .
+
+          julia -e 'println("--- :julia: Instantiating project")
+              using Pkg
+              Pkg.add("BenchmarkTools")
+
+              println("--- :julia: Combining Benchmarks")
+              include("benchmark/aggregate.jl")'
+        artifact_paths:
+          - "benchmark/results/combinedbenchmarks.json"
+        agents:
+          queue: "juliagpu"
+        timeout_in_minutes: 10
+
   # - label: "AMDGPU Julia v{{matrix.version}}"
   #   matrix:
   #     setup:
diff --git a/.github/workflows/benchmark_aggregate.yml b/.github/workflows/benchmark_aggregate.yml
new file mode 100644
index 000000000..6f78ae3ae
--- /dev/null
+++ b/.github/workflows/benchmark_aggregate.yml
@@ -0,0 +1,46 @@
+name: Benchmarks
+permissions:
+  contents: write # contents permission to update benchmark contents in gh-pages branch
+  statuses: read
+  deployments: write # deployments permission to deploy GitHub pages website
+  pull-requests: write
+
+on:
+  pull_request:
+
+  push:
+    branches:
+      - main
+
+jobs:
+  benchmark:
+    if: ${{ !contains(github.event.head_commit.message, '[skip benchmarks]') }}
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Download Buildkite Artifacts
+      id: download
+      uses: EnricoMi/download-buildkite-artifact-action@v1
+      with:
+        buildkite_token: ${{ secrets.BUILDKITE_TOKEN }}
+        output_path: artifacts
+
+    - name: Locate Benchmarks Artifact
+      id: locate
+      if: ${{ steps.download.outputs.download-state == 'success' }}
+      run: echo "path=$(find artifacts -type f -name combinedbenchmarks.json 2>/dev/null)" >> $GITHUB_OUTPUT
+
+    - name: Upload Benchmark Results
+      if: ${{ steps.locate.outputs.path != '' }}
+      uses: benchmark-action/github-action-benchmark@v1
+      with:
+        name: Reactant.jl Benchmarks
+        tool: "julia"
+        output-file-path: ${{ steps.locate.outputs.path }}
+        benchmark-data-dir-path: "benchmarks"
+        github-token: ${{ secrets.GITHUB_TOKEN }}
+        comment-always: true
+        summary-always: true
+        alert-threshold: "150%"
+        fail-on-alert: false
+        auto-push: ${{ github.event_name != 'pull_request' }}
diff --git a/.gitignore b/.gitignore
index 2c84ce4d0..b3c6929fb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -273,3 +273,5 @@ deps/ReactantExtra/MODULE.bazel.lock
 external
 
 archive/
+
+benchmark/results/*
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
index dcc8c521b..b684576e7 100644
--- a/benchmark/Project.toml
+++ b/benchmark/Project.toml
@@ -1,10 +1,19 @@
 [deps]
+AppleAccelerate = "13e28ba4-7ad8-5781-acae-3021b1ed3924"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 Boltz = "4544d5e4-abc5-4dea-817f-29e4c205d9c8"
+CpuId = "adafc99b-e345-5852-983c-f28acb93d879"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
+LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
+MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
+MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 BenchmarkTools = "1.5"
@@ -13,3 +22,9 @@ Enzyme = "0.13"
 Lux = "1.1"
 Random = "1.10"
 julia = "1.10"
+
+[extras]
+CUDA_Driver_jll = "4ee394cb-3365-5eb0-8335-949819d2adfc"
+
+[preferences.CUDA_Driver_jll]
+compat = false
diff --git a/benchmark/aggregate.jl b/benchmark/aggregate.jl
new file mode 100644
index 000000000..36d9bb19c
--- /dev/null
+++ b/benchmark/aggregate.jl
@@ -0,0 +1,36 @@
+using BenchmarkTools
+
+const BACKENDS = ["CPU", "CUDA"]
+
+const CPU_Results = joinpath(dirname(@__FILE__), "results", "CPUbenchmarks.json")
+@assert(ispath(CPU_Results))
+
+const RESULTS = BenchmarkTools.load(CPU_Results)[1]
+@assert RESULTS isa BenchmarkTools.BenchmarkGroup
+
+for backend in BACKENDS[2:end]
+    @info "Aggregating results for $(backend)"
+    filename = string(backend, "benchmarks.json")
+    filepath = joinpath(dirname(@__FILE__), "results", filename)
+    if !ispath(filepath)
+        @warn "No file found at path: $(filepath)"
+    else
+        backend_results = BenchmarkTools.load(filepath)[1]
+        if backend_results isa BenchmarkTools.BenchmarkGroup
+            # <benchmark name>/<forward or reverse>/<backend>/<reactant or package>
+            for benchmark in keys(RESULTS)
+                for pass in keys(RESULTS[benchmark])
+                    for pkg in keys(backend_results[benchmark][pass][backend])
+                        RESULTS[benchmark][pass][backend][pkg] = backend_results[benchmark][pass][backend][pkg]
+                    end
+                end
+            end
+        else
+            @warn "Unexpected file format for file at path: $(filepath)"
+        end
+    end
+end
+
+BenchmarkTools.save(
+    joinpath(dirname(@__FILE__), "results", "combinedbenchmarks.json"), RESULTS
+)
diff --git a/benchmark/runbenchmarks.jl b/benchmark/runbenchmarks.jl
new file mode 100644
index 000000000..7dc061c56
--- /dev/null
+++ b/benchmark/runbenchmarks.jl
@@ -0,0 +1,57 @@
+# Accelerator Support for testing non-Reactant performance
+using LuxCUDA
+
+using BenchmarkTools: BenchmarkTools, BenchmarkGroup, @btime, @benchmarkable
+using CpuId: CpuId
+using InteractiveUtils: versioninfo
+using LinearAlgebra: BLAS
+using Reactant: Reactant
+using Statistics: median
+
+# To run benchmarks on a specific GPU backend, add AMDGPU / CUDA / Metal / oneAPI
+# to benchmarks/Project.toml and change BENCHMARK_GROUP to the backend name
+const BENCHMARK_GROUP = get(ENV, "BENCHMARK_GROUP", "CPU")
+@info "Running benchmarks for $BENCHMARK_GROUP"
+
+BenchmarkTools.DEFAULT_PARAMETERS.seconds = 20
+
+if BENCHMARK_GROUP == "CPU"
+    if Sys.isapple() && (Sys.ARCH == :aarch64 || Sys.ARCH == :arm64)
+        @info "Running benchmarks on Apple with ARM CPUs. Using `AppleAccelerate.jl`."
+        using AppleAccelerate: AppleAccelerate
+    end
+
+    if Sys.ARCH == :x86_64 && occursin("intel", lowercase(CpuId.cpubrand()))
+        @info "Running benchmarks on Intel CPUs. Loading `MKL.jl`."
+        using MKL: MKL
+    end
+end
+
+const BENCHMARK_CPU_THREADS = Threads.nthreads()
+BLAS.set_num_threads(BENCHMARK_CPU_THREADS)
+
+@info sprint(versioninfo)
+@info "BLAS threads: $(BLAS.get_num_threads())"
+
+const SUITE = BenchmarkGroup()
+
+if BENCHMARK_GROUP == "CUDA"
+    Reactant.set_default_backend("gpu")
+    @info "Running CUDA benchmarks" maxlog = 1
+    CUDA.versioninfo()
+else
+    @info "Running CPU benchmarks with $(BENCHMARK_CPU_THREADS) thread(s)" maxlog = 1
+end
+
+# Main benchmark files
+include("setup.jl")
+setup_benchmarks!(SUITE, BENCHMARK_GROUP)
+
+results = BenchmarkTools.run(SUITE; verbose=true)
+
+filepath = joinpath(dirname(@__FILE__), "results")
+mkpath(filepath)
+filename = string(BENCHMARK_GROUP, "benchmarks.json")
+BenchmarkTools.save(joinpath(filepath, filename), median(results))
+
+@info "Saved results to $(joinpath(filepath, filename))"
diff --git a/benchmark/setup.jl b/benchmark/setup.jl
new file mode 100644
index 000000000..108554741
--- /dev/null
+++ b/benchmark/setup.jl
@@ -0,0 +1,90 @@
+using Boltz: Vision
+using Lux: Lux
+using MLDataDevices: AbstractDevice, CPUDevice, CUDADevice
+using Random: Random
+using Reactant: Reactant, @compile
+
+using Enzyme: Enzyme
+using Zygote: Zygote
+
+# Helper Functions
+@inline synchronize(::CPUDevice) = nothing
+@inline synchronize(::CUDADevice) = CUDA.synchronize()
+
+@inline reclaim(::CPUDevice) = GC.gc()
+@inline reclaim(::CUDADevice) = CUDA.reclaim()
+
+@inline sumabs2(model, x, p, st) = sum(abs2, first(Lux.apply(model, x, p, st)))
+@inline sumabs2(model, x) = sum(abs2, model(x))
+
+function benchmark_group_to_backend(benchmark_group::String)
+    benchmark_group == "CPU" && return CPUDevice()
+    benchmark_group == "CUDA" && return CUDADevice()
+    return error("Unknown backend: $(benchmark_group)")
+end
+
+function general_lux_setup(model, x_dims)
+    rng = Random.default_rng()  # don't use any other rng
+    ps, st = Lux.setup(rng, model)
+    x_dims === nothing && return ps, st
+    x = randn(rng, Float32, x_dims)
+    return x, ps, st
+end
+
+function setup_benchmarks!(suite::BenchmarkGroup, backend::String)
+    dev = benchmark_group_to_backend(backend)
+
+    setup_vit_benchmark!(suite, backend, dev)
+
+    return nothing
+end
+
+# Lux Benchmarks
+function setup_vit_benchmark!(suite::BenchmarkGroup, backend, dev::AbstractDevice)
+    for mode in (:tiny, :small, :base), bsize in (4, 16, 32)
+        benchmark_name = "ViT $(mode) (256 x 256 x 3 x $(bsize))"
+
+        setup_lux_forward_pass_benchmark!(
+            suite, benchmark_name, backend, Vision.ViT(mode), (256, 256, 3, bsize), dev
+        )
+    end
+end
+
+function setup_lux_forward_pass_benchmark!(
+    suite::BenchmarkGroup,
+    benchmark_name::String,
+    backend::String,
+    model,
+    x_dims,
+    dev::AbstractDevice,
+)
+    suite[benchmark_name]["forward"][backend]["Lux"] = @benchmarkable begin
+        Lux.apply($model, x, ps, st_test)
+        synchronize($dev)
+    end setup = begin
+        GC.gc()
+        reclaim($dev)
+        x, ps, st = $dev(general_lux_setup($model, $x_dims))
+        st_test = Lux.testmode(st)
+        GC.gc()
+        reclaim($dev)
+    end
+
+    suite[benchmark_name]["forward"][backend]["Reactant"] = @benchmarkable begin
+        y, _ = apply_compiled($model, x_ra, ps_ra, st_test_ra)
+        Reactant.synchronize(y)
+    end setup = begin
+        GC.gc()
+        reclaim($dev)
+        x, ps, st = general_lux_setup($model, $x_dims)
+        st_test = Lux.testmode(st)
+        x_ra = Reactant.to_rarray(x)
+        ps_ra = Reactant.to_rarray(ps)
+        st_test_ra = Reactant.to_rarray(st_test)
+        apply_compiled = @compile Lux.apply($model, x_ra, ps_ra, st_test_ra)
+        GC.gc()
+        reclaim($dev)
+    end
+
+    return nothing
+end
diff --git a/src/XLA.jl b/src/XLA.jl
index 684511e68..9e77dac6d 100644
--- a/src/XLA.jl
+++ b/src/XLA.jl
@@ -446,7 +446,7 @@ end
 const AsyncEmptyBuffer = AsyncBuffer(Buffer(C_NULL), nothing)
 
 @inline function await(buffer::AsyncBuffer)::Nothing
-    if buffer.future == nothing
+    if buffer.future === nothing
         return nothing
     else
         future = buffer.future
@@ -457,7 +457,7 @@ const AsyncEmptyBuffer = AsyncBuffer(Buffer(C_NULL), nothing)
 end
 
 @inline function synced_buffer(buffer::AsyncBuffer)
-    if buffer.future != nothing
+    if buffer.future !== nothing
         future = buffer.future
         buffer.future = nothing
         await(future::Future)

From 55ce2cdf40545ba9ac41836ccce2e5b787cf9ea0 Mon Sep 17 00:00:00 2001
From: William Moses <gh@wsmoses.com>
Date: Tue, 1 Oct 2024 22:10:59 -0500
Subject: [PATCH 2/3] Bump dependencies (#143)

---
 deps/ReactantExtra/BUILD                   | 10 +++++-----
 deps/ReactantExtra/WORKSPACE               |  7 +++----
 deps/ReactantExtra/tblgen/jl-generators.cc |  8 ++------
 3 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/deps/ReactantExtra/BUILD b/deps/ReactantExtra/BUILD
index 0d9f17650..a62d1bad0 100644
--- a/deps/ReactantExtra/BUILD
+++ b/deps/ReactantExtra/BUILD
@@ -342,8 +342,8 @@ cc_library(
         
         "@xla//xla/service/cpu:cpu_transfer_manager",
         "@xla//xla/pjrt/gpu:se_gpu_pjrt_client",
-        
-        "@tsl//tsl/protobuf:protos_all_cc_impl",
+       
+	"@xla//xla/tsl/protobuf:protos_all_cc_impl",
         "@xla//xla/tsl/framework:allocator_registry_impl",
 
         "@xla//xla/pjrt:status_casters",
@@ -352,9 +352,9 @@ cc_library(
         "@xla//xla/python/ifrt/hlo:hlo_program",
         "@xla//xla/ffi:call_frame",
         "@com_google_protobuf//:protobuf",
-        "@tsl//tsl/profiler/backends/cpu:annotation_stack_impl",
-        "@tsl//tsl/profiler/backends/cpu:traceme_recorder_impl",
-        "@tsl//tsl/profiler/utils:time_utils_impl",
+	"@xla//xla/tsl/profiler/backends/cpu:annotation_stack_impl",
+        "@xla//xla/tsl/profiler/backends/cpu:traceme_recorder_impl",
+        "@xla//xla/tsl/profiler/utils:time_utils_impl",
         "@tsl//tsl/platform:env_impl",
         "@xla//xla/stream_executor:stream_executor_impl",
         "@xla//xla/mlir/utils:type_util",
diff --git a/deps/ReactantExtra/WORKSPACE b/deps/ReactantExtra/WORKSPACE
index 0ffa879f6..b73cf25b6 100644
--- a/deps/ReactantExtra/WORKSPACE
+++ b/deps/ReactantExtra/WORKSPACE
@@ -9,7 +9,7 @@ http_archive(
     urls = ["https://github.com/wsmoses/nsync/archive/{commit}.tar.gz".format(commit = NSYNC_COMMIT)],
 )
 
-ENZYMEXLA_COMMIT = "27ccd5e6ace279781ad437ddd551e8a1b9a3de9e"
+ENZYMEXLA_COMMIT = "c81918c2e1625b3ed74dca4e123b2b65ab4aaf3a"
 ENZYMEXLA_SHA256 = ""
 
 http_archive(
@@ -91,12 +91,11 @@ http_archive(
     patches = ["@enzyme_ad//:patches/jax.patch"],
 )
 
-# load("@jax//third_party/xla:workspace.bzl", "XLA_COMMIT", "XLA_SHA256")
-XLA_COMMIT = "7d4f8d1e8a91e67a713ac69796a22f343d292327"
+load("@jax//third_party/xla:workspace.bzl", "XLA_COMMIT", "XLA_SHA256")
 
 http_archive(
     name = "xla",
-    # sha256 = XLA_SHA256,
+    sha256 = XLA_SHA256,
     strip_prefix = "xla-" + XLA_COMMIT,
     urls = ["https://github.com/wsmoses/xla/archive/{commit}.tar.gz".format(commit = XLA_COMMIT)],
     patch_cmds = XLA_PATCHES
diff --git a/deps/ReactantExtra/tblgen/jl-generators.cc b/deps/ReactantExtra/tblgen/jl-generators.cc
index 30a65bfc9..ba2069eed 100644
--- a/deps/ReactantExtra/tblgen/jl-generators.cc
+++ b/deps/ReactantExtra/tblgen/jl-generators.cc
@@ -118,7 +118,7 @@ namespace
     return description;
   }
 
-  std::string getDialectName(llvm::ArrayRef<llvm::Record*> op_defs) {
+  std::string getDialectName(llvm::ArrayRef<const llvm::Record*> op_defs) {
     mlir::tblgen::Operator any_op(op_defs.front());
     assert(
         std::all_of(op_defs.begin(), op_defs.end(), [&any_op](llvm::Record* op) {
@@ -163,11 +163,7 @@ extern bool disableModuleWrap;
 bool emitOpTableDefs(const llvm::RecordKeeper &recordKeeper,
                      llvm::raw_ostream &os)
 {
-#if LLVM_VERSION_MAJOR >= 16
-  std::vector<llvm::Record *> opdefs = recordKeeper.getAllDerivedDefinitionsIfDefined("Op");
-#else
-  std::vector<llvm::Record *> opdefs = recordKeeper.getAllDerivedDefinitions("Op");
-#endif
+  llvm::ArrayRef<const llvm::Record*> opdefs = recordKeeper.getAllDerivedDefinitionsIfDefined("Op");
 
   const char *moduleTemplate;
   if (disableModuleWrap)

From 4f4cb40af080a6f60a63fa3e747b95788804205c Mon Sep 17 00:00:00 2001
From: William Moses <wsmoses@cyclops.juliacomputing.io>
Date: Tue, 1 Oct 2024 23:22:15 -0400
Subject: [PATCH 3/3] don't build rocm

---
 deps/ReactantExtra/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/deps/ReactantExtra/BUILD b/deps/ReactantExtra/BUILD
index a62d1bad0..6a755c2eb 100644
--- a/deps/ReactantExtra/BUILD
+++ b/deps/ReactantExtra/BUILD
@@ -367,7 +367,6 @@ cc_library(
     ] + select({
     "@xla//xla/tsl:is_cuda_enabled_and_oss":[
         "@xla//xla/stream_executor/cuda:all_runtime", 
-        "@xla//xla/stream_executor/rocm:all_runtime",
         "@xla//xla/service/gpu/model:hlo_op_profiles",
         "@xla//xla/service/gpu/model:hlo_op_profile_proto_cc_impl",
         "@xla//xla/service/gpu:nvptx_compiler",