From 59d31c6ac271cec661a09d01a5eb524be26490fb Mon Sep 17 00:00:00 2001 From: Avik Pal Date: Tue, 1 Oct 2024 14:58:53 -0400 Subject: [PATCH 1/3] perf: benchmarking CI (#136) * ci(buildkite): add benchmark runners * perf: initial ViT benchmarking * fix: path names * ci: run CPU benchmarks on larger machine * ci: try fixing CUDA bench * fix: aggregation script * ci: run GC to rule out allocations --- .buildkite/pipeline.yml | 71 ++++++++++++++++++ .github/workflows/benchmark_aggregate.yml | 46 ++++++++++++ .gitignore | 2 + benchmark/Project.toml | 15 ++++ benchmark/aggregate.jl | 36 +++++++++ benchmark/runbenchmarks.jl | 57 ++++++++++++++ benchmark/setup.jl | 90 +++++++++++++++++++++++ src/XLA.jl | 4 +- 8 files changed, 319 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/benchmark_aggregate.yml create mode 100644 benchmark/aggregate.jl create mode 100644 benchmark/runbenchmarks.jl create mode 100644 benchmark/setup.jl diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index e6b3d603e..e2451b00c 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -20,6 +20,77 @@ steps: if: build.message !~ /\[skip tests\]/ timeout_in_minutes: 60 + - group: ":racehorse: Benchmarks" + steps: + - label: "CPU: Run Benchmarks" + plugins: + - JuliaCI/julia#v1: + version: "1" + command: | + julia --project=benchmark -e 'println("--- :julia: Instantiating project") + using Pkg + Pkg.develop([PackageSpec(path=pwd())])' + + julia --project=benchmark -e 'println("--- :julia: Run Benchmarks") + include("benchmark/runbenchmarks.jl")' + artifact_paths: + - "benchmark/results/*" + agents: + # Models are quite large so we need a decent sized machine. Don't tell Chris we + # are stealing SciMLBenchmarks machine :P + queue: "juliaecosystem" + sandbox_capable: true + exclusive: true + arch: "x86_64" + env: + BENCHMARK_GROUP: CPU + JULIA_NUM_THREADS: "auto" + timeout_in_minutes: 120 + + - label: "CUDA: Run Benchmarks" + plugins: + - JuliaCI/julia#v1: + version: "1" + command: | + julia --project=benchmark -e 'println("--- :julia: Instantiating project") + using Pkg + Pkg.develop([PackageSpec(path=pwd())])' + + julia --project=benchmark -e 'println("--- :julia: Run Benchmarks") + include("benchmark/runbenchmarks.jl")' + artifact_paths: + - "benchmark/results/*" + agents: + queue: "benchmark" + gpu: "rtx4070" + cuda: "*" + env: + BENCHMARK_GROUP: CUDA + JULIA_NUM_THREADS: "auto" + timeout_in_minutes: 120 + + - wait: ~ + continue_on_failure: true + + - label: "Combine benchmarks" + plugins: + - JuliaCI/julia#v1: + version: "1" + command: | + buildkite-agent artifact download "benchmark/results/*" . + + julia -e 'println("--- :julia: Instantiating project") + using Pkg + Pkg.add("BenchmarkTools") + + println("--- :julia: Combining Benchmarks") + include("benchmark/aggregate.jl")' + artifact_paths: + - "benchmark/results/combinedbenchmarks.json" + agents: + queue: "juliagpu" + timeout_in_minutes: 10 + # - label: "AMDGPU Julia v{{matrix.version}}" # matrix: # setup: diff --git a/.github/workflows/benchmark_aggregate.yml b/.github/workflows/benchmark_aggregate.yml new file mode 100644 index 000000000..6f78ae3ae --- /dev/null +++ b/.github/workflows/benchmark_aggregate.yml @@ -0,0 +1,46 @@ +name: Benchmarks +permissions: + contents: write # contents permission to update benchmark contents in gh-pages branch + statuses: read + deployments: write # deployments permission to deploy GitHub pages website + pull-requests: write + +on: + pull_request: + + push: + branches: + - main + +jobs: + benchmark: + if: ${{ !contains(github.event.head_commit.message, '[skip benchmarks]') }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Download Buildkite Artifacts + id: download + uses: EnricoMi/download-buildkite-artifact-action@v1 + with: + buildkite_token: ${{ secrets.BUILDKITE_TOKEN }} + output_path: artifacts + + - name: Locate Benchmarks Artifact + id: locate + if: ${{ steps.download.outputs.download-state == 'success' }} + run: echo "path=$(find artifacts -type f -name combinedbenchmarks.json 2>/dev/null)" >> $GITHUB_OUTPUT + + - name: Upload Benchmark Results + if: ${{ steps.locate.outputs.path != '' }} + uses: benchmark-action/github-action-benchmark@v1 + with: + name: Reactant.jl Benchmarks + tool: "julia" + output-file-path: ${{ steps.locate.outputs.path }} + benchmark-data-dir-path: "benchmarks" + github-token: ${{ secrets.GITHUB_TOKEN }} + comment-always: true + summary-always: true + alert-threshold: "150%" + fail-on-alert: false + auto-push: ${{ github.event_name != 'pull_request' }} diff --git a/.gitignore b/.gitignore index 2c84ce4d0..b3c6929fb 100644 --- a/.gitignore +++ b/.gitignore @@ -273,3 +273,5 @@ deps/ReactantExtra/MODULE.bazel.lock external archive/ + +benchmark/results/* diff --git a/benchmark/Project.toml b/benchmark/Project.toml index dcc8c521b..b684576e7 100644 --- a/benchmark/Project.toml +++ b/benchmark/Project.toml @@ -1,10 +1,19 @@ [deps] +AppleAccelerate = "13e28ba4-7ad8-5781-acae-3021b1ed3924" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" Boltz = "4544d5e4-abc5-4dea-817f-29e4c205d9c8" +CpuId = "adafc99b-e345-5852-983c-f28acb93d879" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" +InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Lux = "b2108857-7c20-44ae-9111-449ecde12c47" +LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda" +MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2" +MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reactant = "3c362404-f566-11ee-1572-e11a4b42c853" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [compat] BenchmarkTools = "1.5" @@ -13,3 +22,9 @@ Enzyme = "0.13" Lux = "1.1" Random = "1.10" julia = "1.10" + +[extras] +CUDA_Driver_jll = "4ee394cb-3365-5eb0-8335-949819d2adfc" + +[preferences.CUDA_Driver_jll] +compat = false diff --git a/benchmark/aggregate.jl b/benchmark/aggregate.jl new file mode 100644 index 000000000..36d9bb19c --- /dev/null +++ b/benchmark/aggregate.jl @@ -0,0 +1,36 @@ +using BenchmarkTools + +const BACKENDS = ["CPU", "CUDA"] + +const CPU_Results = joinpath(dirname(@__FILE__), "results", "CPUbenchmarks.json") +@assert(ispath(CPU_Results)) + +const RESULTS = BenchmarkTools.load(CPU_Results)[1] +@assert RESULTS isa BenchmarkTools.BenchmarkGroup + +for backend in BACKENDS[2:end] + @info "Aggregating results for $(backend)" + filename = string(backend, "benchmarks.json") + filepath = joinpath(dirname(@__FILE__), "results", filename) + if !ispath(filepath) + @warn "No file found at path: $(filepath)" + else + backend_results = BenchmarkTools.load(filepath)[1] + if backend_results isa BenchmarkTools.BenchmarkGroup + # /// + for benchmark in keys(RESULTS) + for pass in keys(RESULTS[benchmark]) + for pkg in keys(backend_results[benchmark][pass][backend]) + RESULTS[benchmark][pass][backend][pkg] = backend_results[benchmark][pass][backend][pkg] + end + end + end + else + @warn "Unexpected file format for file at path: $(filepath)" + end + end +end + +BenchmarkTools.save( + joinpath(dirname(@__FILE__), "results", "combinedbenchmarks.json"), RESULTS +) diff --git a/benchmark/runbenchmarks.jl b/benchmark/runbenchmarks.jl new file mode 100644 index 000000000..7dc061c56 --- /dev/null +++ b/benchmark/runbenchmarks.jl @@ -0,0 +1,57 @@ +# Accelerator Support for testing non-Reactant performance +using LuxCUDA + +using BenchmarkTools: BenchmarkTools, BenchmarkGroup, @btime, @benchmarkable +using CpuId: CpuId +using InteractiveUtils: versioninfo +using LinearAlgebra: BLAS +using Reactant: Reactant +using Statistics: median + +# To run benchmarks on a specific GPU backend, add AMDGPU / CUDA / Metal / oneAPI +# to benchmarks/Project.toml and change BENCHMARK_GROUP to the backend name +const BENCHMARK_GROUP = get(ENV, "BENCHMARK_GROUP", "CPU") +@info "Running benchmarks for $BENCHMARK_GROUP" + +BenchmarkTools.DEFAULT_PARAMETERS.seconds = 20 + +if BENCHMARK_GROUP == "CPU" + if Sys.isapple() && (Sys.ARCH == :aarch64 || Sys.ARCH == :arm64) + @info "Running benchmarks on Apple with ARM CPUs. Using `AppleAccelerate.jl`." + using AppleAccelerate: AppleAccelerate + end + + if Sys.ARCH == :x86_64 && occursin("intel", lowercase(CpuId.cpubrand())) + @info "Running benchmarks on Intel CPUs. Loading `MKL.jl`." + using MKL: MKL + end +end + +const BENCHMARK_CPU_THREADS = Threads.nthreads() +BLAS.set_num_threads(BENCHMARK_CPU_THREADS) + +@info sprint(versioninfo) +@info "BLAS threads: $(BLAS.get_num_threads())" + +const SUITE = BenchmarkGroup() + +if BENCHMARK_GROUP == "CUDA" + Reactant.set_default_backend("gpu") + @info "Running CUDA benchmarks" maxlog = 1 + CUDA.versioninfo() +else + @info "Running CPU benchmarks with $(BENCHMARK_CPU_THREADS) thread(s)" maxlog = 1 +end + +# Main benchmark files +include("setup.jl") +setup_benchmarks!(SUITE, BENCHMARK_GROUP) + +results = BenchmarkTools.run(SUITE; verbose=true) + +filepath = joinpath(dirname(@__FILE__), "results") +mkpath(filepath) +filename = string(BENCHMARK_GROUP, "benchmarks.json") +BenchmarkTools.save(joinpath(filepath, filename), median(results)) + +@info "Saved results to $(joinpath(filepath, filename))" diff --git a/benchmark/setup.jl b/benchmark/setup.jl new file mode 100644 index 000000000..108554741 --- /dev/null +++ b/benchmark/setup.jl @@ -0,0 +1,90 @@ +using Boltz: Vision +using Lux: Lux +using MLDataDevices: AbstractDevice, CPUDevice, CUDADevice +using Random: Random +using Reactant: Reactant, @compile + +using Enzyme: Enzyme +using Zygote: Zygote + +# Helper Functions +@inline synchronize(::CPUDevice) = nothing +@inline synchronize(::CUDADevice) = CUDA.synchronize() + +@inline reclaim(::CPUDevice) = GC.gc() +@inline reclaim(::CUDADevice) = CUDA.reclaim() + +@inline sumabs2(model, x, p, st) = sum(abs2, first(Lux.apply(model, x, p, st))) +@inline sumabs2(model, x) = sum(abs2, model(x)) + +function benchmark_group_to_backend(benchmark_group::String) + benchmark_group == "CPU" && return CPUDevice() + benchmark_group == "CUDA" && return CUDADevice() + return error("Unknown backend: $(benchmark_group)") +end + +function general_lux_setup(model, x_dims) + rng = Random.default_rng() # don't use any other rng + ps, st = Lux.setup(rng, model) + x_dims === nothing && return ps, st + x = randn(rng, Float32, x_dims) + return x, ps, st +end + +function setup_benchmarks!(suite::BenchmarkGroup, backend::String) + dev = benchmark_group_to_backend(backend) + + setup_vit_benchmark!(suite, backend, dev) + + return nothing +end + +# Lux Benchmarks +function setup_vit_benchmark!(suite::BenchmarkGroup, backend, dev::AbstractDevice) + for mode in (:tiny, :small, :base), bsize in (4, 16, 32) + benchmark_name = "ViT $(mode) (256 x 256 x 3 x $(bsize))" + + setup_lux_forward_pass_benchmark!( + suite, benchmark_name, backend, Vision.ViT(mode), (256, 256, 3, bsize), dev + ) + end +end + +function setup_lux_forward_pass_benchmark!( + suite::BenchmarkGroup, + benchmark_name::String, + backend::String, + model, + x_dims, + dev::AbstractDevice, +) + suite[benchmark_name]["forward"][backend]["Lux"] = @benchmarkable begin + Lux.apply($model, x, ps, st_test) + synchronize($dev) + end setup = begin + GC.gc() + reclaim($dev) + x, ps, st = $dev(general_lux_setup($model, $x_dims)) + st_test = Lux.testmode(st) + GC.gc() + reclaim($dev) + end + + suite[benchmark_name]["forward"][backend]["Reactant"] = @benchmarkable begin + y, _ = apply_compiled($model, x_ra, ps_ra, st_test_ra) + Reactant.synchronize(y) + end setup = begin + GC.gc() + reclaim($dev) + x, ps, st = general_lux_setup($model, $x_dims) + st_test = Lux.testmode(st) + x_ra = Reactant.to_rarray(x) + ps_ra = Reactant.to_rarray(ps) + st_test_ra = Reactant.to_rarray(st_test) + apply_compiled = @compile Lux.apply($model, x_ra, ps_ra, st_test_ra) + GC.gc() + reclaim($dev) + end + + return nothing +end diff --git a/src/XLA.jl b/src/XLA.jl index 684511e68..9e77dac6d 100644 --- a/src/XLA.jl +++ b/src/XLA.jl @@ -446,7 +446,7 @@ end const AsyncEmptyBuffer = AsyncBuffer(Buffer(C_NULL), nothing) @inline function await(buffer::AsyncBuffer)::Nothing - if buffer.future == nothing + if buffer.future === nothing return nothing else future = buffer.future @@ -457,7 +457,7 @@ const AsyncEmptyBuffer = AsyncBuffer(Buffer(C_NULL), nothing) end @inline function synced_buffer(buffer::AsyncBuffer) - if buffer.future != nothing + if buffer.future !== nothing future = buffer.future buffer.future = nothing await(future::Future) From 55ce2cdf40545ba9ac41836ccce2e5b787cf9ea0 Mon Sep 17 00:00:00 2001 From: William Moses Date: Tue, 1 Oct 2024 22:10:59 -0500 Subject: [PATCH 2/3] Bump dependencies (#143) --- deps/ReactantExtra/BUILD | 10 +++++----- deps/ReactantExtra/WORKSPACE | 7 +++---- deps/ReactantExtra/tblgen/jl-generators.cc | 8 ++------ 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/deps/ReactantExtra/BUILD b/deps/ReactantExtra/BUILD index 0d9f17650..a62d1bad0 100644 --- a/deps/ReactantExtra/BUILD +++ b/deps/ReactantExtra/BUILD @@ -342,8 +342,8 @@ cc_library( "@xla//xla/service/cpu:cpu_transfer_manager", "@xla//xla/pjrt/gpu:se_gpu_pjrt_client", - - "@tsl//tsl/protobuf:protos_all_cc_impl", + + "@xla//xla/tsl/protobuf:protos_all_cc_impl", "@xla//xla/tsl/framework:allocator_registry_impl", "@xla//xla/pjrt:status_casters", @@ -352,9 +352,9 @@ cc_library( "@xla//xla/python/ifrt/hlo:hlo_program", "@xla//xla/ffi:call_frame", "@com_google_protobuf//:protobuf", - "@tsl//tsl/profiler/backends/cpu:annotation_stack_impl", - "@tsl//tsl/profiler/backends/cpu:traceme_recorder_impl", - "@tsl//tsl/profiler/utils:time_utils_impl", + "@xla//xla/tsl/profiler/backends/cpu:annotation_stack_impl", + "@xla//xla/tsl/profiler/backends/cpu:traceme_recorder_impl", + "@xla//xla/tsl/profiler/utils:time_utils_impl", "@tsl//tsl/platform:env_impl", "@xla//xla/stream_executor:stream_executor_impl", "@xla//xla/mlir/utils:type_util", diff --git a/deps/ReactantExtra/WORKSPACE b/deps/ReactantExtra/WORKSPACE index 0ffa879f6..b73cf25b6 100644 --- a/deps/ReactantExtra/WORKSPACE +++ b/deps/ReactantExtra/WORKSPACE @@ -9,7 +9,7 @@ http_archive( urls = ["https://github.com/wsmoses/nsync/archive/{commit}.tar.gz".format(commit = NSYNC_COMMIT)], ) -ENZYMEXLA_COMMIT = "27ccd5e6ace279781ad437ddd551e8a1b9a3de9e" +ENZYMEXLA_COMMIT = "c81918c2e1625b3ed74dca4e123b2b65ab4aaf3a" ENZYMEXLA_SHA256 = "" http_archive( @@ -91,12 +91,11 @@ http_archive( patches = ["@enzyme_ad//:patches/jax.patch"], ) -# load("@jax//third_party/xla:workspace.bzl", "XLA_COMMIT", "XLA_SHA256") -XLA_COMMIT = "7d4f8d1e8a91e67a713ac69796a22f343d292327" +load("@jax//third_party/xla:workspace.bzl", "XLA_COMMIT", "XLA_SHA256") http_archive( name = "xla", - # sha256 = XLA_SHA256, + sha256 = XLA_SHA256, strip_prefix = "xla-" + XLA_COMMIT, urls = ["https://github.com/wsmoses/xla/archive/{commit}.tar.gz".format(commit = XLA_COMMIT)], patch_cmds = XLA_PATCHES diff --git a/deps/ReactantExtra/tblgen/jl-generators.cc b/deps/ReactantExtra/tblgen/jl-generators.cc index 30a65bfc9..ba2069eed 100644 --- a/deps/ReactantExtra/tblgen/jl-generators.cc +++ b/deps/ReactantExtra/tblgen/jl-generators.cc @@ -118,7 +118,7 @@ namespace return description; } - std::string getDialectName(llvm::ArrayRef op_defs) { + std::string getDialectName(llvm::ArrayRef op_defs) { mlir::tblgen::Operator any_op(op_defs.front()); assert( std::all_of(op_defs.begin(), op_defs.end(), [&any_op](llvm::Record* op) { @@ -163,11 +163,7 @@ extern bool disableModuleWrap; bool emitOpTableDefs(const llvm::RecordKeeper &recordKeeper, llvm::raw_ostream &os) { -#if LLVM_VERSION_MAJOR >= 16 - std::vector opdefs = recordKeeper.getAllDerivedDefinitionsIfDefined("Op"); -#else - std::vector opdefs = recordKeeper.getAllDerivedDefinitions("Op"); -#endif + llvm::ArrayRef opdefs = recordKeeper.getAllDerivedDefinitionsIfDefined("Op"); const char *moduleTemplate; if (disableModuleWrap) From 4f4cb40af080a6f60a63fa3e747b95788804205c Mon Sep 17 00:00:00 2001 From: William Moses Date: Tue, 1 Oct 2024 23:22:15 -0400 Subject: [PATCH 3/3] don't build rocm --- deps/ReactantExtra/BUILD | 1 - 1 file changed, 1 deletion(-) diff --git a/deps/ReactantExtra/BUILD b/deps/ReactantExtra/BUILD index a62d1bad0..6a755c2eb 100644 --- a/deps/ReactantExtra/BUILD +++ b/deps/ReactantExtra/BUILD @@ -367,7 +367,6 @@ cc_library( ] + select({ "@xla//xla/tsl:is_cuda_enabled_and_oss":[ "@xla//xla/stream_executor/cuda:all_runtime", - "@xla//xla/stream_executor/rocm:all_runtime", "@xla//xla/service/gpu/model:hlo_op_profiles", "@xla//xla/service/gpu/model:hlo_op_profile_proto_cc_impl", "@xla//xla/service/gpu:nvptx_compiler",