Skip to content

Commit

Permalink
Merge branch 'main' into gd/allow-ci-failure
Browse files Browse the repository at this point in the history
  • Loading branch information
gdalle authored Oct 2, 2024
2 parents f94eda9 + 4f4cb40 commit 217a11b
Show file tree
Hide file tree
Showing 11 changed files with 329 additions and 18 deletions.
71 changes: 71 additions & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,77 @@ steps:
if: build.message !~ /\[skip tests\]/
timeout_in_minutes: 60

- group: ":racehorse: Benchmarks"
steps:
- label: "CPU: Run Benchmarks"
plugins:
- JuliaCI/julia#v1:
version: "1"
command: |
julia --project=benchmark -e 'println("--- :julia: Instantiating project")
using Pkg
Pkg.develop([PackageSpec(path=pwd())])'
julia --project=benchmark -e 'println("--- :julia: Run Benchmarks")
include("benchmark/runbenchmarks.jl")'
artifact_paths:
- "benchmark/results/*"
agents:
# Models are quite large so we need a decent sized machine. Don't tell Chris we
# are stealing SciMLBenchmarks machine :P
queue: "juliaecosystem"
sandbox_capable: true
exclusive: true
arch: "x86_64"
env:
BENCHMARK_GROUP: CPU
JULIA_NUM_THREADS: "auto"
timeout_in_minutes: 120

- label: "CUDA: Run Benchmarks"
plugins:
- JuliaCI/julia#v1:
version: "1"
command: |
julia --project=benchmark -e 'println("--- :julia: Instantiating project")
using Pkg
Pkg.develop([PackageSpec(path=pwd())])'
julia --project=benchmark -e 'println("--- :julia: Run Benchmarks")
include("benchmark/runbenchmarks.jl")'
artifact_paths:
- "benchmark/results/*"
agents:
queue: "benchmark"
gpu: "rtx4070"
cuda: "*"
env:
BENCHMARK_GROUP: CUDA
JULIA_NUM_THREADS: "auto"
timeout_in_minutes: 120

- wait: ~
continue_on_failure: true

- label: "Combine benchmarks"
plugins:
- JuliaCI/julia#v1:
version: "1"
command: |
buildkite-agent artifact download "benchmark/results/*" .
julia -e 'println("--- :julia: Instantiating project")
using Pkg
Pkg.add("BenchmarkTools")
println("--- :julia: Combining Benchmarks")
include("benchmark/aggregate.jl")'
artifact_paths:
- "benchmark/results/combinedbenchmarks.json"
agents:
queue: "juliagpu"
timeout_in_minutes: 10

# - label: "AMDGPU Julia v{{matrix.version}}"
# matrix:
# setup:
Expand Down
46 changes: 46 additions & 0 deletions .github/workflows/benchmark_aggregate.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
name: Benchmarks
permissions:
contents: write # contents permission to update benchmark contents in gh-pages branch
statuses: read
deployments: write # deployments permission to deploy GitHub pages website
pull-requests: write

on:
pull_request:

push:
branches:
- main

jobs:
benchmark:
if: ${{ !contains(github.event.head_commit.message, '[skip benchmarks]') }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Download Buildkite Artifacts
id: download
uses: EnricoMi/download-buildkite-artifact-action@v1
with:
buildkite_token: ${{ secrets.BUILDKITE_TOKEN }}
output_path: artifacts

- name: Locate Benchmarks Artifact
id: locate
if: ${{ steps.download.outputs.download-state == 'success' }}
run: echo "path=$(find artifacts -type f -name combinedbenchmarks.json 2>/dev/null)" >> $GITHUB_OUTPUT

- name: Upload Benchmark Results
if: ${{ steps.locate.outputs.path != '' }}
uses: benchmark-action/github-action-benchmark@v1
with:
name: Reactant.jl Benchmarks
tool: "julia"
output-file-path: ${{ steps.locate.outputs.path }}
benchmark-data-dir-path: "benchmarks"
github-token: ${{ secrets.GITHUB_TOKEN }}
comment-always: true
summary-always: true
alert-threshold: "150%"
fail-on-alert: false
auto-push: ${{ github.event_name != 'pull_request' }}
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -273,3 +273,5 @@ deps/ReactantExtra/MODULE.bazel.lock
external

archive/

benchmark/results/*
15 changes: 15 additions & 0 deletions benchmark/Project.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
[deps]
AppleAccelerate = "13e28ba4-7ad8-5781-acae-3021b1ed3924"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
Boltz = "4544d5e4-abc5-4dea-817f-29e4c205d9c8"
CpuId = "adafc99b-e345-5852-983c-f28acb93d879"
Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

[compat]
BenchmarkTools = "1.5"
Expand All @@ -13,3 +22,9 @@ Enzyme = "0.13"
Lux = "1.1"
Random = "1.10"
julia = "1.10"

[extras]
CUDA_Driver_jll = "4ee394cb-3365-5eb0-8335-949819d2adfc"

[preferences.CUDA_Driver_jll]
compat = false
36 changes: 36 additions & 0 deletions benchmark/aggregate.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
using BenchmarkTools

const BACKENDS = ["CPU", "CUDA"]

const CPU_Results = joinpath(dirname(@__FILE__), "results", "CPUbenchmarks.json")
@assert(ispath(CPU_Results))

const RESULTS = BenchmarkTools.load(CPU_Results)[1]
@assert RESULTS isa BenchmarkTools.BenchmarkGroup

for backend in BACKENDS[2:end]
@info "Aggregating results for $(backend)"
filename = string(backend, "benchmarks.json")
filepath = joinpath(dirname(@__FILE__), "results", filename)
if !ispath(filepath)
@warn "No file found at path: $(filepath)"
else
backend_results = BenchmarkTools.load(filepath)[1]
if backend_results isa BenchmarkTools.BenchmarkGroup
# <benchmark name>/<forward or reverse>/<backend>/<reactant or package>
for benchmark in keys(RESULTS)
for pass in keys(RESULTS[benchmark])
for pkg in keys(backend_results[benchmark][pass][backend])
RESULTS[benchmark][pass][backend][pkg] = backend_results[benchmark][pass][backend][pkg]
end
end
end
else
@warn "Unexpected file format for file at path: $(filepath)"
end
end
end

BenchmarkTools.save(
joinpath(dirname(@__FILE__), "results", "combinedbenchmarks.json"), RESULTS
)
57 changes: 57 additions & 0 deletions benchmark/runbenchmarks.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Accelerator Support for testing non-Reactant performance
using LuxCUDA

using BenchmarkTools: BenchmarkTools, BenchmarkGroup, @btime, @benchmarkable
using CpuId: CpuId
using InteractiveUtils: versioninfo
using LinearAlgebra: BLAS
using Reactant: Reactant
using Statistics: median

# To run benchmarks on a specific GPU backend, add AMDGPU / CUDA / Metal / oneAPI
# to benchmarks/Project.toml and change BENCHMARK_GROUP to the backend name
const BENCHMARK_GROUP = get(ENV, "BENCHMARK_GROUP", "CPU")
@info "Running benchmarks for $BENCHMARK_GROUP"

BenchmarkTools.DEFAULT_PARAMETERS.seconds = 20

if BENCHMARK_GROUP == "CPU"
if Sys.isapple() && (Sys.ARCH == :aarch64 || Sys.ARCH == :arm64)
@info "Running benchmarks on Apple with ARM CPUs. Using `AppleAccelerate.jl`."
using AppleAccelerate: AppleAccelerate
end

if Sys.ARCH == :x86_64 && occursin("intel", lowercase(CpuId.cpubrand()))
@info "Running benchmarks on Intel CPUs. Loading `MKL.jl`."
using MKL: MKL
end
end

const BENCHMARK_CPU_THREADS = Threads.nthreads()
BLAS.set_num_threads(BENCHMARK_CPU_THREADS)

@info sprint(versioninfo)
@info "BLAS threads: $(BLAS.get_num_threads())"

const SUITE = BenchmarkGroup()

if BENCHMARK_GROUP == "CUDA"
Reactant.set_default_backend("gpu")
@info "Running CUDA benchmarks" maxlog = 1
CUDA.versioninfo()
else
@info "Running CPU benchmarks with $(BENCHMARK_CPU_THREADS) thread(s)" maxlog = 1
end

# Main benchmark files
include("setup.jl")
setup_benchmarks!(SUITE, BENCHMARK_GROUP)

results = BenchmarkTools.run(SUITE; verbose=true)

filepath = joinpath(dirname(@__FILE__), "results")
mkpath(filepath)
filename = string(BENCHMARK_GROUP, "benchmarks.json")
BenchmarkTools.save(joinpath(filepath, filename), median(results))

@info "Saved results to $(joinpath(filepath, filename))"
90 changes: 90 additions & 0 deletions benchmark/setup.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
using Boltz: Vision
using Lux: Lux
using MLDataDevices: AbstractDevice, CPUDevice, CUDADevice
using Random: Random
using Reactant: Reactant, @compile

using Enzyme: Enzyme
using Zygote: Zygote

# Helper Functions
@inline synchronize(::CPUDevice) = nothing
@inline synchronize(::CUDADevice) = CUDA.synchronize()

@inline reclaim(::CPUDevice) = GC.gc()
@inline reclaim(::CUDADevice) = CUDA.reclaim()

@inline sumabs2(model, x, p, st) = sum(abs2, first(Lux.apply(model, x, p, st)))
@inline sumabs2(model, x) = sum(abs2, model(x))

function benchmark_group_to_backend(benchmark_group::String)
benchmark_group == "CPU" && return CPUDevice()
benchmark_group == "CUDA" && return CUDADevice()
return error("Unknown backend: $(benchmark_group)")
end

function general_lux_setup(model, x_dims)
rng = Random.default_rng() # don't use any other rng
ps, st = Lux.setup(rng, model)
x_dims === nothing && return ps, st
x = randn(rng, Float32, x_dims)
return x, ps, st
end

function setup_benchmarks!(suite::BenchmarkGroup, backend::String)
dev = benchmark_group_to_backend(backend)

setup_vit_benchmark!(suite, backend, dev)

return nothing
end

# Lux Benchmarks
function setup_vit_benchmark!(suite::BenchmarkGroup, backend, dev::AbstractDevice)
for mode in (:tiny, :small, :base), bsize in (4, 16, 32)
benchmark_name = "ViT $(mode) (256 x 256 x 3 x $(bsize))"

setup_lux_forward_pass_benchmark!(
suite, benchmark_name, backend, Vision.ViT(mode), (256, 256, 3, bsize), dev
)
end
end

function setup_lux_forward_pass_benchmark!(
suite::BenchmarkGroup,
benchmark_name::String,
backend::String,
model,
x_dims,
dev::AbstractDevice,
)
suite[benchmark_name]["forward"][backend]["Lux"] = @benchmarkable begin
Lux.apply($model, x, ps, st_test)
synchronize($dev)
end setup = begin
GC.gc()
reclaim($dev)
x, ps, st = $dev(general_lux_setup($model, $x_dims))
st_test = Lux.testmode(st)
GC.gc()
reclaim($dev)
end

suite[benchmark_name]["forward"][backend]["Reactant"] = @benchmarkable begin
y, _ = apply_compiled($model, x_ra, ps_ra, st_test_ra)
Reactant.synchronize(y)
end setup = begin
GC.gc()
reclaim($dev)
x, ps, st = general_lux_setup($model, $x_dims)
st_test = Lux.testmode(st)
x_ra = Reactant.to_rarray(x)
ps_ra = Reactant.to_rarray(ps)
st_test_ra = Reactant.to_rarray(st_test)
apply_compiled = @compile Lux.apply($model, x_ra, ps_ra, st_test_ra)
GC.gc()
reclaim($dev)
end

return nothing
end
11 changes: 5 additions & 6 deletions deps/ReactantExtra/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -342,8 +342,8 @@ cc_library(

"@xla//xla/service/cpu:cpu_transfer_manager",
"@xla//xla/pjrt/gpu:se_gpu_pjrt_client",
"@tsl//tsl/protobuf:protos_all_cc_impl",

"@xla//xla/tsl/protobuf:protos_all_cc_impl",
"@xla//xla/tsl/framework:allocator_registry_impl",

"@xla//xla/pjrt:status_casters",
Expand All @@ -352,9 +352,9 @@ cc_library(
"@xla//xla/python/ifrt/hlo:hlo_program",
"@xla//xla/ffi:call_frame",
"@com_google_protobuf//:protobuf",
"@tsl//tsl/profiler/backends/cpu:annotation_stack_impl",
"@tsl//tsl/profiler/backends/cpu:traceme_recorder_impl",
"@tsl//tsl/profiler/utils:time_utils_impl",
"@xla//xla/tsl/profiler/backends/cpu:annotation_stack_impl",
"@xla//xla/tsl/profiler/backends/cpu:traceme_recorder_impl",
"@xla//xla/tsl/profiler/utils:time_utils_impl",
"@tsl//tsl/platform:env_impl",
"@xla//xla/stream_executor:stream_executor_impl",
"@xla//xla/mlir/utils:type_util",
Expand All @@ -367,7 +367,6 @@ cc_library(
] + select({
"@xla//xla/tsl:is_cuda_enabled_and_oss":[
"@xla//xla/stream_executor/cuda:all_runtime",
"@xla//xla/stream_executor/rocm:all_runtime",
"@xla//xla/service/gpu/model:hlo_op_profiles",
"@xla//xla/service/gpu/model:hlo_op_profile_proto_cc_impl",
"@xla//xla/service/gpu:nvptx_compiler",
Expand Down
Loading

0 comments on commit 217a11b

Please sign in to comment.