From 6defd04b4b534747ab8a8ad5734fbc06075d95f2 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Sat, 12 Oct 2024 17:46:47 +0200 Subject: [PATCH 01/12] simplify test machinery --- prova.jl | 11 +++ test/ext_amdgpu/basic.jl | 23 +++--- test/ext_amdgpu/runtests.jl | 3 - test/ext_amdgpu/test_utils.jl | 15 ---- test/ext_cuda/layers.jl | 82 +++----------------- test/ext_cuda/losses.jl | 2 +- test/ext_cuda/runtests.jl | 3 - test/ext_cuda/test_utils.jl | 4 - test/ext_metal/basic.jl | 2 +- test/ext_metal/runtests.jl | 2 - test/ext_metal/test_utils.jl | 16 ---- test/layers/attention.jl | 7 +- test/runtests.jl | 48 ++++++------ test/test_utils.jl | 141 +++++++++++++++------------------- 14 files changed, 121 insertions(+), 238 deletions(-) create mode 100644 prova.jl delete mode 100644 test/ext_amdgpu/test_utils.jl delete mode 100644 test/ext_cuda/test_utils.jl delete mode 100644 test/ext_metal/test_utils.jl diff --git a/prova.jl b/prova.jl new file mode 100644 index 0000000000..63ebbbdfe2 --- /dev/null +++ b/prova.jl @@ -0,0 +1,11 @@ +using Flux, FiniteDifferences, Test, Zygote, Functors, Metal +include("test/test_utils.jl") + +m = Dense(3, 3) +x = rand(Float32, 3, 3) +test_gradients(m, x; rtol=1e-4, atol=1e-4) + +m = MultiHeadAttention(4, nheads=2) +x = rand(Float32, 4, 3, 2) +m(x) +test_gradients(m, x; loss = o -> sum(o[1].^2) + sum(o[2].^2)) diff --git a/test/ext_amdgpu/basic.jl b/test/ext_amdgpu/basic.jl index 831b577d48..4f93ca5145 100644 --- a/test/ext_amdgpu/basic.jl +++ b/test/ext_amdgpu/basic.jl @@ -19,9 +19,9 @@ end end @testset "Chain of Dense layers" begin - m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax) |> f32 + m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax) x = rand(Float32, 10, 10) - gpu_autodiff_test(m, x) + test_gradients(m, x, test_gpu=true) end @testset "Convolution" begin @@ -29,11 +29,12 @@ end m = conv_type(tuple(fill(2, nd)...), 3 => 4) |> f32 x = rand(Float32, fill(10, nd)..., 3, 5) + md, xd = Flux.gpu.((m, x)) + y = m(x) # Ensure outputs are the same. - gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false) + @test collect(md(xd)) ≈ y atol=1f-3 # Gradients are flipped as well. - md, xd = Flux.gpu.((m, x)) gs = gradient(m -> sum(m(x)), m) gsd = gradient(m -> sum(m(xd)), md) @@ -74,16 +75,18 @@ end end @testset "Chain(Conv)" begin - m = Chain(Conv((3, 3), 3 => 3)) |> f32 + m = Chain(Conv((3, 3), 3 => 3)) x = rand(Float32, 10, 10, 3, 2) - gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false) + # gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false) + test_gradients(m, x, test_gpu=true) md = m |> gpu |> cpu @test md[1].weight ≈ m[1].weight atol=1f-3 - m = Chain(ConvTranspose((3, 3), 3 => 3)) |> f32 + m = Chain(ConvTranspose((3, 3), 3 => 3)) x = rand(Float32, 10, 10, 3, 2) - gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false) + # gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false) + test_gradients(m, x, test_gpu=true) md = m |> gpu |> cpu @test md[1].weight ≈ m[1].weight atol=1f-3 @@ -92,7 +95,7 @@ end @testset "Cross-correlation" begin m = CrossCor((2, 2), 3 => 4) |> f32 x = rand(Float32, 10, 10, 3, 2) - gpu_autodiff_test(m, x; atol=1f-3) + test_gradients(m, x, test_gpu=true) end @testset "Restructure" begin @@ -132,7 +135,7 @@ end bn = BatchNorm(3, σ) for nd in 1:3 x = rand(Float32, fill(2, nd - 1)..., 3, 4) - gpu_autodiff_test(bn, x; atol=1f-3, allow_nothing=true) + test_gradients(bn, x; test_gpu=true) end end diff --git a/test/ext_amdgpu/runtests.jl b/test/ext_amdgpu/runtests.jl index 9027a31f76..ec779dedea 100644 --- a/test/ext_amdgpu/runtests.jl +++ b/test/ext_amdgpu/runtests.jl @@ -2,9 +2,6 @@ @assert AMDGPU.functional() AMDGPU.allowscalar(false) -include("../test_utils.jl") -include("test_utils.jl") - @testset "get_devices" begin include("get_devices.jl") end diff --git a/test/ext_amdgpu/test_utils.jl b/test/ext_amdgpu/test_utils.jl deleted file mode 100644 index 3c84f01048..0000000000 --- a/test/ext_amdgpu/test_utils.jl +++ /dev/null @@ -1,15 +0,0 @@ -function check_grad( - g_gpu::ROCArray{Float32}, g_cpu::Array{Float32}; - atol, rtol, allow_nothing::Bool, -) - @test g_cpu ≈ collect(g_gpu) atol=atol rtol=rtol -end - -function check_grad( - g_gpu::ROCArray{Float32}, g_cpu::Zygote.FillArrays.AbstractFill; - atol, rtol, allow_nothing::Bool, -) - @test g_cpu ≈ collect(g_gpu) atol=atol rtol=rtol -end - -check_type(x::ROCArray{Float32}) = true diff --git a/test/ext_cuda/layers.jl b/test/ext_cuda/layers.jl index 63bcc8b526..c148334dcf 100644 --- a/test/ext_cuda/layers.jl +++ b/test/ext_cuda/layers.jl @@ -10,73 +10,23 @@ @test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple end -# TODO: These layers get into scalar indexing issues. -const BROKEN_LAYERS = Union{} const ACTIVATIONS = [identity, relu, tanh, sigmoid, exp, softplus, elu, selu] -function gpu_gradtest(name::String, layers::Vector, x_cpu = nothing, args...; test_cpu = true, test_mode = false) - isnothing(x_cpu) && error("Missing input to test the layers against.") +function gpu_gradtest(name::String, layers::Vector, x_cpu, args...; test_cpu = true, test_mode = false) @testset "$name GPU grad tests" begin for layer in layers @testset "$layer Layer GPU grad test" begin # compute output and grad of parameters l_cpu = layer(args...) - l_gpu = l_cpu |> gpu if test_mode testmode!(l_cpu) - testmode!(l_gpu) end - ps_cpu = Flux.params(l_cpu) - y_cpu, back_cpu = pullback(() -> sum(l_cpu(x_cpu)), ps_cpu) - gs_cpu = back_cpu(1f0) - - x_gpu = gpu(x_cpu) - ps_gpu = Flux.params(l_gpu) - - if typeof(l_gpu) <: BROKEN_LAYERS - @test_broken gradient(() -> sum(l_gpu(x_gpu)), ps_gpu) isa Flux.Zygote.Grads - else - y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu) - gs_gpu = back_gpu(1f0) # TODO many layers error out when backprop int 1, should fix - - # compute grad of input - xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1] - xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1] - - # test - if test_cpu - if layer === GroupedConvTranspose - @test y_gpu ≈ y_cpu rtol=1f-2 atol=1f-3 - else - @test y_gpu ≈ y_cpu rtol=1f-3 atol=1f-3 - end - if isnothing(xg_cpu) - @test isnothing(xg_gpu) - else - if layer === GroupedConvTranspose - @test Array(xg_gpu) ≈ xg_cpu rtol = 2f-2 atol = 1f-3 - else - @test Array(xg_gpu) ≈ xg_cpu rtol = 1f-3 atol = 1f-3 - end - end - end - @test gs_gpu isa Flux.Zygote.Grads - for (p_cpu, p_gpu) in zip(ps_cpu, ps_gpu) - if isnothing(gs_cpu[p_cpu]) - @test isnothing(gs_gpu[p_gpu]) - else - @test gs_gpu[p_gpu] isa CuArray - if test_cpu - @test Array(gs_gpu[p_gpu]) ≈ gs_cpu[p_cpu] rtol=1f-3 atol=1f-3 - end - end - end - end + test_gradients(l_cpu, x_cpu, test_gpu = true) end end end @@ -150,22 +100,17 @@ gpu_gradtest("Embedding OneHotMatrix repeated indices", embedding, OneHotMatrix( @testset "function layers" begin x = rand(Float32, 3,3) - gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=1)), x) - gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=2)), x) - gpu_autodiff_test(x -> sum(Flux.normalise(x)), x) + test_gradients(x -> sum(Flux.normalise(x; dims=1)), x) + test_gradients(x -> sum(Flux.normalise(x; dims=2)), x) + test_gradients(x -> sum(Flux.normalise(x)), x) end @testset "Zeros mapped for $cl" for cl in (Conv, ConvTranspose, CrossCor, DepthwiseConv) l = cl((2,2), 1=>3, bias = false) |> gpu ip = zeros(Float32, 28,28,1,1) |> gpu - if typeof(l) <: BROKEN_LAYERS - @test_broken sum(l(ip)) ≈ 0.f0 - @test_broken gradient(() -> sum(l(ip)), Flux.params(l)) isa Flux.Zygote.Grads - else - @test sum(l(ip)) ≈ 0.f0 - gs = gradient(() -> sum(l(ip)), Flux.params(l)) - @test l.bias ∉ gs.params - end + @test sum(l(ip)) ≈ 0.f0 + gs = gradient(() -> sum(l(ip)), Flux.params(l)) + @test l.bias ∉ gs.params end @testset "Dense without bias" begin @@ -366,14 +311,5 @@ end @test Array(y_gpu) ≈ y_cpu atol=1e-4 @test Array(α_gpu) ≈ α_cpu atol=1e-4 - gm_cpu, gx_cpu = gradient(mha_cpu, x_cpu) do mha, x - y, α = mha(x) - return sum(y.^2) + sum(α.^2) - end - gm_gpu, gx_gpu = gradient(mha_gpu, x_gpu) do mha, x - y, α = mha(x) - return sum(y.^2) + sum(α.^2) - end - check_grad(gm_gpu, gm_cpu) - check_grad(gx_gpu, gx_cpu) + test_gradients(mha_cpu, x_cpu, loss = o -> sum(o[1].^2) + sum(o[2].^2), test_gpu = true) end diff --git a/test/ext_cuda/losses.jl b/test/ext_cuda/losses.jl index b339b352bb..b5ba4d2f4d 100644 --- a/test/ext_cuda/losses.jl +++ b/test/ext_cuda/losses.jl @@ -31,7 +31,7 @@ y = [1 0 0 0 1 y = rand(Float32, 3,4) @test loss(x, y) ≈ loss(gpu(x), gpu(y)) - gpu_autodiff_test(loss, x, y) + test_gradients(loss, x, y, test_gpu=true, test_grad_f = false) # Float16 tests @test loss(f16(x), f16(y)) ≈ loss(gpu(f16(x)), gpu(f16(y))) diff --git a/test/ext_cuda/runtests.jl b/test/ext_cuda/runtests.jl index aa1f431fe7..6fdbde4250 100644 --- a/test/ext_cuda/runtests.jl +++ b/test/ext_cuda/runtests.jl @@ -7,9 +7,6 @@ using Random, LinearAlgebra, Statistics @assert CUDA.functional() CUDA.allowscalar(false) -# include("../test_utils.jl") -include("test_utils.jl") - @testset "get_devices" begin include("get_devices.jl") end diff --git a/test/ext_cuda/test_utils.jl b/test/ext_cuda/test_utils.jl deleted file mode 100644 index 10a8d0dfdf..0000000000 --- a/test/ext_cuda/test_utils.jl +++ /dev/null @@ -1,4 +0,0 @@ -check_grad(g_gpu::CuArray{Float32}, g_cpu::Array{Float32}; rtol=1e-4, atol=1e-4, allow_nothing::Bool=false) = - @test g_cpu ≈ collect(g_gpu) rtol=rtol atol=atol - -check_type(x::CuArray{Float32}) = true diff --git a/test/ext_metal/basic.jl b/test/ext_metal/basic.jl index 9e4a9ef9cb..76121a1d3a 100644 --- a/test/ext_metal/basic.jl +++ b/test/ext_metal/basic.jl @@ -23,5 +23,5 @@ end m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax) x = rand(Float32, 10, 10) @test (m|>gpu)(x|>gpu) isa MtlArray{Float32, 2} - gpu_autodiff_test(m, x) + test_gradients(m, x, test_gpu=true) end diff --git a/test/ext_metal/runtests.jl b/test/ext_metal/runtests.jl index 8c8af7d896..cb9532390e 100644 --- a/test/ext_metal/runtests.jl +++ b/test/ext_metal/runtests.jl @@ -5,8 +5,6 @@ using Random, Statistics using Zygote Flux.gpu_backend!("Metal") # needs a restart -include("test_utils.jl") - @testset "data movement" begin metal_device = Flux.gpu_device() cdev = cpu_device() diff --git a/test/ext_metal/test_utils.jl b/test/ext_metal/test_utils.jl deleted file mode 100644 index f6ed32a8f4..0000000000 --- a/test/ext_metal/test_utils.jl +++ /dev/null @@ -1,16 +0,0 @@ - -function check_grad( - g_gpu::MtlArray{Float32}, g_cpu::Array{Float32}; - atol, rtol, allow_nothing::Bool, -) - @test g_cpu ≈ collect(g_gpu) atol=atol rtol=rtol -end - -function check_grad( - g_gpu::MtlArray{Float32}, g_cpu::Zygote.FillArrays.AbstractFill; - atol, rtol, allow_nothing::Bool, -) - @test g_cpu ≈ collect(g_gpu) atol=atol rtol=rtol -end - -check_type(x::MtlArray{Float32}) = true diff --git a/test/layers/attention.jl b/test/layers/attention.jl index a4c90b36ed..f51c4c192f 100644 --- a/test/layers/attention.jl +++ b/test/layers/attention.jl @@ -54,12 +54,7 @@ end @testset "gradient" begin - gm, gq = gradient(mha, q) do mha, q - y, α = mha(q) - return sum(y.^2) + sum(α.^2) - end - check_grad_type(gm, mha) - check_grad_type(gq, q) + test_gradients(mha, q, loss = ((y, α)) -> sum(y.^2) + sum(α.^2)) end end diff --git a/test/runtests.jl b/test/runtests.jl index ef3d67f4d7..7c5f08095b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,40 +10,40 @@ using Pkg ## Uncomment below to change the default test settings # ENV["FLUX_TEST_AMDGPU"] = "true" # ENV["FLUX_TEST_CUDA"] = "true" -# ENV["FLUX_TEST_METAL"] = "true" +ENV["FLUX_TEST_METAL"] = "true" # ENV["FLUX_TEST_CPU"] = "false" # ENV["FLUX_TEST_DISTRIBUTED_MPI"] = "true" # ENV["FLUX_TEST_DISTRIBUTED_NCCL"] = "true" ENV["FLUX_TEST_ENZYME"] = "false" # We temporarily disable Enzyme tests since they are failing -include("test_utils.jl") +include("test_utils.jl") # for test_gradients Random.seed!(0) @testset verbose=true "Flux.jl" begin if get(ENV, "FLUX_TEST_CPU", "true") == "true" - @testset "Utils" begin - include("utils.jl") - end - - @testset "Loading" begin - include("loading.jl") - end - - @testset "Optimise / Train" begin - include("optimise.jl") - include("train.jl") - include("tracker.jl") - end - - @testset "Data" begin - include("data.jl") - end - - @testset "Losses" begin - include("losses.jl") - include("ctc.jl") - end + # @testset "Utils" begin + # include("utils.jl") + # end + + # @testset "Loading" begin + # include("loading.jl") + # end + + # @testset "Optimise / Train" begin + # include("optimise.jl") + # include("train.jl") + # include("tracker.jl") + # end + + # @testset "Data" begin + # include("data.jl") + # end + + # @testset "Losses" begin + # include("losses.jl") + # include("ctc.jl") + # end @testset "Layers" begin include("layers/attention.jl") diff --git a/test/test_utils.jl b/test/test_utils.jl index 004d3035ad..4dcf0f5793 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -8,98 +8,79 @@ const ALL_LOSSES = [Flux.Losses.mse, Flux.Losses.mae, Flux.Losses.msle, Flux.Losses.dice_coeff_loss, Flux.Losses.poisson_loss, Flux.Losses.hinge_loss, Flux.Losses.squared_hinge_loss, - Flux.Losses.binary_focal_loss, Flux.Losses.focal_loss, Flux.Losses.siamese_contrastive_loss] + Flux.Losses.binary_focal_loss, Flux.Losses.focal_loss, + Flux.Losses.siamese_contrastive_loss] - -function check_grad(g_gpu, g_cpu; - rtol=1e-4, atol=1e-4, - allow_nothing::Bool=false) - allow_nothing && return - @warn "Unsupported types in `check_grad`: $(typeof(g_gpu)), $(typeof(g_cpu))" - @show g_gpu g_cpu - @test false +function finitediff_withgradient(f, x...) + y = f(x...) + fdm = central_fdm(5, 1) + return y, FiniteDifferences.grad(fdm, f, x...) end -check_grad(g_gpu::Base.RefValue, g_cpu::Base.RefValue; rtol=1e-4, atol=1e-4, allow_nothing::Bool=false) = - check_grad(g_gpu[], g_cpu[]; rtol, atol, allow_nothing) - -check_grad(g_gpu::Nothing, g_cpu::Nothing; rtol=1e-4, atol=1e-4, allow_nothing::Bool=false) = - @test true - -check_grad(g_gpu::Float32, g_cpu::Float32; rtol=1e-4, atol=1e-4, allow_nothing::Bool=false) = - @test g_cpu ≈ g_gpu rtol=rtol atol=atol - -function check_grad(g_gpu::Tuple, g_cpu::Tuple; rtol=1e-4, atol=1e-4, allow_nothing::Bool=false) - for (v1, v2) in zip(g_gpu, g_cpu) - check_grad(v1, v2; rtol, atol, allow_nothing) - end -end -function check_grad(g_gpu::NamedTuple, g_cpu::NamedTuple; rtol=1e-4, atol=1e-4, allow_nothing::Bool=false) - for ((k1,v1), (k2,v2)) in zip(pairs(g_gpu), pairs(g_cpu)) - @test k1 == k2 - check_grad(v1, v2; rtol, atol, allow_nothing) +function check_equal_leaves(a, b; rtol=1e-4, atol=1e-4, check_eltype=true) + fmapstructure_with_path(a, b) do kp, x, y + if x isa AbstractArray + if check_eltype + @test eltype(x) == eltype(y) + end + @test x ≈ y rtol=rtol atol=atol + elseif x isa Number + @test x ≈ y rtol=rtol atol=atol + end end end -check_type(x) = false -check_type(x::Float32) = true -check_type(x::Array{Float32}) = true -function gpu_autodiff_test( - f_cpu, - xs_cpu::Array{Float32}...; - test_equal=true, +function test_gradients( + f, + xs::Array...; rtol=1e-4, atol=1e-4, - checkgrad::Bool = true, - allow_nothing::Bool = false, - ) - - # Compare CPU & GPU function outputs. - f_gpu = f_cpu |> gpu - xs_gpu = gpu.(xs_cpu) - - y_cpu = f_cpu(xs_cpu...) - y_gpu = f_gpu(xs_gpu...) - @test collect(y_cpu) ≈ collect(y_gpu) atol=atol rtol=rtol - - checkgrad || return - - ### GRADIENT WITH RESPECT TO INPUT ### - - y_cpu, back_cpu = pullback((x...) -> f_cpu(x...), xs_cpu...) - @test check_type(y_cpu) - Δ_cpu = size(y_cpu) == () ? randn(Float32) : randn(Float32, size(y_cpu)) - gs_cpu = back_cpu(Δ_cpu) - - Δ_gpu = Δ_cpu |> gpu - y_gpu, back_gpu = pullback((x...) -> f_gpu(x...), xs_gpu...) - @test check_type(y_gpu) - gs_gpu = back_gpu(Δ_gpu) - - if test_equal - @test collect(y_cpu) ≈ collect(y_gpu) rtol=rtol atol=atol - for (g_gpu, g_cpu) in zip(gs_gpu, gs_cpu) - check_grad(g_gpu, g_cpu; atol, rtol, allow_nothing) - end + test_gpu = false, + test_grad_f = true, + loss = sum + ) + + # Use finite differences gradient as a reference. + y_fd, g_fd = finitediff_withgradient((xs...) -> loss(f(xs...)), xs...) + + # Zygote gradient with respect to input. + y, g = Zygote.withgradient((xs...) -> loss(f(xs...)), xs...) + @test y ≈ y_fd rtol=rtol atol=atol + check_equal_leaves(g, g_fd; rtol, atol) + + if test_gpu + gpu_dev = gpu_device(force=true) + cpu_dev = cpu_device() + x_gpu = x |> gpu_dev + f_gpu = f |> gpu_dev + + # Zygote gradient with respect to input on GPU. + y_gpu, g_gpu = Zygote.withgradient(x -> loss(f_gpu(x)), x_gpu) + @test get_device(g_gpu) == gpu_dev + @test y_gpu |> cpu_dev ≈ y rtol=rtol atol=atol + check_equal_leaves(g_gpu |> cpu_dev, g; rtol, atol) end - ### GRADIENT WITH RESPECT TO f ### - - ps_cpu = Flux.params(f_cpu) - y_cpu, back_cpu = pullback(() -> f_cpu(xs_cpu...), ps_cpu) - gs_cpu = back_cpu(Δ_cpu) - - ps_gpu = Flux.params(f_gpu) - y_gpu, back_gpu = pullback(() -> f_gpu(xs_gpu...), ps_gpu) - gs_gpu = back_gpu(Δ_gpu) - - if test_equal - @test collect(y_cpu) ≈ collect(y_gpu) rtol=rtol atol=atol - @assert length(ps_gpu) == length(ps_cpu) - for (p_gpu, p_cpu) in zip(ps_gpu, ps_cpu) - check_grad(gs_gpu[p_gpu], gs_cpu[p_cpu]; atol, rtol, allow_nothing) + if test_grad_f + # Use finite differences gradient as a reference. + # y_fd, g_fd = finitediff_withgradient(f -> loss(f(x)), f) + ps, re = Flux.destructure(f) + y_fd, g_fd = finitediff_withgradient(f -> loss(re(ps)(x)), ps) + g_fd = (re(g_fd[1]),) + + # Zygote gradient with respect to f. + y, g = Zygote.withgradient(f -> loss(f(x)), f) + @test y ≈ y_fd rtol=rtol atol=atol + check_equal_leaves(g, g_fd; rtol, atol) + + if test_gpu + # Zygote gradient with respect to input on GPU. + y_gpu, g_gpu = Zygote.withgradient(f -> loss(f(x_gpu)), f_gpu) + @test get_device(g_gpu) == gpu_dev + @test y_gpu |> cpu_dev ≈ y rtol=rtol atol=atol + check_equal_leaves(g_gpu |> cpu_dev, g; rtol, atol) end end end From a8af95d59d749e9eb07dc6bdcb73eb0f86e897da Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Sat, 12 Oct 2024 18:03:20 +0200 Subject: [PATCH 02/12] fix --- prova.jl | 11 ----------- test/Project.toml | 2 +- test/runtests.jl | 1 + test/test_utils.jl | 2 +- 4 files changed, 3 insertions(+), 13 deletions(-) delete mode 100644 prova.jl diff --git a/prova.jl b/prova.jl deleted file mode 100644 index 63ebbbdfe2..0000000000 --- a/prova.jl +++ /dev/null @@ -1,11 +0,0 @@ -using Flux, FiniteDifferences, Test, Zygote, Functors, Metal -include("test/test_utils.jl") - -m = Dense(3, 3) -x = rand(Float32, 3, 3) -test_gradients(m, x; rtol=1e-4, atol=1e-4) - -m = MultiHeadAttention(4, nheads=2) -x = rand(Float32, 4, 3, 2) -m(x) -test_gradients(m, x; loss = o -> sum(o[1].^2) + sum(o[2].^2)) diff --git a/test/Project.toml b/test/Project.toml index b5fcda422b..99f1d7175a 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -21,4 +21,4 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [compat] FiniteDifferences = "0.12" Tracker = "0.2.33" -Enzyme = "0.12.4" +Enzyme = "0.13" diff --git a/test/runtests.jl b/test/runtests.jl index 7c5f08095b..7199f88649 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -6,6 +6,7 @@ using Random, Statistics, LinearAlgebra using IterTools: ncycle using Zygote using Pkg +using FiniteDifferences: FiniteDifferences ## Uncomment below to change the default test settings # ENV["FLUX_TEST_AMDGPU"] = "true" diff --git a/test/test_utils.jl b/test/test_utils.jl index 4dcf0f5793..fed79178f8 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -14,7 +14,7 @@ const ALL_LOSSES = [Flux.Losses.mse, Flux.Losses.mae, Flux.Losses.msle, function finitediff_withgradient(f, x...) y = f(x...) - fdm = central_fdm(5, 1) + fdm = FiniteDifferences.central_fdm(5, 1) return y, FiniteDifferences.grad(fdm, f, x...) end From b94cf64ae61cc7b7083c5b9185d701b1581dc430 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Sat, 12 Oct 2024 18:13:50 +0200 Subject: [PATCH 03/12] fix --- test/layers/attention.jl | 2 +- test/runtests.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/layers/attention.jl b/test/layers/attention.jl index f51c4c192f..2c6fd7d514 100644 --- a/test/layers/attention.jl +++ b/test/layers/attention.jl @@ -54,7 +54,7 @@ end @testset "gradient" begin - test_gradients(mha, q, loss = ((y, α)) -> sum(y.^2) + sum(α.^2)) + test_gradients(mha, q, loss = o -> sum(o[1].^2) + sum(o[2].^2)) end end diff --git a/test/runtests.jl b/test/runtests.jl index 7199f88649..9ee030ec95 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -11,7 +11,7 @@ using FiniteDifferences: FiniteDifferences ## Uncomment below to change the default test settings # ENV["FLUX_TEST_AMDGPU"] = "true" # ENV["FLUX_TEST_CUDA"] = "true" -ENV["FLUX_TEST_METAL"] = "true" +# ENV["FLUX_TEST_METAL"] = "true" # ENV["FLUX_TEST_CPU"] = "false" # ENV["FLUX_TEST_DISTRIBUTED_MPI"] = "true" # ENV["FLUX_TEST_DISTRIBUTED_NCCL"] = "true" From 77a9611fab8538f15c19e668f9496b4e0f929c3e Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Sat, 12 Oct 2024 18:56:41 +0200 Subject: [PATCH 04/12] fix --- test/runtests.jl | 1 + test/test_utils.jl | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 9ee030ec95..0ce93c59c0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -7,6 +7,7 @@ using IterTools: ncycle using Zygote using Pkg using FiniteDifferences: FiniteDifferences +using Functors: fmapstructure_with_path ## Uncomment below to change the default test settings # ENV["FLUX_TEST_AMDGPU"] = "true" diff --git a/test/test_utils.jl b/test/test_utils.jl index fed79178f8..1bdd5cb90b 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -53,11 +53,11 @@ function test_gradients( if test_gpu gpu_dev = gpu_device(force=true) cpu_dev = cpu_device() - x_gpu = x |> gpu_dev + xs_gpu = xs |> gpu_dev f_gpu = f |> gpu_dev # Zygote gradient with respect to input on GPU. - y_gpu, g_gpu = Zygote.withgradient(x -> loss(f_gpu(x)), x_gpu) + y_gpu, g_gpu = Zygote.withgradient((xs...) -> loss(f_gpu(xs...)), xs_gpu...) @test get_device(g_gpu) == gpu_dev @test y_gpu |> cpu_dev ≈ y rtol=rtol atol=atol check_equal_leaves(g_gpu |> cpu_dev, g; rtol, atol) @@ -67,17 +67,17 @@ function test_gradients( # Use finite differences gradient as a reference. # y_fd, g_fd = finitediff_withgradient(f -> loss(f(x)), f) ps, re = Flux.destructure(f) - y_fd, g_fd = finitediff_withgradient(f -> loss(re(ps)(x)), ps) + y_fd, g_fd = finitediff_withgradient(ps -> loss(re(ps)(xs...)), ps) g_fd = (re(g_fd[1]),) # Zygote gradient with respect to f. - y, g = Zygote.withgradient(f -> loss(f(x)), f) + y, g = Zygote.withgradient(f -> loss(f(xs...)), f) @test y ≈ y_fd rtol=rtol atol=atol check_equal_leaves(g, g_fd; rtol, atol) if test_gpu # Zygote gradient with respect to input on GPU. - y_gpu, g_gpu = Zygote.withgradient(f -> loss(f(x_gpu)), f_gpu) + y_gpu, g_gpu = Zygote.withgradient(f -> loss(f(xs_gpu...)), f_gpu) @test get_device(g_gpu) == gpu_dev @test y_gpu |> cpu_dev ≈ y rtol=rtol atol=atol check_equal_leaves(g_gpu |> cpu_dev, g; rtol, atol) From 18ab9b198adb517f6a38e4c7c39609bef4a75c23 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Sat, 12 Oct 2024 19:13:11 +0200 Subject: [PATCH 05/12] f64 --- test/ext_amdgpu/basic.jl | 6 +++--- test/runtests.jl | 44 ++++++++++++++++++++-------------------- test/test_utils.jl | 14 ++++++------- 3 files changed, 32 insertions(+), 32 deletions(-) diff --git a/test/ext_amdgpu/basic.jl b/test/ext_amdgpu/basic.jl index 4f93ca5145..a17d542d2d 100644 --- a/test/ext_amdgpu/basic.jl +++ b/test/ext_amdgpu/basic.jl @@ -76,7 +76,7 @@ end @testset "Chain(Conv)" begin m = Chain(Conv((3, 3), 3 => 3)) - x = rand(Float32, 10, 10, 3, 2) + x = rand(Float32, 5, 5, 3, 2) # gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false) test_gradients(m, x, test_gpu=true) @@ -84,7 +84,7 @@ end @test md[1].weight ≈ m[1].weight atol=1f-3 m = Chain(ConvTranspose((3, 3), 3 => 3)) - x = rand(Float32, 10, 10, 3, 2) + x = rand(Float32, 5, 5, 3, 2) # gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false) test_gradients(m, x, test_gpu=true) @@ -94,7 +94,7 @@ end @testset "Cross-correlation" begin m = CrossCor((2, 2), 3 => 4) |> f32 - x = rand(Float32, 10, 10, 3, 2) + x = rand(Float32, 5, 5, 3, 2) test_gradients(m, x, test_gpu=true) end diff --git a/test/runtests.jl b/test/runtests.jl index 0ce93c59c0..ff6660be14 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -24,28 +24,28 @@ Random.seed!(0) @testset verbose=true "Flux.jl" begin if get(ENV, "FLUX_TEST_CPU", "true") == "true" - # @testset "Utils" begin - # include("utils.jl") - # end - - # @testset "Loading" begin - # include("loading.jl") - # end - - # @testset "Optimise / Train" begin - # include("optimise.jl") - # include("train.jl") - # include("tracker.jl") - # end - - # @testset "Data" begin - # include("data.jl") - # end - - # @testset "Losses" begin - # include("losses.jl") - # include("ctc.jl") - # end + @testset "Utils" begin + include("utils.jl") + end + + @testset "Loading" begin + include("loading.jl") + end + + @testset "Optimise / Train" begin + include("optimise.jl") + include("train.jl") + include("tracker.jl") + end + + @testset "Data" begin + include("data.jl") + end + + @testset "Losses" begin + include("losses.jl") + include("ctc.jl") + end @testset "Layers" begin include("layers/attention.jl") diff --git a/test/test_utils.jl b/test/test_utils.jl index 1bdd5cb90b..522def9f96 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -19,12 +19,9 @@ function finitediff_withgradient(f, x...) end -function check_equal_leaves(a, b; rtol=1e-4, atol=1e-4, check_eltype=true) +function check_equal_leaves(a, b; rtol=1e-4, atol=1e-4) fmapstructure_with_path(a, b) do kp, x, y if x isa AbstractArray - if check_eltype - @test eltype(x) == eltype(y) - end @test x ≈ y rtol=rtol atol=atol elseif x isa Number @test x ≈ y rtol=rtol atol=atol @@ -43,7 +40,10 @@ function test_gradients( ) # Use finite differences gradient as a reference. - y_fd, g_fd = finitediff_withgradient((xs...) -> loss(f(xs...)), xs...) + # Cast to Float64 to avoid precision issues. + f64 = f |> f64 + xs64 = xs .|> f64 + y_fd, g_fd = finitediff_withgradient((xs...) -> loss(f64(xs...)), xs64...) # Zygote gradient with respect to input. y, g = Zygote.withgradient((xs...) -> loss(f(xs...)), xs...) @@ -66,8 +66,8 @@ function test_gradients( if test_grad_f # Use finite differences gradient as a reference. # y_fd, g_fd = finitediff_withgradient(f -> loss(f(x)), f) - ps, re = Flux.destructure(f) - y_fd, g_fd = finitediff_withgradient(ps -> loss(re(ps)(xs...)), ps) + ps, re = Flux.destructure(f64) + y_fd, g_fd = finitediff_withgradient(ps -> loss(re(ps)(xs64...)), ps) g_fd = (re(g_fd[1]),) # Zygote gradient with respect to f. From 9367b9577168caa1e766b096db204c754d3128b3 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Sat, 12 Oct 2024 19:27:06 +0200 Subject: [PATCH 06/12] fixes --- test/ext_cuda/layers.jl | 4 +--- test/ext_cuda/losses.jl | 5 +++-- test/test_utils.jl | 3 ++- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/test/ext_cuda/layers.jl b/test/ext_cuda/layers.jl index c148334dcf..0d985e59d1 100644 --- a/test/ext_cuda/layers.jl +++ b/test/ext_cuda/layers.jl @@ -11,9 +11,7 @@ end -const ACTIVATIONS = [identity, relu, tanh, - sigmoid, exp, softplus, - elu, selu] +const ACTIVATIONS = [identity, tanh, softplus, elu] function gpu_gradtest(name::String, layers::Vector, x_cpu, args...; test_cpu = true, test_mode = false) @testset "$name GPU grad tests" begin diff --git a/test/ext_cuda/losses.jl b/test/ext_cuda/losses.jl index b5ba4d2f4d..e787e64881 100644 --- a/test/ext_cuda/losses.jl +++ b/test/ext_cuda/losses.jl @@ -27,8 +27,9 @@ y = [1 0 0 0 1 @test focal_loss(x, y) ≈ focal_loss(gpu(x), gpu(y)) @testset "GPU: $loss" for loss in ALL_LOSSES - x = rand(Float32, 3,4) - y = rand(Float32, 3,4) + # let's stay far from the boundaries to avoid problems with finite differences gradients + x = 0.1f0 + 0.8f0 .* rand(Float32, 3, 4) + y = 0.1f0 + 0.8f0 .* rand(Float32, 3, 4) @test loss(x, y) ≈ loss(gpu(x), gpu(y)) test_gradients(loss, x, y, test_gpu=true, test_grad_f = false) diff --git a/test/test_utils.jl b/test/test_utils.jl index 522def9f96..a12d5fed66 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -14,7 +14,8 @@ const ALL_LOSSES = [Flux.Losses.mse, Flux.Losses.mae, Flux.Losses.msle, function finitediff_withgradient(f, x...) y = f(x...) - fdm = FiniteDifferences.central_fdm(5, 1) + # We set a range to avoid domain errors + fdm = FiniteDifferences.central_fdm(5, 1, max_range=1e-2) return y, FiniteDifferences.grad(fdm, f, x...) end From 088565f3af1e8199557f40175cc907f488be42d5 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Sat, 12 Oct 2024 19:42:21 +0200 Subject: [PATCH 07/12] fix --- test/test_utils.jl | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/test/test_utils.jl b/test/test_utils.jl index a12d5fed66..543d0603dc 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -42,8 +42,8 @@ function test_gradients( # Use finite differences gradient as a reference. # Cast to Float64 to avoid precision issues. - f64 = f |> f64 - xs64 = xs .|> f64 + f64 = f |> Flux.f64 + xs64 = xs .|> Flux.f64 y_fd, g_fd = finitediff_withgradient((xs...) -> loss(f64(xs...)), xs64...) # Zygote gradient with respect to input. @@ -85,18 +85,3 @@ function test_gradients( end end end - -# check_grad_type checks that the gradient type matches the primal type. - -check_grad_type(g::Nothing, x) = nothing - -function check_grad_type(g::AbstractArray{T1}, x::AbstractArray{T2}) where {T1, T2} - @test T1 == T2 - @test size(g) == size(x) -end - -function check_grad_type(g::NamedTuple, x::T) where T - for f in fieldnames(T) - check_grad_type(g[f], getfield(x, f)) - end -end From fde966493245d55d82c90901163ca354724f1c13 Mon Sep 17 00:00:00 2001 From: CarloLucibello Date: Sun, 13 Oct 2024 09:49:13 +0200 Subject: [PATCH 08/12] fix cuda test --- src/distributed/public_api.jl | 2 +- test/ext_amdgpu/get_devices.jl | 4 ++-- test/ext_cuda/get_devices.jl | 3 --- test/ext_cuda/losses.jl | 4 ++-- test/functors.jl | 5 +---- test/runtests.jl | 4 ++-- 6 files changed, 8 insertions(+), 14 deletions(-) diff --git a/src/distributed/public_api.jl b/src/distributed/public_api.jl index 26d321814d..d5d10e42a4 100644 --- a/src/distributed/public_api.jl +++ b/src/distributed/public_api.jl @@ -132,7 +132,7 @@ Backend Agnostic API to perform an allreduce operation on the given buffer `send workers. """ function allreduce!(backend::AbstractFluxDistributedBackend, sendrecvbuf, op::F) where {F} - return __allreduce!(backend, sendrecvbuf, op, get_device()) + return __allreduce!(backend, sendrecvbuf, op, gpu_device()) end function allreduce!( diff --git a/test/ext_amdgpu/get_devices.jl b/test/ext_amdgpu/get_devices.jl index 7f4d8ccd7a..24b1d71a38 100644 --- a/test/ext_amdgpu/get_devices.jl +++ b/test/ext_amdgpu/get_devices.jl @@ -17,9 +17,9 @@ x = randn(Float32, 5, 5) cx = x |> amdgpu_device @test cx isa AMDGPU.ROCArray -# moving models to specific NVIDIA devices +# moving models to specific AMDGPU devices for id in 0:(length(AMDGPU.devices()) - 1) - current_amdgpu_device = Flux.get_device("AMDGPU", id) + current_amdgpu_device = gpu_device(id+1) global dense_model = dense_model |> current_amdgpu_device @test dense_model.weight isa AMDGPU.ROCArray diff --git a/test/ext_cuda/get_devices.jl b/test/ext_cuda/get_devices.jl index 2f4ea3bd98..e66f500650 100644 --- a/test/ext_cuda/get_devices.jl +++ b/test/ext_cuda/get_devices.jl @@ -8,9 +8,6 @@ dense_model = Dense(2 => 3) # initially lives on CPU weight = copy(dense_model.weight) # store the weight bias = copy(dense_model.bias) # store the bias -cuda_device = Flux.get_device() - -@test typeof(cuda_device) <: Flux.CUDADevice # correctness of data transfer x = randn(5, 5) diff --git a/test/ext_cuda/losses.jl b/test/ext_cuda/losses.jl index e787e64881..ecf235fe0a 100644 --- a/test/ext_cuda/losses.jl +++ b/test/ext_cuda/losses.jl @@ -28,8 +28,8 @@ y = [1 0 0 0 1 @testset "GPU: $loss" for loss in ALL_LOSSES # let's stay far from the boundaries to avoid problems with finite differences gradients - x = 0.1f0 + 0.8f0 .* rand(Float32, 3, 4) - y = 0.1f0 + 0.8f0 .* rand(Float32, 3, 4) + x = 0.1f0 .+ 0.8f0 .* rand(Float32, 3, 4) + y = 0.1f0 .+ 0.8f0 .* rand(Float32, 3, 4) @test loss(x, y) ≈ loss(gpu(x), gpu(y)) test_gradients(loss, x, y, test_gpu=true, test_grad_f = false) diff --git a/test/functors.jl b/test/functors.jl index 280b76d6f0..734eadc574 100644 --- a/test/functors.jl +++ b/test/functors.jl @@ -3,10 +3,7 @@ if !(Flux.CUDA_LOADED[] || Flux.AMDGPU_LOADED[] || Flux.METAL_LOADED[]) @test x === gpu(x) end -dev = Flux.get_device() +dev = Flux.cpu_device() @test typeof(dev) <: Flux.CPUDevice @test dev(x) == x -# specifically getting CPU device -dev = Flux.get_device("CPU") -@test typeof(dev) <: Flux.CPUDevice diff --git a/test/runtests.jl b/test/runtests.jl index ff6660be14..f44c4b7758 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -11,9 +11,9 @@ using Functors: fmapstructure_with_path ## Uncomment below to change the default test settings # ENV["FLUX_TEST_AMDGPU"] = "true" -# ENV["FLUX_TEST_CUDA"] = "true" +ENV["FLUX_TEST_CUDA"] = "true" # ENV["FLUX_TEST_METAL"] = "true" -# ENV["FLUX_TEST_CPU"] = "false" +ENV["FLUX_TEST_CPU"] = "false" # ENV["FLUX_TEST_DISTRIBUTED_MPI"] = "true" # ENV["FLUX_TEST_DISTRIBUTED_NCCL"] = "true" ENV["FLUX_TEST_ENZYME"] = "false" # We temporarily disable Enzyme tests since they are failing From 13ddabdf800508af321ba2275948f49e01e5f211 Mon Sep 17 00:00:00 2001 From: CarloLucibello Date: Sun, 13 Oct 2024 10:54:34 +0200 Subject: [PATCH 09/12] tweaks --- .gitignore | 1 + test/ext_amdgpu/basic.jl | 13 ++++++------- test/runtests.jl | 4 ++-- test/test_utils.jl | 10 +++++----- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index 1e8a6b3b8a..21bd9e6e68 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ Manifest.toml LocalPreferences.toml .DS_Store docs/mymodel.bson +prova.jl diff --git a/test/ext_amdgpu/basic.jl b/test/ext_amdgpu/basic.jl index a17d542d2d..057051f23a 100644 --- a/test/ext_amdgpu/basic.jl +++ b/test/ext_amdgpu/basic.jl @@ -35,11 +35,11 @@ end @test collect(md(xd)) ≈ y atol=1f-3 # Gradients are flipped as well. - gs = gradient(m -> sum(m(x)), m) - gsd = gradient(m -> sum(m(xd)), md) + gs = gradient(m -> sum(m(x)), m)[1] + gsd = gradient(m -> sum(m(xd)), md)[1] dims = ntuple(i -> i, ndims(m.weight) - 2) - @test reverse(gs[1].weight; dims) ≈ Array(gsd[1].weight) atol=1f-2 + @test reverse(gs.weight; dims) ≈ Array(gsd.weight) atol=1f-2 # Movement back to CPU flips weights back. mh = Flux.cpu(md) @@ -77,16 +77,15 @@ end @testset "Chain(Conv)" begin m = Chain(Conv((3, 3), 3 => 3)) x = rand(Float32, 5, 5, 3, 2) - # gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false) - test_gradients(m, x, test_gpu=true) + + @test Array((m |> gpu)(x |> gpu)) ≈ m(x) atol=1f-3 md = m |> gpu |> cpu @test md[1].weight ≈ m[1].weight atol=1f-3 m = Chain(ConvTranspose((3, 3), 3 => 3)) x = rand(Float32, 5, 5, 3, 2) - # gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false) - test_gradients(m, x, test_gpu=true) + @test Array((m |> gpu)(x |> gpu)) ≈ m(x) atol=1f-3 md = m |> gpu |> cpu @test md[1].weight ≈ m[1].weight atol=1f-3 diff --git a/test/runtests.jl b/test/runtests.jl index f44c4b7758..ff6660be14 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -11,9 +11,9 @@ using Functors: fmapstructure_with_path ## Uncomment below to change the default test settings # ENV["FLUX_TEST_AMDGPU"] = "true" -ENV["FLUX_TEST_CUDA"] = "true" +# ENV["FLUX_TEST_CUDA"] = "true" # ENV["FLUX_TEST_METAL"] = "true" -ENV["FLUX_TEST_CPU"] = "false" +# ENV["FLUX_TEST_CPU"] = "false" # ENV["FLUX_TEST_DISTRIBUTED_MPI"] = "true" # ENV["FLUX_TEST_DISTRIBUTED_NCCL"] = "true" ENV["FLUX_TEST_ENZYME"] = "false" # We temporarily disable Enzyme tests since they are failing diff --git a/test/test_utils.jl b/test/test_utils.jl index 543d0603dc..a7d234b55f 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -59,8 +59,8 @@ function test_gradients( # Zygote gradient with respect to input on GPU. y_gpu, g_gpu = Zygote.withgradient((xs...) -> loss(f_gpu(xs...)), xs_gpu...) - @test get_device(g_gpu) == gpu_dev - @test y_gpu |> cpu_dev ≈ y rtol=rtol atol=atol + @test get_device(g_gpu) == get_device(xs_gpu) + @test y_gpu ≈ y rtol=rtol atol=atol check_equal_leaves(g_gpu |> cpu_dev, g; rtol, atol) end @@ -77,10 +77,10 @@ function test_gradients( check_equal_leaves(g, g_fd; rtol, atol) if test_gpu - # Zygote gradient with respect to input on GPU. + # Zygote gradient with respect to f on GPU. y_gpu, g_gpu = Zygote.withgradient(f -> loss(f(xs_gpu...)), f_gpu) - @test get_device(g_gpu) == gpu_dev - @test y_gpu |> cpu_dev ≈ y rtol=rtol atol=atol + # @test get_device(g_gpu) == get_device(xs_gpu) + @test y_gpu ≈ y rtol=rtol atol=atol check_equal_leaves(g_gpu |> cpu_dev, g; rtol, atol) end end From a2c92ba1ea1a413c3fa6bfb2351c37b57fd9e213 Mon Sep 17 00:00:00 2001 From: CarloLucibello Date: Sun, 13 Oct 2024 12:53:23 +0200 Subject: [PATCH 10/12] fix cuda device --- test/ext_amdgpu/basic.jl | 8 ++++---- test/ext_cuda/get_devices.jl | 6 ++++++ test/ext_cuda/runtests.jl | 1 - 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/test/ext_amdgpu/basic.jl b/test/ext_amdgpu/basic.jl index 057051f23a..2f057a22f5 100644 --- a/test/ext_amdgpu/basic.jl +++ b/test/ext_amdgpu/basic.jl @@ -26,7 +26,7 @@ end @testset "Convolution" begin for conv_type in (Conv, ConvTranspose), nd in 1:3 - m = conv_type(tuple(fill(2, nd)...), 3 => 4) |> f32 + m = conv_type(tuple(fill(2, nd)...), 3 => 4) x = rand(Float32, fill(10, nd)..., 3, 5) md, xd = Flux.gpu.((m, x)) @@ -53,10 +53,10 @@ end x = rand(Float32, fill(10, nd)..., 3, 5) |> gpu pad = ntuple(i -> i, nd) - m = conv_type(kernel, 3 => 4, pad=pad) |> f32 |> gpu + m = conv_type(kernel, 3 => 4, pad=pad) |> gpu expanded_pad = ntuple(i -> pad[(i - 1) ÷ 2 + 1], 2 * nd) - m_expanded = conv_type(kernel, 3 => 4, pad=expanded_pad) |> f32 |> gpu + m_expanded = conv_type(kernel, 3 => 4, pad=expanded_pad) |> gpu @test size(m(x)) == size(m_expanded(x)) end @@ -92,7 +92,7 @@ end end @testset "Cross-correlation" begin - m = CrossCor((2, 2), 3 => 4) |> f32 + m = CrossCor((2, 2), 3 => 4) x = rand(Float32, 5, 5, 3, 2) test_gradients(m, x, test_gpu=true) end diff --git a/test/ext_cuda/get_devices.jl b/test/ext_cuda/get_devices.jl index e66f500650..ae722319a5 100644 --- a/test/ext_cuda/get_devices.jl +++ b/test/ext_cuda/get_devices.jl @@ -27,6 +27,12 @@ for id in 0:(length(CUDA.devices()) - 1) @test isequal(Flux.cpu(dense_model.weight), weight) @test isequal(Flux.cpu(dense_model.bias), bias) end + +# gpu_device remembers the last device selected +# Therefore, we need to reset it to the current cuda device +@test gpu_device().device.handle == length(CUDA.devices()) - 1 +gpu_device(CUDA.device().handle + 1) + # finally move to CPU, and see if things work cdev = cpu_device() dense_model = cdev(dense_model) diff --git a/test/ext_cuda/runtests.jl b/test/ext_cuda/runtests.jl index 6fdbde4250..012a62d41a 100644 --- a/test/ext_cuda/runtests.jl +++ b/test/ext_cuda/runtests.jl @@ -10,7 +10,6 @@ CUDA.allowscalar(false) @testset "get_devices" begin include("get_devices.jl") end - @testset "cuda" begin include("cuda.jl") end From fbf66c42142e6b7e2726147119e90adef731d832 Mon Sep 17 00:00:00 2001 From: CarloLucibello Date: Sun, 13 Oct 2024 13:57:58 +0200 Subject: [PATCH 11/12] test less --- test/ext_amdgpu/basic.jl | 6 +++--- test/ext_cuda/layers.jl | 13 +++++++------ test/ext_cuda/losses.jl | 2 +- test/ext_metal/basic.jl | 2 +- test/test_utils.jl | 39 ++++++++++++++++++++++++--------------- 5 files changed, 36 insertions(+), 26 deletions(-) diff --git a/test/ext_amdgpu/basic.jl b/test/ext_amdgpu/basic.jl index 2f057a22f5..62a7b065f3 100644 --- a/test/ext_amdgpu/basic.jl +++ b/test/ext_amdgpu/basic.jl @@ -21,7 +21,7 @@ end @testset "Chain of Dense layers" begin m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax) x = rand(Float32, 10, 10) - test_gradients(m, x, test_gpu=true) + test_gradients(m, x, test_gpu=true, compare_finite_diff=false) end @testset "Convolution" begin @@ -94,7 +94,7 @@ end @testset "Cross-correlation" begin m = CrossCor((2, 2), 3 => 4) x = rand(Float32, 5, 5, 3, 2) - test_gradients(m, x, test_gpu=true) + test_gradients(m, x, test_gpu=true, compare_finite_diff=false) end @testset "Restructure" begin @@ -134,7 +134,7 @@ end bn = BatchNorm(3, σ) for nd in 1:3 x = rand(Float32, fill(2, nd - 1)..., 3, 4) - test_gradients(bn, x; test_gpu=true) + test_gradients(bn, x; test_gpu=true, compare_finite_diff=false) end end diff --git a/test/ext_cuda/layers.jl b/test/ext_cuda/layers.jl index 0d985e59d1..cbe5c06704 100644 --- a/test/ext_cuda/layers.jl +++ b/test/ext_cuda/layers.jl @@ -24,7 +24,7 @@ function gpu_gradtest(name::String, layers::Vector, x_cpu, args...; test_cpu = t testmode!(l_cpu) end - test_gradients(l_cpu, x_cpu, test_gpu = true) + test_gradients(l_cpu, x_cpu, test_gpu=true, compare_finite_diff=false) end end end @@ -97,10 +97,10 @@ gpu_gradtest("Embedding OneHotMatrix index", embedding, OneHotMatrix([1,2,3], 5 gpu_gradtest("Embedding OneHotMatrix repeated indices", embedding, OneHotMatrix([1,2,2], 5), 5, 2) @testset "function layers" begin - x = rand(Float32, 3,3) - test_gradients(x -> sum(Flux.normalise(x; dims=1)), x) - test_gradients(x -> sum(Flux.normalise(x; dims=2)), x) - test_gradients(x -> sum(Flux.normalise(x)), x) + x = rand(Float32, 3, 3) + test_gradients(x -> sum(Flux.normalise(x; dims=1)), x, test_gpu=true, compare_finite_diff=false) + test_gradients(x -> sum(Flux.normalise(x; dims=2)), x, test_gpu=true, compare_finite_diff=false) + test_gradients(x -> sum(Flux.normalise(x)), x, test_gpu=true, compare_finite_diff=false) end @testset "Zeros mapped for $cl" for cl in (Conv, ConvTranspose, CrossCor, DepthwiseConv) @@ -309,5 +309,6 @@ end @test Array(y_gpu) ≈ y_cpu atol=1e-4 @test Array(α_gpu) ≈ α_cpu atol=1e-4 - test_gradients(mha_cpu, x_cpu, loss = o -> sum(o[1].^2) + sum(o[2].^2), test_gpu = true) + test_gradients(mha_cpu, x_cpu, loss = o -> sum(o[1].^2) + sum(o[2].^2), + test_gpu = true, compare_finite_diff = false) end diff --git a/test/ext_cuda/losses.jl b/test/ext_cuda/losses.jl index ecf235fe0a..cf56c7119d 100644 --- a/test/ext_cuda/losses.jl +++ b/test/ext_cuda/losses.jl @@ -32,7 +32,7 @@ y = [1 0 0 0 1 y = 0.1f0 .+ 0.8f0 .* rand(Float32, 3, 4) @test loss(x, y) ≈ loss(gpu(x), gpu(y)) - test_gradients(loss, x, y, test_gpu=true, test_grad_f = false) + test_gradients(loss, x, y, test_gpu=true, test_grad_f=false, compare_finite_diff=false) # Float16 tests @test loss(f16(x), f16(y)) ≈ loss(gpu(f16(x)), gpu(f16(y))) diff --git a/test/ext_metal/basic.jl b/test/ext_metal/basic.jl index 76121a1d3a..97ba8066a3 100644 --- a/test/ext_metal/basic.jl +++ b/test/ext_metal/basic.jl @@ -23,5 +23,5 @@ end m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax) x = rand(Float32, 10, 10) @test (m|>gpu)(x|>gpu) isa MtlArray{Float32, 2} - test_gradients(m, x, test_gpu=true) + test_gradients(m, x, test_gpu=true, compare_finite_diff=false) end diff --git a/test/test_utils.jl b/test/test_utils.jl index a7d234b55f..c18eae84b9 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -37,19 +37,26 @@ function test_gradients( rtol=1e-4, atol=1e-4, test_gpu = false, test_grad_f = true, + compare_finite_diff = true, loss = sum ) - # Use finite differences gradient as a reference. - # Cast to Float64 to avoid precision issues. - f64 = f |> Flux.f64 - xs64 = xs .|> Flux.f64 - y_fd, g_fd = finitediff_withgradient((xs...) -> loss(f64(xs...)), xs64...) + if !test_gpu && !compare_finite_diff + error("You should either compare finite diff vs CPU AD \ + or CPU AD vs GPU AD.") + end # Zygote gradient with respect to input. y, g = Zygote.withgradient((xs...) -> loss(f(xs...)), xs...) - @test y ≈ y_fd rtol=rtol atol=atol - check_equal_leaves(g, g_fd; rtol, atol) + + if compare_finite_diff + # Cast to Float64 to avoid precision issues. + f64 = f |> Flux.f64 + xs64 = xs .|> Flux.f64 + y_fd, g_fd = finitediff_withgradient((xs...) -> loss(f64(xs...)), xs64...) + @test y ≈ y_fd rtol=rtol atol=atol + check_equal_leaves(g, g_fd; rtol, atol) + end if test_gpu gpu_dev = gpu_device(force=true) @@ -65,16 +72,18 @@ function test_gradients( end if test_grad_f - # Use finite differences gradient as a reference. - # y_fd, g_fd = finitediff_withgradient(f -> loss(f(x)), f) - ps, re = Flux.destructure(f64) - y_fd, g_fd = finitediff_withgradient(ps -> loss(re(ps)(xs64...)), ps) - g_fd = (re(g_fd[1]),) - # Zygote gradient with respect to f. y, g = Zygote.withgradient(f -> loss(f(xs...)), f) - @test y ≈ y_fd rtol=rtol atol=atol - check_equal_leaves(g, g_fd; rtol, atol) + + if compare_finite_diff + # Use finite differences gradient as a reference. + # y_fd, g_fd = finitediff_withgradient(f -> loss(f(x)), f) + ps, re = Flux.destructure(f64) + y_fd, g_fd = finitediff_withgradient(ps -> loss(re(ps)(xs64...)), ps) + g_fd = (re(g_fd[1]),) + @test y ≈ y_fd rtol=rtol atol=atol + check_equal_leaves(g, g_fd; rtol, atol) + end if test_gpu # Zygote gradient with respect to f on GPU. From 18a8a1131b0651f8355fa931336456887cbb718b Mon Sep 17 00:00:00 2001 From: CarloLucibello Date: Sun, 13 Oct 2024 16:11:04 +0200 Subject: [PATCH 12/12] fix cuda tests --- test/ext_amdgpu/basic.jl | 5 ++-- test/ext_cuda/layers.jl | 45 ++++++++++++++++--------------- test/runtests.jl | 4 +-- test/test_utils.jl | 58 +++++++++++++++++++++++----------------- 4 files changed, 62 insertions(+), 50 deletions(-) diff --git a/test/ext_amdgpu/basic.jl b/test/ext_amdgpu/basic.jl index 62a7b065f3..163064c072 100644 --- a/test/ext_amdgpu/basic.jl +++ b/test/ext_amdgpu/basic.jl @@ -77,15 +77,14 @@ end @testset "Chain(Conv)" begin m = Chain(Conv((3, 3), 3 => 3)) x = rand(Float32, 5, 5, 3, 2) - - @test Array((m |> gpu)(x |> gpu)) ≈ m(x) atol=1f-3 + test_gradients(m, x, test_gpu=true, compare_finite_diff=false, test_grad_f=false) md = m |> gpu |> cpu @test md[1].weight ≈ m[1].weight atol=1f-3 m = Chain(ConvTranspose((3, 3), 3 => 3)) x = rand(Float32, 5, 5, 3, 2) - @test Array((m |> gpu)(x |> gpu)) ≈ m(x) atol=1f-3 + test_gradients(m, x, test_gpu=true, compare_finite_diff=false, test_grad_f=false) md = m |> gpu |> cpu @test md[1].weight ≈ m[1].weight atol=1f-3 diff --git a/test/ext_cuda/layers.jl b/test/ext_cuda/layers.jl index cbe5c06704..cba95cee75 100644 --- a/test/ext_cuda/layers.jl +++ b/test/ext_cuda/layers.jl @@ -11,9 +11,11 @@ end -const ACTIVATIONS = [identity, tanh, softplus, elu] +const ACTIVATIONS = [identity, tanh] -function gpu_gradtest(name::String, layers::Vector, x_cpu, args...; test_cpu = true, test_mode = false) +function gpu_gradtest(name::String, layers::Vector, x_cpu, args...; + test_mode=false, test_grad_x=true, + atol=1e-4, rtol=1e-4) @testset "$name GPU grad tests" begin for layer in layers @testset "$layer Layer GPU grad test" begin @@ -24,7 +26,7 @@ function gpu_gradtest(name::String, layers::Vector, x_cpu, args...; test_cpu = t testmode!(l_cpu) end - test_gradients(l_cpu, x_cpu, test_gpu=true, compare_finite_diff=false) + test_gradients(l_cpu, x_cpu; test_gpu=true, compare_finite_diff=false, test_grad_x, atol, rtol) end end end @@ -45,23 +47,24 @@ for act in ACTIVATIONS ConvTranspose, ConvTransposeNoBias, CrossCor, CrossCorNoBias, DepthwiseConv, DepthwiseConvNoBias] - gpu_gradtest("Convolution with $act", conv_layers, r, (2,2), 1=>3, act, test_cpu = false) + gpu_gradtest("Convolution with $act", conv_layers, r, (2,2), 1=>3, act) groupedconv = [GroupedConv, GroupedConvTranspose] - gpu_gradtest("GroupedConvolution with $act", groupedconv, rand(Float32, 28, 28, 100, 2), (3,3), 100 => 25, act, test_cpu = true) + gpu_gradtest("GroupedConvolution with $act", groupedconv, rand(Float32, 28, 28, 100, 2), (3,3), 100 => 25, act) batch_norm = [BatchNorm, BatchNormNoTrackStats] - gpu_gradtest("BatchNorm 1 with $act", batch_norm, rand(Float32, 28,28,3,4), 3, act, test_cpu = false) #TODO fix errors - gpu_gradtest("BatchNorm 2 with $act", batch_norm, rand(Float32, 5,4), 5, act, test_cpu = true) + gpu_gradtest("BatchNorm 1 with $act", batch_norm, rand(Float32, 28,28,3,4), 3, act, atol=1e-3) + gpu_gradtest("BatchNorm 2 with $act", batch_norm, rand(Float32, 5,4), 5, act, atol=1e-3) batch_norm = [BatchNormNoTrackStats] - gpu_gradtest("BatchNorm 3 with $act (test mode)", batch_norm, rand(Float32, 5,4), 5, act, test_cpu = true, test_mode = true) + gpu_gradtest("BatchNorm 3 with $act (test mode)", batch_norm, rand(Float32, 5,4), 5, act, + test_mode=true, atol=1e-3) instancenorm = [InstanceNorm] - gpu_gradtest("InstanceNorm with $act", instancenorm, r, 1, act, test_cpu = false) + gpu_gradtest("InstanceNorm with $act", instancenorm, r, 1, act) groupnorm = [GroupNorm] - gpu_gradtest("GroupNorm with $act", groupnorm, rand(Float32, 28,28,3,1), 3, 1, act, test_cpu = false) + gpu_gradtest("GroupNorm with $act", groupnorm, rand(Float32, 28,28,3,1), 3, 1, act) end r = rand(Float32, 28, 28, 1, 1) @@ -70,13 +73,13 @@ pooling_layers = [MaxPool, MeanPool] gpu_gradtest("Pooling", pooling_layers, r, (2,2)) adaptive_pooling_layers = [AdaptiveMaxPool, AdaptiveMeanPool] -gpu_gradtest("AdaptivePooling", adaptive_pooling_layers, r, (7,7), test_cpu = false) +gpu_gradtest("AdaptivePooling", adaptive_pooling_layers, r, (7,7)) dropout_layers = [Dropout, AlphaDropout] -gpu_gradtest("Dropout", dropout_layers, r, 0.5f0; test_cpu = false) # dropout is not deterministic +gpu_gradtest("Dropout", dropout_layers, r, 1e-6) # dropout is not deterministic layer_norm = [LayerNorm] -gpu_gradtest("LayerNorm 1", layer_norm, rand(Float32, 28,28,3,4), 28, test_cpu = false) #TODO fix errors +gpu_gradtest("LayerNorm 1", layer_norm, rand(Float32, 28,28,3,4), 28) gpu_gradtest("LayerNorm 2", layer_norm, rand(Float32, 5,4), 5) upsample = [x -> Upsample(scale=x)] @@ -88,13 +91,13 @@ gpu_gradtest("PixelShuffle 2d", pixelshuffle, rand(Float32, 3, 4, 18, 3), 3) gpu_gradtest("PixelShuffle 1d", pixelshuffle, rand(Float32, 3, 18, 3), 3) embedding = [Flux.Embedding] -gpu_gradtest("Embedding", embedding, [1,3,5], 5, 2) -gpu_gradtest("Embedding repeated indices", embedding, [1,3,5,3], 5, 2) -gpu_gradtest("Embedding integer index", embedding, 1, 5, 2) -gpu_gradtest("Embedding 2d index", embedding, [1 2; 3 4], 5, 2) -gpu_gradtest("Embedding OneHotVec index", embedding, OneHotVector(1, 5), 5, 2) -gpu_gradtest("Embedding OneHotMatrix index", embedding, OneHotMatrix([1,2,3], 5), 5, 2) -gpu_gradtest("Embedding OneHotMatrix repeated indices", embedding, OneHotMatrix([1,2,2], 5), 5, 2) +gpu_gradtest("Embedding", embedding, [1,3,5], 5, 2, test_grad_x=false) +gpu_gradtest("Embedding repeated indices", embedding, [1,3,5,3], 5, 2, test_grad_x=false) +gpu_gradtest("Embedding integer index", embedding, 1, 5, 2, test_grad_x=false) +gpu_gradtest("Embedding 2d index", embedding, [1 2; 3 4], 5, 2, test_grad_x=false) +gpu_gradtest("Embedding OneHotVec index", embedding, OneHotVector(1, 5), 5, 2, test_grad_x=false) +gpu_gradtest("Embedding OneHotMatrix index", embedding, OneHotMatrix([1,2,3], 5), 5, 2, test_grad_x=false) +gpu_gradtest("Embedding OneHotMatrix repeated indices", embedding, OneHotMatrix([1,2,2], 5), 5, 2, test_grad_x=false) @testset "function layers" begin x = rand(Float32, 3, 3) @@ -310,5 +313,5 @@ end @test Array(α_gpu) ≈ α_cpu atol=1e-4 test_gradients(mha_cpu, x_cpu, loss = o -> sum(o[1].^2) + sum(o[2].^2), - test_gpu = true, compare_finite_diff = false) + test_gpu=true, compare_finite_diff=false) end diff --git a/test/runtests.jl b/test/runtests.jl index ff6660be14..f44c4b7758 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -11,9 +11,9 @@ using Functors: fmapstructure_with_path ## Uncomment below to change the default test settings # ENV["FLUX_TEST_AMDGPU"] = "true" -# ENV["FLUX_TEST_CUDA"] = "true" +ENV["FLUX_TEST_CUDA"] = "true" # ENV["FLUX_TEST_METAL"] = "true" -# ENV["FLUX_TEST_CPU"] = "false" +ENV["FLUX_TEST_CPU"] = "false" # ENV["FLUX_TEST_DISTRIBUTED_MPI"] = "true" # ENV["FLUX_TEST_DISTRIBUTED_NCCL"] = "true" ENV["FLUX_TEST_ENZYME"] = "false" # We temporarily disable Enzyme tests since they are failing diff --git a/test/test_utils.jl b/test/test_utils.jl index c18eae84b9..f9a6b6655f 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -33,12 +33,13 @@ end function test_gradients( f, - xs::Array...; + xs...; rtol=1e-4, atol=1e-4, test_gpu = false, test_grad_f = true, + test_grad_x = true, compare_finite_diff = true, - loss = sum + loss = mean, ) if !test_gpu && !compare_finite_diff @@ -46,29 +47,31 @@ function test_gradients( or CPU AD vs GPU AD.") end - # Zygote gradient with respect to input. - y, g = Zygote.withgradient((xs...) -> loss(f(xs...)), xs...) - - if compare_finite_diff - # Cast to Float64 to avoid precision issues. - f64 = f |> Flux.f64 - xs64 = xs .|> Flux.f64 - y_fd, g_fd = finitediff_withgradient((xs...) -> loss(f64(xs...)), xs64...) - @test y ≈ y_fd rtol=rtol atol=atol - check_equal_leaves(g, g_fd; rtol, atol) - end + if test_grad_x + # Zygote gradient with respect to input. + y, g = Zygote.withgradient((xs...) -> loss(f(xs...)), xs...) + + if compare_finite_diff + # Cast to Float64 to avoid precision issues. + f64 = f |> Flux.f64 + xs64 = xs .|> Flux.f64 + y_fd, g_fd = finitediff_withgradient((xs...) -> loss(f64(xs...)), xs64...) + @test y ≈ y_fd rtol=rtol atol=atol + check_equal_leaves(g, g_fd; rtol, atol) + end - if test_gpu - gpu_dev = gpu_device(force=true) - cpu_dev = cpu_device() - xs_gpu = xs |> gpu_dev - f_gpu = f |> gpu_dev + if test_gpu + gpu_dev = gpu_device(force=true) + cpu_dev = cpu_device() + xs_gpu = xs |> gpu_dev + f_gpu = f |> gpu_dev - # Zygote gradient with respect to input on GPU. - y_gpu, g_gpu = Zygote.withgradient((xs...) -> loss(f_gpu(xs...)), xs_gpu...) - @test get_device(g_gpu) == get_device(xs_gpu) - @test y_gpu ≈ y rtol=rtol atol=atol - check_equal_leaves(g_gpu |> cpu_dev, g; rtol, atol) + # Zygote gradient with respect to input on GPU. + y_gpu, g_gpu = Zygote.withgradient((xs...) -> loss(f_gpu(xs...)), xs_gpu...) + @test get_device(g_gpu) == get_device(xs_gpu) + @test y_gpu ≈ y rtol=rtol atol=atol + check_equal_leaves(g_gpu |> cpu_dev, g; rtol, atol) + end end if test_grad_f @@ -78,14 +81,21 @@ function test_gradients( if compare_finite_diff # Use finite differences gradient as a reference. # y_fd, g_fd = finitediff_withgradient(f -> loss(f(x)), f) + # Cast to Float64 to avoid precision issues. + f64 = f |> Flux.f64 ps, re = Flux.destructure(f64) - y_fd, g_fd = finitediff_withgradient(ps -> loss(re(ps)(xs64...)), ps) + y_fd, g_fd = finitediff_withgradient(ps -> loss(re(ps)(xs...)), ps) g_fd = (re(g_fd[1]),) @test y ≈ y_fd rtol=rtol atol=atol check_equal_leaves(g, g_fd; rtol, atol) end if test_gpu + gpu_dev = gpu_device(force=true) + cpu_dev = cpu_device() + xs_gpu = xs |> gpu_dev + f_gpu = f |> gpu_dev + # Zygote gradient with respect to f on GPU. y_gpu, g_gpu = Zygote.withgradient(f -> loss(f(xs_gpu...)), f_gpu) # @test get_device(g_gpu) == get_device(xs_gpu)