From 6defd04b4b534747ab8a8ad5734fbc06075d95f2 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Sat, 12 Oct 2024 17:46:47 +0200
Subject: [PATCH 01/12] simplify test machinery

---
 prova.jl                      |  11 +++
 test/ext_amdgpu/basic.jl      |  23 +++---
 test/ext_amdgpu/runtests.jl   |   3 -
 test/ext_amdgpu/test_utils.jl |  15 ----
 test/ext_cuda/layers.jl       |  82 +++-----------------
 test/ext_cuda/losses.jl       |   2 +-
 test/ext_cuda/runtests.jl     |   3 -
 test/ext_cuda/test_utils.jl   |   4 -
 test/ext_metal/basic.jl       |   2 +-
 test/ext_metal/runtests.jl    |   2 -
 test/ext_metal/test_utils.jl  |  16 ----
 test/layers/attention.jl      |   7 +-
 test/runtests.jl              |  48 ++++++------
 test/test_utils.jl            | 141 +++++++++++++++-------------------
 14 files changed, 121 insertions(+), 238 deletions(-)
 create mode 100644 prova.jl
 delete mode 100644 test/ext_amdgpu/test_utils.jl
 delete mode 100644 test/ext_cuda/test_utils.jl
 delete mode 100644 test/ext_metal/test_utils.jl

diff --git a/prova.jl b/prova.jl
new file mode 100644
index 0000000000..63ebbbdfe2
--- /dev/null
+++ b/prova.jl
@@ -0,0 +1,11 @@
+using Flux, FiniteDifferences, Test, Zygote, Functors, Metal
+include("test/test_utils.jl")
+
+m = Dense(3, 3)
+x = rand(Float32, 3, 3)
+test_gradients(m, x; rtol=1e-4, atol=1e-4)
+
+m = MultiHeadAttention(4, nheads=2)
+x = rand(Float32, 4, 3, 2)
+m(x)
+test_gradients(m, x; loss = o -> sum(o[1].^2) + sum(o[2].^2))
diff --git a/test/ext_amdgpu/basic.jl b/test/ext_amdgpu/basic.jl
index 831b577d48..4f93ca5145 100644
--- a/test/ext_amdgpu/basic.jl
+++ b/test/ext_amdgpu/basic.jl
@@ -19,9 +19,9 @@ end
 end
 
 @testset "Chain of Dense layers" begin
-    m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax) |> f32
+    m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
     x = rand(Float32, 10, 10)
-    gpu_autodiff_test(m, x)
+    test_gradients(m, x, test_gpu=true)
 end
 
 @testset "Convolution" begin
@@ -29,11 +29,12 @@ end
         m = conv_type(tuple(fill(2, nd)...), 3 => 4) |> f32
         x = rand(Float32, fill(10, nd)..., 3, 5)
 
+        md, xd = Flux.gpu.((m, x))
+        y = m(x)
         # Ensure outputs are the same.
-        gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false)
+        @test collect(md(xd)) ≈ y  atol=1f-3
 
         # Gradients are flipped as well.
-        md, xd = Flux.gpu.((m, x))
         gs = gradient(m -> sum(m(x)), m)
         gsd = gradient(m -> sum(m(xd)), md)
 
@@ -74,16 +75,18 @@ end
 end
 
 @testset "Chain(Conv)" begin
-    m = Chain(Conv((3, 3), 3 => 3)) |> f32
+    m = Chain(Conv((3, 3), 3 => 3))
     x = rand(Float32, 10, 10, 3, 2)
-    gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false)
+    # gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false)
+    test_gradients(m, x, test_gpu=true)
 
     md = m |> gpu |> cpu
     @test md[1].weight ≈ m[1].weight atol=1f-3
 
-    m = Chain(ConvTranspose((3, 3), 3 => 3)) |> f32
+    m = Chain(ConvTranspose((3, 3), 3 => 3))
     x = rand(Float32, 10, 10, 3, 2)
-    gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false)
+    # gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false)
+    test_gradients(m, x, test_gpu=true)
 
     md = m |> gpu |> cpu
     @test md[1].weight ≈ m[1].weight atol=1f-3
@@ -92,7 +95,7 @@ end
 @testset "Cross-correlation" begin
     m = CrossCor((2, 2), 3 => 4) |> f32
     x = rand(Float32, 10, 10, 3, 2)
-    gpu_autodiff_test(m, x; atol=1f-3)
+    test_gradients(m, x, test_gpu=true)
 end
 
 @testset "Restructure" begin
@@ -132,7 +135,7 @@ end
     bn = BatchNorm(3, σ)
     for nd in 1:3
         x = rand(Float32, fill(2, nd - 1)..., 3, 4)
-        gpu_autodiff_test(bn, x; atol=1f-3, allow_nothing=true)
+        test_gradients(bn, x; test_gpu=true)
     end
 end
 
diff --git a/test/ext_amdgpu/runtests.jl b/test/ext_amdgpu/runtests.jl
index 9027a31f76..ec779dedea 100644
--- a/test/ext_amdgpu/runtests.jl
+++ b/test/ext_amdgpu/runtests.jl
@@ -2,9 +2,6 @@
 @assert AMDGPU.functional()
 AMDGPU.allowscalar(false)
 
-include("../test_utils.jl")
-include("test_utils.jl")
-
 @testset "get_devices" begin
   include("get_devices.jl")
 end
diff --git a/test/ext_amdgpu/test_utils.jl b/test/ext_amdgpu/test_utils.jl
deleted file mode 100644
index 3c84f01048..0000000000
--- a/test/ext_amdgpu/test_utils.jl
+++ /dev/null
@@ -1,15 +0,0 @@
-function check_grad(
-    g_gpu::ROCArray{Float32}, g_cpu::Array{Float32};
-    atol, rtol, allow_nothing::Bool,
-)
-    @test g_cpu ≈ collect(g_gpu) atol=atol rtol=rtol
-end
-
-function check_grad(
-    g_gpu::ROCArray{Float32}, g_cpu::Zygote.FillArrays.AbstractFill;
-    atol, rtol, allow_nothing::Bool,
-)
-    @test g_cpu ≈ collect(g_gpu) atol=atol rtol=rtol
-end
-
-check_type(x::ROCArray{Float32}) = true
diff --git a/test/ext_cuda/layers.jl b/test/ext_cuda/layers.jl
index 63bcc8b526..c148334dcf 100644
--- a/test/ext_cuda/layers.jl
+++ b/test/ext_cuda/layers.jl
@@ -10,73 +10,23 @@
   @test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple
 end
 
-# TODO: These layers get into scalar indexing issues.
-const BROKEN_LAYERS = Union{}
 
 const ACTIVATIONS = [identity, relu, tanh,
                      sigmoid, exp, softplus,
                      elu, selu]
 
-function gpu_gradtest(name::String, layers::Vector, x_cpu = nothing, args...; test_cpu = true, test_mode = false)
-  isnothing(x_cpu) && error("Missing input to test the layers against.")
+function gpu_gradtest(name::String, layers::Vector, x_cpu, args...; test_cpu = true, test_mode = false)
   @testset "$name GPU grad tests" begin
     for layer in layers
       @testset "$layer Layer GPU grad test" begin
 
         # compute output and grad of parameters
         l_cpu = layer(args...)
-        l_gpu = l_cpu |> gpu
         if test_mode
           testmode!(l_cpu)
-          testmode!(l_gpu)
         end
 
-        ps_cpu = Flux.params(l_cpu)
-        y_cpu, back_cpu = pullback(() -> sum(l_cpu(x_cpu)), ps_cpu)
-        gs_cpu = back_cpu(1f0)
-
-        x_gpu = gpu(x_cpu)
-        ps_gpu = Flux.params(l_gpu)
-
-        if typeof(l_gpu) <: BROKEN_LAYERS
-          @test_broken gradient(() -> sum(l_gpu(x_gpu)), ps_gpu) isa Flux.Zygote.Grads
-        else
-          y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu)
-          gs_gpu = back_gpu(1f0) # TODO many layers error out when backprop int 1, should fix
-
-          # compute grad of input
-          xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1]
-          xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1]
-
-          # test
-          if test_cpu
-            if layer === GroupedConvTranspose
-              @test y_gpu ≈ y_cpu rtol=1f-2 atol=1f-3
-            else
-              @test y_gpu ≈ y_cpu rtol=1f-3 atol=1f-3
-            end
-            if isnothing(xg_cpu)
-              @test isnothing(xg_gpu)
-            else
-              if layer === GroupedConvTranspose
-                @test Array(xg_gpu) ≈ xg_cpu rtol = 2f-2 atol = 1f-3
-              else
-                @test Array(xg_gpu) ≈ xg_cpu rtol = 1f-3 atol = 1f-3
-              end
-            end
-          end
-          @test gs_gpu isa Flux.Zygote.Grads
-          for (p_cpu, p_gpu) in zip(ps_cpu, ps_gpu)
-            if isnothing(gs_cpu[p_cpu])
-              @test isnothing(gs_gpu[p_gpu])
-            else
-              @test gs_gpu[p_gpu] isa CuArray
-              if test_cpu
-                @test Array(gs_gpu[p_gpu]) ≈ gs_cpu[p_cpu] rtol=1f-3 atol=1f-3
-              end
-            end
-          end
-        end
+        test_gradients(l_cpu, x_cpu, test_gpu = true)
       end
     end
   end
@@ -150,22 +100,17 @@ gpu_gradtest("Embedding OneHotMatrix repeated indices", embedding, OneHotMatrix(
 
 @testset "function layers" begin
   x = rand(Float32, 3,3)
-  gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=1)), x)
-  gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=2)), x)
-  gpu_autodiff_test(x -> sum(Flux.normalise(x)), x)
+  test_gradients(x -> sum(Flux.normalise(x; dims=1)), x)
+  test_gradients(x -> sum(Flux.normalise(x; dims=2)), x)
+  test_gradients(x -> sum(Flux.normalise(x)), x)
 end
 
 @testset "Zeros mapped for $cl" for cl in (Conv, ConvTranspose, CrossCor, DepthwiseConv)
   l = cl((2,2), 1=>3, bias = false) |> gpu
   ip = zeros(Float32, 28,28,1,1) |> gpu
-  if typeof(l) <: BROKEN_LAYERS
-    @test_broken sum(l(ip)) ≈ 0.f0
-    @test_broken gradient(() -> sum(l(ip)), Flux.params(l)) isa Flux.Zygote.Grads
-  else
-    @test sum(l(ip)) ≈ 0.f0
-    gs = gradient(() -> sum(l(ip)), Flux.params(l))
-    @test l.bias ∉ gs.params
-  end
+  @test sum(l(ip)) ≈ 0.f0
+  gs = gradient(() -> sum(l(ip)), Flux.params(l))
+  @test l.bias ∉ gs.params
 end
 
 @testset "Dense without bias" begin
@@ -366,14 +311,5 @@ end
   @test Array(y_gpu) ≈ y_cpu atol=1e-4
   @test Array(α_gpu) ≈ α_cpu atol=1e-4
 
-  gm_cpu, gx_cpu = gradient(mha_cpu, x_cpu) do mha, x
-    y, α = mha(x)
-    return sum(y.^2) + sum(α.^2)
-  end
-  gm_gpu, gx_gpu = gradient(mha_gpu, x_gpu) do mha, x
-    y, α = mha(x)
-    return sum(y.^2) + sum(α.^2)
-  end
-  check_grad(gm_gpu, gm_cpu)
-  check_grad(gx_gpu, gx_cpu)
+  test_gradients(mha_cpu, x_cpu, loss = o -> sum(o[1].^2) + sum(o[2].^2), test_gpu = true)
 end
diff --git a/test/ext_cuda/losses.jl b/test/ext_cuda/losses.jl
index b339b352bb..b5ba4d2f4d 100644
--- a/test/ext_cuda/losses.jl
+++ b/test/ext_cuda/losses.jl
@@ -31,7 +31,7 @@ y = [1  0  0  0  1
   y = rand(Float32, 3,4)
   @test loss(x, y) ≈ loss(gpu(x), gpu(y))
 
-  gpu_autodiff_test(loss, x, y)
+  test_gradients(loss, x, y, test_gpu=true, test_grad_f = false)
 
   # Float16 tests
   @test loss(f16(x), f16(y)) ≈ loss(gpu(f16(x)), gpu(f16(y)))
diff --git a/test/ext_cuda/runtests.jl b/test/ext_cuda/runtests.jl
index aa1f431fe7..6fdbde4250 100644
--- a/test/ext_cuda/runtests.jl
+++ b/test/ext_cuda/runtests.jl
@@ -7,9 +7,6 @@ using Random, LinearAlgebra, Statistics
 @assert CUDA.functional()
 CUDA.allowscalar(false)
 
-# include("../test_utils.jl")
-include("test_utils.jl")
-
 @testset "get_devices" begin
   include("get_devices.jl")
 end
diff --git a/test/ext_cuda/test_utils.jl b/test/ext_cuda/test_utils.jl
deleted file mode 100644
index 10a8d0dfdf..0000000000
--- a/test/ext_cuda/test_utils.jl
+++ /dev/null
@@ -1,4 +0,0 @@
-check_grad(g_gpu::CuArray{Float32}, g_cpu::Array{Float32}; rtol=1e-4, atol=1e-4, allow_nothing::Bool=false) =
-    @test g_cpu ≈ collect(g_gpu) rtol=rtol atol=atol
-
-check_type(x::CuArray{Float32}) = true
diff --git a/test/ext_metal/basic.jl b/test/ext_metal/basic.jl
index 9e4a9ef9cb..76121a1d3a 100644
--- a/test/ext_metal/basic.jl
+++ b/test/ext_metal/basic.jl
@@ -23,5 +23,5 @@ end
     m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
     x = rand(Float32, 10, 10) 
     @test (m|>gpu)(x|>gpu) isa MtlArray{Float32, 2}
-    gpu_autodiff_test(m, x)
+    test_gradients(m, x, test_gpu=true)
 end
diff --git a/test/ext_metal/runtests.jl b/test/ext_metal/runtests.jl
index 8c8af7d896..cb9532390e 100644
--- a/test/ext_metal/runtests.jl
+++ b/test/ext_metal/runtests.jl
@@ -5,8 +5,6 @@ using Random, Statistics
 using Zygote
 Flux.gpu_backend!("Metal") # needs a restart
 
-include("test_utils.jl")
-
 @testset "data movement" begin
     metal_device = Flux.gpu_device()
     cdev = cpu_device()
diff --git a/test/ext_metal/test_utils.jl b/test/ext_metal/test_utils.jl
deleted file mode 100644
index f6ed32a8f4..0000000000
--- a/test/ext_metal/test_utils.jl
+++ /dev/null
@@ -1,16 +0,0 @@
-
-function check_grad(
-    g_gpu::MtlArray{Float32}, g_cpu::Array{Float32};
-    atol, rtol, allow_nothing::Bool,
-)
-    @test g_cpu ≈ collect(g_gpu) atol=atol rtol=rtol
-end
-
-function check_grad(
-    g_gpu::MtlArray{Float32}, g_cpu::Zygote.FillArrays.AbstractFill;
-    atol, rtol, allow_nothing::Bool,
-)
-    @test g_cpu ≈ collect(g_gpu) atol=atol rtol=rtol
-end
-
-check_type(x::MtlArray{Float32}) = true
diff --git a/test/layers/attention.jl b/test/layers/attention.jl
index a4c90b36ed..f51c4c192f 100644
--- a/test/layers/attention.jl
+++ b/test/layers/attention.jl
@@ -54,12 +54,7 @@
   end
 
   @testset "gradient" begin
-    gm, gq = gradient(mha, q) do mha, q
-      y, α = mha(q)
-      return sum(y.^2) + sum(α.^2)
-    end
-    check_grad_type(gm, mha)
-    check_grad_type(gq, q)
+    test_gradients(mha, q, loss = ((y, α)) -> sum(y.^2) + sum(α.^2))
   end
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index ef3d67f4d7..7c5f08095b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -10,40 +10,40 @@ using Pkg
 ## Uncomment below to change the default test settings
 # ENV["FLUX_TEST_AMDGPU"] = "true"
 # ENV["FLUX_TEST_CUDA"] = "true"
-# ENV["FLUX_TEST_METAL"] = "true"
+ENV["FLUX_TEST_METAL"] = "true"
 # ENV["FLUX_TEST_CPU"] = "false"
 # ENV["FLUX_TEST_DISTRIBUTED_MPI"] = "true"
 # ENV["FLUX_TEST_DISTRIBUTED_NCCL"] = "true"
 ENV["FLUX_TEST_ENZYME"] = "false" # We temporarily disable Enzyme tests since they are failing
 
-include("test_utils.jl")
+include("test_utils.jl") # for test_gradients
 
 Random.seed!(0)
 
 @testset verbose=true "Flux.jl" begin
   if get(ENV, "FLUX_TEST_CPU", "true") == "true"
-    @testset "Utils" begin
-      include("utils.jl")
-    end
-
-    @testset "Loading" begin
-      include("loading.jl")
-    end
-
-    @testset "Optimise / Train" begin
-      include("optimise.jl")
-      include("train.jl")
-      include("tracker.jl")
-    end
-
-    @testset "Data" begin
-      include("data.jl")
-    end
-
-    @testset "Losses" begin
-      include("losses.jl")
-      include("ctc.jl")
-    end
+    # @testset "Utils" begin
+    #   include("utils.jl")
+    # end
+
+    # @testset "Loading" begin
+    #   include("loading.jl")
+    # end
+
+    # @testset "Optimise / Train" begin
+    #   include("optimise.jl")
+    #   include("train.jl")
+    #   include("tracker.jl")
+    # end
+
+    # @testset "Data" begin
+    #   include("data.jl")
+    # end
+
+    # @testset "Losses" begin
+    #   include("losses.jl")
+    #   include("ctc.jl")
+    # end
 
     @testset "Layers" begin
       include("layers/attention.jl")
diff --git a/test/test_utils.jl b/test/test_utils.jl
index 004d3035ad..4dcf0f5793 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -8,98 +8,79 @@ const ALL_LOSSES = [Flux.Losses.mse, Flux.Losses.mae, Flux.Losses.msle,
                     Flux.Losses.dice_coeff_loss,
                     Flux.Losses.poisson_loss,
                     Flux.Losses.hinge_loss, Flux.Losses.squared_hinge_loss,
-                    Flux.Losses.binary_focal_loss, Flux.Losses.focal_loss, Flux.Losses.siamese_contrastive_loss]
+                    Flux.Losses.binary_focal_loss, Flux.Losses.focal_loss,
+                    Flux.Losses.siamese_contrastive_loss]
 
 
-
-function check_grad(g_gpu, g_cpu;
-            rtol=1e-4, atol=1e-4,
-            allow_nothing::Bool=false)
-    allow_nothing && return
-    @warn "Unsupported types in `check_grad`: $(typeof(g_gpu)), $(typeof(g_cpu))"
-    @show g_gpu g_cpu
-    @test false
+function finitediff_withgradient(f, x...)
+    y = f(x...)
+    fdm = central_fdm(5, 1)
+    return y, FiniteDifferences.grad(fdm, f, x...)
 end
 
-check_grad(g_gpu::Base.RefValue, g_cpu::Base.RefValue; rtol=1e-4, atol=1e-4, allow_nothing::Bool=false) =
-    check_grad(g_gpu[], g_cpu[]; rtol, atol, allow_nothing)
-
-check_grad(g_gpu::Nothing, g_cpu::Nothing; rtol=1e-4, atol=1e-4, allow_nothing::Bool=false) =
-    @test true
-
-check_grad(g_gpu::Float32, g_cpu::Float32; rtol=1e-4, atol=1e-4, allow_nothing::Bool=false) =
-    @test g_cpu ≈ g_gpu rtol=rtol atol=atol
-
-function check_grad(g_gpu::Tuple, g_cpu::Tuple; rtol=1e-4, atol=1e-4, allow_nothing::Bool=false)
-    for (v1, v2) in zip(g_gpu, g_cpu)
-        check_grad(v1, v2; rtol, atol, allow_nothing)
-    end
-end
 
-function check_grad(g_gpu::NamedTuple, g_cpu::NamedTuple; rtol=1e-4, atol=1e-4, allow_nothing::Bool=false)
-    for ((k1,v1), (k2,v2)) in zip(pairs(g_gpu), pairs(g_cpu))
-        @test k1 == k2
-        check_grad(v1, v2; rtol, atol, allow_nothing)
+function check_equal_leaves(a, b; rtol=1e-4, atol=1e-4, check_eltype=true)
+    fmapstructure_with_path(a, b) do kp, x, y
+        if x isa AbstractArray
+            if check_eltype
+                @test eltype(x) == eltype(y)
+            end
+            @test x ≈ y rtol=rtol atol=atol
+        elseif x isa Number
+            @test x ≈ y rtol=rtol atol=atol
+        end
     end
 end
 
-check_type(x) = false
-check_type(x::Float32) = true
-check_type(x::Array{Float32}) = true
 
-function gpu_autodiff_test(
-            f_cpu, 
-            xs_cpu::Array{Float32}...;
-            test_equal=true, 
+function test_gradients(
+            f, 
+            xs::Array...;
             rtol=1e-4, atol=1e-4,
-            checkgrad::Bool = true, 
-            allow_nothing::Bool = false,
-        )
-
-    # Compare CPU & GPU function outputs.
-    f_gpu = f_cpu |> gpu
-    xs_gpu = gpu.(xs_cpu)
-
-    y_cpu = f_cpu(xs_cpu...)
-    y_gpu = f_gpu(xs_gpu...)
-    @test collect(y_cpu) ≈ collect(y_gpu) atol=atol rtol=rtol
-
-    checkgrad || return
-
-    ### GRADIENT WITH RESPECT TO INPUT ###
-
-    y_cpu, back_cpu = pullback((x...) -> f_cpu(x...), xs_cpu...)
-    @test check_type(y_cpu)
-    Δ_cpu = size(y_cpu) == () ? randn(Float32) : randn(Float32, size(y_cpu))
-    gs_cpu = back_cpu(Δ_cpu)
-
-    Δ_gpu = Δ_cpu |> gpu
-    y_gpu, back_gpu = pullback((x...) -> f_gpu(x...), xs_gpu...)
-    @test check_type(y_gpu)
-    gs_gpu = back_gpu(Δ_gpu)
-
-    if test_equal
-        @test collect(y_cpu) ≈ collect(y_gpu) rtol=rtol atol=atol
-        for (g_gpu, g_cpu) in zip(gs_gpu, gs_cpu)
-            check_grad(g_gpu, g_cpu; atol, rtol, allow_nothing)
-        end
+            test_gpu = false,
+            test_grad_f = true,
+            loss = sum
+            )
+
+    # Use finite differences gradient as a reference.
+    y_fd, g_fd = finitediff_withgradient((xs...) -> loss(f(xs...)), xs...)
+
+    # Zygote gradient with respect to input.
+    y, g = Zygote.withgradient((xs...) -> loss(f(xs...)), xs...)
+    @test y ≈ y_fd rtol=rtol atol=atol
+    check_equal_leaves(g, g_fd; rtol, atol)
+
+    if test_gpu
+        gpu_dev = gpu_device(force=true)
+        cpu_dev = cpu_device()
+        x_gpu = x |> gpu_dev
+        f_gpu = f |> gpu_dev
+
+        # Zygote gradient with respect to input on GPU.
+        y_gpu, g_gpu = Zygote.withgradient(x -> loss(f_gpu(x)), x_gpu)
+        @test get_device(g_gpu) == gpu_dev
+        @test y_gpu |> cpu_dev ≈ y rtol=rtol atol=atol
+        check_equal_leaves(g_gpu |> cpu_dev, g; rtol, atol)
     end
 
-    ### GRADIENT WITH RESPECT TO f ###
-
-    ps_cpu = Flux.params(f_cpu)
-    y_cpu, back_cpu = pullback(() -> f_cpu(xs_cpu...), ps_cpu)
-    gs_cpu = back_cpu(Δ_cpu)
-
-    ps_gpu = Flux.params(f_gpu)
-    y_gpu, back_gpu = pullback(() -> f_gpu(xs_gpu...), ps_gpu)
-    gs_gpu = back_gpu(Δ_gpu)
-
-    if test_equal
-        @test collect(y_cpu) ≈ collect(y_gpu) rtol=rtol atol=atol
-        @assert length(ps_gpu) == length(ps_cpu)
-        for (p_gpu, p_cpu) in zip(ps_gpu, ps_cpu)
-            check_grad(gs_gpu[p_gpu], gs_cpu[p_cpu]; atol, rtol, allow_nothing)
+    if test_grad_f
+        # Use finite differences gradient as a reference.
+        # y_fd, g_fd = finitediff_withgradient(f -> loss(f(x)), f)
+        ps, re = Flux.destructure(f)
+        y_fd, g_fd = finitediff_withgradient(f -> loss(re(ps)(x)), ps)
+        g_fd = (re(g_fd[1]),)
+
+        # Zygote gradient with respect to f.
+        y, g = Zygote.withgradient(f -> loss(f(x)), f)
+        @test y ≈ y_fd rtol=rtol atol=atol
+        check_equal_leaves(g, g_fd; rtol, atol)
+
+        if test_gpu
+            # Zygote gradient with respect to input on GPU.
+            y_gpu, g_gpu = Zygote.withgradient(f -> loss(f(x_gpu)), f_gpu)
+            @test get_device(g_gpu) == gpu_dev
+            @test y_gpu |> cpu_dev ≈ y rtol=rtol atol=atol
+            check_equal_leaves(g_gpu |> cpu_dev, g; rtol, atol)
         end
     end
 end

From a8af95d59d749e9eb07dc6bdcb73eb0f86e897da Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Sat, 12 Oct 2024 18:03:20 +0200
Subject: [PATCH 02/12] fix

---
 prova.jl           | 11 -----------
 test/Project.toml  |  2 +-
 test/runtests.jl   |  1 +
 test/test_utils.jl |  2 +-
 4 files changed, 3 insertions(+), 13 deletions(-)
 delete mode 100644 prova.jl

diff --git a/prova.jl b/prova.jl
deleted file mode 100644
index 63ebbbdfe2..0000000000
--- a/prova.jl
+++ /dev/null
@@ -1,11 +0,0 @@
-using Flux, FiniteDifferences, Test, Zygote, Functors, Metal
-include("test/test_utils.jl")
-
-m = Dense(3, 3)
-x = rand(Float32, 3, 3)
-test_gradients(m, x; rtol=1e-4, atol=1e-4)
-
-m = MultiHeadAttention(4, nheads=2)
-x = rand(Float32, 4, 3, 2)
-m(x)
-test_gradients(m, x; loss = o -> sum(o[1].^2) + sum(o[2].^2))
diff --git a/test/Project.toml b/test/Project.toml
index b5fcda422b..99f1d7175a 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -21,4 +21,4 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 [compat]
 FiniteDifferences = "0.12"
 Tracker = "0.2.33"
-Enzyme = "0.12.4"
+Enzyme = "0.13"
diff --git a/test/runtests.jl b/test/runtests.jl
index 7c5f08095b..7199f88649 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -6,6 +6,7 @@ using Random, Statistics, LinearAlgebra
 using IterTools: ncycle
 using Zygote
 using Pkg
+using FiniteDifferences: FiniteDifferences
 
 ## Uncomment below to change the default test settings
 # ENV["FLUX_TEST_AMDGPU"] = "true"
diff --git a/test/test_utils.jl b/test/test_utils.jl
index 4dcf0f5793..fed79178f8 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -14,7 +14,7 @@ const ALL_LOSSES = [Flux.Losses.mse, Flux.Losses.mae, Flux.Losses.msle,
 
 function finitediff_withgradient(f, x...)
     y = f(x...)
-    fdm = central_fdm(5, 1)
+    fdm = FiniteDifferences.central_fdm(5, 1)
     return y, FiniteDifferences.grad(fdm, f, x...)
 end
 

From b94cf64ae61cc7b7083c5b9185d701b1581dc430 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Sat, 12 Oct 2024 18:13:50 +0200
Subject: [PATCH 03/12] fix

---
 test/layers/attention.jl | 2 +-
 test/runtests.jl         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/layers/attention.jl b/test/layers/attention.jl
index f51c4c192f..2c6fd7d514 100644
--- a/test/layers/attention.jl
+++ b/test/layers/attention.jl
@@ -54,7 +54,7 @@
   end
 
   @testset "gradient" begin
-    test_gradients(mha, q, loss = ((y, α)) -> sum(y.^2) + sum(α.^2))
+    test_gradients(mha, q, loss = o -> sum(o[1].^2) + sum(o[2].^2))
   end
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 7199f88649..9ee030ec95 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -11,7 +11,7 @@ using FiniteDifferences: FiniteDifferences
 ## Uncomment below to change the default test settings
 # ENV["FLUX_TEST_AMDGPU"] = "true"
 # ENV["FLUX_TEST_CUDA"] = "true"
-ENV["FLUX_TEST_METAL"] = "true"
+# ENV["FLUX_TEST_METAL"] = "true"
 # ENV["FLUX_TEST_CPU"] = "false"
 # ENV["FLUX_TEST_DISTRIBUTED_MPI"] = "true"
 # ENV["FLUX_TEST_DISTRIBUTED_NCCL"] = "true"

From 77a9611fab8538f15c19e668f9496b4e0f929c3e Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Sat, 12 Oct 2024 18:56:41 +0200
Subject: [PATCH 04/12] fix

---
 test/runtests.jl   |  1 +
 test/test_utils.jl | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 9ee030ec95..0ce93c59c0 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -7,6 +7,7 @@ using IterTools: ncycle
 using Zygote
 using Pkg
 using FiniteDifferences: FiniteDifferences
+using Functors: fmapstructure_with_path
 
 ## Uncomment below to change the default test settings
 # ENV["FLUX_TEST_AMDGPU"] = "true"
diff --git a/test/test_utils.jl b/test/test_utils.jl
index fed79178f8..1bdd5cb90b 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -53,11 +53,11 @@ function test_gradients(
     if test_gpu
         gpu_dev = gpu_device(force=true)
         cpu_dev = cpu_device()
-        x_gpu = x |> gpu_dev
+        xs_gpu = xs |> gpu_dev
         f_gpu = f |> gpu_dev
 
         # Zygote gradient with respect to input on GPU.
-        y_gpu, g_gpu = Zygote.withgradient(x -> loss(f_gpu(x)), x_gpu)
+        y_gpu, g_gpu = Zygote.withgradient((xs...) -> loss(f_gpu(xs...)), xs_gpu...)
         @test get_device(g_gpu) == gpu_dev
         @test y_gpu |> cpu_dev ≈ y rtol=rtol atol=atol
         check_equal_leaves(g_gpu |> cpu_dev, g; rtol, atol)
@@ -67,17 +67,17 @@ function test_gradients(
         # Use finite differences gradient as a reference.
         # y_fd, g_fd = finitediff_withgradient(f -> loss(f(x)), f)
         ps, re = Flux.destructure(f)
-        y_fd, g_fd = finitediff_withgradient(f -> loss(re(ps)(x)), ps)
+        y_fd, g_fd = finitediff_withgradient(ps -> loss(re(ps)(xs...)), ps)
         g_fd = (re(g_fd[1]),)
 
         # Zygote gradient with respect to f.
-        y, g = Zygote.withgradient(f -> loss(f(x)), f)
+        y, g = Zygote.withgradient(f -> loss(f(xs...)), f)
         @test y ≈ y_fd rtol=rtol atol=atol
         check_equal_leaves(g, g_fd; rtol, atol)
 
         if test_gpu
             # Zygote gradient with respect to input on GPU.
-            y_gpu, g_gpu = Zygote.withgradient(f -> loss(f(x_gpu)), f_gpu)
+            y_gpu, g_gpu = Zygote.withgradient(f -> loss(f(xs_gpu...)), f_gpu)
             @test get_device(g_gpu) == gpu_dev
             @test y_gpu |> cpu_dev ≈ y rtol=rtol atol=atol
             check_equal_leaves(g_gpu |> cpu_dev, g; rtol, atol)

From 18ab9b198adb517f6a38e4c7c39609bef4a75c23 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Sat, 12 Oct 2024 19:13:11 +0200
Subject: [PATCH 05/12] f64

---
 test/ext_amdgpu/basic.jl |  6 +++---
 test/runtests.jl         | 44 ++++++++++++++++++++--------------------
 test/test_utils.jl       | 14 ++++++-------
 3 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/test/ext_amdgpu/basic.jl b/test/ext_amdgpu/basic.jl
index 4f93ca5145..a17d542d2d 100644
--- a/test/ext_amdgpu/basic.jl
+++ b/test/ext_amdgpu/basic.jl
@@ -76,7 +76,7 @@ end
 
 @testset "Chain(Conv)" begin
     m = Chain(Conv((3, 3), 3 => 3))
-    x = rand(Float32, 10, 10, 3, 2)
+    x = rand(Float32, 5, 5, 3, 2)
     # gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false)
     test_gradients(m, x, test_gpu=true)
 
@@ -84,7 +84,7 @@ end
     @test md[1].weight ≈ m[1].weight atol=1f-3
 
     m = Chain(ConvTranspose((3, 3), 3 => 3))
-    x = rand(Float32, 10, 10, 3, 2)
+    x = rand(Float32, 5, 5, 3, 2)
     # gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false)
     test_gradients(m, x, test_gpu=true)
 
@@ -94,7 +94,7 @@ end
 
 @testset "Cross-correlation" begin
     m = CrossCor((2, 2), 3 => 4) |> f32
-    x = rand(Float32, 10, 10, 3, 2)
+    x = rand(Float32, 5, 5, 3, 2)
     test_gradients(m, x, test_gpu=true)
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 0ce93c59c0..ff6660be14 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -24,28 +24,28 @@ Random.seed!(0)
 
 @testset verbose=true "Flux.jl" begin
   if get(ENV, "FLUX_TEST_CPU", "true") == "true"
-    # @testset "Utils" begin
-    #   include("utils.jl")
-    # end
-
-    # @testset "Loading" begin
-    #   include("loading.jl")
-    # end
-
-    # @testset "Optimise / Train" begin
-    #   include("optimise.jl")
-    #   include("train.jl")
-    #   include("tracker.jl")
-    # end
-
-    # @testset "Data" begin
-    #   include("data.jl")
-    # end
-
-    # @testset "Losses" begin
-    #   include("losses.jl")
-    #   include("ctc.jl")
-    # end
+    @testset "Utils" begin
+      include("utils.jl")
+    end
+
+    @testset "Loading" begin
+      include("loading.jl")
+    end
+
+    @testset "Optimise / Train" begin
+      include("optimise.jl")
+      include("train.jl")
+      include("tracker.jl")
+    end
+
+    @testset "Data" begin
+      include("data.jl")
+    end
+
+    @testset "Losses" begin
+      include("losses.jl")
+      include("ctc.jl")
+    end
 
     @testset "Layers" begin
       include("layers/attention.jl")
diff --git a/test/test_utils.jl b/test/test_utils.jl
index 1bdd5cb90b..522def9f96 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -19,12 +19,9 @@ function finitediff_withgradient(f, x...)
 end
 
 
-function check_equal_leaves(a, b; rtol=1e-4, atol=1e-4, check_eltype=true)
+function check_equal_leaves(a, b; rtol=1e-4, atol=1e-4)
     fmapstructure_with_path(a, b) do kp, x, y
         if x isa AbstractArray
-            if check_eltype
-                @test eltype(x) == eltype(y)
-            end
             @test x ≈ y rtol=rtol atol=atol
         elseif x isa Number
             @test x ≈ y rtol=rtol atol=atol
@@ -43,7 +40,10 @@ function test_gradients(
             )
 
     # Use finite differences gradient as a reference.
-    y_fd, g_fd = finitediff_withgradient((xs...) -> loss(f(xs...)), xs...)
+    # Cast to Float64 to avoid precision issues.
+    f64 = f |> f64
+    xs64 = xs .|> f64
+    y_fd, g_fd = finitediff_withgradient((xs...) -> loss(f64(xs...)), xs64...)
 
     # Zygote gradient with respect to input.
     y, g = Zygote.withgradient((xs...) -> loss(f(xs...)), xs...)
@@ -66,8 +66,8 @@ function test_gradients(
     if test_grad_f
         # Use finite differences gradient as a reference.
         # y_fd, g_fd = finitediff_withgradient(f -> loss(f(x)), f)
-        ps, re = Flux.destructure(f)
-        y_fd, g_fd = finitediff_withgradient(ps -> loss(re(ps)(xs...)), ps)
+        ps, re = Flux.destructure(f64)
+        y_fd, g_fd = finitediff_withgradient(ps -> loss(re(ps)(xs64...)), ps)
         g_fd = (re(g_fd[1]),)
 
         # Zygote gradient with respect to f.

From 9367b9577168caa1e766b096db204c754d3128b3 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Sat, 12 Oct 2024 19:27:06 +0200
Subject: [PATCH 06/12] fixes

---
 test/ext_cuda/layers.jl | 4 +---
 test/ext_cuda/losses.jl | 5 +++--
 test/test_utils.jl      | 3 ++-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/ext_cuda/layers.jl b/test/ext_cuda/layers.jl
index c148334dcf..0d985e59d1 100644
--- a/test/ext_cuda/layers.jl
+++ b/test/ext_cuda/layers.jl
@@ -11,9 +11,7 @@
 end
 
 
-const ACTIVATIONS = [identity, relu, tanh,
-                     sigmoid, exp, softplus,
-                     elu, selu]
+const ACTIVATIONS = [identity, tanh, softplus, elu]
 
 function gpu_gradtest(name::String, layers::Vector, x_cpu, args...; test_cpu = true, test_mode = false)
   @testset "$name GPU grad tests" begin
diff --git a/test/ext_cuda/losses.jl b/test/ext_cuda/losses.jl
index b5ba4d2f4d..e787e64881 100644
--- a/test/ext_cuda/losses.jl
+++ b/test/ext_cuda/losses.jl
@@ -27,8 +27,9 @@ y = [1  0  0  0  1
 @test focal_loss(x, y) ≈ focal_loss(gpu(x), gpu(y))
 
 @testset "GPU: $loss" for loss in ALL_LOSSES
-  x = rand(Float32, 3,4)
-  y = rand(Float32, 3,4)
+  # let's stay far from the boundaries to avoid problems with finite differences gradients
+  x = 0.1f0 + 0.8f0 .* rand(Float32, 3, 4)
+  y = 0.1f0 + 0.8f0 .* rand(Float32, 3, 4)
   @test loss(x, y) ≈ loss(gpu(x), gpu(y))
 
   test_gradients(loss, x, y, test_gpu=true, test_grad_f = false)
diff --git a/test/test_utils.jl b/test/test_utils.jl
index 522def9f96..a12d5fed66 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -14,7 +14,8 @@ const ALL_LOSSES = [Flux.Losses.mse, Flux.Losses.mae, Flux.Losses.msle,
 
 function finitediff_withgradient(f, x...)
     y = f(x...)
-    fdm = FiniteDifferences.central_fdm(5, 1)
+    # We set a range to avoid domain errors
+    fdm = FiniteDifferences.central_fdm(5, 1, max_range=1e-2)
     return y, FiniteDifferences.grad(fdm, f, x...)
 end
 

From 088565f3af1e8199557f40175cc907f488be42d5 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Sat, 12 Oct 2024 19:42:21 +0200
Subject: [PATCH 07/12] fix

---
 test/test_utils.jl | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/test/test_utils.jl b/test/test_utils.jl
index a12d5fed66..543d0603dc 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -42,8 +42,8 @@ function test_gradients(
 
     # Use finite differences gradient as a reference.
     # Cast to Float64 to avoid precision issues.
-    f64 = f |> f64
-    xs64 = xs .|> f64
+    f64 = f |> Flux.f64
+    xs64 = xs .|> Flux.f64
     y_fd, g_fd = finitediff_withgradient((xs...) -> loss(f64(xs...)), xs64...)
 
     # Zygote gradient with respect to input.
@@ -85,18 +85,3 @@ function test_gradients(
         end
     end
 end
-
-# check_grad_type checks that the gradient type matches the primal type.
-
-check_grad_type(g::Nothing, x) = nothing
-
-function check_grad_type(g::AbstractArray{T1}, x::AbstractArray{T2}) where {T1, T2}
-    @test T1 == T2
-    @test size(g) == size(x)
-end
-
-function check_grad_type(g::NamedTuple, x::T) where T
-    for f in fieldnames(T)
-        check_grad_type(g[f], getfield(x, f))
-    end
-end

From fde966493245d55d82c90901163ca354724f1c13 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Sun, 13 Oct 2024 09:49:13 +0200
Subject: [PATCH 08/12] fix cuda test

---
 src/distributed/public_api.jl  | 2 +-
 test/ext_amdgpu/get_devices.jl | 4 ++--
 test/ext_cuda/get_devices.jl   | 3 ---
 test/ext_cuda/losses.jl        | 4 ++--
 test/functors.jl               | 5 +----
 test/runtests.jl               | 4 ++--
 6 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/src/distributed/public_api.jl b/src/distributed/public_api.jl
index 26d321814d..d5d10e42a4 100644
--- a/src/distributed/public_api.jl
+++ b/src/distributed/public_api.jl
@@ -132,7 +132,7 @@ Backend Agnostic API to perform an allreduce operation on the given buffer `send
 workers.
 """
 function allreduce!(backend::AbstractFluxDistributedBackend, sendrecvbuf, op::F) where {F}
-    return __allreduce!(backend, sendrecvbuf, op, get_device())
+    return __allreduce!(backend, sendrecvbuf, op, gpu_device())
 end
 
 function allreduce!(
diff --git a/test/ext_amdgpu/get_devices.jl b/test/ext_amdgpu/get_devices.jl
index 7f4d8ccd7a..24b1d71a38 100644
--- a/test/ext_amdgpu/get_devices.jl
+++ b/test/ext_amdgpu/get_devices.jl
@@ -17,9 +17,9 @@ x = randn(Float32, 5, 5)
 cx = x |> amdgpu_device
 @test cx isa AMDGPU.ROCArray
 
-# moving models to specific NVIDIA devices
+# moving models to specific AMDGPU devices
 for id in 0:(length(AMDGPU.devices()) - 1)
-  current_amdgpu_device = Flux.get_device("AMDGPU", id)
+  current_amdgpu_device = gpu_device(id+1)
 
   global dense_model = dense_model |> current_amdgpu_device
   @test dense_model.weight isa AMDGPU.ROCArray
diff --git a/test/ext_cuda/get_devices.jl b/test/ext_cuda/get_devices.jl
index 2f4ea3bd98..e66f500650 100644
--- a/test/ext_cuda/get_devices.jl
+++ b/test/ext_cuda/get_devices.jl
@@ -8,9 +8,6 @@ dense_model = Dense(2 => 3)                 # initially lives on CPU
 weight = copy(dense_model.weight)           # store the weight
 bias = copy(dense_model.bias)               # store the bias
 
-cuda_device = Flux.get_device()
-
-@test typeof(cuda_device) <: Flux.CUDADevice
 
 # correctness of data transfer
 x = randn(5, 5)
diff --git a/test/ext_cuda/losses.jl b/test/ext_cuda/losses.jl
index e787e64881..ecf235fe0a 100644
--- a/test/ext_cuda/losses.jl
+++ b/test/ext_cuda/losses.jl
@@ -28,8 +28,8 @@ y = [1  0  0  0  1
 
 @testset "GPU: $loss" for loss in ALL_LOSSES
   # let's stay far from the boundaries to avoid problems with finite differences gradients
-  x = 0.1f0 + 0.8f0 .* rand(Float32, 3, 4)
-  y = 0.1f0 + 0.8f0 .* rand(Float32, 3, 4)
+  x = 0.1f0 .+ 0.8f0 .* rand(Float32, 3, 4)
+  y = 0.1f0 .+ 0.8f0 .* rand(Float32, 3, 4)
   @test loss(x, y) ≈ loss(gpu(x), gpu(y))
 
   test_gradients(loss, x, y, test_gpu=true, test_grad_f = false)
diff --git a/test/functors.jl b/test/functors.jl
index 280b76d6f0..734eadc574 100644
--- a/test/functors.jl
+++ b/test/functors.jl
@@ -3,10 +3,7 @@ if !(Flux.CUDA_LOADED[] || Flux.AMDGPU_LOADED[] || Flux.METAL_LOADED[])
     @test x === gpu(x)
 end
 
-dev = Flux.get_device()
+dev = Flux.cpu_device()
 @test typeof(dev) <: Flux.CPUDevice
 @test dev(x) == x
 
-# specifically getting CPU device
-dev = Flux.get_device("CPU")
-@test typeof(dev) <: Flux.CPUDevice
diff --git a/test/runtests.jl b/test/runtests.jl
index ff6660be14..f44c4b7758 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -11,9 +11,9 @@ using Functors: fmapstructure_with_path
 
 ## Uncomment below to change the default test settings
 # ENV["FLUX_TEST_AMDGPU"] = "true"
-# ENV["FLUX_TEST_CUDA"] = "true"
+ENV["FLUX_TEST_CUDA"] = "true"
 # ENV["FLUX_TEST_METAL"] = "true"
-# ENV["FLUX_TEST_CPU"] = "false"
+ENV["FLUX_TEST_CPU"] = "false"
 # ENV["FLUX_TEST_DISTRIBUTED_MPI"] = "true"
 # ENV["FLUX_TEST_DISTRIBUTED_NCCL"] = "true"
 ENV["FLUX_TEST_ENZYME"] = "false" # We temporarily disable Enzyme tests since they are failing

From 13ddabdf800508af321ba2275948f49e01e5f211 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Sun, 13 Oct 2024 10:54:34 +0200
Subject: [PATCH 09/12] tweaks

---
 .gitignore               |  1 +
 test/ext_amdgpu/basic.jl | 13 ++++++-------
 test/runtests.jl         |  4 ++--
 test/test_utils.jl       | 10 +++++-----
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1e8a6b3b8a..21bd9e6e68 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,4 @@ Manifest.toml
 LocalPreferences.toml
 .DS_Store
 docs/mymodel.bson
+prova.jl
diff --git a/test/ext_amdgpu/basic.jl b/test/ext_amdgpu/basic.jl
index a17d542d2d..057051f23a 100644
--- a/test/ext_amdgpu/basic.jl
+++ b/test/ext_amdgpu/basic.jl
@@ -35,11 +35,11 @@ end
         @test collect(md(xd)) ≈ y  atol=1f-3
 
         # Gradients are flipped as well.
-        gs = gradient(m -> sum(m(x)), m)
-        gsd = gradient(m -> sum(m(xd)), md)
+        gs = gradient(m -> sum(m(x)), m)[1]
+        gsd = gradient(m -> sum(m(xd)), md)[1]
 
         dims = ntuple(i -> i, ndims(m.weight) - 2)
-        @test reverse(gs[1].weight; dims) ≈ Array(gsd[1].weight) atol=1f-2
+        @test reverse(gs.weight; dims) ≈ Array(gsd.weight) atol=1f-2
 
         # Movement back to CPU flips weights back.
         mh = Flux.cpu(md)
@@ -77,16 +77,15 @@ end
 @testset "Chain(Conv)" begin
     m = Chain(Conv((3, 3), 3 => 3))
     x = rand(Float32, 5, 5, 3, 2)
-    # gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false)
-    test_gradients(m, x, test_gpu=true)
+    
+    @test Array((m |> gpu)(x |> gpu)) ≈ m(x) atol=1f-3
 
     md = m |> gpu |> cpu
     @test md[1].weight ≈ m[1].weight atol=1f-3
 
     m = Chain(ConvTranspose((3, 3), 3 => 3))
     x = rand(Float32, 5, 5, 3, 2)
-    # gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false)
-    test_gradients(m, x, test_gpu=true)
+    @test Array((m |> gpu)(x |> gpu)) ≈ m(x) atol=1f-3
 
     md = m |> gpu |> cpu
     @test md[1].weight ≈ m[1].weight atol=1f-3
diff --git a/test/runtests.jl b/test/runtests.jl
index f44c4b7758..ff6660be14 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -11,9 +11,9 @@ using Functors: fmapstructure_with_path
 
 ## Uncomment below to change the default test settings
 # ENV["FLUX_TEST_AMDGPU"] = "true"
-ENV["FLUX_TEST_CUDA"] = "true"
+# ENV["FLUX_TEST_CUDA"] = "true"
 # ENV["FLUX_TEST_METAL"] = "true"
-ENV["FLUX_TEST_CPU"] = "false"
+# ENV["FLUX_TEST_CPU"] = "false"
 # ENV["FLUX_TEST_DISTRIBUTED_MPI"] = "true"
 # ENV["FLUX_TEST_DISTRIBUTED_NCCL"] = "true"
 ENV["FLUX_TEST_ENZYME"] = "false" # We temporarily disable Enzyme tests since they are failing
diff --git a/test/test_utils.jl b/test/test_utils.jl
index 543d0603dc..a7d234b55f 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -59,8 +59,8 @@ function test_gradients(
 
         # Zygote gradient with respect to input on GPU.
         y_gpu, g_gpu = Zygote.withgradient((xs...) -> loss(f_gpu(xs...)), xs_gpu...)
-        @test get_device(g_gpu) == gpu_dev
-        @test y_gpu |> cpu_dev ≈ y rtol=rtol atol=atol
+        @test get_device(g_gpu) == get_device(xs_gpu)
+        @test y_gpu ≈ y rtol=rtol atol=atol
         check_equal_leaves(g_gpu |> cpu_dev, g; rtol, atol)
     end
 
@@ -77,10 +77,10 @@ function test_gradients(
         check_equal_leaves(g, g_fd; rtol, atol)
 
         if test_gpu
-            # Zygote gradient with respect to input on GPU.
+            # Zygote gradient with respect to f on GPU.
             y_gpu, g_gpu = Zygote.withgradient(f -> loss(f(xs_gpu...)), f_gpu)
-            @test get_device(g_gpu) == gpu_dev
-            @test y_gpu |> cpu_dev ≈ y rtol=rtol atol=atol
+            # @test get_device(g_gpu) == get_device(xs_gpu)
+            @test y_gpu ≈ y rtol=rtol atol=atol
             check_equal_leaves(g_gpu |> cpu_dev, g; rtol, atol)
         end
     end

From a2c92ba1ea1a413c3fa6bfb2351c37b57fd9e213 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Sun, 13 Oct 2024 12:53:23 +0200
Subject: [PATCH 10/12] fix cuda device

---
 test/ext_amdgpu/basic.jl     | 8 ++++----
 test/ext_cuda/get_devices.jl | 6 ++++++
 test/ext_cuda/runtests.jl    | 1 -
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/test/ext_amdgpu/basic.jl b/test/ext_amdgpu/basic.jl
index 057051f23a..2f057a22f5 100644
--- a/test/ext_amdgpu/basic.jl
+++ b/test/ext_amdgpu/basic.jl
@@ -26,7 +26,7 @@ end
 
 @testset "Convolution" begin
     for conv_type in (Conv, ConvTranspose), nd in 1:3
-        m = conv_type(tuple(fill(2, nd)...), 3 => 4) |> f32
+        m = conv_type(tuple(fill(2, nd)...), 3 => 4)
         x = rand(Float32, fill(10, nd)..., 3, 5)
 
         md, xd = Flux.gpu.((m, x))
@@ -53,10 +53,10 @@ end
         x = rand(Float32, fill(10, nd)..., 3, 5) |> gpu
 
         pad = ntuple(i -> i, nd)
-        m = conv_type(kernel, 3 => 4, pad=pad) |> f32 |> gpu
+        m = conv_type(kernel, 3 => 4, pad=pad) |> gpu
 
         expanded_pad = ntuple(i -> pad[(i - 1) ÷ 2 + 1], 2 * nd)
-        m_expanded = conv_type(kernel, 3 => 4, pad=expanded_pad) |> f32 |> gpu
+        m_expanded = conv_type(kernel, 3 => 4, pad=expanded_pad) |> gpu
 
         @test size(m(x)) == size(m_expanded(x))
     end
@@ -92,7 +92,7 @@ end
 end
 
 @testset "Cross-correlation" begin
-    m = CrossCor((2, 2), 3 => 4) |> f32
+    m = CrossCor((2, 2), 3 => 4)
     x = rand(Float32, 5, 5, 3, 2)
     test_gradients(m, x, test_gpu=true)
 end
diff --git a/test/ext_cuda/get_devices.jl b/test/ext_cuda/get_devices.jl
index e66f500650..ae722319a5 100644
--- a/test/ext_cuda/get_devices.jl
+++ b/test/ext_cuda/get_devices.jl
@@ -27,6 +27,12 @@ for id in 0:(length(CUDA.devices()) - 1)
   @test isequal(Flux.cpu(dense_model.weight), weight)
   @test isequal(Flux.cpu(dense_model.bias), bias)
 end
+
+# gpu_device remembers the last device selected
+# Therefore, we need to reset it to the current cuda device
+@test gpu_device().device.handle == length(CUDA.devices()) - 1
+gpu_device(CUDA.device().handle + 1)
+
 # finally move to CPU, and see if things work
 cdev = cpu_device()
 dense_model = cdev(dense_model)
diff --git a/test/ext_cuda/runtests.jl b/test/ext_cuda/runtests.jl
index 6fdbde4250..012a62d41a 100644
--- a/test/ext_cuda/runtests.jl
+++ b/test/ext_cuda/runtests.jl
@@ -10,7 +10,6 @@ CUDA.allowscalar(false)
 @testset "get_devices" begin
   include("get_devices.jl")
 end
-
 @testset "cuda" begin
   include("cuda.jl")
 end

From fbf66c42142e6b7e2726147119e90adef731d832 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Sun, 13 Oct 2024 13:57:58 +0200
Subject: [PATCH 11/12] test less

---
 test/ext_amdgpu/basic.jl |  6 +++---
 test/ext_cuda/layers.jl  | 13 +++++++------
 test/ext_cuda/losses.jl  |  2 +-
 test/ext_metal/basic.jl  |  2 +-
 test/test_utils.jl       | 39 ++++++++++++++++++++++++---------------
 5 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/test/ext_amdgpu/basic.jl b/test/ext_amdgpu/basic.jl
index 2f057a22f5..62a7b065f3 100644
--- a/test/ext_amdgpu/basic.jl
+++ b/test/ext_amdgpu/basic.jl
@@ -21,7 +21,7 @@ end
 @testset "Chain of Dense layers" begin
     m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
     x = rand(Float32, 10, 10)
-    test_gradients(m, x, test_gpu=true)
+    test_gradients(m, x, test_gpu=true, compare_finite_diff=false)
 end
 
 @testset "Convolution" begin
@@ -94,7 +94,7 @@ end
 @testset "Cross-correlation" begin
     m = CrossCor((2, 2), 3 => 4)
     x = rand(Float32, 5, 5, 3, 2)
-    test_gradients(m, x, test_gpu=true)
+    test_gradients(m, x, test_gpu=true, compare_finite_diff=false)
 end
 
 @testset "Restructure" begin
@@ -134,7 +134,7 @@ end
     bn = BatchNorm(3, σ)
     for nd in 1:3
         x = rand(Float32, fill(2, nd - 1)..., 3, 4)
-        test_gradients(bn, x; test_gpu=true)
+        test_gradients(bn, x; test_gpu=true, compare_finite_diff=false)
     end
 end
 
diff --git a/test/ext_cuda/layers.jl b/test/ext_cuda/layers.jl
index 0d985e59d1..cbe5c06704 100644
--- a/test/ext_cuda/layers.jl
+++ b/test/ext_cuda/layers.jl
@@ -24,7 +24,7 @@ function gpu_gradtest(name::String, layers::Vector, x_cpu, args...; test_cpu = t
           testmode!(l_cpu)
         end
 
-        test_gradients(l_cpu, x_cpu, test_gpu = true)
+        test_gradients(l_cpu, x_cpu, test_gpu=true, compare_finite_diff=false)
       end
     end
   end
@@ -97,10 +97,10 @@ gpu_gradtest("Embedding OneHotMatrix index", embedding,  OneHotMatrix([1,2,3], 5
 gpu_gradtest("Embedding OneHotMatrix repeated indices", embedding, OneHotMatrix([1,2,2], 5), 5, 2)
 
 @testset "function layers" begin
-  x = rand(Float32, 3,3)
-  test_gradients(x -> sum(Flux.normalise(x; dims=1)), x)
-  test_gradients(x -> sum(Flux.normalise(x; dims=2)), x)
-  test_gradients(x -> sum(Flux.normalise(x)), x)
+  x = rand(Float32, 3, 3)
+  test_gradients(x -> sum(Flux.normalise(x; dims=1)), x, test_gpu=true, compare_finite_diff=false)
+  test_gradients(x -> sum(Flux.normalise(x; dims=2)), x, test_gpu=true, compare_finite_diff=false)
+  test_gradients(x -> sum(Flux.normalise(x)), x, test_gpu=true, compare_finite_diff=false)
 end
 
 @testset "Zeros mapped for $cl" for cl in (Conv, ConvTranspose, CrossCor, DepthwiseConv)
@@ -309,5 +309,6 @@ end
   @test Array(y_gpu) ≈ y_cpu atol=1e-4
   @test Array(α_gpu) ≈ α_cpu atol=1e-4
 
-  test_gradients(mha_cpu, x_cpu, loss = o -> sum(o[1].^2) + sum(o[2].^2), test_gpu = true)
+  test_gradients(mha_cpu, x_cpu, loss = o -> sum(o[1].^2) + sum(o[2].^2), 
+    test_gpu = true, compare_finite_diff = false)
 end
diff --git a/test/ext_cuda/losses.jl b/test/ext_cuda/losses.jl
index ecf235fe0a..cf56c7119d 100644
--- a/test/ext_cuda/losses.jl
+++ b/test/ext_cuda/losses.jl
@@ -32,7 +32,7 @@ y = [1  0  0  0  1
   y = 0.1f0 .+ 0.8f0 .* rand(Float32, 3, 4)
   @test loss(x, y) ≈ loss(gpu(x), gpu(y))
 
-  test_gradients(loss, x, y, test_gpu=true, test_grad_f = false)
+  test_gradients(loss, x, y, test_gpu=true, test_grad_f=false, compare_finite_diff=false)
 
   # Float16 tests
   @test loss(f16(x), f16(y)) ≈ loss(gpu(f16(x)), gpu(f16(y)))
diff --git a/test/ext_metal/basic.jl b/test/ext_metal/basic.jl
index 76121a1d3a..97ba8066a3 100644
--- a/test/ext_metal/basic.jl
+++ b/test/ext_metal/basic.jl
@@ -23,5 +23,5 @@ end
     m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
     x = rand(Float32, 10, 10) 
     @test (m|>gpu)(x|>gpu) isa MtlArray{Float32, 2}
-    test_gradients(m, x, test_gpu=true)
+    test_gradients(m, x, test_gpu=true, compare_finite_diff=false)
 end
diff --git a/test/test_utils.jl b/test/test_utils.jl
index a7d234b55f..c18eae84b9 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -37,19 +37,26 @@ function test_gradients(
             rtol=1e-4, atol=1e-4,
             test_gpu = false,
             test_grad_f = true,
+            compare_finite_diff = true,
             loss = sum
             )
 
-    # Use finite differences gradient as a reference.
-    # Cast to Float64 to avoid precision issues.
-    f64 = f |> Flux.f64
-    xs64 = xs .|> Flux.f64
-    y_fd, g_fd = finitediff_withgradient((xs...) -> loss(f64(xs...)), xs64...)
+    if !test_gpu && !compare_finite_diff
+        error("You should either compare finite diff vs CPU AD \
+               or CPU AD vs GPU AD.")
+    end
 
     # Zygote gradient with respect to input.
     y, g = Zygote.withgradient((xs...) -> loss(f(xs...)), xs...)
-    @test y ≈ y_fd rtol=rtol atol=atol
-    check_equal_leaves(g, g_fd; rtol, atol)
+    
+    if compare_finite_diff
+        # Cast to Float64 to avoid precision issues.
+        f64 = f |> Flux.f64
+        xs64 = xs .|> Flux.f64
+        y_fd, g_fd = finitediff_withgradient((xs...) -> loss(f64(xs...)), xs64...)
+        @test y ≈ y_fd rtol=rtol atol=atol
+        check_equal_leaves(g, g_fd; rtol, atol)
+    end
 
     if test_gpu
         gpu_dev = gpu_device(force=true)
@@ -65,16 +72,18 @@ function test_gradients(
     end
 
     if test_grad_f
-        # Use finite differences gradient as a reference.
-        # y_fd, g_fd = finitediff_withgradient(f -> loss(f(x)), f)
-        ps, re = Flux.destructure(f64)
-        y_fd, g_fd = finitediff_withgradient(ps -> loss(re(ps)(xs64...)), ps)
-        g_fd = (re(g_fd[1]),)
-
         # Zygote gradient with respect to f.
         y, g = Zygote.withgradient(f -> loss(f(xs...)), f)
-        @test y ≈ y_fd rtol=rtol atol=atol
-        check_equal_leaves(g, g_fd; rtol, atol)
+
+        if compare_finite_diff
+            # Use finite differences gradient as a reference.
+            # y_fd, g_fd = finitediff_withgradient(f -> loss(f(x)), f)
+            ps, re = Flux.destructure(f64)
+            y_fd, g_fd = finitediff_withgradient(ps -> loss(re(ps)(xs64...)), ps)
+            g_fd = (re(g_fd[1]),)
+            @test y ≈ y_fd rtol=rtol atol=atol
+            check_equal_leaves(g, g_fd; rtol, atol)
+        end
 
         if test_gpu
             # Zygote gradient with respect to f on GPU.

From 18a8a1131b0651f8355fa931336456887cbb718b Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Sun, 13 Oct 2024 16:11:04 +0200
Subject: [PATCH 12/12] fix cuda tests

---
 test/ext_amdgpu/basic.jl |  5 ++--
 test/ext_cuda/layers.jl  | 45 ++++++++++++++++---------------
 test/runtests.jl         |  4 +--
 test/test_utils.jl       | 58 +++++++++++++++++++++++-----------------
 4 files changed, 62 insertions(+), 50 deletions(-)

diff --git a/test/ext_amdgpu/basic.jl b/test/ext_amdgpu/basic.jl
index 62a7b065f3..163064c072 100644
--- a/test/ext_amdgpu/basic.jl
+++ b/test/ext_amdgpu/basic.jl
@@ -77,15 +77,14 @@ end
 @testset "Chain(Conv)" begin
     m = Chain(Conv((3, 3), 3 => 3))
     x = rand(Float32, 5, 5, 3, 2)
-    
-    @test Array((m |> gpu)(x |> gpu)) ≈ m(x) atol=1f-3
+    test_gradients(m, x, test_gpu=true, compare_finite_diff=false, test_grad_f=false)
 
     md = m |> gpu |> cpu
     @test md[1].weight ≈ m[1].weight atol=1f-3
 
     m = Chain(ConvTranspose((3, 3), 3 => 3))
     x = rand(Float32, 5, 5, 3, 2)
-    @test Array((m |> gpu)(x |> gpu)) ≈ m(x) atol=1f-3
+    test_gradients(m, x, test_gpu=true, compare_finite_diff=false, test_grad_f=false)
 
     md = m |> gpu |> cpu
     @test md[1].weight ≈ m[1].weight atol=1f-3
diff --git a/test/ext_cuda/layers.jl b/test/ext_cuda/layers.jl
index cbe5c06704..cba95cee75 100644
--- a/test/ext_cuda/layers.jl
+++ b/test/ext_cuda/layers.jl
@@ -11,9 +11,11 @@
 end
 
 
-const ACTIVATIONS = [identity, tanh, softplus, elu]
+const ACTIVATIONS = [identity, tanh]
 
-function gpu_gradtest(name::String, layers::Vector, x_cpu, args...; test_cpu = true, test_mode = false)
+function gpu_gradtest(name::String, layers::Vector, x_cpu, args...; 
+    test_mode=false, test_grad_x=true, 
+    atol=1e-4, rtol=1e-4)
   @testset "$name GPU grad tests" begin
     for layer in layers
       @testset "$layer Layer GPU grad test" begin
@@ -24,7 +26,7 @@ function gpu_gradtest(name::String, layers::Vector, x_cpu, args...; test_cpu = t
           testmode!(l_cpu)
         end
 
-        test_gradients(l_cpu, x_cpu, test_gpu=true, compare_finite_diff=false)
+        test_gradients(l_cpu, x_cpu; test_gpu=true, compare_finite_diff=false, test_grad_x, atol, rtol)
       end
     end
   end
@@ -45,23 +47,24 @@ for act in ACTIVATIONS
                  ConvTranspose, ConvTransposeNoBias,
                  CrossCor, CrossCorNoBias,
                  DepthwiseConv, DepthwiseConvNoBias]
-  gpu_gradtest("Convolution with $act", conv_layers, r, (2,2), 1=>3, act, test_cpu = false)
+  gpu_gradtest("Convolution with $act", conv_layers, r, (2,2), 1=>3, act)
 
   groupedconv = [GroupedConv, GroupedConvTranspose]
-  gpu_gradtest("GroupedConvolution with $act", groupedconv, rand(Float32, 28, 28, 100, 2), (3,3), 100 => 25, act, test_cpu = true)
+  gpu_gradtest("GroupedConvolution with $act", groupedconv, rand(Float32, 28, 28, 100, 2), (3,3), 100 => 25, act)
 
   batch_norm = [BatchNorm, BatchNormNoTrackStats]
-  gpu_gradtest("BatchNorm 1 with $act", batch_norm, rand(Float32, 28,28,3,4), 3, act, test_cpu = false) #TODO fix errors
-  gpu_gradtest("BatchNorm 2 with $act", batch_norm, rand(Float32, 5,4), 5, act, test_cpu = true)
+  gpu_gradtest("BatchNorm 1 with $act", batch_norm, rand(Float32, 28,28,3,4), 3, act, atol=1e-3)
+  gpu_gradtest("BatchNorm 2 with $act", batch_norm, rand(Float32, 5,4), 5, act, atol=1e-3)
 
   batch_norm = [BatchNormNoTrackStats]
-  gpu_gradtest("BatchNorm 3 with $act (test mode)", batch_norm, rand(Float32, 5,4), 5, act, test_cpu = true, test_mode = true)
+  gpu_gradtest("BatchNorm 3 with $act (test mode)", batch_norm, rand(Float32, 5,4), 5, act, 
+    test_mode=true, atol=1e-3)
 
   instancenorm = [InstanceNorm]
-  gpu_gradtest("InstanceNorm with $act", instancenorm, r, 1, act, test_cpu = false)
+  gpu_gradtest("InstanceNorm with $act", instancenorm, r, 1, act)
 
   groupnorm = [GroupNorm]
-  gpu_gradtest("GroupNorm with $act", groupnorm, rand(Float32, 28,28,3,1), 3, 1, act, test_cpu = false)
+  gpu_gradtest("GroupNorm with $act", groupnorm, rand(Float32, 28,28,3,1), 3, 1, act)
 end
 
 r = rand(Float32, 28, 28, 1, 1)
@@ -70,13 +73,13 @@ pooling_layers = [MaxPool, MeanPool]
 gpu_gradtest("Pooling", pooling_layers, r, (2,2))
 
 adaptive_pooling_layers = [AdaptiveMaxPool, AdaptiveMeanPool]
-gpu_gradtest("AdaptivePooling", adaptive_pooling_layers, r, (7,7), test_cpu = false)
+gpu_gradtest("AdaptivePooling", adaptive_pooling_layers, r, (7,7))
 
 dropout_layers = [Dropout, AlphaDropout]
-gpu_gradtest("Dropout", dropout_layers, r, 0.5f0; test_cpu = false) # dropout is not deterministic
+gpu_gradtest("Dropout", dropout_layers, r, 1e-6) # dropout is not deterministic
 
 layer_norm = [LayerNorm]
-gpu_gradtest("LayerNorm 1", layer_norm, rand(Float32, 28,28,3,4), 28, test_cpu = false) #TODO fix errors
+gpu_gradtest("LayerNorm 1", layer_norm, rand(Float32, 28,28,3,4), 28)
 gpu_gradtest("LayerNorm 2", layer_norm, rand(Float32, 5,4), 5)
 
 upsample = [x -> Upsample(scale=x)]
@@ -88,13 +91,13 @@ gpu_gradtest("PixelShuffle 2d", pixelshuffle, rand(Float32, 3, 4, 18, 3), 3)
 gpu_gradtest("PixelShuffle 1d", pixelshuffle, rand(Float32, 3, 18, 3), 3)
 
 embedding = [Flux.Embedding]
-gpu_gradtest("Embedding", embedding, [1,3,5], 5, 2)
-gpu_gradtest("Embedding repeated indices", embedding, [1,3,5,3], 5, 2)
-gpu_gradtest("Embedding integer index", embedding, 1, 5, 2)
-gpu_gradtest("Embedding 2d index", embedding, [1 2; 3 4], 5, 2)
-gpu_gradtest("Embedding OneHotVec index", embedding, OneHotVector(1, 5), 5, 2)
-gpu_gradtest("Embedding OneHotMatrix index", embedding,  OneHotMatrix([1,2,3], 5), 5, 2)
-gpu_gradtest("Embedding OneHotMatrix repeated indices", embedding, OneHotMatrix([1,2,2], 5), 5, 2)
+gpu_gradtest("Embedding", embedding, [1,3,5], 5, 2, test_grad_x=false)
+gpu_gradtest("Embedding repeated indices", embedding, [1,3,5,3], 5, 2, test_grad_x=false)
+gpu_gradtest("Embedding integer index", embedding, 1, 5, 2, test_grad_x=false)
+gpu_gradtest("Embedding 2d index", embedding, [1 2; 3 4], 5, 2, test_grad_x=false)
+gpu_gradtest("Embedding OneHotVec index", embedding, OneHotVector(1, 5), 5, 2, test_grad_x=false)
+gpu_gradtest("Embedding OneHotMatrix index", embedding,  OneHotMatrix([1,2,3], 5), 5, 2, test_grad_x=false)
+gpu_gradtest("Embedding OneHotMatrix repeated indices", embedding, OneHotMatrix([1,2,2], 5), 5, 2, test_grad_x=false)
 
 @testset "function layers" begin
   x = rand(Float32, 3, 3)
@@ -310,5 +313,5 @@ end
   @test Array(α_gpu) ≈ α_cpu atol=1e-4
 
   test_gradients(mha_cpu, x_cpu, loss = o -> sum(o[1].^2) + sum(o[2].^2), 
-    test_gpu = true, compare_finite_diff = false)
+    test_gpu=true, compare_finite_diff=false)
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index ff6660be14..f44c4b7758 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -11,9 +11,9 @@ using Functors: fmapstructure_with_path
 
 ## Uncomment below to change the default test settings
 # ENV["FLUX_TEST_AMDGPU"] = "true"
-# ENV["FLUX_TEST_CUDA"] = "true"
+ENV["FLUX_TEST_CUDA"] = "true"
 # ENV["FLUX_TEST_METAL"] = "true"
-# ENV["FLUX_TEST_CPU"] = "false"
+ENV["FLUX_TEST_CPU"] = "false"
 # ENV["FLUX_TEST_DISTRIBUTED_MPI"] = "true"
 # ENV["FLUX_TEST_DISTRIBUTED_NCCL"] = "true"
 ENV["FLUX_TEST_ENZYME"] = "false" # We temporarily disable Enzyme tests since they are failing
diff --git a/test/test_utils.jl b/test/test_utils.jl
index c18eae84b9..f9a6b6655f 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -33,12 +33,13 @@ end
 
 function test_gradients(
             f, 
-            xs::Array...;
+            xs...;
             rtol=1e-4, atol=1e-4,
             test_gpu = false,
             test_grad_f = true,
+            test_grad_x = true,
             compare_finite_diff = true,
-            loss = sum
+            loss = mean,
             )
 
     if !test_gpu && !compare_finite_diff
@@ -46,29 +47,31 @@ function test_gradients(
                or CPU AD vs GPU AD.")
     end
 
-    # Zygote gradient with respect to input.
-    y, g = Zygote.withgradient((xs...) -> loss(f(xs...)), xs...)
-    
-    if compare_finite_diff
-        # Cast to Float64 to avoid precision issues.
-        f64 = f |> Flux.f64
-        xs64 = xs .|> Flux.f64
-        y_fd, g_fd = finitediff_withgradient((xs...) -> loss(f64(xs...)), xs64...)
-        @test y ≈ y_fd rtol=rtol atol=atol
-        check_equal_leaves(g, g_fd; rtol, atol)
-    end
+    if test_grad_x
+        # Zygote gradient with respect to input.
+        y, g = Zygote.withgradient((xs...) -> loss(f(xs...)), xs...)
+        
+        if compare_finite_diff
+            # Cast to Float64 to avoid precision issues.
+            f64 = f |> Flux.f64
+            xs64 = xs .|> Flux.f64
+            y_fd, g_fd = finitediff_withgradient((xs...) -> loss(f64(xs...)), xs64...)
+            @test y ≈ y_fd rtol=rtol atol=atol
+            check_equal_leaves(g, g_fd; rtol, atol)
+        end
 
-    if test_gpu
-        gpu_dev = gpu_device(force=true)
-        cpu_dev = cpu_device()
-        xs_gpu = xs |> gpu_dev
-        f_gpu = f |> gpu_dev
+        if test_gpu
+            gpu_dev = gpu_device(force=true)
+            cpu_dev = cpu_device()
+            xs_gpu = xs |> gpu_dev
+            f_gpu = f |> gpu_dev
 
-        # Zygote gradient with respect to input on GPU.
-        y_gpu, g_gpu = Zygote.withgradient((xs...) -> loss(f_gpu(xs...)), xs_gpu...)
-        @test get_device(g_gpu) == get_device(xs_gpu)
-        @test y_gpu ≈ y rtol=rtol atol=atol
-        check_equal_leaves(g_gpu |> cpu_dev, g; rtol, atol)
+            # Zygote gradient with respect to input on GPU.
+            y_gpu, g_gpu = Zygote.withgradient((xs...) -> loss(f_gpu(xs...)), xs_gpu...)
+            @test get_device(g_gpu) == get_device(xs_gpu)
+            @test y_gpu ≈ y rtol=rtol atol=atol
+            check_equal_leaves(g_gpu |> cpu_dev, g; rtol, atol)
+        end
     end
 
     if test_grad_f
@@ -78,14 +81,21 @@ function test_gradients(
         if compare_finite_diff
             # Use finite differences gradient as a reference.
             # y_fd, g_fd = finitediff_withgradient(f -> loss(f(x)), f)
+            # Cast to Float64 to avoid precision issues.
+            f64 = f |> Flux.f64
             ps, re = Flux.destructure(f64)
-            y_fd, g_fd = finitediff_withgradient(ps -> loss(re(ps)(xs64...)), ps)
+            y_fd, g_fd = finitediff_withgradient(ps -> loss(re(ps)(xs...)), ps)
             g_fd = (re(g_fd[1]),)
             @test y ≈ y_fd rtol=rtol atol=atol
             check_equal_leaves(g, g_fd; rtol, atol)
         end
 
         if test_gpu
+            gpu_dev = gpu_device(force=true)
+            cpu_dev = cpu_device()
+            xs_gpu = xs |> gpu_dev
+            f_gpu = f |> gpu_dev
+
             # Zygote gradient with respect to f on GPU.
             y_gpu, g_gpu = Zygote.withgradient(f -> loss(f(xs_gpu...)), f_gpu)
             # @test get_device(g_gpu) == get_device(xs_gpu)