FluxML · CarloLucibello · Oct 13, 2024 · Oct 12, 2024 · Oct 12, 2024 · Oct 12, 2024
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@ Manifest.toml
 LocalPreferences.toml
 .DS_Store
 docs/mymodel.bson
+prova.jl
diff --git a/src/distributed/public_api.jl b/src/distributed/public_api.jl
@@ -132,7 +132,7 @@ Backend Agnostic API to perform an allreduce operation on the given buffer `send
 workers.
 """
 function allreduce!(backend::AbstractFluxDistributedBackend, sendrecvbuf, op::F) where {F}
-    return __allreduce!(backend, sendrecvbuf, op, get_device())
+    return __allreduce!(backend, sendrecvbuf, op, gpu_device())
 end
 
 function allreduce!(

diff --git a/test/Project.toml b/test/Project.toml
@@ -21,4 +21,4 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 [compat]
 FiniteDifferences = "0.12"
 Tracker = "0.2.33"
-Enzyme = "0.12.4"
+Enzyme = "0.13"
diff --git a/test/ext_amdgpu/basic.jl b/test/ext_amdgpu/basic.jl
@@ -19,26 +19,27 @@ end
 end
 
 @testset "Chain of Dense layers" begin
-    m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax) |> f32
+    m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
     x = rand(Float32, 10, 10)
-    gpu_autodiff_test(m, x)
+    test_gradients(m, x, test_gpu=true, compare_finite_diff=false)
 end
 
 @testset "Convolution" begin
     for conv_type in (Conv, ConvTranspose), nd in 1:3
-        m = conv_type(tuple(fill(2, nd)...), 3 => 4) |> f32
+        m = conv_type(tuple(fill(2, nd)...), 3 => 4)
         x = rand(Float32, fill(10, nd)..., 3, 5)
 
+        md, xd = Flux.gpu.((m, x))
+        y = m(x)
         # Ensure outputs are the same.
-        gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false)
+        @test collect(md(xd)) ≈ y  atol=1f-3
 
         # Gradients are flipped as well.
-        md, xd = Flux.gpu.((m, x))
-        gs = gradient(m -> sum(m(x)), m)
-        gsd = gradient(m -> sum(m(xd)), md)
+        gs = gradient(m -> sum(m(x)), m)[1]
+        gsd = gradient(m -> sum(m(xd)), md)[1]
 
         dims = ntuple(i -> i, ndims(m.weight) - 2)
-        @test reverse(gs[1].weight; dims) ≈ Array(gsd[1].weight) atol=1f-2
+        @test reverse(gs.weight; dims) ≈ Array(gsd.weight) atol=1f-2
 
         # Movement back to CPU flips weights back.
         mh = Flux.cpu(md)
@@ -52,10 +53,10 @@ end
         x = rand(Float32, fill(10, nd)..., 3, 5) |> gpu
 
         pad = ntuple(i -> i, nd)
-        m = conv_type(kernel, 3 => 4, pad=pad) |> f32 |> gpu
+        m = conv_type(kernel, 3 => 4, pad=pad) |> gpu
 
         expanded_pad = ntuple(i -> pad[(i - 1) ÷ 2 + 1], 2 * nd)
-        m_expanded = conv_type(kernel, 3 => 4, pad=expanded_pad) |> f32 |> gpu
+        m_expanded = conv_type(kernel, 3 => 4, pad=expanded_pad) |> gpu
 
         @test size(m(x)) == size(m_expanded(x))
     end
@@ -74,25 +75,25 @@ end
 end
 
 @testset "Chain(Conv)" begin
-    m = Chain(Conv((3, 3), 3 => 3)) |> f32
-    x = rand(Float32, 10, 10, 3, 2)
-    gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false)
+    m = Chain(Conv((3, 3), 3 => 3))
+    x = rand(Float32, 5, 5, 3, 2)
+    test_gradients(m, x, test_gpu=true, compare_finite_diff=false, test_grad_f=false)
 
     md = m |> gpu |> cpu
     @test md[1].weight ≈ m[1].weight atol=1f-3
 
-    m = Chain(ConvTranspose((3, 3), 3 => 3)) |> f32
-    x = rand(Float32, 10, 10, 3, 2)
-    gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false)
+    m = Chain(ConvTranspose((3, 3), 3 => 3))
+    x = rand(Float32, 5, 5, 3, 2)
+    test_gradients(m, x, test_gpu=true, compare_finite_diff=false, test_grad_f=false)
 
     md = m |> gpu |> cpu
     @test md[1].weight ≈ m[1].weight atol=1f-3
 end
 
 @testset "Cross-correlation" begin
-    m = CrossCor((2, 2), 3 => 4) |> f32
-    x = rand(Float32, 10, 10, 3, 2)
-    gpu_autodiff_test(m, x; atol=1f-3)
+    m = CrossCor((2, 2), 3 => 4)
+    x = rand(Float32, 5, 5, 3, 2)
+    test_gradients(m, x, test_gpu=true, compare_finite_diff=false)
 end
 
 @testset "Restructure" begin
@@ -132,7 +133,7 @@ end
     bn = BatchNorm(3, σ)
     for nd in 1:3
         x = rand(Float32, fill(2, nd - 1)..., 3, 4)
-        gpu_autodiff_test(bn, x; atol=1f-3, allow_nothing=true)
+        test_gradients(bn, x; test_gpu=true, compare_finite_diff=false)
     end
 end
 

diff --git a/test/ext_amdgpu/get_devices.jl b/test/ext_amdgpu/get_devices.jl
@@ -17,9 +17,9 @@ x = randn(Float32, 5, 5)
 cx = x |> amdgpu_device
 @test cx isa AMDGPU.ROCArray
 
-# moving models to specific NVIDIA devices
+# moving models to specific AMDGPU devices
 for id in 0:(length(AMDGPU.devices()) - 1)
-  current_amdgpu_device = Flux.get_device("AMDGPU", id)
+  current_amdgpu_device = gpu_device(id+1)
 
   global dense_model = dense_model |> current_amdgpu_device
   @test dense_model.weight isa AMDGPU.ROCArray

diff --git a/test/ext_amdgpu/runtests.jl b/test/ext_amdgpu/runtests.jl
@@ -2,9 +2,6 @@
 @assert AMDGPU.functional()
 AMDGPU.allowscalar(false)
 
-include("../test_utils.jl")
-include("test_utils.jl")
-
 @testset "get_devices" begin
   include("get_devices.jl")
 end

diff --git a/test/ext_amdgpu/test_utils.jl b/test/ext_amdgpu/test_utils.jl
diff --git a/test/ext_cuda/get_devices.jl b/test/ext_cuda/get_devices.jl
@@ -8,9 +8,6 @@ dense_model = Dense(2 => 3)                 # initially lives on CPU
 weight = copy(dense_model.weight)           # store the weight
 bias = copy(dense_model.bias)               # store the bias
 
-cuda_device = Flux.get_device()
-
-@test typeof(cuda_device) <: Flux.CUDADevice
 
 # correctness of data transfer
 x = randn(5, 5)
@@ -30,6 +27,12 @@ for id in 0:(length(CUDA.devices()) - 1)
   @test isequal(Flux.cpu(dense_model.weight), weight)
   @test isequal(Flux.cpu(dense_model.bias), bias)
 end
+
+# gpu_device remembers the last device selected
+# Therefore, we need to reset it to the current cuda device
+@test gpu_device().device.handle == length(CUDA.devices()) - 1
+gpu_device(CUDA.device().handle + 1)
+
 # finally move to CPU, and see if things work
 cdev = cpu_device()
 dense_model = cdev(dense_model)

diff --git a/test/ext_cuda/layers.jl b/test/ext_cuda/layers.jl
@@ -10,73 +10,23 @@
   @test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple
 end
 
-# TODO: These layers get into scalar indexing issues.
-const BROKEN_LAYERS = Union{}
 
-const ACTIVATIONS = [identity, relu, tanh,
-                     sigmoid, exp, softplus,
-                     elu, selu]
+const ACTIVATIONS = [identity, tanh]
 
-function gpu_gradtest(name::String, layers::Vector, x_cpu = nothing, args...; test_cpu = true, test_mode = false)
-  isnothing(x_cpu) && error("Missing input to test the layers against.")
+function gpu_gradtest(name::String, layers::Vector, x_cpu, args...; 
+    test_mode=false, test_grad_x=true, 
+    atol=1e-4, rtol=1e-4)
   @testset "$name GPU grad tests" begin
     for layer in layers
       @testset "$layer Layer GPU grad test" begin
 
         # compute output and grad of parameters
         l_cpu = layer(args...)
-        l_gpu = l_cpu |> gpu
         if test_mode
           testmode!(l_cpu)
-          testmode!(l_gpu)
         end
 
-        ps_cpu = Flux.params(l_cpu)
-        y_cpu, back_cpu = pullback(() -> sum(l_cpu(x_cpu)), ps_cpu)
-        gs_cpu = back_cpu(1f0)
-
-        x_gpu = gpu(x_cpu)
-        ps_gpu = Flux.params(l_gpu)
-
-        if typeof(l_gpu) <: BROKEN_LAYERS
-          @test_broken gradient(() -> sum(l_gpu(x_gpu)), ps_gpu) isa Flux.Zygote.Grads
-        else
-          y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu)
-          gs_gpu = back_gpu(1f0) # TODO many layers error out when backprop int 1, should fix
-
-          # compute grad of input
-          xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1]
-          xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1]
-
-          # test
-          if test_cpu
-            if layer === GroupedConvTranspose
-              @test y_gpu ≈ y_cpu rtol=1f-2 atol=1f-3
-            else
-              @test y_gpu ≈ y_cpu rtol=1f-3 atol=1f-3
-            end
-            if isnothing(xg_cpu)
-              @test isnothing(xg_gpu)
-            else
-              if layer === GroupedConvTranspose
-                @test Array(xg_gpu) ≈ xg_cpu rtol = 2f-2 atol = 1f-3
-              else
-                @test Array(xg_gpu) ≈ xg_cpu rtol = 1f-3 atol = 1f-3
-              end
-            end
-          end
-          @test gs_gpu isa Flux.Zygote.Grads
-          for (p_cpu, p_gpu) in zip(ps_cpu, ps_gpu)
-            if isnothing(gs_cpu[p_cpu])
-              @test isnothing(gs_gpu[p_gpu])
-            else
-              @test gs_gpu[p_gpu] isa CuArray
-              if test_cpu
-                @test Array(gs_gpu[p_gpu]) ≈ gs_cpu[p_cpu] rtol=1f-3 atol=1f-3
-              end
-            end
-          end
-        end
+        test_gradients(l_cpu, x_cpu; test_gpu=true, compare_finite_diff=false, test_grad_x, atol, rtol)
       end
     end
   end
@@ -97,23 +47,24 @@ for act in ACTIVATIONS
                  ConvTranspose, ConvTransposeNoBias,
                  CrossCor, CrossCorNoBias,
                  DepthwiseConv, DepthwiseConvNoBias]
-  gpu_gradtest("Convolution with $act", conv_layers, r, (2,2), 1=>3, act, test_cpu = false)
+  gpu_gradtest("Convolution with $act", conv_layers, r, (2,2), 1=>3, act)
 
   groupedconv = [GroupedConv, GroupedConvTranspose]
-  gpu_gradtest("GroupedConvolution with $act", groupedconv, rand(Float32, 28, 28, 100, 2), (3,3), 100 => 25, act, test_cpu = true)
+  gpu_gradtest("GroupedConvolution with $act", groupedconv, rand(Float32, 28, 28, 100, 2), (3,3), 100 => 25, act)
 
   batch_norm = [BatchNorm, BatchNormNoTrackStats]
-  gpu_gradtest("BatchNorm 1 with $act", batch_norm, rand(Float32, 28,28,3,4), 3, act, test_cpu = false) #TODO fix errors
-  gpu_gradtest("BatchNorm 2 with $act", batch_norm, rand(Float32, 5,4), 5, act, test_cpu = true)
+  gpu_gradtest("BatchNorm 1 with $act", batch_norm, rand(Float32, 28,28,3,4), 3, act, atol=1e-3)
+  gpu_gradtest("BatchNorm 2 with $act", batch_norm, rand(Float32, 5,4), 5, act, atol=1e-3)
 
   batch_norm = [BatchNormNoTrackStats]
-  gpu_gradtest("BatchNorm 3 with $act (test mode)", batch_norm, rand(Float32, 5,4), 5, act, test_cpu = true, test_mode = true)
+  gpu_gradtest("BatchNorm 3 with $act (test mode)", batch_norm, rand(Float32, 5,4), 5, act, 
+    test_mode=true, atol=1e-3)
 
   instancenorm = [InstanceNorm]
-  gpu_gradtest("InstanceNorm with $act", instancenorm, r, 1, act, test_cpu = false)
+  gpu_gradtest("InstanceNorm with $act", instancenorm, r, 1, act)
 
   groupnorm = [GroupNorm]
-  gpu_gradtest("GroupNorm with $act", groupnorm, rand(Float32, 28,28,3,1), 3, 1, act, test_cpu = false)
+  gpu_gradtest("GroupNorm with $act", groupnorm, rand(Float32, 28,28,3,1), 3, 1, act)
 end
 
 r = rand(Float32, 28, 28, 1, 1)
@@ -122,13 +73,13 @@ pooling_layers = [MaxPool, MeanPool]
 gpu_gradtest("Pooling", pooling_layers, r, (2,2))
 
 adaptive_pooling_layers = [AdaptiveMaxPool, AdaptiveMeanPool]
-gpu_gradtest("AdaptivePooling", adaptive_pooling_layers, r, (7,7), test_cpu = false)
+gpu_gradtest("AdaptivePooling", adaptive_pooling_layers, r, (7,7))
 
 dropout_layers = [Dropout, AlphaDropout]
-gpu_gradtest("Dropout", dropout_layers, r, 0.5f0; test_cpu = false) # dropout is not deterministic
+gpu_gradtest("Dropout", dropout_layers, r, 1e-6) # dropout is not deterministic
 
 layer_norm = [LayerNorm]
-gpu_gradtest("LayerNorm 1", layer_norm, rand(Float32, 28,28,3,4), 28, test_cpu = false) #TODO fix errors
+gpu_gradtest("LayerNorm 1", layer_norm, rand(Float32, 28,28,3,4), 28)
 gpu_gradtest("LayerNorm 2", layer_norm, rand(Float32, 5,4), 5)
 
 upsample = [x -> Upsample(scale=x)]
@@ -140,32 +91,27 @@ gpu_gradtest("PixelShuffle 2d", pixelshuffle, rand(Float32, 3, 4, 18, 3), 3)
 gpu_gradtest("PixelShuffle 1d", pixelshuffle, rand(Float32, 3, 18, 3), 3)
 
 embedding = [Flux.Embedding]
-gpu_gradtest("Embedding", embedding, [1,3,5], 5, 2)
-gpu_gradtest("Embedding repeated indices", embedding, [1,3,5,3], 5, 2)
-gpu_gradtest("Embedding integer index", embedding, 1, 5, 2)
-gpu_gradtest("Embedding 2d index", embedding, [1 2; 3 4], 5, 2)
-gpu_gradtest("Embedding OneHotVec index", embedding, OneHotVector(1, 5), 5, 2)
-gpu_gradtest("Embedding OneHotMatrix index", embedding,  OneHotMatrix([1,2,3], 5), 5, 2)
-gpu_gradtest("Embedding OneHotMatrix repeated indices", embedding, OneHotMatrix([1,2,2], 5), 5, 2)
+gpu_gradtest("Embedding", embedding, [1,3,5], 5, 2, test_grad_x=false)
+gpu_gradtest("Embedding repeated indices", embedding, [1,3,5,3], 5, 2, test_grad_x=false)
+gpu_gradtest("Embedding integer index", embedding, 1, 5, 2, test_grad_x=false)
+gpu_gradtest("Embedding 2d index", embedding, [1 2; 3 4], 5, 2, test_grad_x=false)
+gpu_gradtest("Embedding OneHotVec index", embedding, OneHotVector(1, 5), 5, 2, test_grad_x=false)
+gpu_gradtest("Embedding OneHotMatrix index", embedding,  OneHotMatrix([1,2,3], 5), 5, 2, test_grad_x=false)
+gpu_gradtest("Embedding OneHotMatrix repeated indices", embedding, OneHotMatrix([1,2,2], 5), 5, 2, test_grad_x=false)
 
 @testset "function layers" begin
-  x = rand(Float32, 3,3)
-  gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=1)), x)
-  gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=2)), x)
-  gpu_autodiff_test(x -> sum(Flux.normalise(x)), x)
+  x = rand(Float32, 3, 3)
+  test_gradients(x -> sum(Flux.normalise(x; dims=1)), x, test_gpu=true, compare_finite_diff=false)
+  test_gradients(x -> sum(Flux.normalise(x; dims=2)), x, test_gpu=true, compare_finite_diff=false)
+  test_gradients(x -> sum(Flux.normalise(x)), x, test_gpu=true, compare_finite_diff=false)
 end
 
 @testset "Zeros mapped for $cl" for cl in (Conv, ConvTranspose, CrossCor, DepthwiseConv)
   l = cl((2,2), 1=>3, bias = false) |> gpu
   ip = zeros(Float32, 28,28,1,1) |> gpu
-  if typeof(l) <: BROKEN_LAYERS
-    @test_broken sum(l(ip)) ≈ 0.f0
-    @test_broken gradient(() -> sum(l(ip)), Flux.params(l)) isa Flux.Zygote.Grads
-  else
-    @test sum(l(ip)) ≈ 0.f0
-    gs = gradient(() -> sum(l(ip)), Flux.params(l))
-    @test l.bias ∉ gs.params
-  end
+  @test sum(l(ip)) ≈ 0.f0
+  gs = gradient(() -> sum(l(ip)), Flux.params(l))
+  @test l.bias ∉ gs.params
 end
 
 @testset "Dense without bias" begin
@@ -366,14 +312,6 @@ end
   @test Array(y_gpu) ≈ y_cpu atol=1e-4
   @test Array(α_gpu) ≈ α_cpu atol=1e-4
 
-  gm_cpu, gx_cpu = gradient(mha_cpu, x_cpu) do mha, x
-    y, α = mha(x)
-    return sum(y.^2) + sum(α.^2)
-  end
-  gm_gpu, gx_gpu = gradient(mha_gpu, x_gpu) do mha, x
-    y, α = mha(x)
-    return sum(y.^2) + sum(α.^2)
-  end
-  check_grad(gm_gpu, gm_cpu)
-  check_grad(gx_gpu, gx_cpu)
+  test_gradients(mha_cpu, x_cpu, loss = o -> sum(o[1].^2) + sum(o[2].^2), 
+    test_gpu=true, compare_finite_diff=false)
 end
diff --git a/test/ext_cuda/losses.jl b/test/ext_cuda/losses.jl
@@ -27,11 +27,12 @@ y = [1  0  0  0  1
 @test focal_loss(x, y) ≈ focal_loss(gpu(x), gpu(y))
 
 @testset "GPU: $loss" for loss in ALL_LOSSES
-  x = rand(Float32, 3,4)
-  y = rand(Float32, 3,4)
+  # let's stay far from the boundaries to avoid problems with finite differences gradients
+  x = 0.1f0 .+ 0.8f0 .* rand(Float32, 3, 4)
+  y = 0.1f0 .+ 0.8f0 .* rand(Float32, 3, 4)
   @test loss(x, y) ≈ loss(gpu(x), gpu(y))
 
-  gpu_autodiff_test(loss, x, y)
+  test_gradients(loss, x, y, test_gpu=true, test_grad_f=false, compare_finite_diff=false)
 
   # Float16 tests
   @test loss(f16(x), f16(y)) ≈ loss(gpu(f16(x)), gpu(f16(y)))

diff --git a/test/ext_cuda/runtests.jl b/test/ext_cuda/runtests.jl
@@ -7,13 +7,9 @@ using Random, LinearAlgebra, Statistics
 @assert CUDA.functional()
 CUDA.allowscalar(false)
 
-# include("../test_utils.jl")
-include("test_utils.jl")
-
 @testset "get_devices" begin
   include("get_devices.jl")
 end
-
 @testset "cuda" begin
   include("cuda.jl")
 end