From 72cda4aecb2bcb019b14a8c98163165347753cff Mon Sep 17 00:00:00 2001 From: Brian Chen Date: Wed, 22 Jun 2022 22:14:17 -0700 Subject: [PATCH 1/3] Improve type stability of LayerNorm and Dropout --- src/Flux.jl | 4 ++- src/layers/normalise.jl | 58 +++++++++++++++++++++++++----------- src/layers/stateless.jl | 38 +++++++++++++++++++---- test/layers/normalisation.jl | 13 +++++++- 4 files changed, 88 insertions(+), 25 deletions(-) diff --git a/src/Flux.jl b/src/Flux.jl index 0cacbd419a..f9599569c9 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -9,7 +9,9 @@ using MacroTools: @forward using MLUtils import Optimisers: Optimisers, trainable, destructure # before v0.13, Flux owned these functions -using Zygote, ChainRulesCore +using ChainRulesCore + +using Zygote using Zygote: Params, @adjoint, gradient, pullback, @nograd export gradient diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index f1f6c22033..94ee42b187 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -2,7 +2,9 @@ istraining() = false ChainRulesCore.rrule(::typeof(istraining)) = true, _ -> (NoTangent(),) -_isactive(m) = isnothing(m.active) ? istraining() : m.active +_isactive(m) = isnothing(m.active) ? istraining() : Bool(m.active) + +ChainRulesCore.@non_differentiable _isactive(::Any) _dropout_shape(s, ::Colon) = size(s) _dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(size(s)))...) @@ -29,18 +31,43 @@ automatically managed using the [`Dropout`](@ref) layer instead of the The [`Dropout`](@ref) layer is what you should use in most scenarios. """ -function dropout(rng, x, p; dims=:, active::Bool=true) - active || return x - y = dropout_mask(rng, x, p, dims=dims) - return x .* y -end +dropout(rng, x, p; dims=:, active::Bool=true) = _dropout(rng, x, p, dims, active) dropout(x, p; kwargs...) = dropout(rng_from_array(x), x, p; kwargs...) -dropout_mask(rng::CUDA.RNG, x::CuArray, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...) -dropout_mask(rng, x::CuArray, p; kwargs...) = - throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays.")) -dropout_mask(rng, x, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...) -function _dropout_mask(rng, x, p; dims=:) +# Internal function without kwargs to keep Zygote generated code type stable +function _dropout(rng, x, p, dims, active) + mask = active ? dropout_mask(rng, x, p, dims) : nothing + return _apply_mask(x, mask) +end + +function ChainRulesCore.rrule(::typeof(_dropout), rng, x, p, dims, active) + mask = active ? dropout_mask(rng, x, p, dims) : nothing + # Required because we don't always call dropout_mask + MT = Core.Compiler.return_type(dropout_mask, Tuple{typeof(rng),typeof(x),typeof(p),typeof(dims)}) + project_x = ProjectTo(x) + return _apply_mask(x, mask), DropoutPullback{MT,typeof(project_x)}(mask, project_x) +end + +# Also needed for type stability. Otherwise inference lifts the Union into a +# Union{pullback{Nothing}, pullback{AbstractArray}} +struct DropoutPullback{M<:AbstractArray,P<:ProjectTo{AbstractArray}} + mask::Union{Nothing,M} + project::P +end + +function (pb::DropoutPullback)(dy) + dx = pb.project(_apply_mask(dy, pb.mask)) + return (NoTangent(), NoTangent(), dx, NoTangent()) +end + +_apply_mask(x, ::Nothing) = x +_apply_mask(x, mask) = x .* mask + +dropout_mask(rng::CUDA.RNG, x::CuArray, p, dims) = _dropout_mask(rng, x, p, dims) +dropout_mask(rng, x::CuArray, p, dims) = + throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only supports CUDA.RNG for CuArrays.")) +dropout_mask(rng, x, p, dims) = _dropout_mask(rng, x, p, dims) +function _dropout_mask(rng, x, p, dims) realfptype = float(real(eltype(x))) y = rand!(rng, similar(x, realfptype, _dropout_shape(x, dims))) y .= _dropout_kernel.(y, p, 1 - p) @@ -48,7 +75,7 @@ function _dropout_mask(rng, x, p; dims=:) end # TODO move this to NNlib -ChainRulesCore.@non_differentiable dropout_mask(::Any, ::Any, ::Any) +ChainRulesCore.@non_differentiable dropout_mask(::Any, ::Any, ::Any, ::Any) """ Dropout(p; dims=:, rng = rng_from_array()) @@ -106,10 +133,7 @@ end @functor Dropout trainable(a::Dropout) = (;) -function (a::Dropout)(x) - _isactive(a) || return x - return dropout(a.rng, x, a.p; dims=a.dims, active=true) -end +(a::Dropout)(x) = _dropout(a.rng, x, a.p, a.dims, _isactive(a)) testmode!(m::Dropout, mode=true) = (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) @@ -226,7 +250,7 @@ LayerNorm(size_act...; kw...) = LayerNorm(Int.(size_act[1:end-1]), size_act[end] @functor LayerNorm -(a::LayerNorm)(x) = a.diag(normalise(x, dims=1:length(a.size), ϵ=a.ϵ)) +(a::LayerNorm)(x) = a.diag(_normalize(x, 1:length(a.size), a.ϵ)) function Base.show(io::IO, l::LayerNorm) print(io, "LayerNorm(", join(l.size, ", ")) diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl index 34e365ae9d..d7f8758394 100644 --- a/src/layers/stateless.jl +++ b/src/layers/stateless.jl @@ -26,15 +26,41 @@ function flatten(x::AbstractArray) return reshape(x, :, size(x)[end]) end +# Utils for LayerNorm internals. +# Most of these are required for better performance and type stability under AD. +# In an ideal world, we'd just have normalise. + +function _mean_std(x::AbstractArray, dims) + μ = mean(x, dims=dims) + σ = std(x, dims=dims, mean=μ, corrected=false) + return μ, σ +end + +function ChainRulesCore.rrule(::typeof(_mean_std), x::AbstractArray, dims) + μ, mean_pullback = ChainRulesCore.rrule(mean, x, dims=dims) + σ, std_pullback = ChainRulesCore.rrule(std, x, dims=dims, mean=μ, corrected=false) + function _mean_std_pullback((dμ, dσ)) + dx = ChainRulesCore.add!!(std_pullback(dσ)[2], mean_pullback(dμ)[2]) + return (NoTangent(), dx, NoTangent()) + end + + return (μ, σ), _mean_std_pullback +end + +_zscore(x, μ, σ, ϵ) = (x - μ) / (σ + ϵ) + +# We don't define a rrule for the whole function because we want +# AD to figure out the _zscore broadcast for us. +function _normalize(x::AbstractArray, dims, ϵ) + μ, σ = _mean_std(x, dims) + return _zscore.(x, μ, σ, ϵ) +end + """ normalise(x; dims=ndims(x), ϵ=1e-5) Normalise `x` to mean 0 and standard deviation 1 across the dimension(s) given by `dims`. -Per default, `dims` is the last dimension. +Per default, `dims` is the last dimension. `ϵ` is a small additive factor added to the denominator for numerical stability. """ -@inline function normalise(x::AbstractArray; dims=ndims(x), ϵ=ofeltype(x, 1e-5)) - μ = mean(x, dims=dims) - σ = std(x, dims=dims, mean=μ, corrected=false) - return @. (x - μ) / (σ + ϵ) -end +@inline normalise(x::AbstractArray; dims=ndims(x), ϵ=ofeltype(x, 1e-5)) = _normalize(x, dims, ϵ) diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl index 7ae15aeff9..cda9bab972 100644 --- a/test/layers/normalisation.jl +++ b/test/layers/normalisation.jl @@ -73,6 +73,12 @@ evalwgrad(f, x...) = pullback(f, x...)[1] @test cpu(m).rng === only(values(rng_kwargs)) end end + + for active in (true, false) + m = Dropout(0.5, :, active) + @inferred _, back = pullback(m, rand(10)) # _, DropoutPullback{Array{Float64}} + @inferred back(ones(10)) # Array{Float64} + end end @testset "AlphaDropout" begin @@ -343,8 +349,13 @@ end @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims=1) x = rand(2,3,4,5) @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims=1) + x = rand(2) - @test LayerNorm(2, tanh)(x) ≈ tanh.(Flux.normalise(x, dims=1)) + m = LayerNorm(2, tanh) + @test m(x) ≈ tanh.(Flux.normalise(x, dims=1)) + @inferred _, back = pullback(sum∘m, x) + @inferred back(1.0) + x = rand(2,3,4,5) @test LayerNorm((2,3))(x) ≈ Flux.normalise(x, dims=(1,2)) From 689d16be3e9c7bf57ba29b544c834970cc576892 Mon Sep 17 00:00:00 2001 From: Brian Chen Date: Fri, 24 Jun 2022 12:35:03 -0700 Subject: [PATCH 2/3] fix and add tests --- src/layers/normalise.jl | 11 ++++++++--- test/layers/normalisation.jl | 10 +++++----- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index 94ee42b187..b0d7152577 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -2,9 +2,14 @@ istraining() = false ChainRulesCore.rrule(::typeof(istraining)) = true, _ -> (NoTangent(),) -_isactive(m) = isnothing(m.active) ? istraining() : Bool(m.active) +_isactive(m) = Bool(something(m.active, istraining())) -ChainRulesCore.@non_differentiable _isactive(::Any) +# Avoids instabilities from differentiating through getproperty(m, :active) +function ChainRulesCore.rrule(::typeof(_isactive), m) + training, _ = rrule(istraining) + _isactive_pullback(_) = (NoTangent(), NoTangent()) + return Bool(something(m.active, training)), _isactive_pullback +end _dropout_shape(s, ::Colon) = size(s) _dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(size(s)))...) @@ -57,7 +62,7 @@ end function (pb::DropoutPullback)(dy) dx = pb.project(_apply_mask(dy, pb.mask)) - return (NoTangent(), NoTangent(), dx, NoTangent()) + return (NoTangent(), NoTangent(), dx, NoTangent(), NoTangent(), NoTangent()) end _apply_mask(x, ::Nothing) = x diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl index cda9bab972..f21d9d0e44 100644 --- a/test/layers/normalisation.jl +++ b/test/layers/normalisation.jl @@ -73,10 +73,10 @@ evalwgrad(f, x...) = pullback(f, x...)[1] @test cpu(m).rng === only(values(rng_kwargs)) end end - + for active in (true, false) m = Dropout(0.5, :, active) - @inferred _, back = pullback(m, rand(10)) # _, DropoutPullback{Array{Float64}} + _, back = @inferred pullback(m, rand(10)) # _, DropoutPullback{Array{Float64}} @inferred back(ones(10)) # Array{Float64} end end @@ -353,9 +353,9 @@ end x = rand(2) m = LayerNorm(2, tanh) @test m(x) ≈ tanh.(Flux.normalise(x, dims=1)) - @inferred _, back = pullback(sum∘m, x) - @inferred back(1.0) - + _, back = @inferred pullback(|>, x, m) + # TODO needs https://github.com/FluxML/Zygote.jl/pull/1248 + # @inferred back(1.0) x = rand(2,3,4,5) @test LayerNorm((2,3))(x) ≈ Flux.normalise(x, dims=(1,2)) From 29ef2ff119bfbde3948f9d06a01353e3652a7138 Mon Sep 17 00:00:00 2001 From: Brian Chen Date: Sun, 31 Jul 2022 22:06:36 -0700 Subject: [PATCH 3/3] revert to unfused broadcast --- src/layers/stateless.jl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl index d7f8758394..03c07e82b3 100644 --- a/src/layers/stateless.jl +++ b/src/layers/stateless.jl @@ -47,13 +47,11 @@ function ChainRulesCore.rrule(::typeof(_mean_std), x::AbstractArray, dims) return (μ, σ), _mean_std_pullback end -_zscore(x, μ, σ, ϵ) = (x - μ) / (σ + ϵ) - # We don't define a rrule for the whole function because we want -# AD to figure out the _zscore broadcast for us. +# AD to figure out the broadcast for us. function _normalize(x::AbstractArray, dims, ϵ) μ, σ = _mean_std(x, dims) - return _zscore.(x, μ, σ, ϵ) + return @. (x - μ) / (σ + ϵ) end """