From d0dd7b12a8846f2528928d571fff91ac610490d0 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Tue, 15 Aug 2023 01:04:34 +0200 Subject: [PATCH 01/11] Add KernelDensity as dependency --- Project.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Project.toml b/Project.toml index 4c442dd..20f2802 100644 --- a/Project.toml +++ b/Project.toml @@ -9,6 +9,7 @@ DataInterpolations = "82cc6244-b520-54b8-b5a6-8a565e85f1d0" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" IteratorInterfaceExtensions = "82899510-4779-5014-852e-03e436cf321d" +KernelDensity = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688" MCMCDiagnosticTools = "be115224-59cd-429b-ad48-344e309966f0" @@ -33,6 +34,7 @@ DocStringExtensions = "0.8, 0.9" FiniteDifferences = "0.12" GLM = "1" IteratorInterfaceExtensions = "0.1.1, 1" +KernelDensity = "0.6" LogExpFunctions = "0.2.0, 0.3" MCMCDiagnosticTools = "0.3.4" OffsetArrays = "1" From 0d6336f459a495d96b690b27d3c72a306500fdde Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Tue, 15 Aug 2023 01:07:11 +0200 Subject: [PATCH 02/11] Add kde and ckde --- src/PosteriorStats.jl | 5 ++ src/kde.jl | 117 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 src/kde.jl diff --git a/src/PosteriorStats.jl b/src/PosteriorStats.jl index f4bab41..ac1bd28 100644 --- a/src/PosteriorStats.jl +++ b/src/PosteriorStats.jl @@ -5,6 +5,7 @@ using DataInterpolations: DataInterpolations using Distributions: Distributions using DocStringExtensions: FIELDS, FUNCTIONNAME, TYPEDEF, TYPEDFIELDS, SIGNATURES using IteratorInterfaceExtensions: IteratorInterfaceExtensions +using KernelDensity: KernelDensity using LinearAlgebra: mul!, norm using LogExpFunctions: LogExpFunctions using Markdown: @doc_str @@ -20,6 +21,9 @@ using StatsBase: StatsBase using Tables: Tables using TableTraits: TableTraits +# Density +export bandwidth_silverman, ckde, kde + # PSIS export PSIS, PSISResult, psis, psis! @@ -43,6 +47,7 @@ const INFORMATION_CRITERION_SCALES = (deviance=-2, log=1, negative_log=-1) include("utils.jl") include("hdi.jl") +include("kde.jl") include("elpdresult.jl") include("loo.jl") include("waic.jl") diff --git a/src/kde.jl b/src/kde.jl new file mode 100644 index 0000000..9998a8a --- /dev/null +++ b/src/kde.jl @@ -0,0 +1,117 @@ +const STD_NORM_IQR = rationalize(1.34) + +""" + bandwidth_silverman(x; kwargs...) -> Real +""" +function bandwidth_silverman( + x::AbstractVector{<:Real}; alpha::Real=9//10, std::Real=Statistics.std(x) +) + n = length(x) + iqr = StatsBase.iqr(x) + quantile_width = iqr / STD_NORM_IQR + width = min(std, quantile_width) + T = typeof(one(width)) + return alpha * width * T(n)^(-1//5) +end + +function _padding_factor( + kernel::Distributions.ContinuousUnivariateDistribution, prob_tail::Real +) + return Int(cld(Distributions.cquantile(kernel, prob_tail / 2), Statistics.std(kernel))) +end + +function _kernel_with_bandwidth( + T::Type{<:Distributions.ContinuousUnivariateDistribution}, bw +) + d = T() + dcentered = (d - Statistics.median(d)) * bw / Statistics.std(d) + StatsBase.skewness(dcentered) ≈ 0 || throw(ArgumentError("Kernel must be symmetric.")) + return dcentered +end + +""" + kde(x; kwargs...) -> KernelDensity.UnivariateKDE + +Compute the univariate kernel density estimate of data `x`. + +# Arguments +- `x`: data array + +# Keyword arguments +- `bandwidth::Real`: bandwidth of the kernel. Defaults to [`bandwidth_silverman(x)`](@ref). +- `kernel::Type{<:Distributions.ContinuousUnivariateDistribution}`: type of kernel to build. + Defaults to `Normal`. +- `bound_correction::Bool`: whether to perform boundary correction. Defaults to `true`. +- `grid_length::Int`: number of grid points to use. Defaults to `512`. +""" +function kde( + x::AbstractVector; + bandwidth::Real=bandwidth_silverman(x), + kernel=Distributions.Normal, + bound_correction::Bool=true, + grid_length::Int=512, +) + x_min, x_max = extrema(x) + + grid_length = max(grid_length, 100) + grid_min = x_min + grid_max = x_max + bin_width = (grid_max - grid_min) / grid_length + + # work out how much padding to add to guarantee that extra density due to wraparound + # is negligible + prob_tail = 1e-3 + _kernel = _kernel_with_bandwidth(kernel, bandwidth) + npad_bw = _padding_factor(_kernel, prob_tail) + npad_bin = Int(cld(npad_bw * bandwidth, bin_width)) + + # add extra padding if performing boundary correction + if bound_correction + nbin_reflect = npad_bin + npad_bin += nbin_reflect + end + + # pad to avoid wraparound at the boundary + grid_min -= bin_width * npad_bin + grid_max += bin_width * npad_bin + grid_length += 2npad_bin + + # compute density + k = KernelDensity.kde( + x; npoints=grid_length, boundary=(grid_min, grid_max), bandwidth, kernel + ) + grid = k.x + pdf = k.density + + if bound_correction + # reflect density at the boundary + il = firstindex(pdf) + npad_bin + ir = lastindex(pdf) - npad_bin + pdf[il:(il + nbin_reflect)] .+= @view pdf[il:-1:(il - nbin_reflect)] + pdf[(ir - nbin_reflect):ir] .+= @view pdf[(ir + nbin_reflect):-1:ir] + end + + # remove padding + grid_unpad = grid[(begin + npad_bin):(end - npad_bin)] + pdf_unpad = pdf[(begin + npad_bin):(end - npad_bin)] + + return KernelDensity.UnivariateKDE(grid_unpad, pdf_unpad) +end + +struct UnivariateCKDE{X,P} + x::X + probability::P +end + +""" + ckde(x; kwargs...) -> UnivariateCKDE + +Compute the CDF of the kernel density estimate of data `x`. + +For details about arguments and keywords, see [`kde`](@ref) +""" +function ckde(x; kwargs...) + grid, pdf = kde(x; kwargs...) + prob = cumsum(pdf) .* step(grid) + return UnivariateCKDE(grid, prob ./ prob[end]) +end From 94c7411e9e4162fc6d1b2f92c603f2875b71e9db Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Tue, 15 Aug 2023 01:07:48 +0200 Subject: [PATCH 03/11] Update API docs --- docs/src/api.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/src/api.md b/docs/src/api.md index c1b20c0..e9c3187 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -56,7 +56,15 @@ Stacking loo_pit ``` -### Utilities +## Density utilities + +```@docs +kde +ckde +bandwidth_silverman +``` + +## Utilities ```@docs PosteriorStats.smooth_data From 5a862986559ab36ac80dfef6d3bbdc5b9c59e5c1 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Tue, 15 Aug 2023 01:34:39 +0200 Subject: [PATCH 04/11] Fix ckde --- src/kde.jl | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/kde.jl b/src/kde.jl index 9998a8a..366f46d 100644 --- a/src/kde.jl +++ b/src/kde.jl @@ -110,8 +110,14 @@ Compute the CDF of the kernel density estimate of data `x`. For details about arguments and keywords, see [`kde`](@ref) """ -function ckde(x; kwargs...) - grid, pdf = kde(x; kwargs...) - prob = cumsum(pdf) .* step(grid) - return UnivariateCKDE(grid, prob ./ prob[end]) +ckde(x; kwargs...) = ckde(kde(x; kwargs...)) + +""" + ckde(k::KernelDensity.UnivariateKDE) -> UnivariateCKDE + +Compute the CDF of the provided kernel density estimate. +""" +function ckde(k::KernelDensity.UnivariateKDE) + prob = cumsum(k.density) .* step(k.x) + return UnivariateCKDE(k.x, prob ./ prob[end]) end From ba1b2e792145912ede98fc13c7e5e67535584d44 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Tue, 15 Aug 2023 19:21:05 +0200 Subject: [PATCH 05/11] Update kde implementation --- src/kde.jl | 57 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/src/kde.jl b/src/kde.jl index 366f46d..bc9ab8f 100644 --- a/src/kde.jl +++ b/src/kde.jl @@ -49,53 +49,64 @@ function kde( bandwidth::Real=bandwidth_silverman(x), kernel=Distributions.Normal, bound_correction::Bool=true, - grid_length::Int=512, + npoints::Int=512, + pad_factor::Union{Real,Nothing}=nothing, ) x_min, x_max = extrema(x) - grid_length = max(grid_length, 100) + grid_length = max(npoints, 100) grid_min = x_min grid_max = x_max bin_width = (grid_max - grid_min) / grid_length - # work out how much padding to add to guarantee that extra density due to wraparound - # is negligible - prob_tail = 1e-3 - _kernel = _kernel_with_bandwidth(kernel, bandwidth) - npad_bw = _padding_factor(_kernel, prob_tail) - npad_bin = Int(cld(npad_bw * bandwidth, bin_width)) + if pad_factor === nothing + # work out how much padding to add to guarantee that extra density due to wraparound + # is negligible + prob_tail = 1e-3 + _kernel = _kernel_with_bandwidth(kernel, bandwidth) + _pad_factor = _padding_factor(_kernel, prob_tail) + elseif pad_factor ≥ 0 + throw(ArgumentError("Padding factor must be non-negative.")) + else + _pad_factor = pad_factor + end + npoints_pad = 2 * max(1, Int(cld(_pad_factor * bandwidth, bin_width))) + npad_left = npad_right = npoints_pad ÷ 2 # add extra padding if performing boundary correction - if bound_correction - nbin_reflect = npad_bin - npad_bin += nbin_reflect - end + nbin_reflect = bound_correction ? npoints_pad ÷ 2 : 0 # pad to avoid wraparound at the boundary - grid_min -= bin_width * npad_bin - grid_max += bin_width * npad_bin - grid_length += 2npad_bin + grid_min -= bin_width * npad_left + grid_max += bin_width * npad_right + grid_length += npoints_pad # compute density + center_min = grid_min + bin_width / 2 + center_max = grid_max - bin_width / 2 k = KernelDensity.kde( - x; npoints=grid_length, boundary=(grid_min, grid_max), bandwidth, kernel + x; npoints=grid_length, boundary=(center_min, center_max), bandwidth, kernel ) grid = k.x pdf = k.density if bound_correction # reflect density at the boundary - il = firstindex(pdf) + npad_bin - ir = lastindex(pdf) - npad_bin - pdf[il:(il + nbin_reflect)] .+= @view pdf[il:-1:(il - nbin_reflect)] - pdf[(ir - nbin_reflect):ir] .+= @view pdf[(ir + nbin_reflect):-1:ir] + il = firstindex(pdf) + npad_left + ir = lastindex(pdf) - npad_right + pdf[range(il; length=nbin_reflect)] .+= @view pdf[range( + il - 1; step=-1, length=nbin_reflect + )] + pdf[range(ir; step=-1, length=nbin_reflect)] .+= @view pdf[range( + ir + 1; length=nbin_reflect + )] end # remove padding - grid_unpad = grid[(begin + npad_bin):(end - npad_bin)] - pdf_unpad = pdf[(begin + npad_bin):(end - npad_bin)] + grid = grid[(begin + npad_left):(end - npad_right)] + pdf = pdf[(begin + npad_left):(end - npad_right)] - return KernelDensity.UnivariateKDE(grid_unpad, pdf_unpad) + return KernelDensity.UnivariateKDE(grid, pdf) end struct UnivariateCKDE{X,P} From 7880f57697a1c8101c49f4f82c203a4e96ed0a3a Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Tue, 15 Aug 2023 19:22:18 +0200 Subject: [PATCH 06/11] Update docstring --- src/kde.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/kde.jl b/src/kde.jl index bc9ab8f..034b21f 100644 --- a/src/kde.jl +++ b/src/kde.jl @@ -42,7 +42,9 @@ Compute the univariate kernel density estimate of data `x`. - `kernel::Type{<:Distributions.ContinuousUnivariateDistribution}`: type of kernel to build. Defaults to `Normal`. - `bound_correction::Bool`: whether to perform boundary correction. Defaults to `true`. -- `grid_length::Int`: number of grid points to use. Defaults to `512`. + If `false`, the resulting truncated KDE is not normalized to 1. +- `npoints::Int`: number of points at which the resulting KDE is evaluated. Defaults to + `512`. """ function kde( x::AbstractVector; From 94f2e49a0c37e8dc6b8930c84ec65d00d3b174d1 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Wed, 16 Aug 2023 01:11:20 +0200 Subject: [PATCH 07/11] Rename variables --- src/kde.jl | 46 +++++++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/src/kde.jl b/src/kde.jl index 034b21f..1d1dbe4 100644 --- a/src/kde.jl +++ b/src/kde.jl @@ -54,12 +54,9 @@ function kde( npoints::Int=512, pad_factor::Union{Real,Nothing}=nothing, ) - x_min, x_max = extrema(x) - - grid_length = max(npoints, 100) - grid_min = x_min - grid_max = x_max - bin_width = (grid_max - grid_min) / grid_length + grid_size = max(npoints, 100) + grid_min, grid_max = extrema(x) + bin_width = (grid_max - grid_min) / grid_size if pad_factor === nothing # work out how much padding to add to guarantee that extra density due to wraparound @@ -72,43 +69,38 @@ function kde( else _pad_factor = pad_factor end - npoints_pad = 2 * max(1, Int(cld(_pad_factor * bandwidth, bin_width))) - npad_left = npad_right = npoints_pad ÷ 2 - - # add extra padding if performing boundary correction - nbin_reflect = bound_correction ? npoints_pad ÷ 2 : 0 + grid_pad_size = 2 * max(1, Int(cld(_pad_factor * bandwidth, bin_width))) + npad_left = npad_right = grid_pad_size ÷ 2 # pad to avoid wraparound at the boundary grid_min -= bin_width * npad_left grid_max += bin_width * npad_right - grid_length += npoints_pad + grid_size += grid_pad_size # compute density - center_min = grid_min + bin_width / 2 - center_max = grid_max - bin_width / 2 - k = KernelDensity.kde( - x; npoints=grid_length, boundary=(center_min, center_max), bandwidth, kernel - ) - grid = k.x - pdf = k.density + boundary = (grid_min + bin_width / 2, grid_max - bin_width / 2) + k = KernelDensity.kde(x; npoints=grid_size, boundary, bandwidth, kernel) + midpoints = k.x + density = k.density if bound_correction - # reflect density at the boundary - il = firstindex(pdf) + npad_left - ir = lastindex(pdf) - npad_right - pdf[range(il; length=nbin_reflect)] .+= @view pdf[range( + nbin_reflect = min(npad_left, npad_right) + il = firstindex(density) + npad_left + ir = lastindex(density) - npad_right + # reflect density at the boundary (x_min, x_max) + density[range(il; length=nbin_reflect)] .+= @view density[range( il - 1; step=-1, length=nbin_reflect )] - pdf[range(ir; step=-1, length=nbin_reflect)] .+= @view pdf[range( + density[range(ir; step=-1, length=nbin_reflect)] .+= @view density[range( ir + 1; length=nbin_reflect )] end # remove padding - grid = grid[(begin + npad_left):(end - npad_right)] - pdf = pdf[(begin + npad_left):(end - npad_right)] + midpoints = midpoints[(begin + npad_left):(end - npad_right)] + density = density[(begin + npad_left):(end - npad_right)] - return KernelDensity.UnivariateKDE(grid, pdf) + return KernelDensity.UnivariateKDE(midpoints, density) end struct UnivariateCKDE{X,P} From a009e84195eb785069a62ab81dd5f8373e52a128 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Wed, 16 Aug 2023 01:13:17 +0200 Subject: [PATCH 08/11] Add comment --- src/kde.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/kde.jl b/src/kde.jl index 1d1dbe4..b2a6e6d 100644 --- a/src/kde.jl +++ b/src/kde.jl @@ -69,6 +69,8 @@ function kde( else _pad_factor = pad_factor end + # always pad by at least 1 bin on each side to ensure that the boundary passed to kde + # contains all data points (otherwise, they will be ignored) grid_pad_size = 2 * max(1, Int(cld(_pad_factor * bandwidth, bin_width))) npad_left = npad_right = grid_pad_size ÷ 2 From 37da6d9414c18b0d8f948e310485d89e74cf5dd5 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Wed, 16 Aug 2023 01:14:28 +0200 Subject: [PATCH 09/11] Fix check --- src/kde.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kde.jl b/src/kde.jl index b2a6e6d..b7b4b07 100644 --- a/src/kde.jl +++ b/src/kde.jl @@ -64,8 +64,8 @@ function kde( prob_tail = 1e-3 _kernel = _kernel_with_bandwidth(kernel, bandwidth) _pad_factor = _padding_factor(_kernel, prob_tail) - elseif pad_factor ≥ 0 - throw(ArgumentError("Padding factor must be non-negative.")) + elseif pad_factor < 0 + throw(DomainError(pad_factor, "Padding factor must be non-negative.")) else _pad_factor = pad_factor end From bcea763ae6785ea8c8bcd49363cd15996d32804d Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Wed, 16 Aug 2023 10:42:16 +0200 Subject: [PATCH 10/11] Remove ckde functionality --- docs/src/api.md | 1 - src/PosteriorStats.jl | 2 +- src/kde.jl | 24 ------------------------ 3 files changed, 1 insertion(+), 26 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index e9c3187..3f9cec2 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -60,7 +60,6 @@ loo_pit ```@docs kde -ckde bandwidth_silverman ``` diff --git a/src/PosteriorStats.jl b/src/PosteriorStats.jl index ac1bd28..4bd1060 100644 --- a/src/PosteriorStats.jl +++ b/src/PosteriorStats.jl @@ -22,7 +22,7 @@ using Tables: Tables using TableTraits: TableTraits # Density -export bandwidth_silverman, ckde, kde +export bandwidth_silverman, kde # PSIS export PSIS, PSISResult, psis, psis! diff --git a/src/kde.jl b/src/kde.jl index b7b4b07..325d2bf 100644 --- a/src/kde.jl +++ b/src/kde.jl @@ -104,27 +104,3 @@ function kde( return KernelDensity.UnivariateKDE(midpoints, density) end - -struct UnivariateCKDE{X,P} - x::X - probability::P -end - -""" - ckde(x; kwargs...) -> UnivariateCKDE - -Compute the CDF of the kernel density estimate of data `x`. - -For details about arguments and keywords, see [`kde`](@ref) -""" -ckde(x; kwargs...) = ckde(kde(x; kwargs...)) - -""" - ckde(k::KernelDensity.UnivariateKDE) -> UnivariateCKDE - -Compute the CDF of the provided kernel density estimate. -""" -function ckde(k::KernelDensity.UnivariateKDE) - prob = cumsum(k.density) .* step(k.x) - return UnivariateCKDE(k.x, prob ./ prob[end]) -end From 1c89650acc2531c32d3d896106ddd50b03130229 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Wed, 16 Aug 2023 11:52:19 +0200 Subject: [PATCH 11/11] Don't overwrite LD_LIBRARY_PATH --- .github/workflows/CI.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 06e7b20..c990804 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -31,7 +31,7 @@ jobs: if: matrix.os == 'ubuntu-latest' - name: Set R lib path for RCall.jl if: matrix.os == 'ubuntu-latest' - run: echo "LD_LIBRARY_PATH=$(R RHOME)/lib" >> $GITHUB_ENV + run: echo "LD_LIBRARY_PATH=$(R RHOME)/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV - name: Install R packages if: matrix.os == 'ubuntu-latest' run: |