Merge pull request #198 from birdflow-science/plot-loss

birdflow-science · Sep 25, 2024 · 78e966a · 78e966a
2 parents 80b4a25 + 584c3ae
commit 78e966a
Show file tree

Hide file tree

Showing 21 changed files with 932 additions and 11 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: BirdFlowR
 Title: Predict and Visualize Bird Movement
-Version: 0.1.0.9065
+Version: 0.1.0.9066
 Authors@R: 
     c(person("Ethan", "Plunkett",  email = "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0003-4405-2251")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -40,6 +40,7 @@ export(get_countries)
 export(get_dates)
 export(get_distr)
 export(get_dynamic_mask)
+export(get_loss)
 export(get_marginal)
 export(get_mask)
 export(get_metadata)
@@ -79,6 +80,7 @@ export(n_transitions)
 export(pad_timestep)
 export(plot_distr)
 export(plot_flux)
+export(plot_loss)
 export(plot_movement_vectors)
 export(plot_routes)
 export(preprocess_species)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,30 @@
+# BirdFlowR 0.1.0.9067
+2024-07-24
+
+## Probabilistic flux
+
+**Note** this version adds the machinery for calculating probabilistic flux but
+the parameterization is not yet right and it will likely need some optimization
+to run on non-trivial models. 
+
+* Add `weight_betweeness()` which returns betweenness weights in a manner similar
+to `is_between()`.
+* Add `calc_distance_weights()`,`calc_martern_variance()` and some related 
+helper functions that support `weight_betweeness()`
+* Update `calc_flux()` with argument `weighted` which switches between
+ the old binary betweenness with linear paths and the new probabilistic flux 
+ with weighted betweenness in which the probability spreads in the middle of the 
+ transition path and spreads more for longer paths.  
+* Add `data-raw/martern_parameters.R` which helps visualize how parameters and
+functional forms affect weights.
+
+# BirdFlowR 0.1.0.9066
+2024-07-11
+
+Add `plot_loss()` to plot the change in the total loss and three loss 
+components during fitting. It relies on metadata added to the HDF5 file by
+BirdFlowPy.  
+
 # BirdFlowR 0.1.0.9065
 2024-06-14
 

diff --git a/R/calc_distance_weights.R b/R/calc_distance_weights.R
@@ -0,0 +1,85 @@
+
+# Brownian bridge based weighting. This is a placeholder for now to allow
+# implementing weight_betweeness.
+
+#'  calculate the weights of transitions for flux points
+#'
+#'  `calc_dist_weights()` is an internal function that takes summary stats
+#'  on the relationship between points and a transition line and returns
+#'  the weight that should be used for that transition.
+#'
+#'  The first three arguments can all be vectors in which case the calculations
+#'  will be vectorized over the corresponding elements.
+#'
+#'  This is a preliminary version of the function and will likely change.
+#'
+#' @param dist_to_line How far is the point from the line (m)
+#' @param dist_along_line How far along the line is the point, after
+#' projecting it  onto the line (m)
+#' @param line_lengths How long is the line (m)
+#' @param radius_m The radius of the transect at the flux points - used to
+#' determine the band of brobability desnity that will be added to form
+#' the weight.
+#' @param res_m The resolution of the associated bird flow model, used to
+#' determine the nugget added to the variance to represent the uncertainty in
+#' the starting and ending location of the transition.
+#'
+#' @param method  The method used for calculating the standard deviation
+#' in the probability distribution.  Currently `"m3"`, Martern 3/2; and
+#' `"bb"`, brownian bridge are supported.
+#'
+#' @return A vector of weights of the same length as the first three arguments.
+#' @keywords internal
+calc_dist_weights <- function(dist_to_line, dist_along_line, line_lengths,
+                              radius_m, res_m, method = "m3") {
+
+
+
+  # Spatial information
+  len <- line_lengths / 1000  # length of great circle in km
+  t <- dist_along_line / 1000 # Distance from start of great circle to where
+  # the point projects onto the great circle in km
+  d <- dist_to_line / 1000 # Distance from reference point to the great circle
+
+  res_km <- res_m / 1000
+
+  r <- radius_m / 1000  # Radius of transact in KM.
+
+  # s2 is the standard deviation of the nugget
+  s2 <- res_km / 4 # This initial setting of 1/4 the cell width converted
+  # to KM means the cell boundary (orthagonally) is at 2
+  #  standard deviations from the cell center, the corners
+  # will be at 2.8 SD.
+
+  valid_methods <- c("bb", # Brownian Bridge
+                     "m3") # Martern 3/2
+
+  if (!method %in% valid_methods)
+    stop("Method shold be one of ", paste(valid_methods, collapse = ", "))
+
+  if (method == "bb") {
+
+    # Hyperparameters
+    s1 <-  10 # Tune this visually looking at a few resulting distributions?
+
+
+    # Standard deviation
+    sd <-  sqrt(s1^2 * t * (len - t) / len + s2^2)  # SD in KM for this point.
+
+  }
+
+  if (method == "m3") {
+    sd <- sqrt(calc_martern_variance(t, len, k_m3, gamma = 40000, kl = 2000) +
+                 s2^2)
+  }
+
+
+  # Calculate weight
+  weight <- rep(0, length(sd))
+  in_range <- (d - r) < 1.96 * sd
+  low_cum_prob <- stats::pnorm(d[in_range] - r, sd = sd[in_range])
+  high_cum_prob <- stats::pnorm(d[in_range] + r, sd = sd[in_range])
+  weight[in_range] <- high_cum_prob - low_cum_prob
+
+  return(weight)
+}
diff --git a/R/calc_flux.R b/R/calc_flux.R
@@ -41,6 +41,7 @@
 #' standardizing the units based on the entire cell area that that point
 #' represents.
 #'
+#'
 #' @param bf A BirdFlow model
 #' @param points A set of points to calculate movement through. If `points` is
 #' `NULL` they will default to the BirdFlow model cells that are either active
@@ -70,6 +71,13 @@
 #' transition.}
 #'}
 #' @inheritParams is_between
+#' @param weighted If `FALSE` use the original and quicker version of flux
+#' that sums all the marginal probability for transitions that pass within a
+#' fixed distance of the point.  If `TRUE` assign a weight to the point and
+#' transition combo that then is multiplied by the marginal probability before
+#' summing.  This argument is experimental but the default value is identical
+#' to the old version. The argument name and behaviour when set to `TRUE` may
+#' change.
 #'
 #' @return See `format` argument.
 #' @export
@@ -86,7 +94,9 @@
 #' }
 #'
 calc_flux <- function(bf, points = NULL, radius = NULL, n_directions = 1,
-                      format = NULL, batch_size = 5e5, check_radius = TRUE) {
+                      format = NULL, batch_size = 5e5, check_radius = TRUE,
+                      weighted = FALSE) {
+
 
   if (!requireNamespace("SparseArray", quietly = TRUE)) {
     stop("The SparseArray package is required to use calc_flux(). ",
@@ -107,8 +117,16 @@ calc_flux <- function(bf, points = NULL, radius = NULL, n_directions = 1,
   format <- tolower(format)
   stopifnot(format %in% c("points", "spatraster", "dataframe"))
 
+  # The only difference between is_between() and weight_between() return formats
+  # is in the "between" component.  In the first it's logical (TRUE is between)
+  # in the second it is a weight between 0 and 1 (non zero indicates some
+  # level of betweeness".
+  if (weighted) {
+    result <- weight_between(bf, points, radius, n_directions)
+  } else {
+    result <- is_between(bf, points, radius, n_directions)
+  }
 
-  result <- is_between(bf, points, radius, n_directions)
   between <- result$between
   points <- result$points
   radius_km <- result$radius / 1000
@@ -151,8 +169,8 @@ calc_flux <- function(bf, points = NULL, radius = NULL, n_directions = 1,
       # Matrix does. The solution is to convert the selection
       # to a standard matrix or a sparse Matrix. The first is empirically a
       # lot faster
-      sel <- as.matrix(sb[, , j]) #, sparse = TRUE)
-      net_movement[j, i] <- sum(mar[sel])
+      sel <- as.matrix(sb[, , j]) # either logical or weights
+      net_movement[j, i] <- sum(mar * sel)
     }
   }
 
@@ -167,7 +185,7 @@ calc_flux <- function(bf, points = NULL, radius = NULL, n_directions = 1,
   if (format == "spatraster") {
     raster <- array(data = NA, dim = c(nrow(bf), ncol(bf), ncol(net_movement)))
     dimnames(raster) <- list(row = NULL, col = NULL,
-                            transition = colnames(net_movement))
+                             transition = colnames(net_movement))
     rc <- cbind(y_to_row(points$y, bf), x_to_col(points$x, bf))
     colnames(rc) <- c("row", "col")
     for (i in seq_along(marginals)) {
@@ -185,7 +203,7 @@ calc_flux <- function(bf, points = NULL, radius = NULL, n_directions = 1,
 
   if (format == "dataframe") {
     wide <- cbind(as.data.frame(points)[, c("x", "y")],
-                net_movement)
+                  net_movement)
     long <- tidyr::pivot_longer(wide, cols = setdiff(names(wide), c("x", "y")),
                                 names_to = "transition", values_to = "flux")
 

diff --git a/R/calc_martern_variance.R b/R/calc_martern_variance.R
@@ -0,0 +1,29 @@
+# Define kernel functions
+# Original version used "l" instead of "kl" but that led to a
+# warning about partial matching of "l" to "len" when invoked via ...
+# in calc_dist_weights(). kl makes it unambiguous.
+k_m1 <- function(d, gamma, kl) {
+  gamma * (1 + sqrt(3) * d / kl)
+}
+
+k_m3 <- function(d, gamma = 40, kl = 40) {
+  gamma * (1 + sqrt(3) * d / kl) * exp(-sqrt(3) * d / kl)
+}
+
+k_m5 <- function(d, gamma = 40, kl = 40) {
+  gamma * (1 + sqrt(5) * d / kl +  5 * d^2 / (3 * kl^2)) * exp(-sqrt(5) * d / kl)
+}
+
+k_sq <- function(d, gamma, kl) {
+  gamma * exp(-0.5 * (d / kl)^2)
+}
+
+# Note the argument k is a kernel function to use, presumably one of the above.
+# ...  is used to flexibly pass additional parameters (gamma and l through to k)
+calc_martern_variance <- function(t, len, k, ...) {
+  variance <- k(0, ...) -
+    (k(t, ...)^2 + k(len - t, ...)^2 - 2 *
+       k(t, ...) * k(len - t, ...) *  k(len, ...) / k(0, ...)) /
+    (k(0, ...) * (1 - k(len, ...)^2 / k(0, ...)^2))
+  return(pmax(variance, 0))
+}
diff --git a/R/plot_loss.R b/R/plot_loss.R
@@ -0,0 +1,84 @@
+
+#' Get loss values for each step in the model fitting process
+#'
+#' `get_loss()` returns a data frame of loss values. Each row corresponds to
+#' a step in the fitting process.
+#' @param bf A BirdFlow object.
+#'
+#' @return A data frame with columns:
+#' \item{dist}{The distance loss}
+#' \item{ent}{The entropy loss}
+#' \item{obs}{The observation loss}
+#' \item{total}{The total weighted loss}
+#' @export
+#'
+#' @examples
+#' bf <- BirdFlowModels::amewoo
+#' get_loss(bf)
+#' @seealso [plot_loss()]
+get_loss <- function(bf) {
+  if (!"loss_values" %in% names(bf$metadata))
+    stop("Missing loss values in the BirdFlow metadata. ",
+         "Is this a fitted model?")
+  bf$metadata$loss_values
+}
+
+
+
+
+#' Plot changes in component and total loss during model fitting
+#'
+#' Model fitting  - in [BirdFlowPy]() - attempts to minimize the total weighted
+#' loss.  This plot shows four lines:
+#' *  **Total loss** is the weighted sum of the three loss components. The
+#' weighting may cause it to be lower than some of the components.
+#' * **Observation loss** captures how well the model predicts the Status and
+#' Trend distributions it was trained on.  Its weight is always set to 1 and its
+#' relative weight is changed by adjusting the other to weights which are
+#' usually much less than 1.
+#' * **Distance loss** is lower when the routes encoded in the model are
+#' shorter.
+#' * **Entropy loss** is lower when the entropy in the model is higher.
+#'
+#'
+#' @param bf A fitted Bird Flow model
+#' @param transform Passed to [ggplot2::scale_y_continuous()] to set the y-axis
+#' transformation. Reasonable values for this function include
+#' "identity", "log", "log10", "log2", and "sqrt".
+#' @return a **ggplot2** plot object.
+#' @export
+#' @examples
+#'  bf <- BirdFlowModels::amewoo
+#'  plot_loss(bf)
+plot_loss <- function(bf, transform = "log10") {
+
+  if (!all(c("loss_values", "hyperparameters") %in% names(bf$metadata)))
+    stop("Missing loss or hyperparmers from metadata. Is this a fitted model?")
+  loss <- get_loss(bf) |>
+    dplyr::rename(Distance = "dist",
+                  Entropy = "ent",
+                  Observation = "obs",
+                  Total = "total")
+  loss$Step <- seq_len(nrow(loss))
+
+
+  # Convert to long format
+  d <- tidyr::pivot_longer(loss, cols = -ncol(loss), values_to = "Loss",
+                           names_to = "Type")
+
+  hp <-  bf$metadata$hyperparameters
+
+  subtitle <- paste0("Weights: Observation:", hp$obs_weight,
+                     " Distance: ", hp$dist_weight,
+                     " Entropy: ", hp$ent_weight)
+
+  ggplot2::ggplot(data = d,
+                  ggplot2::aes(x = .data$Step,
+                               y = .data$Loss,
+                               color = .data$Type)) +
+    ggplot2::geom_line(linewidth = .8) +
+    ggplot2::scale_y_continuous(transform = "log10") +
+    ggplot2::ggtitle(paste0(species(bf), " Loss"), subtitle = subtitle)
+
+
+}
diff --git a/R/plot_routes.R b/R/plot_routes.R
@@ -137,7 +137,7 @@ plot_routes <- function(routes, bf, facet = FALSE, max_stay_len = NULL,
   # Check for full output from route() and select just point component
   # This is to ease the transition in return format for route()
   # (from a list with $points and $lines to just the $points component)
-  if (is.list(routes) && all(names(routes) == c("points", "lines")) &&
+  if (is.list(routes) && all(names(routes) %in% c("points", "lines")) &&
       is.data.frame(routes$points)) {
     routes <- routes$points
   }

diff --git a/R/process_rasters.R b/R/process_rasters.R
@@ -253,7 +253,7 @@ process_rasters <- function(res,
       trim_quantile <- rep(trim_quantile, nt)
 
     truncate_and_renormalize <- function(x, p = 0.95) {
-      q <- quantile(x[!x == 0], p = p)
+      q <- stats::quantile(x[!x == 0], p = p)
       x[x > q] <- q
       x <- x / sum(x, na.rm = TRUE)
       x