From b5425555a02674b3e5a185fd91e2eae0a9ef9540 Mon Sep 17 00:00:00 2001
From: Rodrigo de Salvo Braz <rodrigobraz@fb.com>
Date: Tue, 6 Sep 2022 10:11:49 -0700
Subject: [PATCH] Small clarifications and improvements (#1611)

Summary:
Pull Request resolved: https://github.com/facebookresearch/beanmachine/pull/1611

Some small improvements and clarifications (mostly to documentation) while in the process of working on something else; committing them separately.

Reviewed By: gafter

Differential Revision: D36923195

fbshipit-source-id: 515951ca5b9f33626bd5d135ed45dcf7d8d5002f
---
 .../graph/distribution/distribution.h         | 22 +++++++---------
 src/beanmachine/graph/graph.cpp               |  3 +++
 src/beanmachine/graph/graph.h                 | 20 +++++++++++---
 .../marginalization/marginalized_graph.cpp    | 21 ++++++++-------
 .../graph/operator/stochasticop.cpp           |  3 ++-
 src/beanmachine/graph/util.cpp                | 26 ++++++++++++++-----
 src/beanmachine/graph/util.h                  | 12 ++++++++-
 7 files changed, 73 insertions(+), 34 deletions(-)

diff --git a/src/beanmachine/graph/distribution/distribution.h b/src/beanmachine/graph/distribution/distribution.h
index d1d0cedcc6..a3037d904a 100644
--- a/src/beanmachine/graph/distribution/distribution.h
+++ b/src/beanmachine/graph/distribution/distribution.h
@@ -112,11 +112,13 @@ class Distribution : public graph::Node {
   the log prob of the distribution w.r.t. the sampled value.
   :param value: value of the child Sample operator, a single draw from the
   distribution
-  :param back_grad: back_grad1 of the child Sample operator, to be incremented
-  :param adjunct: a multiplier that represents the gradient of the target
-  function w.r.t the log prob of this distribution. It uses the default value
-  1.0 if the direct child is a StochasticOperator, but requires input if the
-  direct child is a mixture distribution.
+  :param back_grad: variable to which the gradient will be added.
+  :param adjunct: if we are interested in df(log_prob)/dvalue, then
+  adjunct must be df(log_prob)/dlog_prob.
+  If we are interested in dlog_prob/dvalue then the adjunct is 1,
+  which is the default.
+  For other cases (such as this distribution being a component of a
+  mixture distribution), the appropriate adjunct must be provided.
   */
   virtual void backward_value(
       const graph::NodeValue& /* value */,
@@ -134,14 +136,8 @@ class Distribution : public graph::Node {
       graph::DoubleMatrix& /* back_grad */,
       Eigen::MatrixXd& /* adjunct */) const {}
   /*
-  In backward gradient propagation, increments the back_grad1 of each parent
-  node w.r.t. the log prob of the distribution, evaluated at the given value.
-  :param value: value of the child Sample operator, a single draw from the
-  distribution
-  :param adjunct: a multiplier that represents the gradient of the
-  target function w.r.t the log prob of this distribution. It uses the default
-  value 1.0 if the direct child is a StochasticOperator, but requires input if
-  the direct child is a mixture distribution.
+  Analogous to backward_value, but computes the gradient
+  wrt to each parameter, and adds results to their back_grad1 field.
   */
   virtual void backward_param(
       const graph::NodeValue& /* value */,
diff --git a/src/beanmachine/graph/graph.cpp b/src/beanmachine/graph/graph.cpp
index bfae4d11f3..052b321fbc 100644
--- a/src/beanmachine/graph/graph.cpp
+++ b/src/beanmachine/graph/graph.cpp
@@ -23,6 +23,9 @@
 namespace beanmachine {
 namespace graph {
 
+NATURAL_TYPE NATURAL_ZERO = 0ull;
+NATURAL_TYPE NATURAL_ONE = 1ull;
+
 std::string ValueType::to_string() const {
   std::string vtype;
   std::string atype;
diff --git a/src/beanmachine/graph/graph.h b/src/beanmachine/graph/graph.h
index 81cb0f66f9..21f2f59808 100644
--- a/src/beanmachine/graph/graph.h
+++ b/src/beanmachine/graph/graph.h
@@ -124,6 +124,9 @@ struct ValueType {
 
 typedef NATURAL_TYPE natural_t;
 
+extern NATURAL_TYPE NATURAL_ZERO;
+extern NATURAL_TYPE NATURAL_ONE;
+
 class NodeValue {
  public:
   ValueType type;
@@ -374,6 +377,9 @@ enum class DistributionType {
   LKJ_CHOLESKY
 };
 
+// TODO: do we really need DistributionType? Can't we know the type of a
+// Distribution from its class alone?
+
 enum class FactorType {
   UNKNOWN,
   EXP_PRODUCT,
@@ -449,9 +455,17 @@ class Node {
   virtual bool needs_gradient() const {
     return true;
   }
-  // gradient_log_prob is also only valid for stochastic nodes
-  // TODO: shouldn't we then restrict them to those classes? See above.
-  // this function adds the gradients to the passed in gradients
+  // gradient_log_prob is also only valid for stochastic nodes.
+  // (TODO: shouldn't we then restrict them to those classes? See above.)
+  // It computes the first and second gradients of the log prob
+  // of this node with respect to a given target node and
+  // adds them to the passed-in gradient parameters.
+  // Note that for this computation to be correct,
+  // gradients (the grad1 and grad2 properties of nodes)
+  // must have been updated all the way from the
+  // target node to this node.
+  // This is because this method only performs a local computation
+  // and relies on the grad1 and grad2 attributes of nodes.
   virtual void gradient_log_prob(
       const graph::Node* target_node,
       double& /* grad1 */,
diff --git a/src/beanmachine/graph/marginalization/marginalized_graph.cpp b/src/beanmachine/graph/marginalization/marginalized_graph.cpp
index 993b73f417..85a3907996 100644
--- a/src/beanmachine/graph/marginalization/marginalized_graph.cpp
+++ b/src/beanmachine/graph/marginalization/marginalized_graph.cpp
@@ -63,13 +63,13 @@ all of the nodes required to compute the MarginalDistribution
 4. the stochastic children nodes of the discrete sample
 5. the parents (a node not in 1-4 that has a child in 1-4)
 
-The original graph will contain
+The original graph will be modified to contain
 1. the MarginalDistribution node (to replace #1-3 from the
    subgraph above)
-2. the children of the MarginalDistribution are the
+2. the children of the MarginalDistribution, which are the
    stochastic children nodes of the discrete node
    (the same as #4 from the subgraph)
-3. the parents of the MarginalDistribution are the parents
+3. the parents of the MarginalDistribution, which are the parents
    of the subgraph (same as #5 from the subgraph)
 
 In order to keep the original graph and the subgraph completely
@@ -87,7 +87,8 @@ same as the parent node in the graph.
 CHILDREN:
 The children of the MarginalDistribution are the stochastic children of
 the discrete sample node.
-The stochastic children are needed to compute the MarginalDistribution,
+The stochastic children are needed to compute the
+log prob of the MarginalDistribution,
 so they are part of the subgraph.
 However, a "copy" of these children also needs to be added to the graph.
 This "copy" node is a SAMPLE node of MarginalDistribution whose value
@@ -104,11 +105,13 @@ void marginalize_graph(Graph& graph, uint discrete_sample_node_id) {
   std::vector<uint> sto_node_ids;
   std::tie(det_node_ids, sto_node_ids) =
       compute_children(graph, discrete_sample->index);
+  // TODO: do we need to rename the above compute_affected_nodes,
+  // or even use Graph's methods for that instead of computing it ourselves?
 
   // create MarginalDistribution
   std::unique_ptr<distribution::DummyMarginal> marginal_distribution_ptr =
       std::make_unique<distribution::DummyMarginal>(std::move(subgraph_ptr));
-  // TODO: support the correct sample type for multiple children
+  // TODO: support multiple children
   if (sto_node_ids.size() > 0) {
     // @lint-ignore
     marginal_distribution_ptr->sample_type =
@@ -119,7 +122,6 @@ void marginalize_graph(Graph& graph, uint discrete_sample_node_id) {
       marginal_distribution_ptr.get();
   SubGraph* subgraph = marginal_distribution->subgraph_ptr.get();
 
-  // add nodes to subgraph
   add_nodes_to_subgraph(
       subgraph,
       discrete_distribution,
@@ -127,9 +129,8 @@ void marginalize_graph(Graph& graph, uint discrete_sample_node_id) {
       det_node_ids,
       sto_node_ids);
 
-  // connect parents to MarginalDistribution in graph
   connect_parents_to_marginal_distribution(graph, marginal_distribution);
-  // add copy of parents to subgraph
+
   add_copy_of_parent_nodes_to_subgraph(subgraph, marginal_distribution);
 
   // create and connect children to MarginalDistribution
@@ -139,6 +140,7 @@ void marginalize_graph(Graph& graph, uint discrete_sample_node_id) {
 
   // list of all created nodes to add to `nodes` of current graph
   std::vector<std::unique_ptr<Node>> created_graph_nodes;
+
   // add MarginalDistribution to list of created nodes
   created_graph_nodes.push_back(std::move(marginal_distribution_ptr));
   // add created nodes to list of created_graph_nodes
@@ -154,6 +156,7 @@ void marginalize_graph(Graph& graph, uint discrete_sample_node_id) {
   // the created nodes should be inserted right after the largest parent index
   uint marginal_distribution_index =
       compute_largest_parent_index(marginal_distribution) + 1;
+
   // insert created nodes into graph at "marginalized_node_index"
   for (uint i = 0; i < created_graph_nodes.size(); i++) {
     graph.nodes.insert(
@@ -164,7 +167,7 @@ void marginalize_graph(Graph& graph, uint discrete_sample_node_id) {
 }
 
 /*
-returns <determinisitc_node_ids, stochastic_node_ids>
+returns <deterministic_node_ids, stochastic_node_ids>
 1. deterministic_node_ids are all of the deterministic nodes up until the
 2. stochastic_node_ids children are reached
 */
diff --git a/src/beanmachine/graph/operator/stochasticop.cpp b/src/beanmachine/graph/operator/stochasticop.cpp
index 9ae4a2a2e5..af87588374 100644
--- a/src/beanmachine/graph/operator/stochasticop.cpp
+++ b/src/beanmachine/graph/operator/stochasticop.cpp
@@ -42,7 +42,8 @@ void StochasticOperator::gradient_log_prob(
   //   but it is not represented as an in-node.
   //
   // This makes the computation of this derivative less uniform
-  // and less directly corresponding to the typical use of the chain rule.
+  // and less directly corresponding to the typical use of the chain rule
+  // for the operations explicitly represented as nodes.
   //
   // Still, it should be possible to simply apply the chain rule
   // and find an expression involving the gradient of this stochastic node's
diff --git a/src/beanmachine/graph/util.cpp b/src/beanmachine/graph/util.cpp
index 0403c992e2..23b235abb4 100644
--- a/src/beanmachine/graph/util.cpp
+++ b/src/beanmachine/graph/util.cpp
@@ -76,13 +76,10 @@ double Phi_approx_inv(double z) {
 }
 
 double log_sum_exp(const std::vector<double>& values) {
-  // find the max and subtract it out
-  double max = values[0];
-  for (std::vector<double>::size_type idx = 1; idx < values.size(); idx++) {
-    if (values[idx] > max) {
-      max = values[idx];
-    }
-  }
+  // See "log-sum-exp trick for log-domain calculations" in
+  // https://en.wikipedia.org/wiki/LogSumExp
+  assert(values.size() != 0);
+  double max = *std::max_element(values.begin(), values.end());
   double sum = 0;
   for (auto value : values) {
     sum += std::exp(value - max);
@@ -96,6 +93,21 @@ double log_sum_exp(double a, double b) {
   return std::log(sum) + max_val;
 }
 
+std::vector<double> probs_given_log_potentials(std::vector<double> log_pot) {
+  // p_i = pot_i/Z
+  // where Z is the normalization constant sum_i exp(log pot_i).
+  // = exp(log(pot_i/Z))
+  // = exp(log pot_i - logZ)
+  // logZ is log(sum_i exp(log pot_i))
+  auto logZ = log_sum_exp(log_pot);
+  std::vector<double> probs;
+  probs.reserve(log_pot.size());
+  for (size_t i = 0; i != log_pot.size(); i++) {
+    probs.push_back(std::exp(log_pot[i] - logZ));
+  }
+  return probs;
+}
+
 double polygamma(int n, double x) {
   return boost::math::polygamma(n, x);
 }
diff --git a/src/beanmachine/graph/util.h b/src/beanmachine/graph/util.h
index bbf1683ecb..1e7c84dbc5 100644
--- a/src/beanmachine/graph/util.h
+++ b/src/beanmachine/graph/util.h
@@ -89,13 +89,23 @@ std::vector<T> percentiles(
 }
 
 /*
-Compute log of the sum of the exponentiation of all the values in the vector
+Equivalent to log of sum of exponentiations of values,
+but more numerically stable.
 :param values: vector of log values
 :returns: log sum exp of values
 */
 double log_sum_exp(const std::vector<double>& values);
 double log_sum_exp(double a, double b);
 
+/*
+  Given log potentials log pot_i
+  where potentials pot_i are an unnormalized probability distribution,
+  return the normalized probability distribution p_i.
+  p_i = pot_i/Z
+  where Z is the normalization constant sum_i exp(log pot_i).
+*/
+std::vector<double> probs_given_log_potentials(std::vector<double> log_pot);
+
 struct BinaryLogSumExp {
   double operator()(double a, double b) const {
     return log_sum_exp(a, b);