From f223a93ca55aa17225f2c4ebedba9940d8ce5362 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sat, 30 Mar 2024 13:57:39 -0400
Subject: [PATCH 01/29] done with join sel skeleton

---
 optd-core/src/cascades/optimizer.rs        |  4 +
 optd-datafusion-repr/src/cost/base_cost.rs | 87 ++++++++++++++++++----
 2 files changed, 75 insertions(+), 16 deletions(-)
diff --git a/optd-core/src/cascades/optimizer.rs b/optd-core/src/cascades/optimizer.rs
index a2e4ea7b..d24eec70 100644
--- a/optd-core/src/cascades/optimizer.rs
+++ b/optd-core/src/cascades/optimizer.rs
@@ -317,6 +317,10 @@ impl<T: RelNodeTyp> CascadesOptimizer<T> {
         self.memo.merge_group(group_a, group_b);
     }
 
+    /// Get the properties of a Cascades group
+    /// P is the type of the property you expect
+    /// idx is the idx of the property you want. The order of properties is defined
+    ///   by the property_builders parameter in CascadesOptimizer::new()
     pub fn get_property_by_group<P: PropertyBuilder<T>>(
         &self,
         group_id: GroupId,
diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index 55a789ee..973d1f49 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -5,7 +5,7 @@ use crate::plan_nodes::{
 };
 use crate::properties::column_ref::{ColumnRefPropertyBuilder, GroupColumnRefs};
 use crate::{
-    plan_nodes::{OptRelNodeRef, OptRelNodeTyp},
+    plan_nodes::{OptRelNodeRef, OptRelNodeTyp, JoinType},
     properties::column_ref::ColumnRef,
 };
 use arrow_schema::{ArrowError, DataType};
@@ -323,8 +323,11 @@ const DEFAULT_EQ_SEL: f64 = 0.005;
 const DEFAULT_INEQ_SEL: f64 = 0.3333333333333333;
 // Default selectivity estimate for pattern-match operators such as LIKE
 const DEFAULT_MATCH_SEL: f64 = 0.005;
+// Default selectivity if we have no information
+const DEFAULT_UNK_SEL: f64 = 0.005;
 
-const INVALID_SEL: f64 = 0.01;
+// A placeholder for todo!() for codepaths which are accessed by plannertest
+const TODO_SEL: f64 = 0.01;
 
 impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
     pub fn row_cnt(Cost(cost): &Cost) -> f64 {
@@ -428,10 +431,10 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
                             row_cnt.min(fetch as f64)
                         }
                     } else {
-                        panic!("compute_cost() should not be called if optimizer is None")
+                        (row_cnt * DEFAULT_UNK_SEL).max(1.0)
                     }
                 } else {
-                    panic!("compute_cost() should not be called if context is None")
+                    (row_cnt * DEFAULT_UNK_SEL).max(1.0)
                 };
                 Self::cost(row_cnt, compute_cost, 0.0)
             }
@@ -456,10 +459,10 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
                                 panic!("encountered a PhysicalFilter without an expression")
                             }
                         } else {
-                            panic!("compute_cost() should not be called if optimizer is None")
+                            DEFAULT_UNK_SEL
                         }
                     }
-                    None => panic!("compute_cost() should not be called if context is None"),
+                    None => DEFAULT_UNK_SEL,
                 };
 
                 Self::cost(
@@ -468,11 +471,32 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
                     0.0,
                 )
             }
-            OptRelNodeTyp::PhysicalNestedLoopJoin(_) => {
+            OptRelNodeTyp::PhysicalNestedLoopJoin(join_typ) => {
                 let (row_cnt_1, _, _) = Self::cost_tuple(&children[0]);
                 let (row_cnt_2, _, _) = Self::cost_tuple(&children[1]);
                 let (_, compute_cost, _) = Self::cost_tuple(&children[2]);
-                let selectivity = 0.01;
+                let selectivity = match context {
+                    Some(context) => {
+                        if let Some(optimizer) = optimizer {
+                            let column_refs = optimizer
+                            .get_property_by_group::<ColumnRefPropertyBuilder>(
+                                context.group_id,
+                                1,
+                            );
+                            let expr_group_id = context.children_group_ids[2];
+                            let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false);
+                            // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information
+                            if let Some(expr_tree) = expr_trees.first() {
+                                self.get_join_selectivity(*join_typ, Arc::clone(expr_tree), &column_refs)
+                            } else {
+                                panic!("encountered a join without an expression")
+                            }
+                        } else {
+                            DEFAULT_UNK_SEL
+                        }
+                    }
+                    None => DEFAULT_UNK_SEL,
+                };
                 Self::cost(
                     (row_cnt_1 * row_cnt_2 * selectivity).max(1.0),
                     row_cnt_1 * row_cnt_2 * compute_cost + row_cnt_1,
@@ -580,7 +604,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
                 let right_child = expr_tree.child(1);
 
                 if bin_op_typ.is_comparison() {
-                    self.get_comparison_op_selectivity(
+                    self.get_filter_comp_op_selectivity(
                         *bin_op_typ,
                         left_child,
                         right_child,
@@ -595,19 +619,50 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
                 }
             }
             OptRelNodeTyp::LogOp(log_op_typ) => {
-                self.get_log_op_selectivity(*log_op_typ, &expr_tree.children, column_refs)
+                self.get_filter_log_op_selectivity(*log_op_typ, &expr_tree.children, column_refs)
             }
             OptRelNodeTyp::Func(_) => todo!("check bool type or else panic"),
             OptRelNodeTyp::SortOrder(_) => {
                 panic!("the selectivity of sort order expressions is undefined")
             }
-            OptRelNodeTyp::Between => INVALID_SEL,
+            OptRelNodeTyp::Between => TODO_SEL,
             OptRelNodeTyp::Cast => todo!("check bool type or else panic"),
             OptRelNodeTyp::Like => DEFAULT_MATCH_SEL,
             OptRelNodeTyp::DataType(_) => {
                 panic!("the selectivity of a data type is not defined")
             }
-            OptRelNodeTyp::InList => INVALID_SEL,
+            OptRelNodeTyp::InList => TODO_SEL,
+            _ => unreachable!(
+                "all expression OptRelNodeTyp were enumerated. this should be unreachable"
+            ),
+        }
+    }
+
+    /// The expr_tree input must be a "mixed expression tree", just like with get_filter_selectivity()
+    fn get_join_selectivity(
+        &self,
+        join_typ: JoinType,
+        expr_tree: OptRelNodeRef,
+        column_refs: &GroupColumnRefs,
+    ) -> f64 {
+        assert!(expr_tree.typ.is_expression());
+        match &expr_tree.typ {
+            OptRelNodeTyp::Constant(_) => TODO_SEL,
+            OptRelNodeTyp::ColumnRef => todo!("check bool type or else panic"),
+            OptRelNodeTyp::UnOp(_) => todo!(),
+            OptRelNodeTyp::BinOp(_) => TODO_SEL,
+            OptRelNodeTyp::LogOp(_) => TODO_SEL,
+            OptRelNodeTyp::Func(_) => todo!("check bool type or else panic"),
+            OptRelNodeTyp::SortOrder(_) => {
+                panic!("the selectivity of sort order expressions is undefined")
+            }
+            OptRelNodeTyp::Between => todo!(),
+            OptRelNodeTyp::Cast => todo!("check bool type or else panic"),
+            OptRelNodeTyp::Like => todo!(),
+            OptRelNodeTyp::DataType(_) => {
+                panic!("the selectivity of a data type is not defined")
+            }
+            OptRelNodeTyp::InList => todo!(),
             _ => unreachable!(
                 "all expression OptRelNodeTyp were enumerated. this should be unreachable"
             ),
@@ -615,7 +670,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
     }
 
     /// Comparison operators are the base case for recursion in get_filter_selectivity()
-    fn get_comparison_op_selectivity(
+    fn get_filter_comp_op_selectivity(
         &self,
         comp_bin_op_typ: BinOpType,
         left: OptRelNodeRef,
@@ -652,7 +707,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
 
         // handle the different cases of column nodes
         if col_ref_nodes.is_empty() {
-            INVALID_SEL
+            TODO_SEL
         } else if col_ref_nodes.len() == 1 {
             let col_ref_node = col_ref_nodes
                 .pop()
@@ -712,7 +767,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
                     OptRelNodeTyp::BinOp(_) => {
                         Self::get_default_comparison_op_selectivity(comp_bin_op_typ)
                     }
-                    OptRelNodeTyp::Cast => INVALID_SEL,
+                    OptRelNodeTyp::Cast => TODO_SEL,
                     _ => unimplemented!(
                         "unhandled case of comparing a column ref node to {}",
                         non_col_ref_node.as_ref().typ
@@ -852,7 +907,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         }
     }
 
-    fn get_log_op_selectivity(
+    fn get_filter_log_op_selectivity(
         &self,
         log_op_typ: LogOpType,
         children: &[OptRelNodeRef],

From a20b8f5a017db945af5c460b96bfbae5469f32f4 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sat, 30 Mar 2024 14:16:42 -0400
Subject: [PATCH 02/29] added filtersel and joinsel const

---
 optd-datafusion-repr/src/cost/base_cost.rs | 98 ++++++++++++++++------
 1 file changed, 74 insertions(+), 24 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index 973d1f49..161a8440 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -584,7 +584,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
     ) -> f64 {
         assert!(expr_tree.typ.is_expression());
         match &expr_tree.typ {
-            OptRelNodeTyp::Constant(_) => todo!("check bool type or else panic"),
+            OptRelNodeTyp::Constant(_) => Self::get_constant_selectivity(expr_tree),
             OptRelNodeTyp::ColumnRef => todo!("check bool type or else panic"),
             OptRelNodeTyp::UnOp(un_op_typ) => {
                 assert!(expr_tree.children.len() == 1);
@@ -647,7 +647,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
     ) -> f64 {
         assert!(expr_tree.typ.is_expression());
         match &expr_tree.typ {
-            OptRelNodeTyp::Constant(_) => TODO_SEL,
+            OptRelNodeTyp::Constant(_) => Self::get_constant_selectivity(expr_tree),
             OptRelNodeTyp::ColumnRef => todo!("check bool type or else panic"),
             OptRelNodeTyp::UnOp(_) => todo!(),
             OptRelNodeTyp::BinOp(_) => TODO_SEL,
@@ -798,6 +798,31 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         }
     }
 
+    fn get_constant_selectivity(const_node: OptRelNodeRef) -> f64 {
+        if let OptRelNodeTyp::Constant(const_typ) = const_node.typ {
+            if matches!(const_typ, ConstantType::Bool) {
+                let value = const_node
+                    .as_ref()
+                    .data
+                    .as_ref()
+                    .expect("constants should have data");
+                if let Value::Bool(bool_value) = value {
+                    if *bool_value {
+                        1.0
+                    } else {
+                        0.0
+                    }
+                } else {
+                    unreachable!("if the typ is ConstantType::Bool, the value should be a Value::Bool")
+                }
+            } else {
+                panic!("selectivity is not defined on constants which are not bools")
+            }
+        } else {
+            panic!("get_constant_selectivity must be called on a constant")
+        }
+    }
+
     /// Get the selectivity of an expression of the form "column equals value" (or "value equals column")
     /// Will handle the case of statistics missing
     /// Equality predicates are handled entirely differently from range predicates so this is its own function
@@ -950,8 +975,7 @@ mod tests {
 
     use crate::{
         plan_nodes::{
-            BinOpExpr, BinOpType, ColumnRefExpr, ConstantExpr, Expr, ExprList, LogOpExpr,
-            LogOpType, OptRelNode, OptRelNodeRef, UnOpExpr, UnOpType,
+            BinOpExpr, BinOpType, ColumnRefExpr, ConstantExpr, Expr, ExprList, JoinType, LogOpExpr, LogOpType, OptRelNode, OptRelNodeRef, UnOpExpr, UnOpType
         },
         properties::column_ref::ColumnRef,
     };
@@ -1077,8 +1101,27 @@ mod tests {
         .into_rel_node()
     }
 
+    /// The reason this isn't an associated function of PerColumnStats is because that would require
+    ///   adding an empty() function to the trait definitions of MostCommonValues and Distribution,
+    ///   which I wanted to avoid
+    fn get_empty_per_col_stats() -> TestPerColumnStats {
+        TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            0,
+            0.0,
+            TestDistribution::empty(),
+        )
+    }
+
     #[test]
-    fn test_colref_eq_constint_in_mcv() {
+    fn test_filtersel_const() {
+        let cost_model = create_one_column_cost_model(get_empty_per_col_stats());
+        assert_approx_eq::assert_approx_eq!(cost_model.get_filter_selectivity(cnst(Value::Bool(true)), &vec![]), 1.0);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_filter_selectivity(cnst(Value::Bool(false)), &vec![]), 0.0);
+    }
+
+    #[test]
+    fn test_filtersel_colref_eq_constint_in_mcv() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::new(vec![(Value::Int32(1), 0.3)]),
             0,
@@ -1102,7 +1145,7 @@ mod tests {
     }
 
     #[test]
-    fn test_colref_eq_constint_not_in_mcv_no_nulls() {
+    fn test_filtersel_colref_eq_constint_not_in_mcv_no_nulls() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::new(vec![(Value::Int32(1), 0.2), (Value::Int32(3), 0.44)]),
             5,
@@ -1126,7 +1169,7 @@ mod tests {
     }
 
     #[test]
-    fn test_colref_eq_constint_not_in_mcv_with_nulls() {
+    fn test_filtersel_colref_eq_constint_not_in_mcv_with_nulls() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::new(vec![(Value::Int32(1), 0.2), (Value::Int32(3), 0.44)]),
             5,
@@ -1151,7 +1194,7 @@ mod tests {
 
     /// I only have one test for NEQ since I'll assume that it uses the same underlying logic as EQ
     #[test]
-    fn test_colref_neq_constint_in_mcv() {
+    fn test_filtersel_colref_neq_constint_in_mcv() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::new(vec![(Value::Int32(1), 0.3)]),
             0,
@@ -1175,7 +1218,7 @@ mod tests {
     }
 
     #[test]
-    fn test_colref_leq_constint_no_mcvs_in_range() {
+    fn test_filtersel_colref_leq_constint_no_mcvs_in_range() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             10,
@@ -1199,7 +1242,7 @@ mod tests {
     }
 
     #[test]
-    fn test_colref_leq_constint_no_mcvs_in_range_with_nulls() {
+    fn test_filtersel_colref_leq_constint_no_mcvs_in_range_with_nulls() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             10,
@@ -1223,7 +1266,7 @@ mod tests {
     }
 
     #[test]
-    fn test_colref_leq_constint_with_mcvs_in_range_not_at_border() {
+    fn test_filtersel_colref_leq_constint_with_mcvs_in_range_not_at_border() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues {
                 mcvs: vec![
@@ -1256,7 +1299,7 @@ mod tests {
     }
 
     #[test]
-    fn test_colref_leq_constint_with_mcv_at_border() {
+    fn test_filtersel_colref_leq_constint_with_mcv_at_border() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::new(vec![
                 (Value::Int32(6), 0.05),
@@ -1285,7 +1328,7 @@ mod tests {
     }
 
     #[test]
-    fn test_colref_lt_constint_no_mcvs_in_range() {
+    fn test_filtersel_colref_lt_constint_no_mcvs_in_range() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             10,
@@ -1309,7 +1352,7 @@ mod tests {
     }
 
     #[test]
-    fn test_colref_lt_constint_no_mcvs_in_range_with_nulls() {
+    fn test_filtersel_colref_lt_constint_no_mcvs_in_range_with_nulls() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             9, // 90% of the values aren't nulls since null_frac = 0.1. if there are 9 distinct non-null values, each will have 0.1 frequency
@@ -1333,7 +1376,7 @@ mod tests {
     }
 
     #[test]
-    fn test_colref_lt_constint_with_mcvs_in_range_not_at_border() {
+    fn test_filtersel_colref_lt_constint_with_mcvs_in_range_not_at_border() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues {
                 mcvs: vec![
@@ -1366,7 +1409,7 @@ mod tests {
     }
 
     #[test]
-    fn test_colref_lt_constint_with_mcv_at_border() {
+    fn test_filtersel_colref_lt_constint_with_mcv_at_border() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues {
                 mcvs: vec![
@@ -1401,7 +1444,7 @@ mod tests {
     /// I have fewer tests for GT since I'll assume that it uses the same underlying logic as LEQ
     /// The only interesting thing to test is that if there are nulls, those aren't included in GT
     #[test]
-    fn test_colref_gt_constint_no_nulls() {
+    fn test_filtersel_colref_gt_constint_no_nulls() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             10,
@@ -1425,7 +1468,7 @@ mod tests {
     }
 
     #[test]
-    fn test_colref_gt_constint_with_nulls() {
+    fn test_filtersel_colref_gt_constint_with_nulls() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             10,
@@ -1451,7 +1494,7 @@ mod tests {
 
     /// As with above, I have one test without nulls and one test with nulls
     #[test]
-    fn test_colref_geq_constint_no_nulls() {
+    fn test_filtersel_colref_geq_constint_no_nulls() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             10,
@@ -1475,7 +1518,7 @@ mod tests {
     }
 
     #[test]
-    fn test_colref_geq_constint_with_nulls() {
+    fn test_filtersel_colref_geq_constint_with_nulls() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             9, // 90% of the values aren't nulls since null_frac = 0.1. if there are 9 distinct non-null values, each will have 0.1 frequency
@@ -1500,7 +1543,7 @@ mod tests {
     }
 
     #[test]
-    fn test_and() {
+    fn test_filtersel_and() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues {
                 mcvs: vec![
@@ -1540,7 +1583,7 @@ mod tests {
     }
 
     #[test]
-    fn test_or() {
+    fn test_filtersel_or() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues {
                 mcvs: vec![
@@ -1580,7 +1623,7 @@ mod tests {
     }
 
     #[test]
-    fn test_not_no_nulls() {
+    fn test_filtersel_not_no_nulls() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::new(vec![(Value::Int32(1), 0.3)]),
             0,
@@ -1602,7 +1645,7 @@ mod tests {
     }
 
     #[test]
-    fn test_not_with_nulls() {
+    fn test_filtersel_not_with_nulls() {
         let cost_model = create_one_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::new(vec![(Value::Int32(1), 0.3)]),
             0,
@@ -1624,4 +1667,11 @@ mod tests {
             0.7
         );
     }
+
+    #[test]
+    fn test_joinsel_const() {
+        let cost_model = create_one_column_cost_model(get_empty_per_col_stats());
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(true)), &vec![]), 1.0);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(false)), &vec![]), 0.0);
+    }
 }

From ccf068dd06c55b9f2bb1625dfd399f1da7f7d131 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sat, 30 Mar 2024 19:21:07 -0400
Subject: [PATCH 03/29] made get semantic nodes a function

---
 optd-datafusion-repr/src/cost/base_cost.rs | 238 +++++++++++++++++----
 1 file changed, 193 insertions(+), 45 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index 161a8440..fa71828e 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -323,11 +323,12 @@ const DEFAULT_EQ_SEL: f64 = 0.005;
 const DEFAULT_INEQ_SEL: f64 = 0.3333333333333333;
 // Default selectivity estimate for pattern-match operators such as LIKE
 const DEFAULT_MATCH_SEL: f64 = 0.005;
+const DEFAULT_NUM_DISTINCT: u64 = 200;
 // Default selectivity if we have no information
 const DEFAULT_UNK_SEL: f64 = 0.005;
 
-// A placeholder for todo!() for codepaths which are accessed by plannertest
-const TODO_SEL: f64 = 0.01;
+// A placeholder for unimplemented!() for codepaths which are accessed by plannertest
+const UNIMPLEMENTED_SEL: f64 = 0.01;
 
 impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
     pub fn row_cnt(Cost(cost): &Cost) -> f64 {
@@ -585,7 +586,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         assert!(expr_tree.typ.is_expression());
         match &expr_tree.typ {
             OptRelNodeTyp::Constant(_) => Self::get_constant_selectivity(expr_tree),
-            OptRelNodeTyp::ColumnRef => todo!("check bool type or else panic"),
+            OptRelNodeTyp::ColumnRef => unimplemented!("check bool type or else panic"),
             OptRelNodeTyp::UnOp(un_op_typ) => {
                 assert!(expr_tree.children.len() == 1);
                 let child = expr_tree.child(0);
@@ -621,17 +622,17 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
             OptRelNodeTyp::LogOp(log_op_typ) => {
                 self.get_filter_log_op_selectivity(*log_op_typ, &expr_tree.children, column_refs)
             }
-            OptRelNodeTyp::Func(_) => todo!("check bool type or else panic"),
+            OptRelNodeTyp::Func(_) => unimplemented!("check bool type or else panic"),
             OptRelNodeTyp::SortOrder(_) => {
                 panic!("the selectivity of sort order expressions is undefined")
             }
-            OptRelNodeTyp::Between => TODO_SEL,
-            OptRelNodeTyp::Cast => todo!("check bool type or else panic"),
+            OptRelNodeTyp::Between => UNIMPLEMENTED_SEL,
+            OptRelNodeTyp::Cast => unimplemented!("check bool type or else panic"),
             OptRelNodeTyp::Like => DEFAULT_MATCH_SEL,
             OptRelNodeTyp::DataType(_) => {
                 panic!("the selectivity of a data type is not defined")
             }
-            OptRelNodeTyp::InList => TODO_SEL,
+            OptRelNodeTyp::InList => UNIMPLEMENTED_SEL,
             _ => unreachable!(
                 "all expression OptRelNodeTyp were enumerated. this should be unreachable"
             ),
@@ -648,21 +649,43 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         assert!(expr_tree.typ.is_expression());
         match &expr_tree.typ {
             OptRelNodeTyp::Constant(_) => Self::get_constant_selectivity(expr_tree),
-            OptRelNodeTyp::ColumnRef => todo!("check bool type or else panic"),
-            OptRelNodeTyp::UnOp(_) => todo!(),
-            OptRelNodeTyp::BinOp(_) => TODO_SEL,
-            OptRelNodeTyp::LogOp(_) => TODO_SEL,
-            OptRelNodeTyp::Func(_) => todo!("check bool type or else panic"),
+            OptRelNodeTyp::ColumnRef => unimplemented!("check bool type or else panic"),
+            OptRelNodeTyp::UnOp(_) => unimplemented!(),
+            OptRelNodeTyp::BinOp(bin_op_typ) => {
+                assert!(expr_tree.children.len() == 2);
+                let left_child = expr_tree.child(0);
+                let right_child = expr_tree.child(1);
+
+                if bin_op_typ.is_comparison() {
+                    self.get_join_comp_op_selectivity(
+                        join_typ,
+                        *bin_op_typ,
+                        left_child,
+                        right_child,
+                        column_refs,
+                    )
+                } else if bin_op_typ.is_numerical() {
+                    panic!(
+                        "the selectivity of operations that return numerical values is undefined"
+                    )
+                } else {
+                    unreachable!("all BinOpTypes should be true for at least one is_*() function")
+                }
+            },
+            OptRelNodeTyp::LogOp(log_op_typ) => {
+                self.get_join_log_op_selectivity(join_typ, *log_op_typ, &expr_tree.children, column_refs)
+            },
+            OptRelNodeTyp::Func(_) => unimplemented!("check bool type or else panic"),
             OptRelNodeTyp::SortOrder(_) => {
                 panic!("the selectivity of sort order expressions is undefined")
             }
-            OptRelNodeTyp::Between => todo!(),
-            OptRelNodeTyp::Cast => todo!("check bool type or else panic"),
-            OptRelNodeTyp::Like => todo!(),
+            OptRelNodeTyp::Between => unimplemented!(),
+            OptRelNodeTyp::Cast => unimplemented!("check bool type or else panic"),
+            OptRelNodeTyp::Like => unimplemented!(),
             OptRelNodeTyp::DataType(_) => {
                 panic!("the selectivity of a data type is not defined")
             }
-            OptRelNodeTyp::InList => todo!(),
+            OptRelNodeTyp::InList => unimplemented!(),
             _ => unreachable!(
                 "all expression OptRelNodeTyp were enumerated. this should be unreachable"
             ),
@@ -679,44 +702,21 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
     ) -> f64 {
         assert!(comp_bin_op_typ.is_comparison());
 
-        // it's more convenient to refer to the children based on whether they're column nodes or not
-        // rather than by left/right
-        let mut col_ref_nodes = vec![];
-        let mut non_col_ref_nodes = vec![];
-        let is_left_col_ref;
         // I intentionally performed moves on left and right. This way, we don't accidentally use them after this block
-        // We always want to use "col_ref_node" and "non_col_ref_node" instead of "left" or "right"
-        if left.as_ref().typ == OptRelNodeTyp::ColumnRef {
-            is_left_col_ref = true;
-            col_ref_nodes.push(
-                ColumnRefExpr::from_rel_node(left)
-                    .expect("we already checked that the type is ColumnRef"),
-            );
-        } else {
-            is_left_col_ref = false;
-            non_col_ref_nodes.push(left);
-        }
-        if right.as_ref().typ == OptRelNodeTyp::ColumnRef {
-            col_ref_nodes.push(
-                ColumnRefExpr::from_rel_node(right)
-                    .expect("we already checked that the type is ColumnRef"),
-            );
-        } else {
-            non_col_ref_nodes.push(right);
-        }
+        let (col_ref_nodes, non_col_ref_nodes, is_left_col_ref) = Self::get_semantic_nodes(left, right);
 
         // handle the different cases of column nodes
         if col_ref_nodes.is_empty() {
-            TODO_SEL
+            UNIMPLEMENTED_SEL
         } else if col_ref_nodes.len() == 1 {
             let col_ref_node = col_ref_nodes
-                .pop()
+                .first()
                 .expect("we just checked that col_ref_nodes.len() == 1");
             let col_ref_idx = col_ref_node.index();
 
             if let ColumnRef::BaseTableColumnRef { table, col_idx } = &column_refs[col_ref_idx] {
                 let non_col_ref_node = non_col_ref_nodes
-                    .pop()
+                    .first()
                     .expect("non_col_ref_nodes should have a value since col_ref_nodes.len() == 1");
 
                 match non_col_ref_node.as_ref().typ {
@@ -767,7 +767,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
                     OptRelNodeTyp::BinOp(_) => {
                         Self::get_default_comparison_op_selectivity(comp_bin_op_typ)
                     }
-                    OptRelNodeTyp::Cast => TODO_SEL,
+                    OptRelNodeTyp::Cast => UNIMPLEMENTED_SEL,
                     _ => unimplemented!(
                         "unhandled case of comparing a column ref node to {}",
                         non_col_ref_node.as_ref().typ
@@ -783,6 +783,94 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         }
     }
 
+    /// Comparison operators are the base case for recursion in get_join_selectivity()
+    fn get_join_comp_op_selectivity(
+        &self,
+        join_typ: JoinType,
+        comp_bin_op_typ: BinOpType,
+        left: OptRelNodeRef,
+        right: OptRelNodeRef,
+        column_refs: &GroupColumnRefs,
+    ) -> f64 {
+        assert!(comp_bin_op_typ.is_comparison());
+
+        // I intentionally performed moves on left and right. This way, we don't accidentally use them after this block
+        let (col_ref_nodes, _, _) = Self::get_semantic_nodes(left, right);
+
+        // handle the different cases of column nodes
+        if col_ref_nodes.is_empty() {
+            unimplemented!()
+        } else if col_ref_nodes.len() == 1 {
+            unimplemented!()
+        } else if col_ref_nodes.len() == 2 {
+            match join_typ {
+                JoinType::Inner => {
+                    // the statistics objects of the referenced columns
+                    let col_ref_stats_list = col_ref_nodes.iter().map(|col_ref_node| {
+                        let col_ref_idx = col_ref_node.index();
+                        if let ColumnRef::BaseTableColumnRef { table, col_idx } = &column_refs[col_ref_idx] {
+                            if let Some(per_table_stats) = self.per_table_stats_map.get(table) {
+                                if let Some(Some(per_column_stats)) = per_table_stats.per_column_stats_vec.get(*col_idx)
+                                {
+                                    Some(per_column_stats)
+                                } else {
+                                    None
+                                }
+                            } else {
+                                None
+                            }
+                        } else {
+                            None
+                        }
+                    });
+                    let ndistincts = col_ref_stats_list.map(|col_ref_stats| {
+                        if let Some(col_ref_stats) = col_ref_stats {
+                            col_ref_stats.ndistinct
+                        } else {
+                            DEFAULT_NUM_DISTINCT
+                        }
+                    });
+                    // using reduce(f64::min) is the idiomatic workaround to the fact that f64 does not implement Ord due to NaN
+                    let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_nodes.len() == 2");
+                    assert!(!selectivity.is_nan(), "it should be impossible for selectivity to be NaN since n-distinct is never 0");
+                    selectivity
+                }
+                _ => unimplemented!()
+            }
+        } else {
+            unreachable!("we could have at most pushed left and right into col_ref_nodes")
+        }
+    }
+
+    /// Convert the left and right child nodes of some operation to what they semantically are
+    /// This is convenient to avoid repeating the same logic just with "left" and "right" swapped
+    fn get_semantic_nodes(left: OptRelNodeRef, right: OptRelNodeRef) -> (Vec<ColumnRefExpr>, Vec<OptRelNodeRef>, bool) {
+        let mut col_ref_nodes = vec![];
+        let mut non_col_ref_nodes = vec![];
+        let is_left_col_ref;
+        // I intentionally performed moves on left and right. This way, we don't accidentally use them after this block
+        // We always want to use "col_ref_node" and "non_col_ref_node" instead of "left" or "right"
+        if left.as_ref().typ == OptRelNodeTyp::ColumnRef {
+            is_left_col_ref = true;
+            col_ref_nodes.push(
+                ColumnRefExpr::from_rel_node(left)
+                    .expect("we already checked that the type is ColumnRef"),
+            );
+        } else {
+            is_left_col_ref = false;
+            non_col_ref_nodes.push(left);
+        }
+        if right.as_ref().typ == OptRelNodeTyp::ColumnRef {
+            col_ref_nodes.push(
+                ColumnRefExpr::from_rel_node(right)
+                    .expect("we already checked that the type is ColumnRef"),
+            );
+        } else {
+            non_col_ref_nodes.push(right);
+        }
+        (col_ref_nodes, non_col_ref_nodes, is_left_col_ref)
+    }
+
     /// The default selectivity of a comparison expression
     /// Used when one side of the comparison is a column while the other side is something too
     ///   complex/impossible to evaluate (subquery, UDF, another column, we have no stats, etc.)
@@ -949,6 +1037,24 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         }
     }
 
+    fn get_join_log_op_selectivity(
+        &self,
+        join_typ: JoinType,
+        log_op_typ: LogOpType,
+        children: &[OptRelNodeRef],
+        column_refs: &GroupColumnRefs,
+    ) -> f64 {
+        let children_sel = children
+            .iter()
+            .map(|expr| self.get_join_selectivity(join_typ, expr.clone(), column_refs));
+
+        match log_op_typ {
+            LogOpType::And => children_sel.product(),
+            // the formula is 1.0 - the probability of _none_ of the events happening
+            LogOpType::Or => 1.0 - children_sel.fold(1.0, |acc, sel| acc * (1.0 - sel)),
+        }
+    }
+
     pub fn get_row_cnt(&self, table: &str) -> Option<usize> {
         self.per_table_stats_map
             .get(table)
@@ -1045,7 +1151,7 @@ mod tests {
 
     const TABLE1_NAME: &str = "t1";
 
-    // one column is sufficient for all filter selectivity predicates
+    // one column is sufficient for all filter selectivity tests
     fn create_one_column_cost_model(
         per_column_stats: TestPerColumnStats,
     ) -> OptCostModel<TestMostCommonValues, TestDistribution> {
@@ -1059,6 +1165,21 @@ mod tests {
         )
     }
 
+    // two columns is sufficient for all join selectivity tests
+    fn create_two_column_cost_model(
+        per_column_stats1: TestPerColumnStats,
+        per_column_stats2: TestPerColumnStats,
+    ) -> OptCostModel<TestMostCommonValues, TestDistribution> {
+        OptCostModel::new(
+            vec![(
+                String::from(TABLE1_NAME),
+                PerTableStats::new(100, vec![Some(per_column_stats1), Some(per_column_stats2)]),
+            )]
+            .into_iter()
+            .collect(),
+        )
+    }
+
     fn col_ref(idx: u64) -> OptRelNodeRef {
         // this conversion is always safe because idx was originally a usize
         let idx_as_usize = idx as usize;
@@ -1394,6 +1515,7 @@ mod tests {
         ));
         let expr_tree = bin_op(BinOpType::Lt, col_ref(0), cnst(Value::Int32(15)));
         let expr_tree_rev = bin_op(BinOpType::Gt, cnst(Value::Int32(15)), col_ref(0));
+        // TODO(phw2): make column_refs a function
         let column_refs = vec![ColumnRef::BaseTableColumnRef {
             table: String::from(TABLE1_NAME),
             col_idx: 0,
@@ -1674,4 +1796,30 @@ mod tests {
         assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(true)), &vec![]), 1.0);
         assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(false)), &vec![]), 0.0);
     }
+
+    #[test]
+    fn test_joinsel_colref_eq_colref_no_mcvs_no_nulls() {
+        let cost_model = create_two_column_cost_model(TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            5,
+            0.0,
+            TestDistribution::empty(),
+        ), TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            3,
+            0.0,
+            TestDistribution::empty(),
+        ));
+        let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
+        let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0));
+        let column_refs = vec![ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE1_NAME),
+            col_idx: 0,
+        }, ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE1_NAME),
+            col_idx: 1,
+        }];
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.2);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.2);
+    }
 }

From 88a15e94190205f0a27d876699a6c27978b90ec5 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sat, 30 Mar 2024 19:27:10 -0400
Subject: [PATCH 04/29] added tests for log op join sel

---
 optd-datafusion-repr/src/cost/base_cost.rs | 56 ++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index fa71828e..ba4a8136 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -1822,4 +1822,60 @@ mod tests {
         assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.2);
         assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.2);
     }
+
+    #[test]
+    fn test_joinsel_and() {
+        let cost_model = create_two_column_cost_model(TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            5,
+            0.0,
+            TestDistribution::empty(),
+        ), TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            3,
+            0.0,
+            TestDistribution::empty(),
+        ));
+        let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
+        let eq1and0 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
+        let expr_tree = log_op(LogOpType::And, vec![eq0and1.clone(), eq1and0.clone()]);
+        let expr_tree_rev = log_op(LogOpType::And, vec![eq1and0.clone(), eq0and1.clone()]);
+        let column_refs = vec![ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE1_NAME),
+            col_idx: 0,
+        }, ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE1_NAME),
+            col_idx: 1,
+        }];
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.04);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.04);
+    }
+
+    #[test]
+    fn test_joinsel_or() {
+        let cost_model = create_two_column_cost_model(TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            5,
+            0.0,
+            TestDistribution::empty(),
+        ), TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            3,
+            0.0,
+            TestDistribution::empty(),
+        ));
+        let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
+        let eq1and0 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
+        let expr_tree = log_op(LogOpType::Or, vec![eq0and1.clone(), eq1and0.clone()]);
+        let expr_tree_rev = log_op(LogOpType::Or, vec![eq1and0.clone(), eq0and1.clone()]);
+        let column_refs = vec![ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE1_NAME),
+            col_idx: 0,
+        }, ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE1_NAME),
+            col_idx: 1,
+        }];
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.36);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.36);
+    }
 }

From c0155625a52e08e79c528fd810eb4fa0d5fafd4d Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sat, 30 Mar 2024 21:13:14 -0400
Subject: [PATCH 05/29] refactored per_col_vec to per_col_map to avoid double
 options

---
 optd-datafusion-repr/src/cost/base_cost.rs | 137 ++++++++++-----------
 optd-datafusion-repr/src/plan_nodes.rs     |   2 +
 optd-perftest/src/datafusion_dbms.rs       |   8 +-
 3 files changed, 70 insertions(+), 77 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index ba4a8136..07e0660a 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -94,7 +94,7 @@ impl MostCommonValues for MockMostCommonValues {
 #[derive(Serialize, Deserialize)]
 pub struct PerTableStats<M: MostCommonValues, D: Distribution> {
     row_cnt: usize,
-    per_column_stats_vec: Vec<Option<PerColumnStats<M, D>>>,
+    per_column_stats_map: HashMap<usize, PerColumnStats<M, D>>,
 }
 
 impl DataFusionPerTableStats {
@@ -150,22 +150,20 @@ impl DataFusionPerTableStats {
         }
 
         // Assemble the per-column stats.
-        let mut per_column_stats_vec = Vec::with_capacity(col_cnt);
+        let mut per_column_stats_map = HashMap::with_capacity(col_cnt);
         for i in 0..col_cnt {
-            per_column_stats_vec.push(if Self::is_type_supported(&col_types[i]) {
-                Some(PerColumnStats::new(
+            if Self::is_type_supported(&col_types[i]) {
+                per_column_stats_map.insert(i, PerColumnStats::new(
                     mcvs[i].take().unwrap(),
                     hlls[i].n_distinct(),
                     null_cnt[i] as f64 / row_cnt as f64,
                     distr[i].take().unwrap(),
-                ))
-            } else {
-                None
-            });
+                ));
+            }
         }
         Ok(Self {
             row_cnt,
-            per_column_stats_vec,
+            per_column_stats_map,
         })
     }
 
@@ -640,12 +638,15 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
     }
 
     /// The expr_tree input must be a "mixed expression tree", just like with get_filter_selectivity()
-    fn get_join_selectivity(
+    /// The "wrapper" is here to separate the equality conditions from the filter conditions before calling
+    ///   the "main" get_join_selectivity() function.
+    fn get_join_selectivity_wrapper(
         &self,
         join_typ: JoinType,
         expr_tree: OptRelNodeRef,
         column_refs: &GroupColumnRefs,
     ) -> f64 {
+        println!("get_join_selectivity(): called on expr_tree={}", expr_tree);
         assert!(expr_tree.typ.is_expression());
         match &expr_tree.typ {
             OptRelNodeTyp::Constant(_) => Self::get_constant_selectivity(expr_tree),
@@ -692,6 +693,55 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         }
     }
 
+    fn get_join_selectivity(
+        &self,
+        join_typ: JoinType,
+        on_col_ref_pairs: Vec<(ColumnRefExpr, ColumnRefExpr)>,
+        filter_expr_tree: Option<OptRelNodeRef>,
+        column_refs: &GroupColumnRefs,
+    ) -> f64 {
+        let join_on_selectivity = self.get_join_on_selectivity(join_typ, on_col_ref_pairs, column_refs);
+        // Currently, there is no difference in how we handle a join filter and a select filter, so we use the same function
+        // One difference (that we *don't* care about right now) is that join filters can contain expressions from multiple
+        //   different tables. Currently, this doesn't affect the get_filter_selectivity() function, but this may change in
+        //   the future
+        let join_filter_selectivity = match filter_expr_tree {
+            Some(filter_expr_tree) => self.get_filter_selectivity(filter_expr_tree, column_refs),
+            None => 1.0,
+        };
+        join_on_selectivity * join_filter_selectivity
+    }
+
+    fn get_per_col_stats(&self, col_ref: &ColumnRef) -> Option<&PerColumnStats<M, D>> {
+        if let ColumnRef::BaseTableColumnRef { table, col_idx } = col_ref {
+            self.per_table_stats_map.get(table).and_then(|per_table_stats| per_table_stats.per_column_stats_map.get(col_idx))
+        } else {
+            None
+        }
+    }
+
+    fn get_join_on_selectivity(
+        &self,
+        join_typ: JoinType,
+        on_col_ref_pairs: Vec<(ColumnRefExpr, ColumnRefExpr)>,
+        column_refs: &GroupColumnRefs
+    ) -> f64 {
+        // multiply the selectivities of all individual conditions together
+        on_col_ref_pairs.into_iter().map(|on_col_ref_pair| {
+            // the formula for each pair is min(1 / ndistinct1, 1 / ndistinct2) (see https://postgrespro.com/blog/pgsql/5969618)
+            let ndistincts = vec![on_col_ref_pair.0, on_col_ref_pair.1].into_iter().map(|on_col_ref| {
+                match self.get_per_col_stats(&column_refs[on_col_ref.index()]) {
+                    Some(per_col_stats) => per_col_stats.ndistinct,
+                    None => DEFAULT_NUM_DISTINCT,
+                }
+            });
+            // using reduce(f64::min) is the idiomatic workaround to the fact that f64 does not implement Ord due to NaN
+            let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_nodes.len() == 2");
+            assert!(!selectivity.is_nan(), "it should be impossible for selectivity to be NaN since n-distinct is never 0");
+            selectivity
+        }).product()
+    }
+
     /// Comparison operators are the base case for recursion in get_filter_selectivity()
     fn get_filter_comp_op_selectivity(
         &self,
@@ -783,65 +833,6 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         }
     }
 
-    /// Comparison operators are the base case for recursion in get_join_selectivity()
-    fn get_join_comp_op_selectivity(
-        &self,
-        join_typ: JoinType,
-        comp_bin_op_typ: BinOpType,
-        left: OptRelNodeRef,
-        right: OptRelNodeRef,
-        column_refs: &GroupColumnRefs,
-    ) -> f64 {
-        assert!(comp_bin_op_typ.is_comparison());
-
-        // I intentionally performed moves on left and right. This way, we don't accidentally use them after this block
-        let (col_ref_nodes, _, _) = Self::get_semantic_nodes(left, right);
-
-        // handle the different cases of column nodes
-        if col_ref_nodes.is_empty() {
-            unimplemented!()
-        } else if col_ref_nodes.len() == 1 {
-            unimplemented!()
-        } else if col_ref_nodes.len() == 2 {
-            match join_typ {
-                JoinType::Inner => {
-                    // the statistics objects of the referenced columns
-                    let col_ref_stats_list = col_ref_nodes.iter().map(|col_ref_node| {
-                        let col_ref_idx = col_ref_node.index();
-                        if let ColumnRef::BaseTableColumnRef { table, col_idx } = &column_refs[col_ref_idx] {
-                            if let Some(per_table_stats) = self.per_table_stats_map.get(table) {
-                                if let Some(Some(per_column_stats)) = per_table_stats.per_column_stats_vec.get(*col_idx)
-                                {
-                                    Some(per_column_stats)
-                                } else {
-                                    None
-                                }
-                            } else {
-                                None
-                            }
-                        } else {
-                            None
-                        }
-                    });
-                    let ndistincts = col_ref_stats_list.map(|col_ref_stats| {
-                        if let Some(col_ref_stats) = col_ref_stats {
-                            col_ref_stats.ndistinct
-                        } else {
-                            DEFAULT_NUM_DISTINCT
-                        }
-                    });
-                    // using reduce(f64::min) is the idiomatic workaround to the fact that f64 does not implement Ord due to NaN
-                    let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_nodes.len() == 2");
-                    assert!(!selectivity.is_nan(), "it should be impossible for selectivity to be NaN since n-distinct is never 0");
-                    selectivity
-                }
-                _ => unimplemented!()
-            }
-        } else {
-            unreachable!("we could have at most pushed left and right into col_ref_nodes")
-        }
-    }
-
     /// Convert the left and right child nodes of some operation to what they semantically are
     /// This is convenient to avoid repeating the same logic just with "left" and "right" swapped
     fn get_semantic_nodes(left: OptRelNodeRef, right: OptRelNodeRef) -> (Vec<ColumnRefExpr>, Vec<OptRelNodeRef>, bool) {
@@ -925,7 +916,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         is_eq: bool,
     ) -> f64 {
         if let Some(per_table_stats) = self.per_table_stats_map.get(table) {
-            if let Some(Some(per_column_stats)) = per_table_stats.per_column_stats_vec.get(col_idx)
+            if let Some(per_column_stats) = per_table_stats.per_column_stats_map.get(&col_idx)
             {
                 let eq_freq = if let Some(freq) = per_column_stats.mcvs.freq(value) {
                     freq
@@ -975,7 +966,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         is_col_eq_val: bool,
     ) -> f64 {
         if let Some(per_table_stats) = self.per_table_stats_map.get(table) {
-            if let Some(Some(per_column_stats)) = per_table_stats.per_column_stats_vec.get(col_idx)
+            if let Some(per_column_stats) = per_table_stats.per_column_stats_map.get(&col_idx)
             {
                 // because distr does not include the values in MCVs, we need to compute the CDFs there as well
                 // because nulls return false in any comparison, they are never included when computing range selectivity
@@ -1063,10 +1054,10 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
 }
 
 impl<M: MostCommonValues, D: Distribution> PerTableStats<M, D> {
-    pub fn new(row_cnt: usize, per_column_stats_vec: Vec<Option<PerColumnStats<M, D>>>) -> Self {
+    pub fn new(row_cnt: usize, per_column_stats_map: HashMap<usize, PerColumnStats<M, D>>) -> Self {
         Self {
             row_cnt,
-            per_column_stats_vec,
+            per_column_stats_map,
         }
     }
 }
diff --git a/optd-datafusion-repr/src/plan_nodes.rs b/optd-datafusion-repr/src/plan_nodes.rs
index faf27e3a..e872b3a9 100644
--- a/optd-datafusion-repr/src/plan_nodes.rs
+++ b/optd-datafusion-repr/src/plan_nodes.rs
@@ -39,6 +39,8 @@ pub use sort::{LogicalSort, PhysicalSort};
 
 use crate::properties::schema::{Schema, SchemaPropertyBuilder};
 
+/// OptRelNodeTyp FAQ:
+///   - The define_plan_node!() macro defines what the children of each join node are
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub enum OptRelNodeTyp {
     Placeholder(GroupId),
diff --git a/optd-perftest/src/datafusion_dbms.rs b/optd-perftest/src/datafusion_dbms.rs
index e98d93e6..204b85e2 100644
--- a/optd-perftest/src/datafusion_dbms.rs
+++ b/optd-perftest/src/datafusion_dbms.rs
@@ -145,13 +145,13 @@ impl DatafusionDBMS {
 
         let mut estcards = vec![];
         for (query_id, sql_fpath) in tpch_kit.get_sql_fpath_ordered_iter(tpch_config)? {
-            let sql = fs::read_to_string(sql_fpath)?;
-            let estcard = self.eval_query_estcard(&sql).await?;
-            estcards.push(estcard);
             println!(
-                "done evaluating datafusion's estcard for TPC-H Q{}",
+                "about to evaluate datafusion's estcard for TPC-H Q{}",
                 query_id
             );
+            let sql = fs::read_to_string(sql_fpath)?;
+            let estcard = self.eval_query_estcard(&sql).await?;
+            estcards.push(estcard);
         }
 
         Ok(estcards)

From fdb1c8c7c3f517d38f48072daa248937c4a34f1e Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sat, 30 Mar 2024 22:07:39 -0400
Subject: [PATCH 06/29] wrote wrapper to extract join on condition

---
 optd-datafusion-repr/src/bin/test_optimize.rs |   4 +-
 optd-datafusion-repr/src/cost/base_cost.rs    | 135 +++++++++---------
 2 files changed, 71 insertions(+), 68 deletions(-)

diff --git a/optd-datafusion-repr/src/bin/test_optimize.rs b/optd-datafusion-repr/src/bin/test_optimize.rs
index eb7a80a1..c9a1feb8 100644
--- a/optd-datafusion-repr/src/bin/test_optimize.rs
+++ b/optd-datafusion-repr/src/bin/test_optimize.rs
@@ -1,4 +1,4 @@
-use std::sync::Arc;
+use std::{collections::HashMap, sync::Arc};
 
 use optd_core::{
     cascades::CascadesOptimizer,
@@ -45,7 +45,7 @@ pub fn main() {
         Box::new(OptCostModel::new(
             [("t1", 1000), ("t2", 100), ("t3", 10000)]
                 .into_iter()
-                .map(|(x, y)| (x.to_string(), DataFusionPerTableStats::new(y, vec![])))
+                .map(|(x, y)| (x.to_string(), DataFusionPerTableStats::new(y, HashMap::new())))
                 .collect(),
         )),
         vec![],
diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index 07e0660a..62517b89 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -1,7 +1,7 @@
 use std::{collections::HashMap, sync::Arc};
 
 use crate::plan_nodes::{
-    BinOpType, ColumnRefExpr, ConstantExpr, ConstantType, LogOpType, OptRelNode, UnOpType,
+    BinOpType, ColumnRefExpr, ConstantExpr, ConstantType, Expr, ExprList, LogOpExpr, LogOpType, OptRelNode, UnOpType
 };
 use crate::properties::column_ref::{ColumnRefPropertyBuilder, GroupColumnRefs};
 use crate::{
@@ -637,63 +637,84 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         }
     }
 
+    /// Check if an expr_tree is a join condition, returning the join on col ref pair if it is
+    /// The reason the check and the info are in the same function is because their code is almost identical
+    fn get_on_col_ref_pair(expr_tree: OptRelNodeRef, column_refs: &GroupColumnRefs) -> Option<(ColumnRefExpr, ColumnRefExpr)> {
+        // We perform three checks to see if a child_expr_tree is an on_col_ref_pair
+        // 1. Check that it's equality
+        if expr_tree.typ == OptRelNodeTyp::BinOp(BinOpType::Eq) {
+            let left_child = expr_tree.child(0);
+            let right_child = expr_tree.child(1);
+            // 2. Check that both sides are column refs
+            if left_child.typ == OptRelNodeTyp::ColumnRef && right_child.typ == OptRelNodeTyp::ColumnRef {
+                // 3. Check that both sides don't belong to the same table (if we don't know, that means they don't belong)
+                let left_col_ref_expr = ColumnRefExpr::from_rel_node(left_child).expect("we already checked that the type is ColumnRef");
+                let right_col_ref_expr = ColumnRefExpr::from_rel_node(right_child).expect("we already checked that the type is ColumnRef");
+                let left_col_ref = &column_refs[left_col_ref_expr.index()];
+                let right_col_ref = &column_refs[right_col_ref_expr.index()];
+                let is_same_table = if let ColumnRef::BaseTableColumnRef { table: left_table, .. } = left_col_ref {
+                    if let ColumnRef::BaseTableColumnRef { table: right_table, .. } = right_col_ref {
+                        left_table == right_table
+                    } else {
+                        false
+                    }
+                } else {
+                    false
+                };
+                if !is_same_table {
+                    Some((left_col_ref_expr, right_col_ref_expr))
+                } else {
+                    None
+                }
+            } else {
+                None
+            }
+        } else {
+            None
+        }
+    }
+
     /// The expr_tree input must be a "mixed expression tree", just like with get_filter_selectivity()
-    /// The "wrapper" is here to separate the equality conditions from the filter conditions before calling
-    ///   the "main" get_join_selectivity() function.
-    fn get_join_selectivity_wrapper(
+    /// This is a "wrapper" to separate the equality conditions from the filter conditions before calling
+    ///   the "main" get_join_selectivity_core() function.
+    fn get_join_selectivity(
         &self,
         join_typ: JoinType,
         expr_tree: OptRelNodeRef,
         column_refs: &GroupColumnRefs,
     ) -> f64 {
-        println!("get_join_selectivity(): called on expr_tree={}", expr_tree);
         assert!(expr_tree.typ.is_expression());
-        match &expr_tree.typ {
-            OptRelNodeTyp::Constant(_) => Self::get_constant_selectivity(expr_tree),
-            OptRelNodeTyp::ColumnRef => unimplemented!("check bool type or else panic"),
-            OptRelNodeTyp::UnOp(_) => unimplemented!(),
-            OptRelNodeTyp::BinOp(bin_op_typ) => {
-                assert!(expr_tree.children.len() == 2);
-                let left_child = expr_tree.child(0);
-                let right_child = expr_tree.child(1);
-
-                if bin_op_typ.is_comparison() {
-                    self.get_join_comp_op_selectivity(
-                        join_typ,
-                        *bin_op_typ,
-                        left_child,
-                        right_child,
-                        column_refs,
-                    )
-                } else if bin_op_typ.is_numerical() {
-                    panic!(
-                        "the selectivity of operations that return numerical values is undefined"
-                    )
+        if expr_tree.typ == OptRelNodeTyp::LogOp(LogOpType::And) {
+            let mut on_col_ref_pairs = vec![];
+            let mut filter_expr_trees = vec![];
+            for child_expr_tree in &expr_tree.children {
+                if let Some(on_col_ref_pair) = Self::get_on_col_ref_pair(child_expr_tree.clone(), column_refs) {
+                    on_col_ref_pairs.push(on_col_ref_pair)
                 } else {
-                    unreachable!("all BinOpTypes should be true for at least one is_*() function")
+                    let child_expr = Expr::from_rel_node(child_expr_tree.clone()).expect("everything that is a direct child of an And node must be an expression");
+                    filter_expr_trees.push(child_expr);
                 }
-            },
-            OptRelNodeTyp::LogOp(log_op_typ) => {
-                self.get_join_log_op_selectivity(join_typ, *log_op_typ, &expr_tree.children, column_refs)
-            },
-            OptRelNodeTyp::Func(_) => unimplemented!("check bool type or else panic"),
-            OptRelNodeTyp::SortOrder(_) => {
-                panic!("the selectivity of sort order expressions is undefined")
             }
-            OptRelNodeTyp::Between => unimplemented!(),
-            OptRelNodeTyp::Cast => unimplemented!("check bool type or else panic"),
-            OptRelNodeTyp::Like => unimplemented!(),
-            OptRelNodeTyp::DataType(_) => {
-                panic!("the selectivity of a data type is not defined")
+            assert!(on_col_ref_pairs.len() + filter_expr_trees.len() == expr_tree.children.len());
+            let filter_expr_tree = if filter_expr_trees.is_empty() {
+                None
+            } else {
+                Some(LogOpExpr::new(
+                    LogOpType::And,
+                    ExprList::new(filter_expr_trees),
+                ).into_rel_node())
+            };
+            self.get_join_selectivity_core(join_typ, on_col_ref_pairs, filter_expr_tree, column_refs)
+        } else {
+            if let Some(on_col_ref_pair) = Self::get_on_col_ref_pair(expr_tree.clone(), column_refs) {
+                self.get_join_selectivity_core(join_typ, vec![on_col_ref_pair], None, column_refs)
+            } else {
+                self.get_join_selectivity_core(join_typ, vec![], Some(expr_tree), column_refs)
             }
-            OptRelNodeTyp::InList => unimplemented!(),
-            _ => unreachable!(
-                "all expression OptRelNodeTyp were enumerated. this should be unreachable"
-            ),
         }
     }
 
-    fn get_join_selectivity(
+    fn get_join_selectivity_core(
         &self,
         join_typ: JoinType,
         on_col_ref_pairs: Vec<(ColumnRefExpr, ColumnRefExpr)>,
@@ -1028,24 +1049,6 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         }
     }
 
-    fn get_join_log_op_selectivity(
-        &self,
-        join_typ: JoinType,
-        log_op_typ: LogOpType,
-        children: &[OptRelNodeRef],
-        column_refs: &GroupColumnRefs,
-    ) -> f64 {
-        let children_sel = children
-            .iter()
-            .map(|expr| self.get_join_selectivity(join_typ, expr.clone(), column_refs));
-
-        match log_op_typ {
-            LogOpType::And => children_sel.product(),
-            // the formula is 1.0 - the probability of _none_ of the events happening
-            LogOpType::Or => 1.0 - children_sel.fold(1.0, |acc, sel| acc * (1.0 - sel)),
-        }
-    }
-
     pub fn get_row_cnt(&self, table: &str) -> Option<usize> {
         self.per_table_stats_map
             .get(table)
@@ -1149,7 +1152,7 @@ mod tests {
         OptCostModel::new(
             vec![(
                 String::from(TABLE1_NAME),
-                PerTableStats::new(100, vec![Some(per_column_stats)]),
+                PerTableStats::new(100, vec![(0, per_column_stats)].into_iter().collect()),
             )]
             .into_iter()
             .collect(),
@@ -1158,13 +1161,13 @@ mod tests {
 
     // two columns is sufficient for all join selectivity tests
     fn create_two_column_cost_model(
+        per_column_stats0: TestPerColumnStats,
         per_column_stats1: TestPerColumnStats,
-        per_column_stats2: TestPerColumnStats,
     ) -> OptCostModel<TestMostCommonValues, TestDistribution> {
         OptCostModel::new(
             vec![(
                 String::from(TABLE1_NAME),
-                PerTableStats::new(100, vec![Some(per_column_stats1), Some(per_column_stats2)]),
+                PerTableStats::new(100, vec![(0, per_column_stats0), (1, per_column_stats1)].into_iter().collect()),
             )]
             .into_iter()
             .collect(),
@@ -1789,7 +1792,7 @@ mod tests {
     }
 
     #[test]
-    fn test_joinsel_colref_eq_colref_no_mcvs_no_nulls() {
+    fn test_joinsel_colref_eq_colref_no_nulls() {
         let cost_model = create_two_column_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             5,

From 25c2b75faf7775dca3e425afeaa47cc374e51fa3 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 09:07:26 -0400
Subject: [PATCH 07/29] no cache -> rebuild cache

---
 optd-perftest/src/cardtest.rs               |  4 ++--
 optd-perftest/src/datafusion_dbms.rs        | 13 ++++++-------
 optd-perftest/src/main.rs                   |  8 ++++----
 optd-perftest/tests/cardtest_integration.rs |  2 +-
 4 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/optd-perftest/src/cardtest.rs b/optd-perftest/src/cardtest.rs
index a7de677a..0b9158cb 100644
--- a/optd-perftest/src/cardtest.rs
+++ b/optd-perftest/src/cardtest.rs
@@ -103,14 +103,14 @@ pub trait CardtestRunnerDBMSHelper {
 
 pub async fn cardtest<P: AsRef<Path>>(
     workspace_dpath: P,
-    no_cached_optd_stats: bool,
+    rebuild_cached_optd_stats: bool,
     pguser: &str,
     pgpassword: &str,
     tpch_config: TpchConfig,
 ) -> anyhow::Result<HashMap<String, Vec<Cardinfo>>> {
     let pg_dbms = Box::new(PostgresDBMS::build(&workspace_dpath, pguser, pgpassword)?);
     let truecard_getter = pg_dbms.clone();
-    let df_dbms = Box::new(DatafusionDBMS::new(&workspace_dpath, no_cached_optd_stats).await?);
+    let df_dbms = Box::new(DatafusionDBMS::new(&workspace_dpath, rebuild_cached_optd_stats).await?);
     let dbmss: Vec<Box<dyn CardtestRunnerDBMSHelper>> = vec![pg_dbms, df_dbms];
 
     let tpch_benchmark = Benchmark::Tpch(tpch_config.clone());
diff --git a/optd-perftest/src/datafusion_dbms.rs b/optd-perftest/src/datafusion_dbms.rs
index 204b85e2..25f76b34 100644
--- a/optd-perftest/src/datafusion_dbms.rs
+++ b/optd-perftest/src/datafusion_dbms.rs
@@ -34,7 +34,7 @@ use regex::Regex;
 
 pub struct DatafusionDBMS {
     workspace_dpath: PathBuf,
-    no_cached_stats: bool,
+    rebuild_cached_stats: bool,
     ctx: SessionContext,
 }
 
@@ -63,11 +63,11 @@ impl CardtestRunnerDBMSHelper for DatafusionDBMS {
 impl DatafusionDBMS {
     pub async fn new<P: AsRef<Path>>(
         workspace_dpath: P,
-        no_cached_stats: bool,
+        rebuild_cached_stats: bool,
     ) -> anyhow::Result<Self> {
         Ok(DatafusionDBMS {
             workspace_dpath: workspace_dpath.as_ref().to_path_buf(),
-            no_cached_stats,
+            rebuild_cached_stats,
             ctx: Self::new_session_ctx(None).await?,
         })
     }
@@ -213,7 +213,7 @@ impl DatafusionDBMS {
             .workspace_dpath
             .join("datafusion_stats_caches")
             .join(format!("{}.json", benchmark_fname));
-        if !self.no_cached_stats && stats_cache_fpath.exists() {
+        if !self.rebuild_cached_stats && stats_cache_fpath.exists() {
             let file = File::open(&stats_cache_fpath)?;
             Ok(serde_json::from_reader(file)?)
         } else {
@@ -222,9 +222,8 @@ impl DatafusionDBMS {
                 _ => unimplemented!(),
             };
 
-            // regardless of whether self.no_cached_stats is true or false, we want to update the cache
-            // this way, even if we choose not to read from the cache, the cache still always has the
-            // most up to date version of the stats
+            // When self.rebuild_cached_stats is true, we *don't read* from the cache but we still
+            //   *do write* to the cache.
             fs::create_dir_all(stats_cache_fpath.parent().unwrap())?;
             let file = File::create(&stats_cache_fpath)?;
             serde_json::to_writer(file, &base_table_stats)?;
diff --git a/optd-perftest/src/main.rs b/optd-perftest/src/main.rs
index 0611b746..6a28cfd0 100644
--- a/optd-perftest/src/main.rs
+++ b/optd-perftest/src/main.rs
@@ -39,11 +39,11 @@ enum Commands {
         #[clap(long)]
         #[clap(action)]
         #[clap(help = "Whether to use the cached optd stats/cache generated stats")]
-        // this is an option because you want to make it false whenever you update the
+        // this is an option because you want to make it true whenever you update the
         //   code for how stats are generated in optd, in order to not use cached stats
         // I found that I almost always want to use the cache though, which is why the
         //   system will use the cache by default
-        no_cached_optd_stats: bool,
+        rebuild_cached_optd_stats: bool,
 
         #[clap(long)]
         #[clap(default_value = "default_user")]
@@ -77,7 +77,7 @@ async fn main() -> anyhow::Result<()> {
             scale_factor,
             seed,
             query_ids,
-            no_cached_optd_stats,
+            rebuild_cached_optd_stats,
             pguser,
             pgpassword,
         } => {
@@ -89,7 +89,7 @@ async fn main() -> anyhow::Result<()> {
             };
             let cardinfo_alldbs = cardtest::cardtest(
                 &workspace_dpath,
-                no_cached_optd_stats,
+                rebuild_cached_optd_stats,
                 &pguser,
                 &pgpassword,
                 tpch_config,
diff --git a/optd-perftest/tests/cardtest_integration.rs b/optd-perftest/tests/cardtest_integration.rs
index 8b5c242d..327d4fa7 100644
--- a/optd-perftest/tests/cardtest_integration.rs
+++ b/optd-perftest/tests/cardtest_integration.rs
@@ -44,7 +44,7 @@ mod tests {
             // make sure scale factor is low so the test runs fast
             "--scale-factor",
             "0.01",
-            "--no-cached-optd-stats",
+            "--rebuild-cached-optd-stats",
             "--pguser",
             "test_user",
             "--pgpassword",

From 273aa0d07130d72e851bd25d2763a52219374c1a Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 09:20:37 -0400
Subject: [PATCH 08/29] refactored per col from map back to vec

---
 optd-datafusion-repr/src/bin/test_optimize.rs |   4 +-
 optd-datafusion-repr/src/cost/base_cost.rs    | 143 ++++++++----------
 2 files changed, 69 insertions(+), 78 deletions(-)

diff --git a/optd-datafusion-repr/src/bin/test_optimize.rs b/optd-datafusion-repr/src/bin/test_optimize.rs
index c9a1feb8..eb7a80a1 100644
--- a/optd-datafusion-repr/src/bin/test_optimize.rs
+++ b/optd-datafusion-repr/src/bin/test_optimize.rs
@@ -1,4 +1,4 @@
-use std::{collections::HashMap, sync::Arc};
+use std::sync::Arc;
 
 use optd_core::{
     cascades::CascadesOptimizer,
@@ -45,7 +45,7 @@ pub fn main() {
         Box::new(OptCostModel::new(
             [("t1", 1000), ("t2", 100), ("t3", 10000)]
                 .into_iter()
-                .map(|(x, y)| (x.to_string(), DataFusionPerTableStats::new(y, HashMap::new())))
+                .map(|(x, y)| (x.to_string(), DataFusionPerTableStats::new(y, vec![])))
                 .collect(),
         )),
         vec![],
diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index 62517b89..9466d072 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -94,7 +94,7 @@ impl MostCommonValues for MockMostCommonValues {
 #[derive(Serialize, Deserialize)]
 pub struct PerTableStats<M: MostCommonValues, D: Distribution> {
     row_cnt: usize,
-    per_column_stats_map: HashMap<usize, PerColumnStats<M, D>>,
+    per_column_stats_vec: Vec<Option<PerColumnStats<M, D>>>,
 }
 
 impl DataFusionPerTableStats {
@@ -150,20 +150,22 @@ impl DataFusionPerTableStats {
         }
 
         // Assemble the per-column stats.
-        let mut per_column_stats_map = HashMap::with_capacity(col_cnt);
+        let mut per_column_stats_vec = Vec::with_capacity(col_cnt);
         for i in 0..col_cnt {
-            if Self::is_type_supported(&col_types[i]) {
-                per_column_stats_map.insert(i, PerColumnStats::new(
+            per_column_stats_vec.push(if Self::is_type_supported(&col_types[i]) {
+                Some(PerColumnStats::new(
                     mcvs[i].take().unwrap(),
                     hlls[i].n_distinct(),
                     null_cnt[i] as f64 / row_cnt as f64,
                     distr[i].take().unwrap(),
-                ));
-            }
+                ))
+            } else {
+                None
+            });
         }
         Ok(Self {
             row_cnt,
-            per_column_stats_map,
+            per_column_stats_vec,
         })
     }
 
@@ -733,14 +735,18 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         join_on_selectivity * join_filter_selectivity
     }
 
-    fn get_per_col_stats(&self, col_ref: &ColumnRef) -> Option<&PerColumnStats<M, D>> {
+    fn get_per_column_stats_from_col_ref(&self, col_ref: &ColumnRef) -> Option<&PerColumnStats<M, D>> {
         if let ColumnRef::BaseTableColumnRef { table, col_idx } = col_ref {
-            self.per_table_stats_map.get(table).and_then(|per_table_stats| per_table_stats.per_column_stats_map.get(col_idx))
+            self.get_per_column_stats(table, *col_idx)
         } else {
             None
         }
     }
 
+    fn get_per_column_stats(&self, table: &str, col_idx: usize) -> Option<&PerColumnStats<M, D>> {
+        self.per_table_stats_map.get(table).and_then(|per_table_stats| per_table_stats.per_column_stats_vec[col_idx].as_ref())
+    }
+
     fn get_join_on_selectivity(
         &self,
         join_typ: JoinType,
@@ -750,8 +756,8 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         // multiply the selectivities of all individual conditions together
         on_col_ref_pairs.into_iter().map(|on_col_ref_pair| {
             // the formula for each pair is min(1 / ndistinct1, 1 / ndistinct2) (see https://postgrespro.com/blog/pgsql/5969618)
-            let ndistincts = vec![on_col_ref_pair.0, on_col_ref_pair.1].into_iter().map(|on_col_ref| {
-                match self.get_per_col_stats(&column_refs[on_col_ref.index()]) {
+            let ndistincts = vec![on_col_ref_pair.0, on_col_ref_pair.1].into_iter().map(|on_col_ref_expr| {
+                match self.get_per_column_stats_from_col_ref(&column_refs[on_col_ref_expr.index()]) {
                     Some(per_col_stats) => per_col_stats.ndistinct,
                     None => DEFAULT_NUM_DISTINCT,
                 }
@@ -936,31 +942,21 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         value: &Value,
         is_eq: bool,
     ) -> f64 {
-        if let Some(per_table_stats) = self.per_table_stats_map.get(table) {
-            if let Some(per_column_stats) = per_table_stats.per_column_stats_map.get(&col_idx)
-            {
-                let eq_freq = if let Some(freq) = per_column_stats.mcvs.freq(value) {
-                    freq
-                } else {
-                    let non_mcv_freq = 1.0 - per_column_stats.mcvs.total_freq();
-                    // always safe because usize is at least as large as i32
-                    let ndistinct_as_usize = per_column_stats.ndistinct as usize;
-                    let non_mcv_cnt = ndistinct_as_usize - per_column_stats.mcvs.cnt();
-                    // note that nulls are not included in ndistinct so we don't need to do non_mcv_cnt - 1 if null_frac > 0
-                    (non_mcv_freq - per_column_stats.null_frac) / (non_mcv_cnt as f64)
-                };
-                if is_eq {
-                    eq_freq
-                } else {
-                    1.0 - eq_freq - per_column_stats.null_frac
-                }
+        if let Some(per_column_stats) = self.get_per_column_stats(table, col_idx) {
+            let eq_freq = if let Some(freq) = per_column_stats.mcvs.freq(value) {
+                freq
             } else {
-                #[allow(clippy::collapsible_else_if)]
-                if is_eq {
-                    DEFAULT_EQ_SEL
-                } else {
-                    1.0 - DEFAULT_EQ_SEL
-                }
+                let non_mcv_freq = 1.0 - per_column_stats.mcvs.total_freq();
+                // always safe because usize is at least as large as i32
+                let ndistinct_as_usize = per_column_stats.ndistinct as usize;
+                let non_mcv_cnt = ndistinct_as_usize - per_column_stats.mcvs.cnt();
+                // note that nulls are not included in ndistinct so we don't need to do non_mcv_cnt - 1 if null_frac > 0
+                (non_mcv_freq - per_column_stats.null_frac) / (non_mcv_cnt as f64)
+            };
+            if is_eq {
+                eq_freq
+            } else {
+                1.0 - eq_freq - per_column_stats.null_frac
             }
         } else {
             #[allow(clippy::collapsible_else_if)]
@@ -986,46 +982,41 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         is_col_lt_val: bool,
         is_col_eq_val: bool,
     ) -> f64 {
-        if let Some(per_table_stats) = self.per_table_stats_map.get(table) {
-            if let Some(per_column_stats) = per_table_stats.per_column_stats_map.get(&col_idx)
-            {
-                // because distr does not include the values in MCVs, we need to compute the CDFs there as well
-                // because nulls return false in any comparison, they are never included when computing range selectivity
-                let distr_leq_freq = per_column_stats.distr.cdf(value);
-                let value_clone = value.clone(); // clone the value so that we can move it into the closure to avoid lifetime issues
-                                                 // TODO: in a future PR, figure out how to make Values comparable. rn I just hardcoded as_i32() to work around this
-                let pred = Box::new(move |val: &Value| val.as_i32() <= value_clone.as_i32());
-                let mcvs_leq_freq = per_column_stats.mcvs.freq_over_pred(pred);
-                let total_leq_freq = distr_leq_freq + mcvs_leq_freq;
-
-                // depending on whether value is in mcvs or not, we use different logic to turn total_leq_cdf into total_lt_cdf
-                // this logic just so happens to be the exact same logic as get_column_equality_selectivity implements
-                let total_lt_freq = total_leq_freq
-                    - self.get_column_equality_selectivity(table, col_idx, value, true);
-
-                // use either total_leq_freq or total_lt_freq to get the selectivity
-                if is_col_lt_val {
-                    if is_col_eq_val {
-                        // this branch means <=
-                        total_leq_freq
-                    } else {
-                        // this branch means <
-                        total_lt_freq
-                    }
+        if let Some(per_column_stats) = self.get_per_column_stats(table, col_idx) {
+            // because distr does not include the values in MCVs, we need to compute the CDFs there as well
+            // because nulls return false in any comparison, they are never included when computing range selectivity
+            let distr_leq_freq = per_column_stats.distr.cdf(value);
+            let value_clone = value.clone(); // clone the value so that we can move it into the closure to avoid lifetime issues
+                                                // TODO: in a future PR, figure out how to make Values comparable. rn I just hardcoded as_i32() to work around this
+            let pred = Box::new(move |val: &Value| val.as_i32() <= value_clone.as_i32());
+            let mcvs_leq_freq = per_column_stats.mcvs.freq_over_pred(pred);
+            let total_leq_freq = distr_leq_freq + mcvs_leq_freq;
+
+            // depending on whether value is in mcvs or not, we use different logic to turn total_leq_cdf into total_lt_cdf
+            // this logic just so happens to be the exact same logic as get_column_equality_selectivity implements
+            let total_lt_freq = total_leq_freq
+                - self.get_column_equality_selectivity(table, col_idx, value, true);
+
+            // use either total_leq_freq or total_lt_freq to get the selectivity
+            if is_col_lt_val {
+                if is_col_eq_val {
+                    // this branch means <=
+                    total_leq_freq
                 } else {
-                    // clippy wants me to collapse this into an else if, but keeping two nested if else statements is clearer
-                    #[allow(clippy::collapsible_else_if)]
-                    if is_col_eq_val {
-                        // this branch means >=, which is 1 - < - null_frac
-                        // we need to subtract null_frac since that isn't included in >= either
-                        1.0 - total_lt_freq - per_column_stats.null_frac
-                    } else {
-                        // this branch means >. same logic as above
-                        1.0 - total_leq_freq - per_column_stats.null_frac
-                    }
+                    // this branch means <
+                    total_lt_freq
                 }
             } else {
-                DEFAULT_INEQ_SEL
+                // clippy wants me to collapse this into an else if, but keeping two nested if else statements is clearer
+                #[allow(clippy::collapsible_else_if)]
+                if is_col_eq_val {
+                    // this branch means >=, which is 1 - < - null_frac
+                    // we need to subtract null_frac since that isn't included in >= either
+                    1.0 - total_lt_freq - per_column_stats.null_frac
+                } else {
+                    // this branch means >. same logic as above
+                    1.0 - total_leq_freq - per_column_stats.null_frac
+                }
             }
         } else {
             DEFAULT_INEQ_SEL
@@ -1057,10 +1048,10 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
 }
 
 impl<M: MostCommonValues, D: Distribution> PerTableStats<M, D> {
-    pub fn new(row_cnt: usize, per_column_stats_map: HashMap<usize, PerColumnStats<M, D>>) -> Self {
+    pub fn new(row_cnt: usize, per_column_stats_vec: Vec<Option<PerColumnStats<M, D>>>) -> Self {
         Self {
             row_cnt,
-            per_column_stats_map,
+            per_column_stats_vec,
         }
     }
 }
@@ -1152,7 +1143,7 @@ mod tests {
         OptCostModel::new(
             vec![(
                 String::from(TABLE1_NAME),
-                PerTableStats::new(100, vec![(0, per_column_stats)].into_iter().collect()),
+                PerTableStats::new(100, vec![Some(per_column_stats)]),
             )]
             .into_iter()
             .collect(),
@@ -1167,7 +1158,7 @@ mod tests {
         OptCostModel::new(
             vec![(
                 String::from(TABLE1_NAME),
-                PerTableStats::new(100, vec![(0, per_column_stats0), (1, per_column_stats1)].into_iter().collect()),
+                PerTableStats::new(100, vec![Some(per_column_stats0), Some(per_column_stats1)]),
             )]
             .into_iter()
             .collect(),

From 9e8b4f2fe2c3b92a4817d1cbc3ff6e7fa99ea22f Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 09:21:52 -0400
Subject: [PATCH 09/29] cmt

---
 optd-datafusion-repr/src/cost/base_cost.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index 9466d072..964929f3 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -94,6 +94,11 @@ impl MostCommonValues for MockMostCommonValues {
 #[derive(Serialize, Deserialize)]
 pub struct PerTableStats<M: MostCommonValues, D: Distribution> {
     row_cnt: usize,
+    // This is a Vec of Options instead of just a Vec because some columns may not have stats
+    //   due to their type being non-comparable.
+    // Further, I chose to represent it as a Vec of Options instead of a HashMap because a Vec
+    //   of Options clearly differentiates between two different failure modes: "out-of-bounds
+    //   access" and "column has no stats".
     per_column_stats_vec: Vec<Option<PerColumnStats<M, D>>>,
 }
 

From 3ff6d24944a50eaa0c4d4bf4c5385deb518701ef Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 09:30:01 -0400
Subject: [PATCH 10/29] fixed joinsel eq test to use two diff tables

---
 optd-datafusion-repr/src/cost/base_cost.rs | 37 ++++++++++++++--------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index 964929f3..ad5ae1ac 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -14,6 +14,7 @@ use datafusion::arrow::array::{
     Int32Array, Int8Array, RecordBatch, RecordBatchIterator, RecordBatchReader, UInt16Array,
     UInt32Array, UInt8Array,
 };
+use datafusion_expr::col;
 use itertools::Itertools;
 use optd_core::{
     cascades::{CascadesOptimizer, RelNodeContext},
@@ -711,11 +712,15 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
                     ExprList::new(filter_expr_trees),
                 ).into_rel_node())
             };
+            println!("on_col_ref_pairs={:?}, filter_expr_tree={:?}", on_col_ref_pairs, filter_expr_tree);
             self.get_join_selectivity_core(join_typ, on_col_ref_pairs, filter_expr_tree, column_refs)
         } else {
+            println!("b, expr_tree={:?}, column_refs={:?}", expr_tree, column_refs);
             if let Some(on_col_ref_pair) = Self::get_on_col_ref_pair(expr_tree.clone(), column_refs) {
+                println!("c");
                 self.get_join_selectivity_core(join_typ, vec![on_col_ref_pair], None, column_refs)
             } else {
+                println!("d");
                 self.get_join_selectivity_core(join_typ, vec![], Some(expr_tree), column_refs)
             }
         }
@@ -1139,7 +1144,8 @@ mod tests {
         }
     }
 
-    const TABLE1_NAME: &str = "t1";
+    const TABLE1_NAME: &str = "table1";
+    const TABLE2_NAME: &str = "table2";
 
     // one column is sufficient for all filter selectivity tests
     fn create_one_column_cost_model(
@@ -1156,14 +1162,17 @@ mod tests {
     }
 
     // two columns is sufficient for all join selectivity tests
-    fn create_two_column_cost_model(
-        per_column_stats0: TestPerColumnStats,
-        per_column_stats1: TestPerColumnStats,
+    fn create_two_table_cost_model(
+        tbl1_per_column_stats: TestPerColumnStats,
+        tbl2_per_column_stats: TestPerColumnStats,
     ) -> OptCostModel<TestMostCommonValues, TestDistribution> {
         OptCostModel::new(
             vec![(
                 String::from(TABLE1_NAME),
-                PerTableStats::new(100, vec![Some(per_column_stats0), Some(per_column_stats1)]),
+                PerTableStats::new(100, vec![Some(tbl1_per_column_stats)]),
+            ), (
+                String::from(TABLE2_NAME),
+                PerTableStats::new(100, vec![Some(tbl2_per_column_stats)]),
             )]
             .into_iter()
             .collect(),
@@ -1789,7 +1798,7 @@ mod tests {
 
     #[test]
     fn test_joinsel_colref_eq_colref_no_nulls() {
-        let cost_model = create_two_column_cost_model(TestPerColumnStats::new(
+        let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             5,
             0.0,
@@ -1806,8 +1815,8 @@ mod tests {
             table: String::from(TABLE1_NAME),
             col_idx: 0,
         }, ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE1_NAME),
-            col_idx: 1,
+            table: String::from(TABLE2_NAME),
+            col_idx: 0,
         }];
         assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.2);
         assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.2);
@@ -1815,7 +1824,7 @@ mod tests {
 
     #[test]
     fn test_joinsel_and() {
-        let cost_model = create_two_column_cost_model(TestPerColumnStats::new(
+        let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             5,
             0.0,
@@ -1834,8 +1843,8 @@ mod tests {
             table: String::from(TABLE1_NAME),
             col_idx: 0,
         }, ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE1_NAME),
-            col_idx: 1,
+            table: String::from(TABLE2_NAME),
+            col_idx: 0,
         }];
         assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.04);
         assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.04);
@@ -1843,7 +1852,7 @@ mod tests {
 
     #[test]
     fn test_joinsel_or() {
-        let cost_model = create_two_column_cost_model(TestPerColumnStats::new(
+        let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             5,
             0.0,
@@ -1862,8 +1871,8 @@ mod tests {
             table: String::from(TABLE1_NAME),
             col_idx: 0,
         }, ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE1_NAME),
-            col_idx: 1,
+            table: String::from(TABLE2_NAME),
+            col_idx: 0,
         }];
         assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.36);
         assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.36);

From bbbfbfc2abadcbffe13075b6ff69b15c5fd22c7b Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 09:30:49 -0400
Subject: [PATCH 11/29] removed joinsel or test

---
 optd-datafusion-repr/src/cost/base_cost.rs | 28 ----------------------
 1 file changed, 28 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index ad5ae1ac..d7fb762b 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -1849,32 +1849,4 @@ mod tests {
         assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.04);
         assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.04);
     }
-
-    #[test]
-    fn test_joinsel_or() {
-        let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
-            5,
-            0.0,
-            TestDistribution::empty(),
-        ), TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
-            3,
-            0.0,
-            TestDistribution::empty(),
-        ));
-        let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
-        let eq1and0 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
-        let expr_tree = log_op(LogOpType::Or, vec![eq0and1.clone(), eq1and0.clone()]);
-        let expr_tree_rev = log_op(LogOpType::Or, vec![eq1and0.clone(), eq0and1.clone()]);
-        let column_refs = vec![ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE1_NAME),
-            col_idx: 0,
-        }, ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE2_NAME),
-            col_idx: 0,
-        }];
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.36);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.36);
-    }
 }

From 4c792f9620a4cc26ec0e164cf42badb084bbbf4b Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 09:40:16 -0400
Subject: [PATCH 12/29] oncond comment

---
 optd-datafusion-repr/src/cost/base_cost.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index d7fb762b..a486193d 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -1797,7 +1797,7 @@ mod tests {
     }
 
     #[test]
-    fn test_joinsel_colref_eq_colref_no_nulls() {
+    fn test_joinsel_oncond() {
         let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             5,
@@ -1823,7 +1823,7 @@ mod tests {
     }
 
     #[test]
-    fn test_joinsel_and() {
+    fn test_joinsel_and_with_oncond() {
         let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             5,
@@ -1849,4 +1849,6 @@ mod tests {
         assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.04);
         assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.04);
     }
+
+    // We don't test joinsel or with oncond because if there is an oncond (on condition), the top-level operator must be an AND
 }

From 2e23e20352831c228e15029d163a55d251774f13 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 09:45:33 -0400
Subject: [PATCH 13/29] wrote unit tests for join sel

---
 optd-datafusion-repr/src/cost/base_cost.rs | 93 ++++++++++++++++++++--
 1 file changed, 86 insertions(+), 7 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index a486193d..cc3d2596 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -1075,10 +1075,9 @@ mod tests {
     use std::collections::HashMap;
 
     use crate::{
-        plan_nodes::{
+        cost::base_cost::DEFAULT_EQ_SEL, plan_nodes::{
             BinOpExpr, BinOpType, ColumnRefExpr, ConstantExpr, Expr, ExprList, JoinType, LogOpExpr, LogOpType, OptRelNode, OptRelNodeRef, UnOpExpr, UnOpType
-        },
-        properties::column_ref::ColumnRef,
+        }, properties::column_ref::ColumnRef
     };
 
     use super::{Distribution, MostCommonValues, OptCostModel, PerColumnStats, PerTableStats};
@@ -1805,7 +1804,7 @@ mod tests {
             TestDistribution::empty(),
         ), TestPerColumnStats::new(
             TestMostCommonValues::empty(),
-            3,
+            4,
             0.0,
             TestDistribution::empty(),
         ));
@@ -1823,7 +1822,7 @@ mod tests {
     }
 
     #[test]
-    fn test_joinsel_and_with_oncond() {
+    fn test_joinsel_and_of_onconds() {
         let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             5,
@@ -1831,12 +1830,12 @@ mod tests {
             TestDistribution::empty(),
         ), TestPerColumnStats::new(
             TestMostCommonValues::empty(),
-            3,
+            4,
             0.0,
             TestDistribution::empty(),
         ));
         let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
-        let eq1and0 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
+        let eq1and0 = bin_op(BinOpType::Eq, col_ref(1), col_ref(0));
         let expr_tree = log_op(LogOpType::And, vec![eq0and1.clone(), eq1and0.clone()]);
         let expr_tree_rev = log_op(LogOpType::And, vec![eq1and0.clone(), eq0and1.clone()]);
         let column_refs = vec![ColumnRef::BaseTableColumnRef {
@@ -1850,5 +1849,85 @@ mod tests {
         assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.04);
     }
 
+    #[test]
+    fn test_joinsel_and_of_oncond_and_filter() {
+        let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            5,
+            0.0,
+            TestDistribution::empty(),
+        ), TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            4,
+            0.0,
+            TestDistribution::empty(),
+        ));
+        let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
+        let eq100 = bin_op(BinOpType::Eq, col_ref(1), cnst(Value::Int32(100)));
+        let expr_tree = log_op(LogOpType::And, vec![eq0and1.clone(), eq100.clone()]);
+        let expr_tree_rev = log_op(LogOpType::And, vec![eq100.clone(), eq0and1.clone()]);
+        let column_refs = vec![ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE1_NAME),
+            col_idx: 0,
+        }, ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE2_NAME),
+            col_idx: 0,
+        }];
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.05);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.05);
+    }
+
+    #[test]
+    fn test_joinsel_and_of_filters() {
+        let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            5,
+            0.0,
+            TestDistribution::empty(),
+        ), TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            4,
+            0.0,
+            TestDistribution::empty(),
+        ));
+        let neq12 = bin_op(BinOpType::Neq, col_ref(0), cnst(Value::Int32(12)));
+        let eq100 = bin_op(BinOpType::Eq, col_ref(1), cnst(Value::Int32(100)));
+        let expr_tree = log_op(LogOpType::And, vec![neq12.clone(), eq100.clone()]);
+        let expr_tree_rev = log_op(LogOpType::And, vec![eq100.clone(), neq12.clone()]);
+        let column_refs = vec![ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE1_NAME),
+            col_idx: 0,
+        }, ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE2_NAME),
+            col_idx: 0,
+        }];
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.2);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.2);
+    }
+
+    #[test]
+    fn test_joinsel_colref_eq_colref_same_table_not_oncond() {
+        let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            5,
+            0.0,
+            TestDistribution::empty(),
+        ), TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            4,
+            0.0,
+            TestDistribution::empty(),
+        ));
+        let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(0));
+        let column_refs = vec![ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE1_NAME),
+            col_idx: 0,
+        }, ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE2_NAME),
+            col_idx: 0,
+        }];
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), DEFAULT_EQ_SEL);
+    }
+
     // We don't test joinsel or with oncond because if there is an oncond (on condition), the top-level operator must be an AND
 }

From 3428ebb439c53d6bbe028248c4c31bd312ace272 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 09:46:29 -0400
Subject: [PATCH 14/29] now checking join type inner

---
 optd-datafusion-repr/src/cost/base_cost.rs | 34 ++++++++++++----------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index cc3d2596..85cba8de 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -14,7 +14,6 @@ use datafusion::arrow::array::{
     Int32Array, Int8Array, RecordBatch, RecordBatchIterator, RecordBatchReader, UInt16Array,
     UInt32Array, UInt8Array,
 };
-use datafusion_expr::col;
 use itertools::Itertools;
 use optd_core::{
     cascades::{CascadesOptimizer, RelNodeContext},
@@ -763,20 +762,25 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         on_col_ref_pairs: Vec<(ColumnRefExpr, ColumnRefExpr)>,
         column_refs: &GroupColumnRefs
     ) -> f64 {
-        // multiply the selectivities of all individual conditions together
-        on_col_ref_pairs.into_iter().map(|on_col_ref_pair| {
-            // the formula for each pair is min(1 / ndistinct1, 1 / ndistinct2) (see https://postgrespro.com/blog/pgsql/5969618)
-            let ndistincts = vec![on_col_ref_pair.0, on_col_ref_pair.1].into_iter().map(|on_col_ref_expr| {
-                match self.get_per_column_stats_from_col_ref(&column_refs[on_col_ref_expr.index()]) {
-                    Some(per_col_stats) => per_col_stats.ndistinct,
-                    None => DEFAULT_NUM_DISTINCT,
-                }
-            });
-            // using reduce(f64::min) is the idiomatic workaround to the fact that f64 does not implement Ord due to NaN
-            let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_nodes.len() == 2");
-            assert!(!selectivity.is_nan(), "it should be impossible for selectivity to be NaN since n-distinct is never 0");
-            selectivity
-        }).product()
+        match join_typ {
+            JoinType::Inner => {
+                // multiply the selectivities of all individual conditions together
+                on_col_ref_pairs.into_iter().map(|on_col_ref_pair| {
+                    // the formula for each pair is min(1 / ndistinct1, 1 / ndistinct2) (see https://postgrespro.com/blog/pgsql/5969618)
+                    let ndistincts = vec![on_col_ref_pair.0, on_col_ref_pair.1].into_iter().map(|on_col_ref_expr| {
+                        match self.get_per_column_stats_from_col_ref(&column_refs[on_col_ref_expr.index()]) {
+                            Some(per_col_stats) => per_col_stats.ndistinct,
+                            None => DEFAULT_NUM_DISTINCT,
+                        }
+                    });
+                    // using reduce(f64::min) is the idiomatic workaround to the fact that f64 does not implement Ord due to NaN
+                    let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_nodes.len() == 2");
+                    assert!(!selectivity.is_nan(), "it should be impossible for selectivity to be NaN since n-distinct is never 0");
+                    selectivity
+                }).product()
+            }
+            _ => unimplemented!(),
+        }
     }
 
     /// Comparison operators are the base case for recursion in get_filter_selectivity()

From 673e4aabf469cc78890b1854f6c5d104a35a5449 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 09:55:31 -0400
Subject: [PATCH 15/29] fixed q11

---
 optd-datafusion-repr/src/cost/base_cost.rs | 51 ++++++++++------------
 1 file changed, 24 insertions(+), 27 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index 85cba8de..c93ba762 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -588,6 +588,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         expr_tree: OptRelNodeRef,
         column_refs: &GroupColumnRefs,
     ) -> f64 {
+        println!("expr_tree={:?}", expr_tree);
         assert!(expr_tree.typ.is_expression());
         match &expr_tree.typ {
             OptRelNodeTyp::Constant(_) => Self::get_constant_selectivity(expr_tree),
@@ -711,15 +712,11 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
                     ExprList::new(filter_expr_trees),
                 ).into_rel_node())
             };
-            println!("on_col_ref_pairs={:?}, filter_expr_tree={:?}", on_col_ref_pairs, filter_expr_tree);
             self.get_join_selectivity_core(join_typ, on_col_ref_pairs, filter_expr_tree, column_refs)
         } else {
-            println!("b, expr_tree={:?}, column_refs={:?}", expr_tree, column_refs);
             if let Some(on_col_ref_pair) = Self::get_on_col_ref_pair(expr_tree.clone(), column_refs) {
-                println!("c");
                 self.get_join_selectivity_core(join_typ, vec![on_col_ref_pair], None, column_refs)
             } else {
-                println!("d");
                 self.get_join_selectivity_core(join_typ, vec![], Some(expr_tree), column_refs)
             }
         }
@@ -774,7 +771,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
                         }
                     });
                     // using reduce(f64::min) is the idiomatic workaround to the fact that f64 does not implement Ord due to NaN
-                    let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_nodes.len() == 2");
+                    let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_exprs.len() == 2");
                     assert!(!selectivity.is_nan(), "it should be impossible for selectivity to be NaN since n-distinct is never 0");
                     selectivity
                 }).product()
@@ -794,25 +791,25 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         assert!(comp_bin_op_typ.is_comparison());
 
         // I intentionally performed moves on left and right. This way, we don't accidentally use them after this block
-        let (col_ref_nodes, non_col_ref_nodes, is_left_col_ref) = Self::get_semantic_nodes(left, right);
+        let (col_ref_exprs, non_col_ref_exprs, is_left_col_ref) = Self::get_semantic_nodes(left, right);
 
         // handle the different cases of column nodes
-        if col_ref_nodes.is_empty() {
+        if col_ref_exprs.is_empty() {
             UNIMPLEMENTED_SEL
-        } else if col_ref_nodes.len() == 1 {
-            let col_ref_node = col_ref_nodes
+        } else if col_ref_exprs.len() == 1 {
+            let col_ref_expr = col_ref_exprs
                 .first()
-                .expect("we just checked that col_ref_nodes.len() == 1");
-            let col_ref_idx = col_ref_node.index();
+                .expect("we just checked that col_ref_exprs.len() == 1");
+            let col_ref_idx = col_ref_expr.index();
 
             if let ColumnRef::BaseTableColumnRef { table, col_idx } = &column_refs[col_ref_idx] {
-                let non_col_ref_node = non_col_ref_nodes
+                let non_col_ref_expr = non_col_ref_exprs
                     .first()
-                    .expect("non_col_ref_nodes should have a value since col_ref_nodes.len() == 1");
+                    .expect("non_col_ref_exprs should have a value since col_ref_exprs.len() == 1");
 
-                match non_col_ref_node.as_ref().typ {
+                match non_col_ref_expr.as_ref().typ {
                     OptRelNodeTyp::Constant(_) => {
-                        let value = non_col_ref_node
+                        let value = non_col_ref_expr
                             .as_ref()
                             .data
                             .as_ref()
@@ -861,46 +858,46 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
                     OptRelNodeTyp::Cast => UNIMPLEMENTED_SEL,
                     _ => unimplemented!(
                         "unhandled case of comparing a column ref node to {}",
-                        non_col_ref_node.as_ref().typ
+                        non_col_ref_expr.as_ref().typ
                     ),
                 }
             } else {
-                unimplemented!("non base table column refs need to be implemented")
+                Self::get_default_comparison_op_selectivity(comp_bin_op_typ)
             }
-        } else if col_ref_nodes.len() == 2 {
+        } else if col_ref_exprs.len() == 2 {
             Self::get_default_comparison_op_selectivity(comp_bin_op_typ)
         } else {
-            unreachable!("we could have at most pushed left and right into col_ref_nodes")
+            unreachable!("we could have at most pushed left and right into col_ref_exprs")
         }
     }
 
     /// Convert the left and right child nodes of some operation to what they semantically are
     /// This is convenient to avoid repeating the same logic just with "left" and "right" swapped
     fn get_semantic_nodes(left: OptRelNodeRef, right: OptRelNodeRef) -> (Vec<ColumnRefExpr>, Vec<OptRelNodeRef>, bool) {
-        let mut col_ref_nodes = vec![];
-        let mut non_col_ref_nodes = vec![];
+        let mut col_ref_exprs = vec![];
+        let mut non_col_ref_exprs = vec![];
         let is_left_col_ref;
         // I intentionally performed moves on left and right. This way, we don't accidentally use them after this block
-        // We always want to use "col_ref_node" and "non_col_ref_node" instead of "left" or "right"
+        // We always want to use "col_ref_expr" and "non_col_ref_expr" instead of "left" or "right"
         if left.as_ref().typ == OptRelNodeTyp::ColumnRef {
             is_left_col_ref = true;
-            col_ref_nodes.push(
+            col_ref_exprs.push(
                 ColumnRefExpr::from_rel_node(left)
                     .expect("we already checked that the type is ColumnRef"),
             );
         } else {
             is_left_col_ref = false;
-            non_col_ref_nodes.push(left);
+            non_col_ref_exprs.push(left);
         }
         if right.as_ref().typ == OptRelNodeTyp::ColumnRef {
-            col_ref_nodes.push(
+            col_ref_exprs.push(
                 ColumnRefExpr::from_rel_node(right)
                     .expect("we already checked that the type is ColumnRef"),
             );
         } else {
-            non_col_ref_nodes.push(right);
+            non_col_ref_exprs.push(right);
         }
-        (col_ref_nodes, non_col_ref_nodes, is_left_col_ref)
+        (col_ref_exprs, non_col_ref_exprs, is_left_col_ref)
     }
 
     /// The default selectivity of a comparison expression

From 23d0abf711867fbbc3eb853bbb9d0f0abc64d34f Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 10:14:26 -0400
Subject: [PATCH 16/29] cust row cnt

---
 optd-datafusion-repr/src/cost/base_cost.rs | 58 ++++++++++++++++++----
 1 file changed, 48 insertions(+), 10 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index c93ba762..421e515d 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -588,7 +588,6 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         expr_tree: OptRelNodeRef,
         column_refs: &GroupColumnRefs,
     ) -> f64 {
-        println!("expr_tree={:?}", expr_tree);
         assert!(expr_tree.typ.is_expression());
         match &expr_tree.typ {
             OptRelNodeTyp::Constant(_) => Self::get_constant_selectivity(expr_tree),
@@ -1161,18 +1160,28 @@ mod tests {
         )
     }
 
-    // two columns is sufficient for all join selectivity tests
+    /// Two columns is sufficient for all join selectivity tests
     fn create_two_table_cost_model(
         tbl1_per_column_stats: TestPerColumnStats,
         tbl2_per_column_stats: TestPerColumnStats,
+    ) -> OptCostModel<TestMostCommonValues, TestDistribution> {
+        create_two_table_cost_model_custom_row_cnts(tbl1_per_column_stats, tbl2_per_column_stats, 100, 100)
+    }
+
+    /// We need custom row counts because some join algorithms rely on the row cnt
+    fn create_two_table_cost_model_custom_row_cnts(
+        tbl1_per_column_stats: TestPerColumnStats,
+        tbl2_per_column_stats: TestPerColumnStats,
+        tbl1_row_cnt: usize,
+        tbl2_row_cnt: usize,
     ) -> OptCostModel<TestMostCommonValues, TestDistribution> {
         OptCostModel::new(
             vec![(
                 String::from(TABLE1_NAME),
-                PerTableStats::new(100, vec![Some(tbl1_per_column_stats)]),
+                PerTableStats::new(tbl1_row_cnt, vec![Some(tbl1_per_column_stats)]),
             ), (
                 String::from(TABLE2_NAME),
-                PerTableStats::new(100, vec![Some(tbl2_per_column_stats)]),
+                PerTableStats::new(tbl2_row_cnt, vec![Some(tbl2_per_column_stats)]),
             )]
             .into_iter()
             .collect(),
@@ -1790,14 +1799,14 @@ mod tests {
     }
 
     #[test]
-    fn test_joinsel_const() {
+    fn test_joinsel_inner_const() {
         let cost_model = create_one_column_cost_model(get_empty_per_col_stats());
         assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(true)), &vec![]), 1.0);
         assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(false)), &vec![]), 0.0);
     }
 
     #[test]
-    fn test_joinsel_oncond() {
+    fn test_joinsel_inner_oncond() {
         let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             5,
@@ -1823,7 +1832,7 @@ mod tests {
     }
 
     #[test]
-    fn test_joinsel_and_of_onconds() {
+    fn test_joinsel_inner_and_of_onconds() {
         let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             5,
@@ -1851,7 +1860,7 @@ mod tests {
     }
 
     #[test]
-    fn test_joinsel_and_of_oncond_and_filter() {
+    fn test_joinsel_inner_and_of_oncond_and_filter() {
         let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             5,
@@ -1879,7 +1888,7 @@ mod tests {
     }
 
     #[test]
-    fn test_joinsel_and_of_filters() {
+    fn test_joinsel_inner_and_of_filters() {
         let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             5,
@@ -1907,7 +1916,7 @@ mod tests {
     }
 
     #[test]
-    fn test_joinsel_colref_eq_colref_same_table_not_oncond() {
+    fn test_joinsel_inner_colref_eq_colref_same_table_is_not_oncond() {
         let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             5,
@@ -1931,4 +1940,33 @@ mod tests {
     }
 
     // We don't test joinsel or with oncond because if there is an oncond (on condition), the top-level operator must be an AND
+
+    #[test]
+    fn test_joinsel_outer_oncond() {
+        let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            5,
+            0.0,
+            TestDistribution::empty(),
+        ), TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            4,
+            0.0,
+            TestDistribution::empty(),
+        ));
+        // since we're talking about left and right outer joins, the order actually matters now
+        let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
+        let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0));
+        let column_refs = vec![ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE1_NAME),
+            col_idx: 0,
+        }, ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE2_NAME),
+            col_idx: 0,
+        }];
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), DEFAULT_EQ_SEL);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), DEFAULT_EQ_SEL);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), DEFAULT_EQ_SEL);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), DEFAULT_EQ_SEL);
+    }
 }

From 38368541a92f1a16b513e37e419d347a925a19ff Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 10:32:53 -0400
Subject: [PATCH 17/29] wrote unit tests for outer sel

---
 optd-datafusion-repr/src/cost/base_cost.rs | 131 +++++++++++++++++++--
 1 file changed, 124 insertions(+), 7 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index 421e515d..af7653d7 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -1941,9 +1941,10 @@ mod tests {
 
     // We don't test joinsel or with oncond because if there is an oncond (on condition), the top-level operator must be an AND
 
+    /// Unique oncond means an oncondition on columns which are unique in both tables
     #[test]
-    fn test_joinsel_outer_oncond() {
-        let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
+    fn test_joinsel_outer_unique_oncond() {
+        let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             5,
             0.0,
@@ -1953,7 +1954,75 @@ mod tests {
             4,
             0.0,
             TestDistribution::empty(),
-        ));
+        ), 5, 4);
+        // since we're talking about left and right outer joins, the order actually matters now
+        let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
+        let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0));
+        let column_refs = vec![ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE1_NAME),
+            col_idx: 0,
+        }, ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE2_NAME),
+            col_idx: 0,
+        }];
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), 0.25);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), 0.25);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), 0.2);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), 0.2);
+    }
+
+    /// Non-unique oncond means the column is not unique in either table
+    /// Inner always >= row count means that the inner join result is >= the row count of both tables
+    #[test]
+    fn test_joinsel_outer_nonunique_oncond_inner_always_geq_rowcnt() {
+        let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            5,
+            0.0,
+            TestDistribution::empty(),
+        ), TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            4,
+            0.0,
+            TestDistribution::empty(),
+        ), 10, 8);
+        // since we're talking about left and right outer joins, the order actually matters now
+        let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
+        let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0));
+        let column_refs = vec![ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE1_NAME),
+            col_idx: 0,
+        }, ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE2_NAME),
+            col_idx: 0,
+        }];
+        // sanity check the expected inner sel
+        let expected_inner_sel = 0.2;
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel);
+        // check the outer sels
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), 0.2);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), 0.2);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), 0.2);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), 0.2);
+    }
+
+    /// Non-unique oncond means the column is not unique in either table
+    /// Inner sometimes < row count means that the inner join result < the row count of at least one table.
+    ///   Note that without a join filter, it's impossible to be less than the row count of both tables
+    #[test]
+    fn test_joinsel_outer_nonunique_oncond_inner_sometimes_lt_rowcnt() {
+        let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            10,
+            0.0,
+            TestDistribution::empty(),
+        ), TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            2,
+            0.0,
+            TestDistribution::empty(),
+        ), 20, 4);
         // since we're talking about left and right outer joins, the order actually matters now
         let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
         let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0));
@@ -1964,9 +2033,57 @@ mod tests {
             table: String::from(TABLE2_NAME),
             col_idx: 0,
         }];
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), DEFAULT_EQ_SEL);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), DEFAULT_EQ_SEL);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), DEFAULT_EQ_SEL);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), DEFAULT_EQ_SEL);
+        // sanity check the expected inner sel
+        let expected_inner_sel = 0.1;
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel);
+        // check the outer sels
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), 0.25);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), 0.25);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), 0.1);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), 0.1);
+    }
+
+    /// Unique oncond means an oncondition on columns which are unique in both tables
+    /// Filter means we're adding a join filter
+    /// Inner sometimes < row count means that the inner join result < the row count of at least one table.
+    #[test]
+    fn test_joinsel_outer_unique_oncond_filter_inner_sometimes_lt_rowcnt() {
+        let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            50,
+            0.0,
+            TestDistribution::new(vec![
+                (Value::Int32(128), 0.4)
+            ]),
+        ), TestPerColumnStats::new(
+            TestMostCommonValues::empty(),
+            4,
+            0.0,
+            TestDistribution::empty(),
+        ), 50, 4);
+        // since we're talking about left and right outer joins, the order actually matters now
+        let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
+        let eq1and0 = bin_op(BinOpType::Eq, col_ref(1), col_ref(0));
+        let filter = bin_op(BinOpType::Leq, col_ref(0), cnst(Value::Int32(128)));
+        let expr_tree = log_op(LogOpType::And, vec![eq0and1, filter.clone()]);
+        // inner rev means its the inner expr (the eq op) whose children are being reversed, as opposed to the and op
+        let expr_tree_inner_rev = log_op(LogOpType::And, vec![eq1and0, filter.clone()]);
+        let column_refs = vec![ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE1_NAME),
+            col_idx: 0,
+        }, ColumnRef::BaseTableColumnRef {
+            table: String::from(TABLE2_NAME),
+            col_idx: 0,
+        }];
+        // sanity check the expected inner sel
+        let expected_inner_sel = 0.008;
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_inner_rev.clone(), &column_refs), expected_inner_sel);
+        // check the outer sels
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), 0.25);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_inner_rev.clone(), &column_refs), 0.25);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_inner_rev.clone(), &column_refs), 0.02);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), 0.02);
     }
 }

From 2f473058db9bf4544e2d3b79d48b4745d6af0258 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 11:04:32 -0400
Subject: [PATCH 18/29] refactored unit tests to pass row cnt properly

---
 optd-datafusion-repr/src/cost/base_cost.rs | 166 ++++++++++++---------
 1 file changed, 95 insertions(+), 71 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index af7653d7..b35b403a 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -493,7 +493,7 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
                             let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false);
                             // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information
                             if let Some(expr_tree) = expr_trees.first() {
-                                self.get_join_selectivity(*join_typ, Arc::clone(expr_tree), &column_refs)
+                                self.get_join_selectivity(*join_typ, Arc::clone(expr_tree), &column_refs, row_cnt_1, row_cnt_2)
                             } else {
                                 panic!("encountered a join without an expression")
                             }
@@ -689,6 +689,8 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         join_typ: JoinType,
         expr_tree: OptRelNodeRef,
         column_refs: &GroupColumnRefs,
+        left_row_cnt: f64,
+        right_row_cnt: f64,
     ) -> f64 {
         assert!(expr_tree.typ.is_expression());
         if expr_tree.typ == OptRelNodeTyp::LogOp(LogOpType::And) {
@@ -711,12 +713,12 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
                     ExprList::new(filter_expr_trees),
                 ).into_rel_node())
             };
-            self.get_join_selectivity_core(join_typ, on_col_ref_pairs, filter_expr_tree, column_refs)
+            self.get_join_selectivity_core(join_typ, on_col_ref_pairs, filter_expr_tree, column_refs, left_row_cnt, right_row_cnt)
         } else {
             if let Some(on_col_ref_pair) = Self::get_on_col_ref_pair(expr_tree.clone(), column_refs) {
-                self.get_join_selectivity_core(join_typ, vec![on_col_ref_pair], None, column_refs)
+                self.get_join_selectivity_core(join_typ, vec![on_col_ref_pair], None, column_refs, left_row_cnt, right_row_cnt)
             } else {
-                self.get_join_selectivity_core(join_typ, vec![], Some(expr_tree), column_refs)
+                self.get_join_selectivity_core(join_typ, vec![], Some(expr_tree), column_refs, left_row_cnt, right_row_cnt)
             }
         }
     }
@@ -727,8 +729,10 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         on_col_ref_pairs: Vec<(ColumnRefExpr, ColumnRefExpr)>,
         filter_expr_tree: Option<OptRelNodeRef>,
         column_refs: &GroupColumnRefs,
+        left_row_cnt: f64,
+        right_row_cnt: f64,
     ) -> f64 {
-        let join_on_selectivity = self.get_join_on_selectivity(join_typ, on_col_ref_pairs, column_refs);
+        let join_on_selectivity = self.get_join_on_selectivity(on_col_ref_pairs, column_refs);
         // Currently, there is no difference in how we handle a join filter and a select filter, so we use the same function
         // One difference (that we *don't* care about right now) is that join filters can contain expressions from multiple
         //   different tables. Currently, this doesn't affect the get_filter_selectivity() function, but this may change in
@@ -737,7 +741,13 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
             Some(filter_expr_tree) => self.get_filter_selectivity(filter_expr_tree, column_refs),
             None => 1.0,
         };
-        join_on_selectivity * join_filter_selectivity
+        let inner_join_selectivity = join_on_selectivity * join_filter_selectivity;
+        match join_typ {
+            JoinType::Inner => inner_join_selectivity,
+            JoinType::LeftOuter => f64::max(inner_join_selectivity, 1.0 / right_row_cnt),
+            JoinType::RightOuter => f64::max(inner_join_selectivity, 1.0 / left_row_cnt),
+            _ => unimplemented!()
+        }
     }
 
     fn get_per_column_stats_from_col_ref(&self, col_ref: &ColumnRef) -> Option<&PerColumnStats<M, D>> {
@@ -752,31 +762,27 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         self.per_table_stats_map.get(table).and_then(|per_table_stats| per_table_stats.per_column_stats_vec[col_idx].as_ref())
     }
 
+    /// Get the selectivity of the on conditions
+    /// Note that the selectivity of the on conditions does not depend on join type. Join type is accounted for separately in get_join_selectivity_core()
     fn get_join_on_selectivity(
         &self,
-        join_typ: JoinType,
         on_col_ref_pairs: Vec<(ColumnRefExpr, ColumnRefExpr)>,
         column_refs: &GroupColumnRefs
     ) -> f64 {
-        match join_typ {
-            JoinType::Inner => {
-                // multiply the selectivities of all individual conditions together
-                on_col_ref_pairs.into_iter().map(|on_col_ref_pair| {
-                    // the formula for each pair is min(1 / ndistinct1, 1 / ndistinct2) (see https://postgrespro.com/blog/pgsql/5969618)
-                    let ndistincts = vec![on_col_ref_pair.0, on_col_ref_pair.1].into_iter().map(|on_col_ref_expr| {
-                        match self.get_per_column_stats_from_col_ref(&column_refs[on_col_ref_expr.index()]) {
-                            Some(per_col_stats) => per_col_stats.ndistinct,
-                            None => DEFAULT_NUM_DISTINCT,
-                        }
-                    });
-                    // using reduce(f64::min) is the idiomatic workaround to the fact that f64 does not implement Ord due to NaN
-                    let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_exprs.len() == 2");
-                    assert!(!selectivity.is_nan(), "it should be impossible for selectivity to be NaN since n-distinct is never 0");
-                    selectivity
-                }).product()
-            }
-            _ => unimplemented!(),
-        }
+        // multiply the selectivities of all individual conditions together
+        on_col_ref_pairs.into_iter().map(|on_col_ref_pair| {
+            // the formula for each pair is min(1 / ndistinct1, 1 / ndistinct2) (see https://postgrespro.com/blog/pgsql/5969618)
+            let ndistincts = vec![on_col_ref_pair.0, on_col_ref_pair.1].into_iter().map(|on_col_ref_expr| {
+                match self.get_per_column_stats_from_col_ref(&column_refs[on_col_ref_expr.index()]) {
+                    Some(per_col_stats) => per_col_stats.ndistinct,
+                    None => DEFAULT_NUM_DISTINCT,
+                }
+            });
+            // using reduce(f64::min) is the idiomatic workaround to the fact that f64 does not implement Ord due to NaN
+            let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_exprs.len() == 2");
+            assert!(!selectivity.is_nan(), "it should be impossible for selectivity to be NaN since n-distinct is never 0");
+            selectivity
+        }).product()
     }
 
     /// Comparison operators are the base case for recursion in get_filter_selectivity()
@@ -1077,11 +1083,12 @@ mod tests {
     use crate::{
         cost::base_cost::DEFAULT_EQ_SEL, plan_nodes::{
             BinOpExpr, BinOpType, ColumnRefExpr, ConstantExpr, Expr, ExprList, JoinType, LogOpExpr, LogOpType, OptRelNode, OptRelNodeRef, UnOpExpr, UnOpType
-        }, properties::column_ref::ColumnRef
+        }, properties::column_ref::{ColumnRef, GroupColumnRefs}
     };
 
     use super::{Distribution, MostCommonValues, OptCostModel, PerColumnStats, PerTableStats};
     type TestPerColumnStats = PerColumnStats<TestMostCommonValues, TestDistribution>;
+    type TestOptCostModel = OptCostModel<TestMostCommonValues, TestDistribution>;
 
     struct TestMostCommonValues {
         mcvs: HashMap<Value, f64>,
@@ -1149,7 +1156,7 @@ mod tests {
     // one column is sufficient for all filter selectivity tests
     fn create_one_column_cost_model(
         per_column_stats: TestPerColumnStats,
-    ) -> OptCostModel<TestMostCommonValues, TestDistribution> {
+    ) -> TestOptCostModel {
         OptCostModel::new(
             vec![(
                 String::from(TABLE1_NAME),
@@ -1164,7 +1171,7 @@ mod tests {
     fn create_two_table_cost_model(
         tbl1_per_column_stats: TestPerColumnStats,
         tbl2_per_column_stats: TestPerColumnStats,
-    ) -> OptCostModel<TestMostCommonValues, TestDistribution> {
+    ) -> TestOptCostModel {
         create_two_table_cost_model_custom_row_cnts(tbl1_per_column_stats, tbl2_per_column_stats, 100, 100)
     }
 
@@ -1174,7 +1181,7 @@ mod tests {
         tbl2_per_column_stats: TestPerColumnStats,
         tbl1_row_cnt: usize,
         tbl2_row_cnt: usize,
-    ) -> OptCostModel<TestMostCommonValues, TestDistribution> {
+    ) -> TestOptCostModel {
         OptCostModel::new(
             vec![(
                 String::from(TABLE1_NAME),
@@ -1798,11 +1805,22 @@ mod tests {
         );
     }
 
+    /// A wrapper around get_join_selectivity that extracts the table row counts from the cost model
+    fn test_get_join_selectivity(cost_model: &TestOptCostModel, reverse_tables: bool, join_typ: JoinType, expr_tree: OptRelNodeRef, column_refs: &GroupColumnRefs) -> f64 {
+        let table1_row_cnt = cost_model.per_table_stats_map[TABLE1_NAME].row_cnt as f64;
+        let table2_row_cnt = cost_model.per_table_stats_map[TABLE2_NAME].row_cnt as f64;
+        if reverse_tables {
+            cost_model.get_join_selectivity(join_typ, expr_tree, column_refs, table1_row_cnt, table2_row_cnt)
+        } else {
+            cost_model.get_join_selectivity(join_typ, expr_tree, column_refs, table2_row_cnt, table1_row_cnt)
+        }
+    }
+
     #[test]
     fn test_joinsel_inner_const() {
         let cost_model = create_one_column_cost_model(get_empty_per_col_stats());
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(true)), &vec![]), 1.0);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(false)), &vec![]), 0.0);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(true)), &vec![], f64::NAN, f64::NAN), 1.0);
+        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(false)), &vec![], f64::NAN, f64::NAN), 0.0);
     }
 
     #[test]
@@ -1827,8 +1845,8 @@ mod tests {
             table: String::from(TABLE2_NAME),
             col_idx: 0,
         }];
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.2);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.2);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), 0.2);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev, &column_refs), 0.2);
     }
 
     #[test]
@@ -1855,8 +1873,8 @@ mod tests {
             table: String::from(TABLE2_NAME),
             col_idx: 0,
         }];
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.04);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.04);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), 0.04);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev, &column_refs), 0.04);
     }
 
     #[test]
@@ -1883,8 +1901,8 @@ mod tests {
             table: String::from(TABLE2_NAME),
             col_idx: 0,
         }];
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.05);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.05);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), 0.05);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev, &column_refs), 0.05);
     }
 
     #[test]
@@ -1911,8 +1929,8 @@ mod tests {
             table: String::from(TABLE2_NAME),
             col_idx: 0,
         }];
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.2);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.2);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), 0.2);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev, &column_refs), 0.2);
     }
 
     #[test]
@@ -1936,12 +1954,28 @@ mod tests {
             table: String::from(TABLE2_NAME),
             col_idx: 0,
         }];
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), DEFAULT_EQ_SEL);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), DEFAULT_EQ_SEL);
     }
 
     // We don't test joinsel or with oncond because if there is an oncond (on condition), the top-level operator must be an AND
 
+    /// I made this helper function to avoid copying all eight lines over and over
+    fn assert_joinsel_outer_selectivity(cost_model: &TestOptCostModel, expr_tree: OptRelNodeRef, expr_tree_rev: OptRelNodeRef, column_refs: &GroupColumnRefs, expected_table1_outer_sel: f64, expected_table2_outer_sel: f64) {
+        // all table 1 outer combinations
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::LeftOuter, expr_tree.clone(), &column_refs), expected_table1_outer_sel);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), expected_table1_outer_sel);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, true, JoinType::RightOuter, expr_tree.clone(), &column_refs), expected_table1_outer_sel);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, true, JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), expected_table1_outer_sel);
+        // all table 2 outer combinations
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, true, JoinType::LeftOuter, expr_tree.clone(), &column_refs), expected_table2_outer_sel);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, true, JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), expected_table2_outer_sel);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::RightOuter, expr_tree.clone(), &column_refs), expected_table2_outer_sel);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), expected_table2_outer_sel);
+    }
+
     /// Unique oncond means an oncondition on columns which are unique in both tables
+    /// There's only one case if both columns are unique and have different row counts: the inner will be < 1 / row count
+    ///   of one table and = 1 / row count of another
     #[test]
     fn test_joinsel_outer_unique_oncond() {
         let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new(
@@ -1955,7 +1989,7 @@ mod tests {
             0.0,
             TestDistribution::empty(),
         ), 5, 4);
-        // since we're talking about left and right outer joins, the order actually matters now
+        // the left/right of the join refers to the tables, not the order of columns in the predicate
         let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
         let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0));
         let column_refs = vec![ColumnRef::BaseTableColumnRef {
@@ -1965,14 +1999,11 @@ mod tests {
             table: String::from(TABLE2_NAME),
             col_idx: 0,
         }];
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), 0.25);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), 0.25);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), 0.2);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), 0.2);
+        assert_joinsel_outer_selectivity(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.25, 0.2);
     }
 
     /// Non-unique oncond means the column is not unique in either table
-    /// Inner always >= row count means that the inner join result is >= the row count of both tables
+    /// Inner always >= row count means that the inner join result is >= 1 / the row count of both tables
     #[test]
     fn test_joinsel_outer_nonunique_oncond_inner_always_geq_rowcnt() {
         let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new(
@@ -1986,7 +2017,7 @@ mod tests {
             0.0,
             TestDistribution::empty(),
         ), 10, 8);
-        // since we're talking about left and right outer joins, the order actually matters now
+        // the left/right of the join refers to the tables, not the order of columns in the predicate
         let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
         let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0));
         let column_refs = vec![ColumnRef::BaseTableColumnRef {
@@ -1998,17 +2029,14 @@ mod tests {
         }];
         // sanity check the expected inner sel
         let expected_inner_sel = 0.2;
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel);
         // check the outer sels
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), 0.2);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), 0.2);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), 0.2);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), 0.2);
+        assert_joinsel_outer_selectivity(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.2, 0.2);
     }
 
     /// Non-unique oncond means the column is not unique in either table
-    /// Inner sometimes < row count means that the inner join result < the row count of at least one table.
+    /// Inner sometimes < row count means that the inner join result < 1 / the row count of exactly one table.
     ///   Note that without a join filter, it's impossible to be less than the row count of both tables
     #[test]
     fn test_joinsel_outer_nonunique_oncond_inner_sometimes_lt_rowcnt() {
@@ -2023,7 +2051,7 @@ mod tests {
             0.0,
             TestDistribution::empty(),
         ), 20, 4);
-        // since we're talking about left and right outer joins, the order actually matters now
+        // the left/right of the join refers to the tables, not the order of columns in the predicate
         let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
         let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0));
         let column_refs = vec![ColumnRef::BaseTableColumnRef {
@@ -2035,20 +2063,17 @@ mod tests {
         }];
         // sanity check the expected inner sel
         let expected_inner_sel = 0.1;
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel);
         // check the outer sels
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), 0.25);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), 0.25);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), 0.1);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), 0.1);
+        assert_joinsel_outer_selectivity(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.25, 0.1);
     }
 
     /// Unique oncond means an oncondition on columns which are unique in both tables
     /// Filter means we're adding a join filter
-    /// Inner sometimes < row count means that the inner join result < the row count of at least one table.
+    /// There's only one case if both columns are unique and there's a filter: the inner will be < 1 / row count of both tables
     #[test]
-    fn test_joinsel_outer_unique_oncond_filter_inner_sometimes_lt_rowcnt() {
+    fn test_joinsel_outer_unique_oncond_filter() {
         let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new(
             TestMostCommonValues::empty(),
             50,
@@ -2062,7 +2087,7 @@ mod tests {
             0.0,
             TestDistribution::empty(),
         ), 50, 4);
-        // since we're talking about left and right outer joins, the order actually matters now
+        // the left/right of the join refers to the tables, not the order of columns in the predicate
         let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
         let eq1and0 = bin_op(BinOpType::Eq, col_ref(1), col_ref(0));
         let filter = bin_op(BinOpType::Leq, col_ref(0), cnst(Value::Int32(128)));
@@ -2078,12 +2103,11 @@ mod tests {
         }];
         // sanity check the expected inner sel
         let expected_inner_sel = 0.008;
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_inner_rev.clone(), &column_refs), expected_inner_sel);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_inner_rev.clone(), &column_refs), expected_inner_sel);
         // check the outer sels
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), 0.25);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_inner_rev.clone(), &column_refs), 0.25);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_inner_rev.clone(), &column_refs), 0.02);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), 0.02);
+        assert_joinsel_outer_selectivity(&cost_model, expr_tree, expr_tree_inner_rev, &column_refs, 0.25, 0.02);
     }
+
+    // I didn't test any non-unique cases with filter. The non-unique tests without filter should cover that
 }

From ff49d83658553231147851ba8a25979dc70bb1b7 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 11:07:31 -0400
Subject: [PATCH 19/29] fixed bug in unittests

---
 optd-datafusion-repr/src/cost/base_cost.rs | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index b35b403a..366af7af 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -723,6 +723,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         }
     }
 
+    /// The core logic of join selectivity which assumes we've already separated the expression into the on conditions and the filters
     fn get_join_selectivity_core(
         &self,
         join_typ: JoinType,
@@ -1809,7 +1810,7 @@ mod tests {
     fn test_get_join_selectivity(cost_model: &TestOptCostModel, reverse_tables: bool, join_typ: JoinType, expr_tree: OptRelNodeRef, column_refs: &GroupColumnRefs) -> f64 {
         let table1_row_cnt = cost_model.per_table_stats_map[TABLE1_NAME].row_cnt as f64;
         let table2_row_cnt = cost_model.per_table_stats_map[TABLE2_NAME].row_cnt as f64;
-        if reverse_tables {
+        if !reverse_tables {
             cost_model.get_join_selectivity(join_typ, expr_tree, column_refs, table1_row_cnt, table2_row_cnt)
         } else {
             cost_model.get_join_selectivity(join_typ, expr_tree, column_refs, table2_row_cnt, table1_row_cnt)
@@ -1960,7 +1961,7 @@ mod tests {
     // We don't test joinsel or with oncond because if there is an oncond (on condition), the top-level operator must be an AND
 
     /// I made this helper function to avoid copying all eight lines over and over
-    fn assert_joinsel_outer_selectivity(cost_model: &TestOptCostModel, expr_tree: OptRelNodeRef, expr_tree_rev: OptRelNodeRef, column_refs: &GroupColumnRefs, expected_table1_outer_sel: f64, expected_table2_outer_sel: f64) {
+    fn assert_joinsel_outer_selectivities(cost_model: &TestOptCostModel, expr_tree: OptRelNodeRef, expr_tree_rev: OptRelNodeRef, column_refs: &GroupColumnRefs, expected_table1_outer_sel: f64, expected_table2_outer_sel: f64) {
         // all table 1 outer combinations
         assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::LeftOuter, expr_tree.clone(), &column_refs), expected_table1_outer_sel);
         assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), expected_table1_outer_sel);
@@ -1999,7 +2000,12 @@ mod tests {
             table: String::from(TABLE2_NAME),
             col_idx: 0,
         }];
-        assert_joinsel_outer_selectivity(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.25, 0.2);
+        // sanity check the expected inner sel
+        let expected_inner_sel = 0.2;
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel);
+        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel);
+        // check the outer sels
+        assert_joinsel_outer_selectivities(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.25, 0.2);
     }
 
     /// Non-unique oncond means the column is not unique in either table
@@ -2032,7 +2038,7 @@ mod tests {
         assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel);
         assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel);
         // check the outer sels
-        assert_joinsel_outer_selectivity(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.2, 0.2);
+        assert_joinsel_outer_selectivities(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.2, 0.2);
     }
 
     /// Non-unique oncond means the column is not unique in either table
@@ -2066,7 +2072,7 @@ mod tests {
         assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel);
         assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel);
         // check the outer sels
-        assert_joinsel_outer_selectivity(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.25, 0.1);
+        assert_joinsel_outer_selectivities(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.25, 0.1);
     }
 
     /// Unique oncond means an oncondition on columns which are unique in both tables
@@ -2106,7 +2112,7 @@ mod tests {
         assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel);
         assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_inner_rev.clone(), &column_refs), expected_inner_sel);
         // check the outer sels
-        assert_joinsel_outer_selectivity(&cost_model, expr_tree, expr_tree_inner_rev, &column_refs, 0.25, 0.02);
+        assert_joinsel_outer_selectivities(&cost_model, expr_tree, expr_tree_inner_rev, &column_refs, 0.25, 0.02);
     }
 
     // I didn't test any non-unique cases with filter. The non-unique tests without filter should cover that

From 3711de2d413db829ada968711b421610d7a3cb63 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 11:08:32 -0400
Subject: [PATCH 20/29] added sel to hashjoin

---
 optd-datafusion-repr/src/cost/base_cost.rs | 26 ++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index 366af7af..5f713c8f 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -514,11 +514,33 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
                 let (_, compute_cost, _) = Self::cost_tuple(&children[1]);
                 Self::cost(row_cnt, compute_cost * row_cnt, 0.0)
             }
-            OptRelNodeTyp::PhysicalHashJoin(_) => {
+            OptRelNodeTyp::PhysicalHashJoin(join_typ) => {
                 let (row_cnt_1, _, _) = Self::cost_tuple(&children[0]);
                 let (row_cnt_2, _, _) = Self::cost_tuple(&children[1]);
+                let selectivity = match context {
+                    Some(context) => {
+                        if let Some(optimizer) = optimizer {
+                            let column_refs = optimizer
+                            .get_property_by_group::<ColumnRefPropertyBuilder>(
+                                context.group_id,
+                                1,
+                            );
+                            let expr_group_id = context.children_group_ids[2];
+                            let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false);
+                            // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information
+                            if let Some(expr_tree) = expr_trees.first() {
+                                self.get_join_selectivity(*join_typ, Arc::clone(expr_tree), &column_refs, row_cnt_1, row_cnt_2)
+                            } else {
+                                panic!("encountered a join without an expression")
+                            }
+                        } else {
+                            DEFAULT_UNK_SEL
+                        }
+                    }
+                    None => DEFAULT_UNK_SEL,
+                };
                 Self::cost(
-                    row_cnt_1.min(row_cnt_2).max(1.0),
+                    (row_cnt_1 * row_cnt_2 * selectivity).max(1.0),
                     row_cnt_1 * 2.0 + row_cnt_2,
                     0.0,
                 )

From fb141a6f70973f6a45ce20ad352823d9a910066a Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 12:23:38 -0400
Subject: [PATCH 21/29] undid hash join sel

---
 optd-datafusion-repr/src/cost/base_cost.rs | 23 +---------------------
 1 file changed, 1 insertion(+), 22 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index 5f713c8f..2996682f 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -517,28 +517,7 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
             OptRelNodeTyp::PhysicalHashJoin(join_typ) => {
                 let (row_cnt_1, _, _) = Self::cost_tuple(&children[0]);
                 let (row_cnt_2, _, _) = Self::cost_tuple(&children[1]);
-                let selectivity = match context {
-                    Some(context) => {
-                        if let Some(optimizer) = optimizer {
-                            let column_refs = optimizer
-                            .get_property_by_group::<ColumnRefPropertyBuilder>(
-                                context.group_id,
-                                1,
-                            );
-                            let expr_group_id = context.children_group_ids[2];
-                            let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false);
-                            // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information
-                            if let Some(expr_tree) = expr_trees.first() {
-                                self.get_join_selectivity(*join_typ, Arc::clone(expr_tree), &column_refs, row_cnt_1, row_cnt_2)
-                            } else {
-                                panic!("encountered a join without an expression")
-                            }
-                        } else {
-                            DEFAULT_UNK_SEL
-                        }
-                    }
-                    None => DEFAULT_UNK_SEL,
-                };
+                let selectivity = DEFAULT_UNK_SEL;
                 Self::cost(
                     (row_cnt_1 * row_cnt_2 * selectivity).max(1.0),
                     row_cnt_1 * 2.0 + row_cnt_2,

From 5be618e3d2a4d007f8920f5704d7ca3a24c241a6 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 12:31:43 -0400
Subject: [PATCH 22/29] cross join

---
 optd-datafusion-repr/src/cost/base_cost.rs | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index 5221840e..21cd0686 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -520,7 +520,7 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
                 let (_, compute_cost, _) = Self::cost_tuple(&children[1]);
                 Self::cost(row_cnt, compute_cost * row_cnt, 0.0)
             }
-            OptRelNodeTyp::PhysicalHashJoin(join_typ) => {
+            OptRelNodeTyp::PhysicalHashJoin(_) => {
                 let (row_cnt_1, _, _) = Self::cost_tuple(&children[0]);
                 let (row_cnt_2, _, _) = Self::cost_tuple(&children[1]);
                 let selectivity = DEFAULT_UNK_SEL;
@@ -797,7 +797,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         left_row_cnt: f64,
         right_row_cnt: f64,
     ) -> f64 {
-        let join_on_selectivity = self.get_join_on_selectivity(on_col_ref_pairs, column_refs);
+        let join_on_selectivity = self.get_join_on_selectivity(&on_col_ref_pairs, column_refs);
         // Currently, there is no difference in how we handle a join filter and a select filter, so we use the same function
         // One difference (that we *don't* care about right now) is that join filters can contain expressions from multiple
         //   different tables. Currently, this doesn't affect the get_filter_selectivity() function, but this may change in
@@ -811,7 +811,11 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
             JoinType::Inner => inner_join_selectivity,
             JoinType::LeftOuter => f64::max(inner_join_selectivity, 1.0 / right_row_cnt),
             JoinType::RightOuter => f64::max(inner_join_selectivity, 1.0 / left_row_cnt),
-            _ => unimplemented!()
+            JoinType::Cross => {
+                assert!(on_col_ref_pairs.is_empty(), "Cross joins should not have on columns");
+                join_filter_selectivity
+            },
+            _ => unimplemented!("join_typ={} is not implemented", join_typ)
         }
     }
 
@@ -831,13 +835,13 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
     /// Note that the selectivity of the on conditions does not depend on join type. Join type is accounted for separately in get_join_selectivity_core()
     fn get_join_on_selectivity(
         &self,
-        on_col_ref_pairs: Vec<(ColumnRefExpr, ColumnRefExpr)>,
+        on_col_ref_pairs: &Vec<(ColumnRefExpr, ColumnRefExpr)>,
         column_refs: &GroupColumnRefs
     ) -> f64 {
         // multiply the selectivities of all individual conditions together
         on_col_ref_pairs.into_iter().map(|on_col_ref_pair| {
             // the formula for each pair is min(1 / ndistinct1, 1 / ndistinct2) (see https://postgrespro.com/blog/pgsql/5969618)
-            let ndistincts = vec![on_col_ref_pair.0, on_col_ref_pair.1].into_iter().map(|on_col_ref_expr| {
+            let ndistincts = vec![&on_col_ref_pair.0, &on_col_ref_pair.1].into_iter().map(|on_col_ref_expr| {
                 match self.get_per_column_stats_from_col_ref(&column_refs[on_col_ref_expr.index()]) {
                     Some(per_col_stats) => per_col_stats.ndistinct,
                     None => DEFAULT_NUM_DISTINCT,

From 4d0f753ec6e4fdbc9720f31cf4f69845765aea58 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 12:31:48 -0400
Subject: [PATCH 23/29] fmt

---
 optd-datafusion-repr/src/cost/base_cost.rs | 893 +++++++++++++++------
 1 file changed, 647 insertions(+), 246 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index 21cd0686..66dbbbc1 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -1,11 +1,12 @@
 use std::{collections::HashMap, sync::Arc};
 
 use crate::plan_nodes::{
-    BinOpType, ColumnRefExpr, ConstantExpr, ConstantType, Expr, ExprList, LogOpExpr, LogOpType, OptRelNode, UnOpType
+    BinOpType, ColumnRefExpr, ConstantExpr, ConstantType, Expr, ExprList, LogOpExpr, LogOpType,
+    OptRelNode, UnOpType,
 };
 use crate::properties::column_ref::{ColumnRefPropertyBuilder, GroupColumnRefs};
 use crate::{
-    plan_nodes::{OptRelNodeRef, OptRelNodeTyp, JoinType},
+    plan_nodes::{JoinType, OptRelNodeRef, OptRelNodeTyp},
     properties::column_ref::ColumnRef,
 };
 use arrow_schema::{ArrowError, DataType};
@@ -491,15 +492,21 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
                     Some(context) => {
                         if let Some(optimizer) = optimizer {
                             let column_refs = optimizer
-                            .get_property_by_group::<ColumnRefPropertyBuilder>(
-                                context.group_id,
-                                1,
-                            );
+                                .get_property_by_group::<ColumnRefPropertyBuilder>(
+                                    context.group_id,
+                                    1,
+                                );
                             let expr_group_id = context.children_group_ids[2];
                             let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false);
                             // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information
                             if let Some(expr_tree) = expr_trees.first() {
-                                self.get_join_selectivity(*join_typ, Arc::clone(expr_tree), &column_refs, row_cnt_1, row_cnt_2)
+                                self.get_join_selectivity(
+                                    *join_typ,
+                                    Arc::clone(expr_tree),
+                                    &column_refs,
+                                    row_cnt_1,
+                                    row_cnt_2,
+                                )
                             } else {
                                 panic!("encountered a join without an expression")
                             }
@@ -710,21 +717,34 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
 
     /// Check if an expr_tree is a join condition, returning the join on col ref pair if it is
     /// The reason the check and the info are in the same function is because their code is almost identical
-    fn get_on_col_ref_pair(expr_tree: OptRelNodeRef, column_refs: &GroupColumnRefs) -> Option<(ColumnRefExpr, ColumnRefExpr)> {
+    fn get_on_col_ref_pair(
+        expr_tree: OptRelNodeRef,
+        column_refs: &GroupColumnRefs,
+    ) -> Option<(ColumnRefExpr, ColumnRefExpr)> {
         // We perform three checks to see if a child_expr_tree is an on_col_ref_pair
         // 1. Check that it's equality
         if expr_tree.typ == OptRelNodeTyp::BinOp(BinOpType::Eq) {
             let left_child = expr_tree.child(0);
             let right_child = expr_tree.child(1);
             // 2. Check that both sides are column refs
-            if left_child.typ == OptRelNodeTyp::ColumnRef && right_child.typ == OptRelNodeTyp::ColumnRef {
+            if left_child.typ == OptRelNodeTyp::ColumnRef
+                && right_child.typ == OptRelNodeTyp::ColumnRef
+            {
                 // 3. Check that both sides don't belong to the same table (if we don't know, that means they don't belong)
-                let left_col_ref_expr = ColumnRefExpr::from_rel_node(left_child).expect("we already checked that the type is ColumnRef");
-                let right_col_ref_expr = ColumnRefExpr::from_rel_node(right_child).expect("we already checked that the type is ColumnRef");
+                let left_col_ref_expr = ColumnRefExpr::from_rel_node(left_child)
+                    .expect("we already checked that the type is ColumnRef");
+                let right_col_ref_expr = ColumnRefExpr::from_rel_node(right_child)
+                    .expect("we already checked that the type is ColumnRef");
                 let left_col_ref = &column_refs[left_col_ref_expr.index()];
                 let right_col_ref = &column_refs[right_col_ref_expr.index()];
-                let is_same_table = if let ColumnRef::BaseTableColumnRef { table: left_table, .. } = left_col_ref {
-                    if let ColumnRef::BaseTableColumnRef { table: right_table, .. } = right_col_ref {
+                let is_same_table = if let ColumnRef::BaseTableColumnRef {
+                    table: left_table, ..
+                } = left_col_ref
+                {
+                    if let ColumnRef::BaseTableColumnRef {
+                        table: right_table, ..
+                    } = right_col_ref
+                    {
                         left_table == right_table
                     } else {
                         false
@@ -761,10 +781,14 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
             let mut on_col_ref_pairs = vec![];
             let mut filter_expr_trees = vec![];
             for child_expr_tree in &expr_tree.children {
-                if let Some(on_col_ref_pair) = Self::get_on_col_ref_pair(child_expr_tree.clone(), column_refs) {
+                if let Some(on_col_ref_pair) =
+                    Self::get_on_col_ref_pair(child_expr_tree.clone(), column_refs)
+                {
                     on_col_ref_pairs.push(on_col_ref_pair)
                 } else {
-                    let child_expr = Expr::from_rel_node(child_expr_tree.clone()).expect("everything that is a direct child of an And node must be an expression");
+                    let child_expr = Expr::from_rel_node(child_expr_tree.clone()).expect(
+                        "everything that is a direct child of an And node must be an expression",
+                    );
                     filter_expr_trees.push(child_expr);
                 }
             }
@@ -772,17 +796,39 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
             let filter_expr_tree = if filter_expr_trees.is_empty() {
                 None
             } else {
-                Some(LogOpExpr::new(
-                    LogOpType::And,
-                    ExprList::new(filter_expr_trees),
-                ).into_rel_node())
+                Some(
+                    LogOpExpr::new(LogOpType::And, ExprList::new(filter_expr_trees))
+                        .into_rel_node(),
+                )
             };
-            self.get_join_selectivity_core(join_typ, on_col_ref_pairs, filter_expr_tree, column_refs, left_row_cnt, right_row_cnt)
+            self.get_join_selectivity_core(
+                join_typ,
+                on_col_ref_pairs,
+                filter_expr_tree,
+                column_refs,
+                left_row_cnt,
+                right_row_cnt,
+            )
         } else {
-            if let Some(on_col_ref_pair) = Self::get_on_col_ref_pair(expr_tree.clone(), column_refs) {
-                self.get_join_selectivity_core(join_typ, vec![on_col_ref_pair], None, column_refs, left_row_cnt, right_row_cnt)
+            if let Some(on_col_ref_pair) = Self::get_on_col_ref_pair(expr_tree.clone(), column_refs)
+            {
+                self.get_join_selectivity_core(
+                    join_typ,
+                    vec![on_col_ref_pair],
+                    None,
+                    column_refs,
+                    left_row_cnt,
+                    right_row_cnt,
+                )
             } else {
-                self.get_join_selectivity_core(join_typ, vec![], Some(expr_tree), column_refs, left_row_cnt, right_row_cnt)
+                self.get_join_selectivity_core(
+                    join_typ,
+                    vec![],
+                    Some(expr_tree),
+                    column_refs,
+                    left_row_cnt,
+                    right_row_cnt,
+                )
             }
         }
     }
@@ -812,14 +858,20 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
             JoinType::LeftOuter => f64::max(inner_join_selectivity, 1.0 / right_row_cnt),
             JoinType::RightOuter => f64::max(inner_join_selectivity, 1.0 / left_row_cnt),
             JoinType::Cross => {
-                assert!(on_col_ref_pairs.is_empty(), "Cross joins should not have on columns");
+                assert!(
+                    on_col_ref_pairs.is_empty(),
+                    "Cross joins should not have on columns"
+                );
                 join_filter_selectivity
-            },
-            _ => unimplemented!("join_typ={} is not implemented", join_typ)
+            }
+            _ => unimplemented!("join_typ={} is not implemented", join_typ),
         }
     }
 
-    fn get_per_column_stats_from_col_ref(&self, col_ref: &ColumnRef) -> Option<&PerColumnStats<M, D>> {
+    fn get_per_column_stats_from_col_ref(
+        &self,
+        col_ref: &ColumnRef,
+    ) -> Option<&PerColumnStats<M, D>> {
         if let ColumnRef::BaseTableColumnRef { table, col_idx } = col_ref {
             self.get_per_column_stats(table, *col_idx)
         } else {
@@ -828,7 +880,9 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
     }
 
     fn get_per_column_stats(&self, table: &str, col_idx: usize) -> Option<&PerColumnStats<M, D>> {
-        self.per_table_stats_map.get(table).and_then(|per_table_stats| per_table_stats.per_column_stats_vec[col_idx].as_ref())
+        self.per_table_stats_map
+            .get(table)
+            .and_then(|per_table_stats| per_table_stats.per_column_stats_vec[col_idx].as_ref())
     }
 
     /// Get the selectivity of the on conditions
@@ -836,7 +890,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
     fn get_join_on_selectivity(
         &self,
         on_col_ref_pairs: &Vec<(ColumnRefExpr, ColumnRefExpr)>,
-        column_refs: &GroupColumnRefs
+        column_refs: &GroupColumnRefs,
     ) -> f64 {
         // multiply the selectivities of all individual conditions together
         on_col_ref_pairs.into_iter().map(|on_col_ref_pair| {
@@ -865,7 +919,8 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         assert!(comp_bin_op_typ.is_comparison());
 
         // I intentionally performed moves on left and right. This way, we don't accidentally use them after this block
-        let (col_ref_exprs, non_col_ref_exprs, is_left_col_ref) = Self::get_semantic_nodes(left, right);
+        let (col_ref_exprs, non_col_ref_exprs, is_left_col_ref) =
+            Self::get_semantic_nodes(left, right);
 
         // handle the different cases of column nodes
         if col_ref_exprs.is_empty() {
@@ -947,7 +1002,10 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
 
     /// Convert the left and right child nodes of some operation to what they semantically are
     /// This is convenient to avoid repeating the same logic just with "left" and "right" swapped
-    fn get_semantic_nodes(left: OptRelNodeRef, right: OptRelNodeRef) -> (Vec<ColumnRefExpr>, Vec<OptRelNodeRef>, bool) {
+    fn get_semantic_nodes(
+        left: OptRelNodeRef,
+        right: OptRelNodeRef,
+    ) -> (Vec<ColumnRefExpr>, Vec<OptRelNodeRef>, bool) {
         let mut col_ref_exprs = vec![];
         let mut non_col_ref_exprs = vec![];
         let is_left_col_ref;
@@ -1004,7 +1062,9 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
                         0.0
                     }
                 } else {
-                    unreachable!("if the typ is ConstantType::Bool, the value should be a Value::Bool")
+                    unreachable!(
+                        "if the typ is ConstantType::Bool, the value should be a Value::Bool"
+                    )
                 }
             } else {
                 panic!("selectivity is not defined on constants which are not bools")
@@ -1072,15 +1132,15 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
             // because nulls return false in any comparison, they are never included when computing range selectivity
             let distr_leq_freq = per_column_stats.distr.cdf(value);
             let value_clone = value.clone(); // clone the value so that we can move it into the closure to avoid lifetime issues
-                                                // TODO: in a future PR, figure out how to make Values comparable. rn I just hardcoded as_i32() to work around this
+                                             // TODO: in a future PR, figure out how to make Values comparable. rn I just hardcoded as_i32() to work around this
             let pred = Box::new(move |val: &Value| val.as_i32() <= value_clone.as_i32());
             let mcvs_leq_freq = per_column_stats.mcvs.freq_over_pred(pred);
             let total_leq_freq = distr_leq_freq + mcvs_leq_freq;
 
             // depending on whether value is in mcvs or not, we use different logic to turn total_leq_cdf into total_lt_cdf
             // this logic just so happens to be the exact same logic as get_column_equality_selectivity implements
-            let total_lt_freq = total_leq_freq
-                - self.get_column_equality_selectivity(table, col_idx, value, true);
+            let total_lt_freq =
+                total_leq_freq - self.get_column_equality_selectivity(table, col_idx, value, true);
 
             // use either total_leq_freq or total_lt_freq to get the selectivity
             if is_col_lt_val {
@@ -1150,9 +1210,12 @@ mod tests {
     use std::collections::HashMap;
 
     use crate::{
-        cost::base_cost::DEFAULT_EQ_SEL, plan_nodes::{
-            BinOpExpr, BinOpType, ColumnRefExpr, ConstantExpr, Expr, ExprList, JoinType, LogOpExpr, LogOpType, OptRelNode, OptRelNodeRef, UnOpExpr, UnOpType
-        }, properties::column_ref::{ColumnRef, GroupColumnRefs}
+        cost::base_cost::DEFAULT_EQ_SEL,
+        plan_nodes::{
+            BinOpExpr, BinOpType, ColumnRefExpr, ConstantExpr, Expr, ExprList, JoinType, LogOpExpr,
+            LogOpType, OptRelNode, OptRelNodeRef, UnOpExpr, UnOpType,
+        },
+        properties::column_ref::{ColumnRef, GroupColumnRefs},
     };
 
     use super::{Distribution, MostCommonValues, OptCostModel, PerColumnStats, PerTableStats};
@@ -1223,9 +1286,7 @@ mod tests {
     const TABLE2_NAME: &str = "table2";
 
     // one column is sufficient for all filter selectivity tests
-    fn create_one_column_cost_model(
-        per_column_stats: TestPerColumnStats,
-    ) -> TestOptCostModel {
+    fn create_one_column_cost_model(per_column_stats: TestPerColumnStats) -> TestOptCostModel {
         OptCostModel::new(
             vec![(
                 String::from(TABLE1_NAME),
@@ -1241,7 +1302,12 @@ mod tests {
         tbl1_per_column_stats: TestPerColumnStats,
         tbl2_per_column_stats: TestPerColumnStats,
     ) -> TestOptCostModel {
-        create_two_table_cost_model_custom_row_cnts(tbl1_per_column_stats, tbl2_per_column_stats, 100, 100)
+        create_two_table_cost_model_custom_row_cnts(
+            tbl1_per_column_stats,
+            tbl2_per_column_stats,
+            100,
+            100,
+        )
     }
 
     /// We need custom row counts because some join algorithms rely on the row cnt
@@ -1252,13 +1318,16 @@ mod tests {
         tbl2_row_cnt: usize,
     ) -> TestOptCostModel {
         OptCostModel::new(
-            vec![(
-                String::from(TABLE1_NAME),
-                PerTableStats::new(tbl1_row_cnt, vec![Some(tbl1_per_column_stats)]),
-            ), (
-                String::from(TABLE2_NAME),
-                PerTableStats::new(tbl2_row_cnt, vec![Some(tbl2_per_column_stats)]),
-            )]
+            vec![
+                (
+                    String::from(TABLE1_NAME),
+                    PerTableStats::new(tbl1_row_cnt, vec![Some(tbl1_per_column_stats)]),
+                ),
+                (
+                    String::from(TABLE2_NAME),
+                    PerTableStats::new(tbl2_row_cnt, vec![Some(tbl2_per_column_stats)]),
+                ),
+            ]
             .into_iter()
             .collect(),
         )
@@ -1321,8 +1390,14 @@ mod tests {
     #[test]
     fn test_filtersel_const() {
         let cost_model = create_one_column_cost_model(get_empty_per_col_stats());
-        assert_approx_eq::assert_approx_eq!(cost_model.get_filter_selectivity(cnst(Value::Bool(true)), &vec![]), 1.0);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_filter_selectivity(cnst(Value::Bool(false)), &vec![]), 0.0);
+        assert_approx_eq::assert_approx_eq!(
+            cost_model.get_filter_selectivity(cnst(Value::Bool(true)), &vec![]),
+            1.0
+        );
+        assert_approx_eq::assert_approx_eq!(
+            cost_model.get_filter_selectivity(cnst(Value::Bool(false)), &vec![]),
+            0.0
+        );
     }
 
     #[test]
@@ -1875,171 +1950,367 @@ mod tests {
     }
 
     /// A wrapper around get_join_selectivity that extracts the table row counts from the cost model
-    fn test_get_join_selectivity(cost_model: &TestOptCostModel, reverse_tables: bool, join_typ: JoinType, expr_tree: OptRelNodeRef, column_refs: &GroupColumnRefs) -> f64 {
+    fn test_get_join_selectivity(
+        cost_model: &TestOptCostModel,
+        reverse_tables: bool,
+        join_typ: JoinType,
+        expr_tree: OptRelNodeRef,
+        column_refs: &GroupColumnRefs,
+    ) -> f64 {
         let table1_row_cnt = cost_model.per_table_stats_map[TABLE1_NAME].row_cnt as f64;
         let table2_row_cnt = cost_model.per_table_stats_map[TABLE2_NAME].row_cnt as f64;
         if !reverse_tables {
-            cost_model.get_join_selectivity(join_typ, expr_tree, column_refs, table1_row_cnt, table2_row_cnt)
+            cost_model.get_join_selectivity(
+                join_typ,
+                expr_tree,
+                column_refs,
+                table1_row_cnt,
+                table2_row_cnt,
+            )
         } else {
-            cost_model.get_join_selectivity(join_typ, expr_tree, column_refs, table2_row_cnt, table1_row_cnt)
+            cost_model.get_join_selectivity(
+                join_typ,
+                expr_tree,
+                column_refs,
+                table2_row_cnt,
+                table1_row_cnt,
+            )
         }
     }
 
     #[test]
     fn test_joinsel_inner_const() {
         let cost_model = create_one_column_cost_model(get_empty_per_col_stats());
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(true)), &vec![], f64::NAN, f64::NAN), 1.0);
-        assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(false)), &vec![], f64::NAN, f64::NAN), 0.0);
+        assert_approx_eq::assert_approx_eq!(
+            cost_model.get_join_selectivity(
+                JoinType::Inner,
+                cnst(Value::Bool(true)),
+                &vec![],
+                f64::NAN,
+                f64::NAN
+            ),
+            1.0
+        );
+        assert_approx_eq::assert_approx_eq!(
+            cost_model.get_join_selectivity(
+                JoinType::Inner,
+                cnst(Value::Bool(false)),
+                &vec![],
+                f64::NAN,
+                f64::NAN
+            ),
+            0.0
+        );
     }
 
     #[test]
     fn test_joinsel_inner_oncond() {
-        let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
-            5,
-            0.0,
-            TestDistribution::empty(),
-        ), TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
-            4,
-            0.0,
-            TestDistribution::empty(),
-        ));
+        let cost_model = create_two_table_cost_model(
+            TestPerColumnStats::new(
+                TestMostCommonValues::empty(),
+                5,
+                0.0,
+                TestDistribution::empty(),
+            ),
+            TestPerColumnStats::new(
+                TestMostCommonValues::empty(),
+                4,
+                0.0,
+                TestDistribution::empty(),
+            ),
+        );
         let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
         let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0));
-        let column_refs = vec![ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE1_NAME),
-            col_idx: 0,
-        }, ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE2_NAME),
-            col_idx: 0,
-        }];
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), 0.2);
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev, &column_refs), 0.2);
+        let column_refs = vec![
+            ColumnRef::BaseTableColumnRef {
+                table: String::from(TABLE1_NAME),
+                col_idx: 0,
+            },
+            ColumnRef::BaseTableColumnRef {
+                table: String::from(TABLE2_NAME),
+                col_idx: 0,
+            },
+        ];
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs),
+            0.2
+        );
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                false,
+                JoinType::Inner,
+                expr_tree_rev,
+                &column_refs
+            ),
+            0.2
+        );
     }
 
     #[test]
     fn test_joinsel_inner_and_of_onconds() {
-        let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
-            5,
-            0.0,
-            TestDistribution::empty(),
-        ), TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
-            4,
-            0.0,
-            TestDistribution::empty(),
-        ));
+        let cost_model = create_two_table_cost_model(
+            TestPerColumnStats::new(
+                TestMostCommonValues::empty(),
+                5,
+                0.0,
+                TestDistribution::empty(),
+            ),
+            TestPerColumnStats::new(
+                TestMostCommonValues::empty(),
+                4,
+                0.0,
+                TestDistribution::empty(),
+            ),
+        );
         let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
         let eq1and0 = bin_op(BinOpType::Eq, col_ref(1), col_ref(0));
         let expr_tree = log_op(LogOpType::And, vec![eq0and1.clone(), eq1and0.clone()]);
         let expr_tree_rev = log_op(LogOpType::And, vec![eq1and0.clone(), eq0and1.clone()]);
-        let column_refs = vec![ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE1_NAME),
-            col_idx: 0,
-        }, ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE2_NAME),
-            col_idx: 0,
-        }];
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), 0.04);
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev, &column_refs), 0.04);
+        let column_refs = vec![
+            ColumnRef::BaseTableColumnRef {
+                table: String::from(TABLE1_NAME),
+                col_idx: 0,
+            },
+            ColumnRef::BaseTableColumnRef {
+                table: String::from(TABLE2_NAME),
+                col_idx: 0,
+            },
+        ];
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs),
+            0.04
+        );
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                false,
+                JoinType::Inner,
+                expr_tree_rev,
+                &column_refs
+            ),
+            0.04
+        );
     }
 
     #[test]
     fn test_joinsel_inner_and_of_oncond_and_filter() {
-        let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
-            5,
-            0.0,
-            TestDistribution::empty(),
-        ), TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
-            4,
-            0.0,
-            TestDistribution::empty(),
-        ));
+        let cost_model = create_two_table_cost_model(
+            TestPerColumnStats::new(
+                TestMostCommonValues::empty(),
+                5,
+                0.0,
+                TestDistribution::empty(),
+            ),
+            TestPerColumnStats::new(
+                TestMostCommonValues::empty(),
+                4,
+                0.0,
+                TestDistribution::empty(),
+            ),
+        );
         let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
         let eq100 = bin_op(BinOpType::Eq, col_ref(1), cnst(Value::Int32(100)));
         let expr_tree = log_op(LogOpType::And, vec![eq0and1.clone(), eq100.clone()]);
         let expr_tree_rev = log_op(LogOpType::And, vec![eq100.clone(), eq0and1.clone()]);
-        let column_refs = vec![ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE1_NAME),
-            col_idx: 0,
-        }, ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE2_NAME),
-            col_idx: 0,
-        }];
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), 0.05);
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev, &column_refs), 0.05);
+        let column_refs = vec![
+            ColumnRef::BaseTableColumnRef {
+                table: String::from(TABLE1_NAME),
+                col_idx: 0,
+            },
+            ColumnRef::BaseTableColumnRef {
+                table: String::from(TABLE2_NAME),
+                col_idx: 0,
+            },
+        ];
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs),
+            0.05
+        );
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                false,
+                JoinType::Inner,
+                expr_tree_rev,
+                &column_refs
+            ),
+            0.05
+        );
     }
 
     #[test]
     fn test_joinsel_inner_and_of_filters() {
-        let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
-            5,
-            0.0,
-            TestDistribution::empty(),
-        ), TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
-            4,
-            0.0,
-            TestDistribution::empty(),
-        ));
+        let cost_model = create_two_table_cost_model(
+            TestPerColumnStats::new(
+                TestMostCommonValues::empty(),
+                5,
+                0.0,
+                TestDistribution::empty(),
+            ),
+            TestPerColumnStats::new(
+                TestMostCommonValues::empty(),
+                4,
+                0.0,
+                TestDistribution::empty(),
+            ),
+        );
         let neq12 = bin_op(BinOpType::Neq, col_ref(0), cnst(Value::Int32(12)));
         let eq100 = bin_op(BinOpType::Eq, col_ref(1), cnst(Value::Int32(100)));
         let expr_tree = log_op(LogOpType::And, vec![neq12.clone(), eq100.clone()]);
         let expr_tree_rev = log_op(LogOpType::And, vec![eq100.clone(), neq12.clone()]);
-        let column_refs = vec![ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE1_NAME),
-            col_idx: 0,
-        }, ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE2_NAME),
-            col_idx: 0,
-        }];
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), 0.2);
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev, &column_refs), 0.2);
+        let column_refs = vec![
+            ColumnRef::BaseTableColumnRef {
+                table: String::from(TABLE1_NAME),
+                col_idx: 0,
+            },
+            ColumnRef::BaseTableColumnRef {
+                table: String::from(TABLE2_NAME),
+                col_idx: 0,
+            },
+        ];
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs),
+            0.2
+        );
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                false,
+                JoinType::Inner,
+                expr_tree_rev,
+                &column_refs
+            ),
+            0.2
+        );
     }
 
     #[test]
     fn test_joinsel_inner_colref_eq_colref_same_table_is_not_oncond() {
-        let cost_model = create_two_table_cost_model(TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
-            5,
-            0.0,
-            TestDistribution::empty(),
-        ), TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
-            4,
-            0.0,
-            TestDistribution::empty(),
-        ));
+        let cost_model = create_two_table_cost_model(
+            TestPerColumnStats::new(
+                TestMostCommonValues::empty(),
+                5,
+                0.0,
+                TestDistribution::empty(),
+            ),
+            TestPerColumnStats::new(
+                TestMostCommonValues::empty(),
+                4,
+                0.0,
+                TestDistribution::empty(),
+            ),
+        );
         let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(0));
-        let column_refs = vec![ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE1_NAME),
-            col_idx: 0,
-        }, ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE2_NAME),
-            col_idx: 0,
-        }];
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), DEFAULT_EQ_SEL);
+        let column_refs = vec![
+            ColumnRef::BaseTableColumnRef {
+                table: String::from(TABLE1_NAME),
+                col_idx: 0,
+            },
+            ColumnRef::BaseTableColumnRef {
+                table: String::from(TABLE2_NAME),
+                col_idx: 0,
+            },
+        ];
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs),
+            DEFAULT_EQ_SEL
+        );
     }
 
     // We don't test joinsel or with oncond because if there is an oncond (on condition), the top-level operator must be an AND
 
     /// I made this helper function to avoid copying all eight lines over and over
-    fn assert_joinsel_outer_selectivities(cost_model: &TestOptCostModel, expr_tree: OptRelNodeRef, expr_tree_rev: OptRelNodeRef, column_refs: &GroupColumnRefs, expected_table1_outer_sel: f64, expected_table2_outer_sel: f64) {
+    fn assert_joinsel_outer_selectivities(
+        cost_model: &TestOptCostModel,
+        expr_tree: OptRelNodeRef,
+        expr_tree_rev: OptRelNodeRef,
+        column_refs: &GroupColumnRefs,
+        expected_table1_outer_sel: f64,
+        expected_table2_outer_sel: f64,
+    ) {
         // all table 1 outer combinations
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::LeftOuter, expr_tree.clone(), &column_refs), expected_table1_outer_sel);
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), expected_table1_outer_sel);
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, true, JoinType::RightOuter, expr_tree.clone(), &column_refs), expected_table1_outer_sel);
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, true, JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), expected_table1_outer_sel);
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                false,
+                JoinType::LeftOuter,
+                expr_tree.clone(),
+                &column_refs
+            ),
+            expected_table1_outer_sel
+        );
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                false,
+                JoinType::LeftOuter,
+                expr_tree_rev.clone(),
+                &column_refs
+            ),
+            expected_table1_outer_sel
+        );
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                true,
+                JoinType::RightOuter,
+                expr_tree.clone(),
+                &column_refs
+            ),
+            expected_table1_outer_sel
+        );
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                true,
+                JoinType::RightOuter,
+                expr_tree_rev.clone(),
+                &column_refs
+            ),
+            expected_table1_outer_sel
+        );
         // all table 2 outer combinations
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, true, JoinType::LeftOuter, expr_tree.clone(), &column_refs), expected_table2_outer_sel);
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, true, JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), expected_table2_outer_sel);
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::RightOuter, expr_tree.clone(), &column_refs), expected_table2_outer_sel);
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), expected_table2_outer_sel);
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                true,
+                JoinType::LeftOuter,
+                expr_tree.clone(),
+                &column_refs
+            ),
+            expected_table2_outer_sel
+        );
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                true,
+                JoinType::LeftOuter,
+                expr_tree_rev.clone(),
+                &column_refs
+            ),
+            expected_table2_outer_sel
+        );
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                false,
+                JoinType::RightOuter,
+                expr_tree.clone(),
+                &column_refs
+            ),
+            expected_table2_outer_sel
+        );
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                false,
+                JoinType::RightOuter,
+                expr_tree_rev.clone(),
+                &column_refs
+            ),
+            expected_table2_outer_sel
+        );
     }
 
     /// Unique oncond means an oncondition on columns which are unique in both tables
@@ -2047,66 +2318,132 @@ mod tests {
     ///   of one table and = 1 / row count of another
     #[test]
     fn test_joinsel_outer_unique_oncond() {
-        let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
+        let cost_model = create_two_table_cost_model_custom_row_cnts(
+            TestPerColumnStats::new(
+                TestMostCommonValues::empty(),
+                5,
+                0.0,
+                TestDistribution::empty(),
+            ),
+            TestPerColumnStats::new(
+                TestMostCommonValues::empty(),
+                4,
+                0.0,
+                TestDistribution::empty(),
+            ),
             5,
-            0.0,
-            TestDistribution::empty(),
-        ), TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
             4,
-            0.0,
-            TestDistribution::empty(),
-        ), 5, 4);
+        );
         // the left/right of the join refers to the tables, not the order of columns in the predicate
         let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
         let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0));
-        let column_refs = vec![ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE1_NAME),
-            col_idx: 0,
-        }, ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE2_NAME),
-            col_idx: 0,
-        }];
+        let column_refs = vec![
+            ColumnRef::BaseTableColumnRef {
+                table: String::from(TABLE1_NAME),
+                col_idx: 0,
+            },
+            ColumnRef::BaseTableColumnRef {
+                table: String::from(TABLE2_NAME),
+                col_idx: 0,
+            },
+        ];
         // sanity check the expected inner sel
         let expected_inner_sel = 0.2;
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel);
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel);
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                false,
+                JoinType::Inner,
+                expr_tree.clone(),
+                &column_refs
+            ),
+            expected_inner_sel
+        );
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                false,
+                JoinType::Inner,
+                expr_tree_rev.clone(),
+                &column_refs
+            ),
+            expected_inner_sel
+        );
         // check the outer sels
-        assert_joinsel_outer_selectivities(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.25, 0.2);
+        assert_joinsel_outer_selectivities(
+            &cost_model,
+            expr_tree,
+            expr_tree_rev,
+            &column_refs,
+            0.25,
+            0.2,
+        );
     }
 
     /// Non-unique oncond means the column is not unique in either table
     /// Inner always >= row count means that the inner join result is >= 1 / the row count of both tables
     #[test]
     fn test_joinsel_outer_nonunique_oncond_inner_always_geq_rowcnt() {
-        let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
-            5,
-            0.0,
-            TestDistribution::empty(),
-        ), TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
-            4,
-            0.0,
-            TestDistribution::empty(),
-        ), 10, 8);
+        let cost_model = create_two_table_cost_model_custom_row_cnts(
+            TestPerColumnStats::new(
+                TestMostCommonValues::empty(),
+                5,
+                0.0,
+                TestDistribution::empty(),
+            ),
+            TestPerColumnStats::new(
+                TestMostCommonValues::empty(),
+                4,
+                0.0,
+                TestDistribution::empty(),
+            ),
+            10,
+            8,
+        );
         // the left/right of the join refers to the tables, not the order of columns in the predicate
         let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
         let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0));
-        let column_refs = vec![ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE1_NAME),
-            col_idx: 0,
-        }, ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE2_NAME),
-            col_idx: 0,
-        }];
+        let column_refs = vec![
+            ColumnRef::BaseTableColumnRef {
+                table: String::from(TABLE1_NAME),
+                col_idx: 0,
+            },
+            ColumnRef::BaseTableColumnRef {
+                table: String::from(TABLE2_NAME),
+                col_idx: 0,
+            },
+        ];
         // sanity check the expected inner sel
         let expected_inner_sel = 0.2;
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel);
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel);
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                false,
+                JoinType::Inner,
+                expr_tree.clone(),
+                &column_refs
+            ),
+            expected_inner_sel
+        );
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                false,
+                JoinType::Inner,
+                expr_tree_rev.clone(),
+                &column_refs
+            ),
+            expected_inner_sel
+        );
         // check the outer sels
-        assert_joinsel_outer_selectivities(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.2, 0.2);
+        assert_joinsel_outer_selectivities(
+            &cost_model,
+            expr_tree,
+            expr_tree_rev,
+            &column_refs,
+            0.2,
+            0.2,
+        );
     }
 
     /// Non-unique oncond means the column is not unique in either table
@@ -2114,33 +2451,66 @@ mod tests {
     ///   Note that without a join filter, it's impossible to be less than the row count of both tables
     #[test]
     fn test_joinsel_outer_nonunique_oncond_inner_sometimes_lt_rowcnt() {
-        let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
-            10,
-            0.0,
-            TestDistribution::empty(),
-        ), TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
-            2,
-            0.0,
-            TestDistribution::empty(),
-        ), 20, 4);
+        let cost_model = create_two_table_cost_model_custom_row_cnts(
+            TestPerColumnStats::new(
+                TestMostCommonValues::empty(),
+                10,
+                0.0,
+                TestDistribution::empty(),
+            ),
+            TestPerColumnStats::new(
+                TestMostCommonValues::empty(),
+                2,
+                0.0,
+                TestDistribution::empty(),
+            ),
+            20,
+            4,
+        );
         // the left/right of the join refers to the tables, not the order of columns in the predicate
         let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
         let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0));
-        let column_refs = vec![ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE1_NAME),
-            col_idx: 0,
-        }, ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE2_NAME),
-            col_idx: 0,
-        }];
+        let column_refs = vec![
+            ColumnRef::BaseTableColumnRef {
+                table: String::from(TABLE1_NAME),
+                col_idx: 0,
+            },
+            ColumnRef::BaseTableColumnRef {
+                table: String::from(TABLE2_NAME),
+                col_idx: 0,
+            },
+        ];
         // sanity check the expected inner sel
         let expected_inner_sel = 0.1;
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel);
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel);
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                false,
+                JoinType::Inner,
+                expr_tree.clone(),
+                &column_refs
+            ),
+            expected_inner_sel
+        );
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                false,
+                JoinType::Inner,
+                expr_tree_rev.clone(),
+                &column_refs
+            ),
+            expected_inner_sel
+        );
         // check the outer sels
-        assert_joinsel_outer_selectivities(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.25, 0.1);
+        assert_joinsel_outer_selectivities(
+            &cost_model,
+            expr_tree,
+            expr_tree_rev,
+            &column_refs,
+            0.25,
+            0.1,
+        );
     }
 
     /// Unique oncond means an oncondition on columns which are unique in both tables
@@ -2148,19 +2518,22 @@ mod tests {
     /// There's only one case if both columns are unique and there's a filter: the inner will be < 1 / row count of both tables
     #[test]
     fn test_joinsel_outer_unique_oncond_filter() {
-        let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
+        let cost_model = create_two_table_cost_model_custom_row_cnts(
+            TestPerColumnStats::new(
+                TestMostCommonValues::empty(),
+                50,
+                0.0,
+                TestDistribution::new(vec![(Value::Int32(128), 0.4)]),
+            ),
+            TestPerColumnStats::new(
+                TestMostCommonValues::empty(),
+                4,
+                0.0,
+                TestDistribution::empty(),
+            ),
             50,
-            0.0,
-            TestDistribution::new(vec![
-                (Value::Int32(128), 0.4)
-            ]),
-        ), TestPerColumnStats::new(
-            TestMostCommonValues::empty(),
             4,
-            0.0,
-            TestDistribution::empty(),
-        ), 50, 4);
+        );
         // the left/right of the join refers to the tables, not the order of columns in the predicate
         let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1));
         let eq1and0 = bin_op(BinOpType::Eq, col_ref(1), col_ref(0));
@@ -2168,19 +2541,47 @@ mod tests {
         let expr_tree = log_op(LogOpType::And, vec![eq0and1, filter.clone()]);
         // inner rev means its the inner expr (the eq op) whose children are being reversed, as opposed to the and op
         let expr_tree_inner_rev = log_op(LogOpType::And, vec![eq1and0, filter.clone()]);
-        let column_refs = vec![ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE1_NAME),
-            col_idx: 0,
-        }, ColumnRef::BaseTableColumnRef {
-            table: String::from(TABLE2_NAME),
-            col_idx: 0,
-        }];
+        let column_refs = vec![
+            ColumnRef::BaseTableColumnRef {
+                table: String::from(TABLE1_NAME),
+                col_idx: 0,
+            },
+            ColumnRef::BaseTableColumnRef {
+                table: String::from(TABLE2_NAME),
+                col_idx: 0,
+            },
+        ];
         // sanity check the expected inner sel
         let expected_inner_sel = 0.008;
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel);
-        assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_inner_rev.clone(), &column_refs), expected_inner_sel);
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                false,
+                JoinType::Inner,
+                expr_tree.clone(),
+                &column_refs
+            ),
+            expected_inner_sel
+        );
+        assert_approx_eq::assert_approx_eq!(
+            test_get_join_selectivity(
+                &cost_model,
+                false,
+                JoinType::Inner,
+                expr_tree_inner_rev.clone(),
+                &column_refs
+            ),
+            expected_inner_sel
+        );
         // check the outer sels
-        assert_joinsel_outer_selectivities(&cost_model, expr_tree, expr_tree_inner_rev, &column_refs, 0.25, 0.02);
+        assert_joinsel_outer_selectivities(
+            &cost_model,
+            expr_tree,
+            expr_tree_inner_rev,
+            &column_refs,
+            0.25,
+            0.02,
+        );
     }
 
     // I didn't test any non-unique cases with filter. The non-unique tests without filter should cover that

From 2f5d67429f34d4160b23791b1b664181a7836e55 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 12:34:10 -0400
Subject: [PATCH 24/29] clippy

---
 optd-datafusion-repr/src/cost/base_cost.rs | 37 +++++++++++-----------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index 66dbbbc1..87d91656 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -810,6 +810,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
                 right_row_cnt,
             )
         } else {
+            #[allow(clippy::collapsible_else_if)]
             if let Some(on_col_ref_pair) = Self::get_on_col_ref_pair(expr_tree.clone(), column_refs)
             {
                 self.get_join_selectivity_core(
@@ -889,11 +890,11 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
     /// Note that the selectivity of the on conditions does not depend on join type. Join type is accounted for separately in get_join_selectivity_core()
     fn get_join_on_selectivity(
         &self,
-        on_col_ref_pairs: &Vec<(ColumnRefExpr, ColumnRefExpr)>,
+        on_col_ref_pairs: &[(ColumnRefExpr, ColumnRefExpr)],
         column_refs: &GroupColumnRefs,
     ) -> f64 {
         // multiply the selectivities of all individual conditions together
-        on_col_ref_pairs.into_iter().map(|on_col_ref_pair| {
+        on_col_ref_pairs.iter().map(|on_col_ref_pair| {
             // the formula for each pair is min(1 / ndistinct1, 1 / ndistinct2) (see https://postgrespro.com/blog/pgsql/5969618)
             let ndistincts = vec![&on_col_ref_pair.0, &on_col_ref_pair.1].into_iter().map(|on_col_ref_expr| {
                 match self.get_per_column_stats_from_col_ref(&column_refs[on_col_ref_expr.index()]) {
@@ -2232,82 +2233,82 @@ mod tests {
         // all table 1 outer combinations
         assert_approx_eq::assert_approx_eq!(
             test_get_join_selectivity(
-                &cost_model,
+                cost_model,
                 false,
                 JoinType::LeftOuter,
                 expr_tree.clone(),
-                &column_refs
+                column_refs
             ),
             expected_table1_outer_sel
         );
         assert_approx_eq::assert_approx_eq!(
             test_get_join_selectivity(
-                &cost_model,
+                cost_model,
                 false,
                 JoinType::LeftOuter,
                 expr_tree_rev.clone(),
-                &column_refs
+                column_refs
             ),
             expected_table1_outer_sel
         );
         assert_approx_eq::assert_approx_eq!(
             test_get_join_selectivity(
-                &cost_model,
+                cost_model,
                 true,
                 JoinType::RightOuter,
                 expr_tree.clone(),
-                &column_refs
+                column_refs
             ),
             expected_table1_outer_sel
         );
         assert_approx_eq::assert_approx_eq!(
             test_get_join_selectivity(
-                &cost_model,
+                cost_model,
                 true,
                 JoinType::RightOuter,
                 expr_tree_rev.clone(),
-                &column_refs
+                column_refs
             ),
             expected_table1_outer_sel
         );
         // all table 2 outer combinations
         assert_approx_eq::assert_approx_eq!(
             test_get_join_selectivity(
-                &cost_model,
+                cost_model,
                 true,
                 JoinType::LeftOuter,
                 expr_tree.clone(),
-                &column_refs
+                column_refs
             ),
             expected_table2_outer_sel
         );
         assert_approx_eq::assert_approx_eq!(
             test_get_join_selectivity(
-                &cost_model,
+                cost_model,
                 true,
                 JoinType::LeftOuter,
                 expr_tree_rev.clone(),
-                &column_refs
+                column_refs
             ),
             expected_table2_outer_sel
         );
         assert_approx_eq::assert_approx_eq!(
             test_get_join_selectivity(
-                &cost_model,
+                cost_model,
                 false,
                 JoinType::RightOuter,
                 expr_tree.clone(),
-                &column_refs
+                column_refs
             ),
             expected_table2_outer_sel
         );
         assert_approx_eq::assert_approx_eq!(
             test_get_join_selectivity(
-                &cost_model,
+                cost_model,
                 false,
                 JoinType::RightOuter,
                 expr_tree_rev.clone(),
-                &column_refs
+                column_refs
             ),
             expected_table2_outer_sel
         );

From cfbca671df004607ea7e162528014812c695777b Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 12:52:00 -0400
Subject: [PATCH 25/29] hash join working

---
 optd-datafusion-repr/src/cost/base_cost.rs | 80 +++++++++++++++-------
 1 file changed, 56 insertions(+), 24 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index 87d91656..e9d4dddc 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -466,11 +466,8 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
                             let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false);
                             // there may be more than one expression tree in a group (you can see this trivially as you can just swap the order of two subtrees for commutative operators)
                             // however, we just take an arbitrary expression tree from the group to compute selectivity
-                            if let Some(expr_tree) = expr_trees.first() {
-                                self.get_filter_selectivity(Arc::clone(expr_tree), &column_refs)
-                            } else {
-                                panic!("encountered a PhysicalFilter without an expression")
-                            }
+                            let expr_tree = expr_trees.first().expect("expression missing");
+                            self.get_filter_selectivity(expr_tree.clone(), &column_refs)
                         } else {
                             DEFAULT_UNK_SEL
                         }
@@ -499,17 +496,14 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
                             let expr_group_id = context.children_group_ids[2];
                             let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false);
                             // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information
-                            if let Some(expr_tree) = expr_trees.first() {
-                                self.get_join_selectivity(
-                                    *join_typ,
-                                    Arc::clone(expr_tree),
-                                    &column_refs,
-                                    row_cnt_1,
-                                    row_cnt_2,
-                                )
-                            } else {
-                                panic!("encountered a join without an expression")
-                            }
+                            let expr_tree = expr_trees.first().expect("expression missing");
+                            self.get_join_selectivity_from_expr_tree(
+                                *join_typ,
+                                expr_tree.clone(),
+                                &column_refs,
+                                row_cnt_1,
+                                row_cnt_2,
+                            )
                         } else {
                             DEFAULT_UNK_SEL
                         }
@@ -527,10 +521,38 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
                 let (_, compute_cost, _) = Self::cost_tuple(&children[1]);
                 Self::cost(row_cnt, compute_cost * row_cnt, 0.0)
             }
-            OptRelNodeTyp::PhysicalHashJoin(_) => {
+            OptRelNodeTyp::PhysicalHashJoin(join_typ) => {
                 let (row_cnt_1, _, _) = Self::cost_tuple(&children[0]);
                 let (row_cnt_2, _, _) = Self::cost_tuple(&children[1]);
-                let selectivity = DEFAULT_UNK_SEL;
+                let selectivity = match context {
+                    Some(context) => {
+                        if let Some(optimizer) = optimizer {
+                            let column_refs = optimizer
+                                .get_property_by_group::<ColumnRefPropertyBuilder>(
+                                    context.group_id,
+                                    1,
+                                );
+                            let left_keys_group_id = context.children_group_ids[2];
+                            let right_keys_group_id = context.children_group_ids[3];
+                            let left_keys_list = optimizer.get_all_group_bindings(left_keys_group_id, false);
+                            let right_keys_list = optimizer.get_all_group_bindings(right_keys_group_id, false);
+                            // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information
+                            let left_keys = left_keys_list.first().expect("left keys missing");
+                            let right_keys = right_keys_list.first().expect("right keys missing");
+                            self.get_join_selectivity_from_keys(
+                                *join_typ,
+                                ExprList::from_rel_node(left_keys.clone()).expect("left_keys should be an ExprList"),
+                                ExprList::from_rel_node(right_keys.clone()).expect("right_keys should be an ExprList"),
+                                &column_refs,
+                                row_cnt_1,
+                                row_cnt_2,
+                            )
+                        } else {
+                            DEFAULT_UNK_SEL
+                        }
+                    }
+                    None => DEFAULT_UNK_SEL,
+                };
                 Self::cost(
                     (row_cnt_1 * row_cnt_2 * selectivity).max(1.0),
                     row_cnt_1 * 2.0 + row_cnt_2,
@@ -768,7 +790,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
     /// The expr_tree input must be a "mixed expression tree", just like with get_filter_selectivity()
     /// This is a "wrapper" to separate the equality conditions from the filter conditions before calling
     ///   the "main" get_join_selectivity_core() function.
-    fn get_join_selectivity(
+    fn get_join_selectivity_from_expr_tree(
         &self,
         join_typ: JoinType,
         expr_tree: OptRelNodeRef,
@@ -834,6 +856,16 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         }
     }
 
+    /// A wrapper to convert the join keys to the format expected by get_join_selectivity_core()
+    fn get_join_selectivity_from_keys(&self, join_typ: JoinType, left_keys: ExprList, right_keys: ExprList, column_refs: &GroupColumnRefs, left_row_cnt: f64, right_row_cnt: f64) -> f64 {
+        assert!(left_keys.len() == right_keys.len());
+        // I assume that the keys are already in the right order s.t. the ith key of left_keys corresponds with the ith key of right_keys
+        let on_col_ref_pairs = left_keys.to_vec().into_iter().zip(right_keys.to_vec().into_iter()).map(|(left_key, right_key)| {
+            (ColumnRefExpr::from_rel_node(left_key.into_rel_node()).expect("keys should be ColumnRefExprs"), ColumnRefExpr::from_rel_node(right_key.into_rel_node()).expect("keys should be ColumnRefExprs"))
+        }).collect_vec();
+        self.get_join_selectivity_core(join_typ, on_col_ref_pairs, None, column_refs, left_row_cnt, right_row_cnt)
+    }
+
     /// The core logic of join selectivity which assumes we've already separated the expression into the on conditions and the filters
     fn get_join_selectivity_core(
         &self,
@@ -1950,7 +1982,7 @@ mod tests {
         );
     }
 
-    /// A wrapper around get_join_selectivity that extracts the table row counts from the cost model
+    /// A wrapper around get_join_selectivity_from_expr_tree that extracts the table row counts from the cost model
     fn test_get_join_selectivity(
         cost_model: &TestOptCostModel,
         reverse_tables: bool,
@@ -1961,7 +1993,7 @@ mod tests {
         let table1_row_cnt = cost_model.per_table_stats_map[TABLE1_NAME].row_cnt as f64;
         let table2_row_cnt = cost_model.per_table_stats_map[TABLE2_NAME].row_cnt as f64;
         if !reverse_tables {
-            cost_model.get_join_selectivity(
+            cost_model.get_join_selectivity_from_expr_tree(
                 join_typ,
                 expr_tree,
                 column_refs,
@@ -1969,7 +2001,7 @@ mod tests {
                 table2_row_cnt,
             )
         } else {
-            cost_model.get_join_selectivity(
+            cost_model.get_join_selectivity_from_expr_tree(
                 join_typ,
                 expr_tree,
                 column_refs,
@@ -1983,7 +2015,7 @@ mod tests {
     fn test_joinsel_inner_const() {
         let cost_model = create_one_column_cost_model(get_empty_per_col_stats());
         assert_approx_eq::assert_approx_eq!(
-            cost_model.get_join_selectivity(
+            cost_model.get_join_selectivity_from_expr_tree(
                 JoinType::Inner,
                 cnst(Value::Bool(true)),
                 &vec![],
@@ -1993,7 +2025,7 @@ mod tests {
             1.0
         );
         assert_approx_eq::assert_approx_eq!(
-            cost_model.get_join_selectivity(
+            cost_model.get_join_selectivity_from_expr_tree(
                 JoinType::Inner,
                 cnst(Value::Bool(false)),
                 &vec![],

From 0a38dde0340557157ead325ed9542bb3efcd7f46 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 12:52:47 -0400
Subject: [PATCH 26/29] clip

---
 optd-datafusion-repr/src/cost/base_cost.rs | 47 +++++++++++++++++-----
 1 file changed, 38 insertions(+), 9 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index e9d4dddc..53948a13 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -534,15 +534,19 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
                                 );
                             let left_keys_group_id = context.children_group_ids[2];
                             let right_keys_group_id = context.children_group_ids[3];
-                            let left_keys_list = optimizer.get_all_group_bindings(left_keys_group_id, false);
-                            let right_keys_list = optimizer.get_all_group_bindings(right_keys_group_id, false);
+                            let left_keys_list =
+                                optimizer.get_all_group_bindings(left_keys_group_id, false);
+                            let right_keys_list =
+                                optimizer.get_all_group_bindings(right_keys_group_id, false);
                             // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information
                             let left_keys = left_keys_list.first().expect("left keys missing");
                             let right_keys = right_keys_list.first().expect("right keys missing");
                             self.get_join_selectivity_from_keys(
                                 *join_typ,
-                                ExprList::from_rel_node(left_keys.clone()).expect("left_keys should be an ExprList"),
-                                ExprList::from_rel_node(right_keys.clone()).expect("right_keys should be an ExprList"),
+                                ExprList::from_rel_node(left_keys.clone())
+                                    .expect("left_keys should be an ExprList"),
+                                ExprList::from_rel_node(right_keys.clone())
+                                    .expect("right_keys should be an ExprList"),
                                 &column_refs,
                                 row_cnt_1,
                                 row_cnt_2,
@@ -857,13 +861,38 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
     }
 
     /// A wrapper to convert the join keys to the format expected by get_join_selectivity_core()
-    fn get_join_selectivity_from_keys(&self, join_typ: JoinType, left_keys: ExprList, right_keys: ExprList, column_refs: &GroupColumnRefs, left_row_cnt: f64, right_row_cnt: f64) -> f64 {
+    fn get_join_selectivity_from_keys(
+        &self,
+        join_typ: JoinType,
+        left_keys: ExprList,
+        right_keys: ExprList,
+        column_refs: &GroupColumnRefs,
+        left_row_cnt: f64,
+        right_row_cnt: f64,
+    ) -> f64 {
         assert!(left_keys.len() == right_keys.len());
         // I assume that the keys are already in the right order s.t. the ith key of left_keys corresponds with the ith key of right_keys
-        let on_col_ref_pairs = left_keys.to_vec().into_iter().zip(right_keys.to_vec().into_iter()).map(|(left_key, right_key)| {
-            (ColumnRefExpr::from_rel_node(left_key.into_rel_node()).expect("keys should be ColumnRefExprs"), ColumnRefExpr::from_rel_node(right_key.into_rel_node()).expect("keys should be ColumnRefExprs"))
-        }).collect_vec();
-        self.get_join_selectivity_core(join_typ, on_col_ref_pairs, None, column_refs, left_row_cnt, right_row_cnt)
+        let on_col_ref_pairs = left_keys
+            .to_vec()
+            .into_iter()
+            .zip(right_keys.to_vec())
+            .map(|(left_key, right_key)| {
+                (
+                    ColumnRefExpr::from_rel_node(left_key.into_rel_node())
+                        .expect("keys should be ColumnRefExprs"),
+                    ColumnRefExpr::from_rel_node(right_key.into_rel_node())
+                        .expect("keys should be ColumnRefExprs"),
+                )
+            })
+            .collect_vec();
+        self.get_join_selectivity_core(
+            join_typ,
+            on_col_ref_pairs,
+            None,
+            column_refs,
+            left_row_cnt,
+            right_row_cnt,
+        )
     }
 
     /// The core logic of join selectivity which assumes we've already separated the expression into the on conditions and the filters

From 805aaa99e76d421deba12902a7e99685d20312a0 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 14:49:04 -0400
Subject: [PATCH 27/29] fixed context and optimizer stuff

---
 optd-datafusion-repr/src/cost/base_cost.rs | 136 +++++++++------------
 1 file changed, 60 insertions(+), 76 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index 53948a13..cfada1c0 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -454,27 +454,21 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
             OptRelNodeTyp::PhysicalFilter => {
                 let (row_cnt, _, _) = Self::cost_tuple(&children[0]);
                 let (_, compute_cost, _) = Self::cost_tuple(&children[1]);
-                let selectivity = match context {
-                    Some(context) => {
-                        if let Some(optimizer) = optimizer {
-                            let column_refs = optimizer
-                                .get_property_by_group::<ColumnRefPropertyBuilder>(
-                                    context.group_id,
-                                    1,
-                                );
-                            let expr_group_id = context.children_group_ids[1];
-                            let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false);
-                            // there may be more than one expression tree in a group (you can see this trivially as you can just swap the order of two subtrees for commutative operators)
-                            // however, we just take an arbitrary expression tree from the group to compute selectivity
-                            let expr_tree = expr_trees.first().expect("expression missing");
-                            self.get_filter_selectivity(expr_tree.clone(), &column_refs)
-                        } else {
-                            DEFAULT_UNK_SEL
-                        }
-                    }
-                    None => DEFAULT_UNK_SEL,
+                let selectivity = if let (Some(context), Some(optimizer)) = (context, optimizer) {
+                    let column_refs = optimizer
+                        .get_property_by_group::<ColumnRefPropertyBuilder>(
+                            context.group_id,
+                            1,
+                        );
+                    let expr_group_id = context.children_group_ids[1];
+                    let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false);
+                    // there may be more than one expression tree in a group (you can see this trivially as you can just swap the order of two subtrees for commutative operators)
+                    // however, we just take an arbitrary expression tree from the group to compute selectivity
+                    let expr_tree = expr_trees.first().expect("expression missing");
+                    self.get_filter_selectivity(expr_tree.clone(), &column_refs)
+                } else {
+                    DEFAULT_UNK_SEL
                 };
-
                 Self::cost(
                     (row_cnt * selectivity).max(1.0),
                     row_cnt * compute_cost,
@@ -485,30 +479,25 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
                 let (row_cnt_1, _, _) = Self::cost_tuple(&children[0]);
                 let (row_cnt_2, _, _) = Self::cost_tuple(&children[1]);
                 let (_, compute_cost, _) = Self::cost_tuple(&children[2]);
-                let selectivity = match context {
-                    Some(context) => {
-                        if let Some(optimizer) = optimizer {
-                            let column_refs = optimizer
-                                .get_property_by_group::<ColumnRefPropertyBuilder>(
-                                    context.group_id,
-                                    1,
-                                );
-                            let expr_group_id = context.children_group_ids[2];
-                            let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false);
-                            // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information
-                            let expr_tree = expr_trees.first().expect("expression missing");
-                            self.get_join_selectivity_from_expr_tree(
-                                *join_typ,
-                                expr_tree.clone(),
-                                &column_refs,
-                                row_cnt_1,
-                                row_cnt_2,
-                            )
-                        } else {
-                            DEFAULT_UNK_SEL
-                        }
-                    }
-                    None => DEFAULT_UNK_SEL,
+                let selectivity = if let (Some(context), Some(optimizer)) = (context, optimizer) {
+                    let column_refs = optimizer
+                        .get_property_by_group::<ColumnRefPropertyBuilder>(
+                            context.group_id,
+                            1,
+                        );
+                    let expr_group_id = context.children_group_ids[2];
+                    let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false);
+                    // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information
+                    let expr_tree = expr_trees.first().expect("expression missing");
+                    self.get_join_selectivity_from_expr_tree(
+                        *join_typ,
+                        expr_tree.clone(),
+                        &column_refs,
+                        row_cnt_1,
+                        row_cnt_2,
+                    )
+                } else {
+                    DEFAULT_UNK_SEL
                 };
                 Self::cost(
                     (row_cnt_1 * row_cnt_2 * selectivity).max(1.0),
@@ -524,38 +513,33 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
             OptRelNodeTyp::PhysicalHashJoin(join_typ) => {
                 let (row_cnt_1, _, _) = Self::cost_tuple(&children[0]);
                 let (row_cnt_2, _, _) = Self::cost_tuple(&children[1]);
-                let selectivity = match context {
-                    Some(context) => {
-                        if let Some(optimizer) = optimizer {
-                            let column_refs = optimizer
-                                .get_property_by_group::<ColumnRefPropertyBuilder>(
-                                    context.group_id,
-                                    1,
-                                );
-                            let left_keys_group_id = context.children_group_ids[2];
-                            let right_keys_group_id = context.children_group_ids[3];
-                            let left_keys_list =
-                                optimizer.get_all_group_bindings(left_keys_group_id, false);
-                            let right_keys_list =
-                                optimizer.get_all_group_bindings(right_keys_group_id, false);
-                            // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information
-                            let left_keys = left_keys_list.first().expect("left keys missing");
-                            let right_keys = right_keys_list.first().expect("right keys missing");
-                            self.get_join_selectivity_from_keys(
-                                *join_typ,
-                                ExprList::from_rel_node(left_keys.clone())
-                                    .expect("left_keys should be an ExprList"),
-                                ExprList::from_rel_node(right_keys.clone())
-                                    .expect("right_keys should be an ExprList"),
-                                &column_refs,
-                                row_cnt_1,
-                                row_cnt_2,
-                            )
-                        } else {
-                            DEFAULT_UNK_SEL
-                        }
-                    }
-                    None => DEFAULT_UNK_SEL,
+                let selectivity = if let (Some(context), Some(optimizer)) = (context, optimizer) {
+                    let column_refs = optimizer
+                        .get_property_by_group::<ColumnRefPropertyBuilder>(
+                            context.group_id,
+                            1,
+                        );
+                    let left_keys_group_id = context.children_group_ids[2];
+                    let right_keys_group_id = context.children_group_ids[3];
+                    let left_keys_list =
+                        optimizer.get_all_group_bindings(left_keys_group_id, false);
+                    let right_keys_list =
+                        optimizer.get_all_group_bindings(right_keys_group_id, false);
+                    // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information
+                    let left_keys = left_keys_list.first().expect("left keys missing");
+                    let right_keys = right_keys_list.first().expect("right keys missing");
+                    self.get_join_selectivity_from_keys(
+                        *join_typ,
+                        ExprList::from_rel_node(left_keys.clone())
+                            .expect("left_keys should be an ExprList"),
+                        ExprList::from_rel_node(right_keys.clone())
+                            .expect("right_keys should be an ExprList"),
+                        &column_refs,
+                        row_cnt_1,
+                        row_cnt_2,
+                    )
+                } else {
+                    DEFAULT_UNK_SEL
                 };
                 Self::cost(
                     (row_cnt_1 * row_cnt_2 * selectivity).max(1.0),

From 900a10b9a660c178432448afb05243a7c580b4fc Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 14:51:45 -0400
Subject: [PATCH 28/29] pr changes

---
 optd-datafusion-repr/src/cost/base_cost.rs | 23 +++++++++-------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index cfada1c0..5c8802d6 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -725,13 +725,13 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
         }
     }
 
-    /// Check if an expr_tree is a join condition, returning the join on col ref pair if it is
-    /// The reason the check and the info are in the same function is because their code is almost identical
+    /// Check if an expr_tree is a join condition, returning the join on col ref pair if it is.
+    /// The reason the check and the info are in the same function is because their code is almost identical.
+    /// It only picks out equality conditions between two column refs on different tables
     fn get_on_col_ref_pair(
         expr_tree: OptRelNodeRef,
         column_refs: &GroupColumnRefs,
     ) -> Option<(ColumnRefExpr, ColumnRefExpr)> {
-        // We perform three checks to see if a child_expr_tree is an on_col_ref_pair
         // 1. Check that it's equality
         if expr_tree.typ == OptRelNodeTyp::BinOp(BinOpType::Eq) {
             let left_child = expr_tree.child(0);
@@ -747,18 +747,13 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
                     .expect("we already checked that the type is ColumnRef");
                 let left_col_ref = &column_refs[left_col_ref_expr.index()];
                 let right_col_ref = &column_refs[right_col_ref_expr.index()];
-                let is_same_table = if let ColumnRef::BaseTableColumnRef {
+                let is_same_table = if let (ColumnRef::BaseTableColumnRef {
                     table: left_table, ..
-                } = left_col_ref
+                }, ColumnRef::BaseTableColumnRef {
+                    table: right_table, ..
+                }) = (left_col_ref, right_col_ref)
                 {
-                    if let ColumnRef::BaseTableColumnRef {
-                        table: right_table, ..
-                    } = right_col_ref
-                    {
-                        left_table == right_table
-                    } else {
-                        false
-                    }
+                    left_table == right_table
                 } else {
                     false
                 };
@@ -947,7 +942,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
                     None => DEFAULT_NUM_DISTINCT,
                 }
             });
-            // using reduce(f64::min) is the idiomatic workaround to the fact that f64 does not implement Ord due to NaN
+            // using reduce(f64::min) is the idiomatic workaround to min() because f64 does not implement Ord due to NaN
             let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_exprs.len() == 2");
             assert!(!selectivity.is_nan(), "it should be impossible for selectivity to be NaN since n-distinct is never 0");
             selectivity

From d30dc19ea6783fd68f41d7463f0b255d75e3caf7 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 31 Mar 2024 14:52:00 -0400
Subject: [PATCH 29/29] fmt and clippy

---
 optd-datafusion-repr/src/cost/base_cost.rs | 28 +++++++++-------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs
index 5c8802d6..5fc0fb1c 100644
--- a/optd-datafusion-repr/src/cost/base_cost.rs
+++ b/optd-datafusion-repr/src/cost/base_cost.rs
@@ -456,10 +456,7 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
                 let (_, compute_cost, _) = Self::cost_tuple(&children[1]);
                 let selectivity = if let (Some(context), Some(optimizer)) = (context, optimizer) {
                     let column_refs = optimizer
-                        .get_property_by_group::<ColumnRefPropertyBuilder>(
-                            context.group_id,
-                            1,
-                        );
+                        .get_property_by_group::<ColumnRefPropertyBuilder>(context.group_id, 1);
                     let expr_group_id = context.children_group_ids[1];
                     let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false);
                     // there may be more than one expression tree in a group (you can see this trivially as you can just swap the order of two subtrees for commutative operators)
@@ -481,10 +478,7 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
                 let (_, compute_cost, _) = Self::cost_tuple(&children[2]);
                 let selectivity = if let (Some(context), Some(optimizer)) = (context, optimizer) {
                     let column_refs = optimizer
-                        .get_property_by_group::<ColumnRefPropertyBuilder>(
-                            context.group_id,
-                            1,
-                        );
+                        .get_property_by_group::<ColumnRefPropertyBuilder>(context.group_id, 1);
                     let expr_group_id = context.children_group_ids[2];
                     let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false);
                     // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information
@@ -515,10 +509,7 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
                 let (row_cnt_2, _, _) = Self::cost_tuple(&children[1]);
                 let selectivity = if let (Some(context), Some(optimizer)) = (context, optimizer) {
                     let column_refs = optimizer
-                        .get_property_by_group::<ColumnRefPropertyBuilder>(
-                            context.group_id,
-                            1,
-                        );
+                        .get_property_by_group::<ColumnRefPropertyBuilder>(context.group_id, 1);
                     let left_keys_group_id = context.children_group_ids[2];
                     let right_keys_group_id = context.children_group_ids[3];
                     let left_keys_list =
@@ -747,11 +738,14 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
                     .expect("we already checked that the type is ColumnRef");
                 let left_col_ref = &column_refs[left_col_ref_expr.index()];
                 let right_col_ref = &column_refs[right_col_ref_expr.index()];
-                let is_same_table = if let (ColumnRef::BaseTableColumnRef {
-                    table: left_table, ..
-                }, ColumnRef::BaseTableColumnRef {
-                    table: right_table, ..
-                }) = (left_col_ref, right_col_ref)
+                let is_same_table = if let (
+                    ColumnRef::BaseTableColumnRef {
+                        table: left_table, ..
+                    },
+                    ColumnRef::BaseTableColumnRef {
+                        table: right_table, ..
+                    },
+                ) = (left_col_ref, right_col_ref)
                 {
                     left_table == right_table
                 } else {