From f223a93ca55aa17225f2c4ebedba9940d8ce5362 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 30 Mar 2024 13:57:39 -0400 Subject: [PATCH 01/29] done with join sel skeleton --- optd-core/src/cascades/optimizer.rs | 4 + optd-datafusion-repr/src/cost/base_cost.rs | 87 ++++++++++++++++++---- 2 files changed, 75 insertions(+), 16 deletions(-) diff --git a/optd-core/src/cascades/optimizer.rs b/optd-core/src/cascades/optimizer.rs index a2e4ea7b..d24eec70 100644 --- a/optd-core/src/cascades/optimizer.rs +++ b/optd-core/src/cascades/optimizer.rs @@ -317,6 +317,10 @@ impl CascadesOptimizer { self.memo.merge_group(group_a, group_b); } + /// Get the properties of a Cascades group + /// P is the type of the property you expect + /// idx is the idx of the property you want. The order of properties is defined + /// by the property_builders parameter in CascadesOptimizer::new() pub fn get_property_by_group>( &self, group_id: GroupId, diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 55a789ee..973d1f49 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -5,7 +5,7 @@ use crate::plan_nodes::{ }; use crate::properties::column_ref::{ColumnRefPropertyBuilder, GroupColumnRefs}; use crate::{ - plan_nodes::{OptRelNodeRef, OptRelNodeTyp}, + plan_nodes::{OptRelNodeRef, OptRelNodeTyp, JoinType}, properties::column_ref::ColumnRef, }; use arrow_schema::{ArrowError, DataType}; @@ -323,8 +323,11 @@ const DEFAULT_EQ_SEL: f64 = 0.005; const DEFAULT_INEQ_SEL: f64 = 0.3333333333333333; // Default selectivity estimate for pattern-match operators such as LIKE const DEFAULT_MATCH_SEL: f64 = 0.005; +// Default selectivity if we have no information +const DEFAULT_UNK_SEL: f64 = 0.005; -const INVALID_SEL: f64 = 0.01; +// A placeholder for todo!() for codepaths which are accessed by plannertest +const TODO_SEL: f64 = 0.01; impl OptCostModel { pub fn row_cnt(Cost(cost): &Cost) -> f64 { @@ -428,10 +431,10 @@ impl CostModel for OptCostM row_cnt.min(fetch as f64) } } else { - panic!("compute_cost() should not be called if optimizer is None") + (row_cnt * DEFAULT_UNK_SEL).max(1.0) } } else { - panic!("compute_cost() should not be called if context is None") + (row_cnt * DEFAULT_UNK_SEL).max(1.0) }; Self::cost(row_cnt, compute_cost, 0.0) } @@ -456,10 +459,10 @@ impl CostModel for OptCostM panic!("encountered a PhysicalFilter without an expression") } } else { - panic!("compute_cost() should not be called if optimizer is None") + DEFAULT_UNK_SEL } } - None => panic!("compute_cost() should not be called if context is None"), + None => DEFAULT_UNK_SEL, }; Self::cost( @@ -468,11 +471,32 @@ impl CostModel for OptCostM 0.0, ) } - OptRelNodeTyp::PhysicalNestedLoopJoin(_) => { + OptRelNodeTyp::PhysicalNestedLoopJoin(join_typ) => { let (row_cnt_1, _, _) = Self::cost_tuple(&children[0]); let (row_cnt_2, _, _) = Self::cost_tuple(&children[1]); let (_, compute_cost, _) = Self::cost_tuple(&children[2]); - let selectivity = 0.01; + let selectivity = match context { + Some(context) => { + if let Some(optimizer) = optimizer { + let column_refs = optimizer + .get_property_by_group::( + context.group_id, + 1, + ); + let expr_group_id = context.children_group_ids[2]; + let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false); + // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information + if let Some(expr_tree) = expr_trees.first() { + self.get_join_selectivity(*join_typ, Arc::clone(expr_tree), &column_refs) + } else { + panic!("encountered a join without an expression") + } + } else { + DEFAULT_UNK_SEL + } + } + None => DEFAULT_UNK_SEL, + }; Self::cost( (row_cnt_1 * row_cnt_2 * selectivity).max(1.0), row_cnt_1 * row_cnt_2 * compute_cost + row_cnt_1, @@ -580,7 +604,7 @@ impl OptCostModel { let right_child = expr_tree.child(1); if bin_op_typ.is_comparison() { - self.get_comparison_op_selectivity( + self.get_filter_comp_op_selectivity( *bin_op_typ, left_child, right_child, @@ -595,19 +619,50 @@ impl OptCostModel { } } OptRelNodeTyp::LogOp(log_op_typ) => { - self.get_log_op_selectivity(*log_op_typ, &expr_tree.children, column_refs) + self.get_filter_log_op_selectivity(*log_op_typ, &expr_tree.children, column_refs) } OptRelNodeTyp::Func(_) => todo!("check bool type or else panic"), OptRelNodeTyp::SortOrder(_) => { panic!("the selectivity of sort order expressions is undefined") } - OptRelNodeTyp::Between => INVALID_SEL, + OptRelNodeTyp::Between => TODO_SEL, OptRelNodeTyp::Cast => todo!("check bool type or else panic"), OptRelNodeTyp::Like => DEFAULT_MATCH_SEL, OptRelNodeTyp::DataType(_) => { panic!("the selectivity of a data type is not defined") } - OptRelNodeTyp::InList => INVALID_SEL, + OptRelNodeTyp::InList => TODO_SEL, + _ => unreachable!( + "all expression OptRelNodeTyp were enumerated. this should be unreachable" + ), + } + } + + /// The expr_tree input must be a "mixed expression tree", just like with get_filter_selectivity() + fn get_join_selectivity( + &self, + join_typ: JoinType, + expr_tree: OptRelNodeRef, + column_refs: &GroupColumnRefs, + ) -> f64 { + assert!(expr_tree.typ.is_expression()); + match &expr_tree.typ { + OptRelNodeTyp::Constant(_) => TODO_SEL, + OptRelNodeTyp::ColumnRef => todo!("check bool type or else panic"), + OptRelNodeTyp::UnOp(_) => todo!(), + OptRelNodeTyp::BinOp(_) => TODO_SEL, + OptRelNodeTyp::LogOp(_) => TODO_SEL, + OptRelNodeTyp::Func(_) => todo!("check bool type or else panic"), + OptRelNodeTyp::SortOrder(_) => { + panic!("the selectivity of sort order expressions is undefined") + } + OptRelNodeTyp::Between => todo!(), + OptRelNodeTyp::Cast => todo!("check bool type or else panic"), + OptRelNodeTyp::Like => todo!(), + OptRelNodeTyp::DataType(_) => { + panic!("the selectivity of a data type is not defined") + } + OptRelNodeTyp::InList => todo!(), _ => unreachable!( "all expression OptRelNodeTyp were enumerated. this should be unreachable" ), @@ -615,7 +670,7 @@ impl OptCostModel { } /// Comparison operators are the base case for recursion in get_filter_selectivity() - fn get_comparison_op_selectivity( + fn get_filter_comp_op_selectivity( &self, comp_bin_op_typ: BinOpType, left: OptRelNodeRef, @@ -652,7 +707,7 @@ impl OptCostModel { // handle the different cases of column nodes if col_ref_nodes.is_empty() { - INVALID_SEL + TODO_SEL } else if col_ref_nodes.len() == 1 { let col_ref_node = col_ref_nodes .pop() @@ -712,7 +767,7 @@ impl OptCostModel { OptRelNodeTyp::BinOp(_) => { Self::get_default_comparison_op_selectivity(comp_bin_op_typ) } - OptRelNodeTyp::Cast => INVALID_SEL, + OptRelNodeTyp::Cast => TODO_SEL, _ => unimplemented!( "unhandled case of comparing a column ref node to {}", non_col_ref_node.as_ref().typ @@ -852,7 +907,7 @@ impl OptCostModel { } } - fn get_log_op_selectivity( + fn get_filter_log_op_selectivity( &self, log_op_typ: LogOpType, children: &[OptRelNodeRef], From a20b8f5a017db945af5c460b96bfbae5469f32f4 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 30 Mar 2024 14:16:42 -0400 Subject: [PATCH 02/29] added filtersel and joinsel const --- optd-datafusion-repr/src/cost/base_cost.rs | 98 ++++++++++++++++------ 1 file changed, 74 insertions(+), 24 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 973d1f49..161a8440 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -584,7 +584,7 @@ impl OptCostModel { ) -> f64 { assert!(expr_tree.typ.is_expression()); match &expr_tree.typ { - OptRelNodeTyp::Constant(_) => todo!("check bool type or else panic"), + OptRelNodeTyp::Constant(_) => Self::get_constant_selectivity(expr_tree), OptRelNodeTyp::ColumnRef => todo!("check bool type or else panic"), OptRelNodeTyp::UnOp(un_op_typ) => { assert!(expr_tree.children.len() == 1); @@ -647,7 +647,7 @@ impl OptCostModel { ) -> f64 { assert!(expr_tree.typ.is_expression()); match &expr_tree.typ { - OptRelNodeTyp::Constant(_) => TODO_SEL, + OptRelNodeTyp::Constant(_) => Self::get_constant_selectivity(expr_tree), OptRelNodeTyp::ColumnRef => todo!("check bool type or else panic"), OptRelNodeTyp::UnOp(_) => todo!(), OptRelNodeTyp::BinOp(_) => TODO_SEL, @@ -798,6 +798,31 @@ impl OptCostModel { } } + fn get_constant_selectivity(const_node: OptRelNodeRef) -> f64 { + if let OptRelNodeTyp::Constant(const_typ) = const_node.typ { + if matches!(const_typ, ConstantType::Bool) { + let value = const_node + .as_ref() + .data + .as_ref() + .expect("constants should have data"); + if let Value::Bool(bool_value) = value { + if *bool_value { + 1.0 + } else { + 0.0 + } + } else { + unreachable!("if the typ is ConstantType::Bool, the value should be a Value::Bool") + } + } else { + panic!("selectivity is not defined on constants which are not bools") + } + } else { + panic!("get_constant_selectivity must be called on a constant") + } + } + /// Get the selectivity of an expression of the form "column equals value" (or "value equals column") /// Will handle the case of statistics missing /// Equality predicates are handled entirely differently from range predicates so this is its own function @@ -950,8 +975,7 @@ mod tests { use crate::{ plan_nodes::{ - BinOpExpr, BinOpType, ColumnRefExpr, ConstantExpr, Expr, ExprList, LogOpExpr, - LogOpType, OptRelNode, OptRelNodeRef, UnOpExpr, UnOpType, + BinOpExpr, BinOpType, ColumnRefExpr, ConstantExpr, Expr, ExprList, JoinType, LogOpExpr, LogOpType, OptRelNode, OptRelNodeRef, UnOpExpr, UnOpType }, properties::column_ref::ColumnRef, }; @@ -1077,8 +1101,27 @@ mod tests { .into_rel_node() } + /// The reason this isn't an associated function of PerColumnStats is because that would require + /// adding an empty() function to the trait definitions of MostCommonValues and Distribution, + /// which I wanted to avoid + fn get_empty_per_col_stats() -> TestPerColumnStats { + TestPerColumnStats::new( + TestMostCommonValues::empty(), + 0, + 0.0, + TestDistribution::empty(), + ) + } + #[test] - fn test_colref_eq_constint_in_mcv() { + fn test_filtersel_const() { + let cost_model = create_one_column_cost_model(get_empty_per_col_stats()); + assert_approx_eq::assert_approx_eq!(cost_model.get_filter_selectivity(cnst(Value::Bool(true)), &vec![]), 1.0); + assert_approx_eq::assert_approx_eq!(cost_model.get_filter_selectivity(cnst(Value::Bool(false)), &vec![]), 0.0); + } + + #[test] + fn test_filtersel_colref_eq_constint_in_mcv() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues::new(vec![(Value::Int32(1), 0.3)]), 0, @@ -1102,7 +1145,7 @@ mod tests { } #[test] - fn test_colref_eq_constint_not_in_mcv_no_nulls() { + fn test_filtersel_colref_eq_constint_not_in_mcv_no_nulls() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues::new(vec![(Value::Int32(1), 0.2), (Value::Int32(3), 0.44)]), 5, @@ -1126,7 +1169,7 @@ mod tests { } #[test] - fn test_colref_eq_constint_not_in_mcv_with_nulls() { + fn test_filtersel_colref_eq_constint_not_in_mcv_with_nulls() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues::new(vec![(Value::Int32(1), 0.2), (Value::Int32(3), 0.44)]), 5, @@ -1151,7 +1194,7 @@ mod tests { /// I only have one test for NEQ since I'll assume that it uses the same underlying logic as EQ #[test] - fn test_colref_neq_constint_in_mcv() { + fn test_filtersel_colref_neq_constint_in_mcv() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues::new(vec![(Value::Int32(1), 0.3)]), 0, @@ -1175,7 +1218,7 @@ mod tests { } #[test] - fn test_colref_leq_constint_no_mcvs_in_range() { + fn test_filtersel_colref_leq_constint_no_mcvs_in_range() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 10, @@ -1199,7 +1242,7 @@ mod tests { } #[test] - fn test_colref_leq_constint_no_mcvs_in_range_with_nulls() { + fn test_filtersel_colref_leq_constint_no_mcvs_in_range_with_nulls() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 10, @@ -1223,7 +1266,7 @@ mod tests { } #[test] - fn test_colref_leq_constint_with_mcvs_in_range_not_at_border() { + fn test_filtersel_colref_leq_constint_with_mcvs_in_range_not_at_border() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues { mcvs: vec![ @@ -1256,7 +1299,7 @@ mod tests { } #[test] - fn test_colref_leq_constint_with_mcv_at_border() { + fn test_filtersel_colref_leq_constint_with_mcv_at_border() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues::new(vec![ (Value::Int32(6), 0.05), @@ -1285,7 +1328,7 @@ mod tests { } #[test] - fn test_colref_lt_constint_no_mcvs_in_range() { + fn test_filtersel_colref_lt_constint_no_mcvs_in_range() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 10, @@ -1309,7 +1352,7 @@ mod tests { } #[test] - fn test_colref_lt_constint_no_mcvs_in_range_with_nulls() { + fn test_filtersel_colref_lt_constint_no_mcvs_in_range_with_nulls() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 9, // 90% of the values aren't nulls since null_frac = 0.1. if there are 9 distinct non-null values, each will have 0.1 frequency @@ -1333,7 +1376,7 @@ mod tests { } #[test] - fn test_colref_lt_constint_with_mcvs_in_range_not_at_border() { + fn test_filtersel_colref_lt_constint_with_mcvs_in_range_not_at_border() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues { mcvs: vec![ @@ -1366,7 +1409,7 @@ mod tests { } #[test] - fn test_colref_lt_constint_with_mcv_at_border() { + fn test_filtersel_colref_lt_constint_with_mcv_at_border() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues { mcvs: vec![ @@ -1401,7 +1444,7 @@ mod tests { /// I have fewer tests for GT since I'll assume that it uses the same underlying logic as LEQ /// The only interesting thing to test is that if there are nulls, those aren't included in GT #[test] - fn test_colref_gt_constint_no_nulls() { + fn test_filtersel_colref_gt_constint_no_nulls() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 10, @@ -1425,7 +1468,7 @@ mod tests { } #[test] - fn test_colref_gt_constint_with_nulls() { + fn test_filtersel_colref_gt_constint_with_nulls() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 10, @@ -1451,7 +1494,7 @@ mod tests { /// As with above, I have one test without nulls and one test with nulls #[test] - fn test_colref_geq_constint_no_nulls() { + fn test_filtersel_colref_geq_constint_no_nulls() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 10, @@ -1475,7 +1518,7 @@ mod tests { } #[test] - fn test_colref_geq_constint_with_nulls() { + fn test_filtersel_colref_geq_constint_with_nulls() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 9, // 90% of the values aren't nulls since null_frac = 0.1. if there are 9 distinct non-null values, each will have 0.1 frequency @@ -1500,7 +1543,7 @@ mod tests { } #[test] - fn test_and() { + fn test_filtersel_and() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues { mcvs: vec![ @@ -1540,7 +1583,7 @@ mod tests { } #[test] - fn test_or() { + fn test_filtersel_or() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues { mcvs: vec![ @@ -1580,7 +1623,7 @@ mod tests { } #[test] - fn test_not_no_nulls() { + fn test_filtersel_not_no_nulls() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues::new(vec![(Value::Int32(1), 0.3)]), 0, @@ -1602,7 +1645,7 @@ mod tests { } #[test] - fn test_not_with_nulls() { + fn test_filtersel_not_with_nulls() { let cost_model = create_one_column_cost_model(TestPerColumnStats::new( TestMostCommonValues::new(vec![(Value::Int32(1), 0.3)]), 0, @@ -1624,4 +1667,11 @@ mod tests { 0.7 ); } + + #[test] + fn test_joinsel_const() { + let cost_model = create_one_column_cost_model(get_empty_per_col_stats()); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(true)), &vec![]), 1.0); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(false)), &vec![]), 0.0); + } } From ccf068dd06c55b9f2bb1625dfd399f1da7f7d131 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 30 Mar 2024 19:21:07 -0400 Subject: [PATCH 03/29] made get semantic nodes a function --- optd-datafusion-repr/src/cost/base_cost.rs | 238 +++++++++++++++++---- 1 file changed, 193 insertions(+), 45 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 161a8440..fa71828e 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -323,11 +323,12 @@ const DEFAULT_EQ_SEL: f64 = 0.005; const DEFAULT_INEQ_SEL: f64 = 0.3333333333333333; // Default selectivity estimate for pattern-match operators such as LIKE const DEFAULT_MATCH_SEL: f64 = 0.005; +const DEFAULT_NUM_DISTINCT: u64 = 200; // Default selectivity if we have no information const DEFAULT_UNK_SEL: f64 = 0.005; -// A placeholder for todo!() for codepaths which are accessed by plannertest -const TODO_SEL: f64 = 0.01; +// A placeholder for unimplemented!() for codepaths which are accessed by plannertest +const UNIMPLEMENTED_SEL: f64 = 0.01; impl OptCostModel { pub fn row_cnt(Cost(cost): &Cost) -> f64 { @@ -585,7 +586,7 @@ impl OptCostModel { assert!(expr_tree.typ.is_expression()); match &expr_tree.typ { OptRelNodeTyp::Constant(_) => Self::get_constant_selectivity(expr_tree), - OptRelNodeTyp::ColumnRef => todo!("check bool type or else panic"), + OptRelNodeTyp::ColumnRef => unimplemented!("check bool type or else panic"), OptRelNodeTyp::UnOp(un_op_typ) => { assert!(expr_tree.children.len() == 1); let child = expr_tree.child(0); @@ -621,17 +622,17 @@ impl OptCostModel { OptRelNodeTyp::LogOp(log_op_typ) => { self.get_filter_log_op_selectivity(*log_op_typ, &expr_tree.children, column_refs) } - OptRelNodeTyp::Func(_) => todo!("check bool type or else panic"), + OptRelNodeTyp::Func(_) => unimplemented!("check bool type or else panic"), OptRelNodeTyp::SortOrder(_) => { panic!("the selectivity of sort order expressions is undefined") } - OptRelNodeTyp::Between => TODO_SEL, - OptRelNodeTyp::Cast => todo!("check bool type or else panic"), + OptRelNodeTyp::Between => UNIMPLEMENTED_SEL, + OptRelNodeTyp::Cast => unimplemented!("check bool type or else panic"), OptRelNodeTyp::Like => DEFAULT_MATCH_SEL, OptRelNodeTyp::DataType(_) => { panic!("the selectivity of a data type is not defined") } - OptRelNodeTyp::InList => TODO_SEL, + OptRelNodeTyp::InList => UNIMPLEMENTED_SEL, _ => unreachable!( "all expression OptRelNodeTyp were enumerated. this should be unreachable" ), @@ -648,21 +649,43 @@ impl OptCostModel { assert!(expr_tree.typ.is_expression()); match &expr_tree.typ { OptRelNodeTyp::Constant(_) => Self::get_constant_selectivity(expr_tree), - OptRelNodeTyp::ColumnRef => todo!("check bool type or else panic"), - OptRelNodeTyp::UnOp(_) => todo!(), - OptRelNodeTyp::BinOp(_) => TODO_SEL, - OptRelNodeTyp::LogOp(_) => TODO_SEL, - OptRelNodeTyp::Func(_) => todo!("check bool type or else panic"), + OptRelNodeTyp::ColumnRef => unimplemented!("check bool type or else panic"), + OptRelNodeTyp::UnOp(_) => unimplemented!(), + OptRelNodeTyp::BinOp(bin_op_typ) => { + assert!(expr_tree.children.len() == 2); + let left_child = expr_tree.child(0); + let right_child = expr_tree.child(1); + + if bin_op_typ.is_comparison() { + self.get_join_comp_op_selectivity( + join_typ, + *bin_op_typ, + left_child, + right_child, + column_refs, + ) + } else if bin_op_typ.is_numerical() { + panic!( + "the selectivity of operations that return numerical values is undefined" + ) + } else { + unreachable!("all BinOpTypes should be true for at least one is_*() function") + } + }, + OptRelNodeTyp::LogOp(log_op_typ) => { + self.get_join_log_op_selectivity(join_typ, *log_op_typ, &expr_tree.children, column_refs) + }, + OptRelNodeTyp::Func(_) => unimplemented!("check bool type or else panic"), OptRelNodeTyp::SortOrder(_) => { panic!("the selectivity of sort order expressions is undefined") } - OptRelNodeTyp::Between => todo!(), - OptRelNodeTyp::Cast => todo!("check bool type or else panic"), - OptRelNodeTyp::Like => todo!(), + OptRelNodeTyp::Between => unimplemented!(), + OptRelNodeTyp::Cast => unimplemented!("check bool type or else panic"), + OptRelNodeTyp::Like => unimplemented!(), OptRelNodeTyp::DataType(_) => { panic!("the selectivity of a data type is not defined") } - OptRelNodeTyp::InList => todo!(), + OptRelNodeTyp::InList => unimplemented!(), _ => unreachable!( "all expression OptRelNodeTyp were enumerated. this should be unreachable" ), @@ -679,44 +702,21 @@ impl OptCostModel { ) -> f64 { assert!(comp_bin_op_typ.is_comparison()); - // it's more convenient to refer to the children based on whether they're column nodes or not - // rather than by left/right - let mut col_ref_nodes = vec![]; - let mut non_col_ref_nodes = vec![]; - let is_left_col_ref; // I intentionally performed moves on left and right. This way, we don't accidentally use them after this block - // We always want to use "col_ref_node" and "non_col_ref_node" instead of "left" or "right" - if left.as_ref().typ == OptRelNodeTyp::ColumnRef { - is_left_col_ref = true; - col_ref_nodes.push( - ColumnRefExpr::from_rel_node(left) - .expect("we already checked that the type is ColumnRef"), - ); - } else { - is_left_col_ref = false; - non_col_ref_nodes.push(left); - } - if right.as_ref().typ == OptRelNodeTyp::ColumnRef { - col_ref_nodes.push( - ColumnRefExpr::from_rel_node(right) - .expect("we already checked that the type is ColumnRef"), - ); - } else { - non_col_ref_nodes.push(right); - } + let (col_ref_nodes, non_col_ref_nodes, is_left_col_ref) = Self::get_semantic_nodes(left, right); // handle the different cases of column nodes if col_ref_nodes.is_empty() { - TODO_SEL + UNIMPLEMENTED_SEL } else if col_ref_nodes.len() == 1 { let col_ref_node = col_ref_nodes - .pop() + .first() .expect("we just checked that col_ref_nodes.len() == 1"); let col_ref_idx = col_ref_node.index(); if let ColumnRef::BaseTableColumnRef { table, col_idx } = &column_refs[col_ref_idx] { let non_col_ref_node = non_col_ref_nodes - .pop() + .first() .expect("non_col_ref_nodes should have a value since col_ref_nodes.len() == 1"); match non_col_ref_node.as_ref().typ { @@ -767,7 +767,7 @@ impl OptCostModel { OptRelNodeTyp::BinOp(_) => { Self::get_default_comparison_op_selectivity(comp_bin_op_typ) } - OptRelNodeTyp::Cast => TODO_SEL, + OptRelNodeTyp::Cast => UNIMPLEMENTED_SEL, _ => unimplemented!( "unhandled case of comparing a column ref node to {}", non_col_ref_node.as_ref().typ @@ -783,6 +783,94 @@ impl OptCostModel { } } + /// Comparison operators are the base case for recursion in get_join_selectivity() + fn get_join_comp_op_selectivity( + &self, + join_typ: JoinType, + comp_bin_op_typ: BinOpType, + left: OptRelNodeRef, + right: OptRelNodeRef, + column_refs: &GroupColumnRefs, + ) -> f64 { + assert!(comp_bin_op_typ.is_comparison()); + + // I intentionally performed moves on left and right. This way, we don't accidentally use them after this block + let (col_ref_nodes, _, _) = Self::get_semantic_nodes(left, right); + + // handle the different cases of column nodes + if col_ref_nodes.is_empty() { + unimplemented!() + } else if col_ref_nodes.len() == 1 { + unimplemented!() + } else if col_ref_nodes.len() == 2 { + match join_typ { + JoinType::Inner => { + // the statistics objects of the referenced columns + let col_ref_stats_list = col_ref_nodes.iter().map(|col_ref_node| { + let col_ref_idx = col_ref_node.index(); + if let ColumnRef::BaseTableColumnRef { table, col_idx } = &column_refs[col_ref_idx] { + if let Some(per_table_stats) = self.per_table_stats_map.get(table) { + if let Some(Some(per_column_stats)) = per_table_stats.per_column_stats_vec.get(*col_idx) + { + Some(per_column_stats) + } else { + None + } + } else { + None + } + } else { + None + } + }); + let ndistincts = col_ref_stats_list.map(|col_ref_stats| { + if let Some(col_ref_stats) = col_ref_stats { + col_ref_stats.ndistinct + } else { + DEFAULT_NUM_DISTINCT + } + }); + // using reduce(f64::min) is the idiomatic workaround to the fact that f64 does not implement Ord due to NaN + let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_nodes.len() == 2"); + assert!(!selectivity.is_nan(), "it should be impossible for selectivity to be NaN since n-distinct is never 0"); + selectivity + } + _ => unimplemented!() + } + } else { + unreachable!("we could have at most pushed left and right into col_ref_nodes") + } + } + + /// Convert the left and right child nodes of some operation to what they semantically are + /// This is convenient to avoid repeating the same logic just with "left" and "right" swapped + fn get_semantic_nodes(left: OptRelNodeRef, right: OptRelNodeRef) -> (Vec, Vec, bool) { + let mut col_ref_nodes = vec![]; + let mut non_col_ref_nodes = vec![]; + let is_left_col_ref; + // I intentionally performed moves on left and right. This way, we don't accidentally use them after this block + // We always want to use "col_ref_node" and "non_col_ref_node" instead of "left" or "right" + if left.as_ref().typ == OptRelNodeTyp::ColumnRef { + is_left_col_ref = true; + col_ref_nodes.push( + ColumnRefExpr::from_rel_node(left) + .expect("we already checked that the type is ColumnRef"), + ); + } else { + is_left_col_ref = false; + non_col_ref_nodes.push(left); + } + if right.as_ref().typ == OptRelNodeTyp::ColumnRef { + col_ref_nodes.push( + ColumnRefExpr::from_rel_node(right) + .expect("we already checked that the type is ColumnRef"), + ); + } else { + non_col_ref_nodes.push(right); + } + (col_ref_nodes, non_col_ref_nodes, is_left_col_ref) + } + /// The default selectivity of a comparison expression /// Used when one side of the comparison is a column while the other side is something too /// complex/impossible to evaluate (subquery, UDF, another column, we have no stats, etc.) @@ -949,6 +1037,24 @@ impl OptCostModel { } } + fn get_join_log_op_selectivity( + &self, + join_typ: JoinType, + log_op_typ: LogOpType, + children: &[OptRelNodeRef], + column_refs: &GroupColumnRefs, + ) -> f64 { + let children_sel = children + .iter() + .map(|expr| self.get_join_selectivity(join_typ, expr.clone(), column_refs)); + + match log_op_typ { + LogOpType::And => children_sel.product(), + // the formula is 1.0 - the probability of _none_ of the events happening + LogOpType::Or => 1.0 - children_sel.fold(1.0, |acc, sel| acc * (1.0 - sel)), + } + } + pub fn get_row_cnt(&self, table: &str) -> Option { self.per_table_stats_map .get(table) @@ -1045,7 +1151,7 @@ mod tests { const TABLE1_NAME: &str = "t1"; - // one column is sufficient for all filter selectivity predicates + // one column is sufficient for all filter selectivity tests fn create_one_column_cost_model( per_column_stats: TestPerColumnStats, ) -> OptCostModel { @@ -1059,6 +1165,21 @@ mod tests { ) } + // two columns is sufficient for all join selectivity tests + fn create_two_column_cost_model( + per_column_stats1: TestPerColumnStats, + per_column_stats2: TestPerColumnStats, + ) -> OptCostModel { + OptCostModel::new( + vec![( + String::from(TABLE1_NAME), + PerTableStats::new(100, vec![Some(per_column_stats1), Some(per_column_stats2)]), + )] + .into_iter() + .collect(), + ) + } + fn col_ref(idx: u64) -> OptRelNodeRef { // this conversion is always safe because idx was originally a usize let idx_as_usize = idx as usize; @@ -1394,6 +1515,7 @@ mod tests { )); let expr_tree = bin_op(BinOpType::Lt, col_ref(0), cnst(Value::Int32(15))); let expr_tree_rev = bin_op(BinOpType::Gt, cnst(Value::Int32(15)), col_ref(0)); + // TODO(phw2): make column_refs a function let column_refs = vec![ColumnRef::BaseTableColumnRef { table: String::from(TABLE1_NAME), col_idx: 0, @@ -1674,4 +1796,30 @@ mod tests { assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(true)), &vec![]), 1.0); assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(false)), &vec![]), 0.0); } + + #[test] + fn test_joinsel_colref_eq_colref_no_mcvs_no_nulls() { + let cost_model = create_two_column_cost_model(TestPerColumnStats::new( + TestMostCommonValues::empty(), + 5, + 0.0, + TestDistribution::empty(), + ), TestPerColumnStats::new( + TestMostCommonValues::empty(), + 3, + 0.0, + TestDistribution::empty(), + )); + let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); + let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0)); + let column_refs = vec![ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 0, + }, ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 1, + }]; + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.2); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.2); + } } From 88a15e94190205f0a27d876699a6c27978b90ec5 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 30 Mar 2024 19:27:10 -0400 Subject: [PATCH 04/29] added tests for log op join sel --- optd-datafusion-repr/src/cost/base_cost.rs | 56 ++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index fa71828e..ba4a8136 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -1822,4 +1822,60 @@ mod tests { assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.2); assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.2); } + + #[test] + fn test_joinsel_and() { + let cost_model = create_two_column_cost_model(TestPerColumnStats::new( + TestMostCommonValues::empty(), + 5, + 0.0, + TestDistribution::empty(), + ), TestPerColumnStats::new( + TestMostCommonValues::empty(), + 3, + 0.0, + TestDistribution::empty(), + )); + let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); + let eq1and0 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); + let expr_tree = log_op(LogOpType::And, vec![eq0and1.clone(), eq1and0.clone()]); + let expr_tree_rev = log_op(LogOpType::And, vec![eq1and0.clone(), eq0and1.clone()]); + let column_refs = vec![ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 0, + }, ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 1, + }]; + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.04); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.04); + } + + #[test] + fn test_joinsel_or() { + let cost_model = create_two_column_cost_model(TestPerColumnStats::new( + TestMostCommonValues::empty(), + 5, + 0.0, + TestDistribution::empty(), + ), TestPerColumnStats::new( + TestMostCommonValues::empty(), + 3, + 0.0, + TestDistribution::empty(), + )); + let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); + let eq1and0 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); + let expr_tree = log_op(LogOpType::Or, vec![eq0and1.clone(), eq1and0.clone()]); + let expr_tree_rev = log_op(LogOpType::Or, vec![eq1and0.clone(), eq0and1.clone()]); + let column_refs = vec![ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 0, + }, ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 1, + }]; + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.36); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.36); + } } From c0155625a52e08e79c528fd810eb4fa0d5fafd4d Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 30 Mar 2024 21:13:14 -0400 Subject: [PATCH 05/29] refactored per_col_vec to per_col_map to avoid double options --- optd-datafusion-repr/src/cost/base_cost.rs | 137 ++++++++++----------- optd-datafusion-repr/src/plan_nodes.rs | 2 + optd-perftest/src/datafusion_dbms.rs | 8 +- 3 files changed, 70 insertions(+), 77 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index ba4a8136..07e0660a 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -94,7 +94,7 @@ impl MostCommonValues for MockMostCommonValues { #[derive(Serialize, Deserialize)] pub struct PerTableStats { row_cnt: usize, - per_column_stats_vec: Vec>>, + per_column_stats_map: HashMap>, } impl DataFusionPerTableStats { @@ -150,22 +150,20 @@ impl DataFusionPerTableStats { } // Assemble the per-column stats. - let mut per_column_stats_vec = Vec::with_capacity(col_cnt); + let mut per_column_stats_map = HashMap::with_capacity(col_cnt); for i in 0..col_cnt { - per_column_stats_vec.push(if Self::is_type_supported(&col_types[i]) { - Some(PerColumnStats::new( + if Self::is_type_supported(&col_types[i]) { + per_column_stats_map.insert(i, PerColumnStats::new( mcvs[i].take().unwrap(), hlls[i].n_distinct(), null_cnt[i] as f64 / row_cnt as f64, distr[i].take().unwrap(), - )) - } else { - None - }); + )); + } } Ok(Self { row_cnt, - per_column_stats_vec, + per_column_stats_map, }) } @@ -640,12 +638,15 @@ impl OptCostModel { } /// The expr_tree input must be a "mixed expression tree", just like with get_filter_selectivity() - fn get_join_selectivity( + /// The "wrapper" is here to separate the equality conditions from the filter conditions before calling + /// the "main" get_join_selectivity() function. + fn get_join_selectivity_wrapper( &self, join_typ: JoinType, expr_tree: OptRelNodeRef, column_refs: &GroupColumnRefs, ) -> f64 { + println!("get_join_selectivity(): called on expr_tree={}", expr_tree); assert!(expr_tree.typ.is_expression()); match &expr_tree.typ { OptRelNodeTyp::Constant(_) => Self::get_constant_selectivity(expr_tree), @@ -692,6 +693,55 @@ impl OptCostModel { } } + fn get_join_selectivity( + &self, + join_typ: JoinType, + on_col_ref_pairs: Vec<(ColumnRefExpr, ColumnRefExpr)>, + filter_expr_tree: Option, + column_refs: &GroupColumnRefs, + ) -> f64 { + let join_on_selectivity = self.get_join_on_selectivity(join_typ, on_col_ref_pairs, column_refs); + // Currently, there is no difference in how we handle a join filter and a select filter, so we use the same function + // One difference (that we *don't* care about right now) is that join filters can contain expressions from multiple + // different tables. Currently, this doesn't affect the get_filter_selectivity() function, but this may change in + // the future + let join_filter_selectivity = match filter_expr_tree { + Some(filter_expr_tree) => self.get_filter_selectivity(filter_expr_tree, column_refs), + None => 1.0, + }; + join_on_selectivity * join_filter_selectivity + } + + fn get_per_col_stats(&self, col_ref: &ColumnRef) -> Option<&PerColumnStats> { + if let ColumnRef::BaseTableColumnRef { table, col_idx } = col_ref { + self.per_table_stats_map.get(table).and_then(|per_table_stats| per_table_stats.per_column_stats_map.get(col_idx)) + } else { + None + } + } + + fn get_join_on_selectivity( + &self, + join_typ: JoinType, + on_col_ref_pairs: Vec<(ColumnRefExpr, ColumnRefExpr)>, + column_refs: &GroupColumnRefs + ) -> f64 { + // multiply the selectivities of all individual conditions together + on_col_ref_pairs.into_iter().map(|on_col_ref_pair| { + // the formula for each pair is min(1 / ndistinct1, 1 / ndistinct2) (see https://postgrespro.com/blog/pgsql/5969618) + let ndistincts = vec![on_col_ref_pair.0, on_col_ref_pair.1].into_iter().map(|on_col_ref| { + match self.get_per_col_stats(&column_refs[on_col_ref.index()]) { + Some(per_col_stats) => per_col_stats.ndistinct, + None => DEFAULT_NUM_DISTINCT, + } + }); + // using reduce(f64::min) is the idiomatic workaround to the fact that f64 does not implement Ord due to NaN + let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_nodes.len() == 2"); + assert!(!selectivity.is_nan(), "it should be impossible for selectivity to be NaN since n-distinct is never 0"); + selectivity + }).product() + } + /// Comparison operators are the base case for recursion in get_filter_selectivity() fn get_filter_comp_op_selectivity( &self, @@ -783,65 +833,6 @@ impl OptCostModel { } } - /// Comparison operators are the base case for recursion in get_join_selectivity() - fn get_join_comp_op_selectivity( - &self, - join_typ: JoinType, - comp_bin_op_typ: BinOpType, - left: OptRelNodeRef, - right: OptRelNodeRef, - column_refs: &GroupColumnRefs, - ) -> f64 { - assert!(comp_bin_op_typ.is_comparison()); - - // I intentionally performed moves on left and right. This way, we don't accidentally use them after this block - let (col_ref_nodes, _, _) = Self::get_semantic_nodes(left, right); - - // handle the different cases of column nodes - if col_ref_nodes.is_empty() { - unimplemented!() - } else if col_ref_nodes.len() == 1 { - unimplemented!() - } else if col_ref_nodes.len() == 2 { - match join_typ { - JoinType::Inner => { - // the statistics objects of the referenced columns - let col_ref_stats_list = col_ref_nodes.iter().map(|col_ref_node| { - let col_ref_idx = col_ref_node.index(); - if let ColumnRef::BaseTableColumnRef { table, col_idx } = &column_refs[col_ref_idx] { - if let Some(per_table_stats) = self.per_table_stats_map.get(table) { - if let Some(Some(per_column_stats)) = per_table_stats.per_column_stats_vec.get(*col_idx) - { - Some(per_column_stats) - } else { - None - } - } else { - None - } - } else { - None - } - }); - let ndistincts = col_ref_stats_list.map(|col_ref_stats| { - if let Some(col_ref_stats) = col_ref_stats { - col_ref_stats.ndistinct - } else { - DEFAULT_NUM_DISTINCT - } - }); - // using reduce(f64::min) is the idiomatic workaround to the fact that f64 does not implement Ord due to NaN - let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_nodes.len() == 2"); - assert!(!selectivity.is_nan(), "it should be impossible for selectivity to be NaN since n-distinct is never 0"); - selectivity - } - _ => unimplemented!() - } - } else { - unreachable!("we could have at most pushed left and right into col_ref_nodes") - } - } - /// Convert the left and right child nodes of some operation to what they semantically are /// This is convenient to avoid repeating the same logic just with "left" and "right" swapped fn get_semantic_nodes(left: OptRelNodeRef, right: OptRelNodeRef) -> (Vec, Vec, bool) { @@ -925,7 +916,7 @@ impl OptCostModel { is_eq: bool, ) -> f64 { if let Some(per_table_stats) = self.per_table_stats_map.get(table) { - if let Some(Some(per_column_stats)) = per_table_stats.per_column_stats_vec.get(col_idx) + if let Some(per_column_stats) = per_table_stats.per_column_stats_map.get(&col_idx) { let eq_freq = if let Some(freq) = per_column_stats.mcvs.freq(value) { freq @@ -975,7 +966,7 @@ impl OptCostModel { is_col_eq_val: bool, ) -> f64 { if let Some(per_table_stats) = self.per_table_stats_map.get(table) { - if let Some(Some(per_column_stats)) = per_table_stats.per_column_stats_vec.get(col_idx) + if let Some(per_column_stats) = per_table_stats.per_column_stats_map.get(&col_idx) { // because distr does not include the values in MCVs, we need to compute the CDFs there as well // because nulls return false in any comparison, they are never included when computing range selectivity @@ -1063,10 +1054,10 @@ impl OptCostModel { } impl PerTableStats { - pub fn new(row_cnt: usize, per_column_stats_vec: Vec>>) -> Self { + pub fn new(row_cnt: usize, per_column_stats_map: HashMap>) -> Self { Self { row_cnt, - per_column_stats_vec, + per_column_stats_map, } } } diff --git a/optd-datafusion-repr/src/plan_nodes.rs b/optd-datafusion-repr/src/plan_nodes.rs index faf27e3a..e872b3a9 100644 --- a/optd-datafusion-repr/src/plan_nodes.rs +++ b/optd-datafusion-repr/src/plan_nodes.rs @@ -39,6 +39,8 @@ pub use sort::{LogicalSort, PhysicalSort}; use crate::properties::schema::{Schema, SchemaPropertyBuilder}; +/// OptRelNodeTyp FAQ: +/// - The define_plan_node!() macro defines what the children of each join node are #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum OptRelNodeTyp { Placeholder(GroupId), diff --git a/optd-perftest/src/datafusion_dbms.rs b/optd-perftest/src/datafusion_dbms.rs index e98d93e6..204b85e2 100644 --- a/optd-perftest/src/datafusion_dbms.rs +++ b/optd-perftest/src/datafusion_dbms.rs @@ -145,13 +145,13 @@ impl DatafusionDBMS { let mut estcards = vec![]; for (query_id, sql_fpath) in tpch_kit.get_sql_fpath_ordered_iter(tpch_config)? { - let sql = fs::read_to_string(sql_fpath)?; - let estcard = self.eval_query_estcard(&sql).await?; - estcards.push(estcard); println!( - "done evaluating datafusion's estcard for TPC-H Q{}", + "about to evaluate datafusion's estcard for TPC-H Q{}", query_id ); + let sql = fs::read_to_string(sql_fpath)?; + let estcard = self.eval_query_estcard(&sql).await?; + estcards.push(estcard); } Ok(estcards) From fdb1c8c7c3f517d38f48072daa248937c4a34f1e Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 30 Mar 2024 22:07:39 -0400 Subject: [PATCH 06/29] wrote wrapper to extract join on condition --- optd-datafusion-repr/src/bin/test_optimize.rs | 4 +- optd-datafusion-repr/src/cost/base_cost.rs | 135 +++++++++--------- 2 files changed, 71 insertions(+), 68 deletions(-) diff --git a/optd-datafusion-repr/src/bin/test_optimize.rs b/optd-datafusion-repr/src/bin/test_optimize.rs index eb7a80a1..c9a1feb8 100644 --- a/optd-datafusion-repr/src/bin/test_optimize.rs +++ b/optd-datafusion-repr/src/bin/test_optimize.rs @@ -1,4 +1,4 @@ -use std::sync::Arc; +use std::{collections::HashMap, sync::Arc}; use optd_core::{ cascades::CascadesOptimizer, @@ -45,7 +45,7 @@ pub fn main() { Box::new(OptCostModel::new( [("t1", 1000), ("t2", 100), ("t3", 10000)] .into_iter() - .map(|(x, y)| (x.to_string(), DataFusionPerTableStats::new(y, vec![]))) + .map(|(x, y)| (x.to_string(), DataFusionPerTableStats::new(y, HashMap::new()))) .collect(), )), vec![], diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 07e0660a..62517b89 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -1,7 +1,7 @@ use std::{collections::HashMap, sync::Arc}; use crate::plan_nodes::{ - BinOpType, ColumnRefExpr, ConstantExpr, ConstantType, LogOpType, OptRelNode, UnOpType, + BinOpType, ColumnRefExpr, ConstantExpr, ConstantType, Expr, ExprList, LogOpExpr, LogOpType, OptRelNode, UnOpType }; use crate::properties::column_ref::{ColumnRefPropertyBuilder, GroupColumnRefs}; use crate::{ @@ -637,63 +637,84 @@ impl OptCostModel { } } + /// Check if an expr_tree is a join condition, returning the join on col ref pair if it is + /// The reason the check and the info are in the same function is because their code is almost identical + fn get_on_col_ref_pair(expr_tree: OptRelNodeRef, column_refs: &GroupColumnRefs) -> Option<(ColumnRefExpr, ColumnRefExpr)> { + // We perform three checks to see if a child_expr_tree is an on_col_ref_pair + // 1. Check that it's equality + if expr_tree.typ == OptRelNodeTyp::BinOp(BinOpType::Eq) { + let left_child = expr_tree.child(0); + let right_child = expr_tree.child(1); + // 2. Check that both sides are column refs + if left_child.typ == OptRelNodeTyp::ColumnRef && right_child.typ == OptRelNodeTyp::ColumnRef { + // 3. Check that both sides don't belong to the same table (if we don't know, that means they don't belong) + let left_col_ref_expr = ColumnRefExpr::from_rel_node(left_child).expect("we already checked that the type is ColumnRef"); + let right_col_ref_expr = ColumnRefExpr::from_rel_node(right_child).expect("we already checked that the type is ColumnRef"); + let left_col_ref = &column_refs[left_col_ref_expr.index()]; + let right_col_ref = &column_refs[right_col_ref_expr.index()]; + let is_same_table = if let ColumnRef::BaseTableColumnRef { table: left_table, .. } = left_col_ref { + if let ColumnRef::BaseTableColumnRef { table: right_table, .. } = right_col_ref { + left_table == right_table + } else { + false + } + } else { + false + }; + if !is_same_table { + Some((left_col_ref_expr, right_col_ref_expr)) + } else { + None + } + } else { + None + } + } else { + None + } + } + /// The expr_tree input must be a "mixed expression tree", just like with get_filter_selectivity() - /// The "wrapper" is here to separate the equality conditions from the filter conditions before calling - /// the "main" get_join_selectivity() function. - fn get_join_selectivity_wrapper( + /// This is a "wrapper" to separate the equality conditions from the filter conditions before calling + /// the "main" get_join_selectivity_core() function. + fn get_join_selectivity( &self, join_typ: JoinType, expr_tree: OptRelNodeRef, column_refs: &GroupColumnRefs, ) -> f64 { - println!("get_join_selectivity(): called on expr_tree={}", expr_tree); assert!(expr_tree.typ.is_expression()); - match &expr_tree.typ { - OptRelNodeTyp::Constant(_) => Self::get_constant_selectivity(expr_tree), - OptRelNodeTyp::ColumnRef => unimplemented!("check bool type or else panic"), - OptRelNodeTyp::UnOp(_) => unimplemented!(), - OptRelNodeTyp::BinOp(bin_op_typ) => { - assert!(expr_tree.children.len() == 2); - let left_child = expr_tree.child(0); - let right_child = expr_tree.child(1); - - if bin_op_typ.is_comparison() { - self.get_join_comp_op_selectivity( - join_typ, - *bin_op_typ, - left_child, - right_child, - column_refs, - ) - } else if bin_op_typ.is_numerical() { - panic!( - "the selectivity of operations that return numerical values is undefined" - ) + if expr_tree.typ == OptRelNodeTyp::LogOp(LogOpType::And) { + let mut on_col_ref_pairs = vec![]; + let mut filter_expr_trees = vec![]; + for child_expr_tree in &expr_tree.children { + if let Some(on_col_ref_pair) = Self::get_on_col_ref_pair(child_expr_tree.clone(), column_refs) { + on_col_ref_pairs.push(on_col_ref_pair) } else { - unreachable!("all BinOpTypes should be true for at least one is_*() function") + let child_expr = Expr::from_rel_node(child_expr_tree.clone()).expect("everything that is a direct child of an And node must be an expression"); + filter_expr_trees.push(child_expr); } - }, - OptRelNodeTyp::LogOp(log_op_typ) => { - self.get_join_log_op_selectivity(join_typ, *log_op_typ, &expr_tree.children, column_refs) - }, - OptRelNodeTyp::Func(_) => unimplemented!("check bool type or else panic"), - OptRelNodeTyp::SortOrder(_) => { - panic!("the selectivity of sort order expressions is undefined") } - OptRelNodeTyp::Between => unimplemented!(), - OptRelNodeTyp::Cast => unimplemented!("check bool type or else panic"), - OptRelNodeTyp::Like => unimplemented!(), - OptRelNodeTyp::DataType(_) => { - panic!("the selectivity of a data type is not defined") + assert!(on_col_ref_pairs.len() + filter_expr_trees.len() == expr_tree.children.len()); + let filter_expr_tree = if filter_expr_trees.is_empty() { + None + } else { + Some(LogOpExpr::new( + LogOpType::And, + ExprList::new(filter_expr_trees), + ).into_rel_node()) + }; + self.get_join_selectivity_core(join_typ, on_col_ref_pairs, filter_expr_tree, column_refs) + } else { + if let Some(on_col_ref_pair) = Self::get_on_col_ref_pair(expr_tree.clone(), column_refs) { + self.get_join_selectivity_core(join_typ, vec![on_col_ref_pair], None, column_refs) + } else { + self.get_join_selectivity_core(join_typ, vec![], Some(expr_tree), column_refs) } - OptRelNodeTyp::InList => unimplemented!(), - _ => unreachable!( - "all expression OptRelNodeTyp were enumerated. this should be unreachable" - ), } } - fn get_join_selectivity( + fn get_join_selectivity_core( &self, join_typ: JoinType, on_col_ref_pairs: Vec<(ColumnRefExpr, ColumnRefExpr)>, @@ -1028,24 +1049,6 @@ impl OptCostModel { } } - fn get_join_log_op_selectivity( - &self, - join_typ: JoinType, - log_op_typ: LogOpType, - children: &[OptRelNodeRef], - column_refs: &GroupColumnRefs, - ) -> f64 { - let children_sel = children - .iter() - .map(|expr| self.get_join_selectivity(join_typ, expr.clone(), column_refs)); - - match log_op_typ { - LogOpType::And => children_sel.product(), - // the formula is 1.0 - the probability of _none_ of the events happening - LogOpType::Or => 1.0 - children_sel.fold(1.0, |acc, sel| acc * (1.0 - sel)), - } - } - pub fn get_row_cnt(&self, table: &str) -> Option { self.per_table_stats_map .get(table) @@ -1149,7 +1152,7 @@ mod tests { OptCostModel::new( vec![( String::from(TABLE1_NAME), - PerTableStats::new(100, vec![Some(per_column_stats)]), + PerTableStats::new(100, vec![(0, per_column_stats)].into_iter().collect()), )] .into_iter() .collect(), @@ -1158,13 +1161,13 @@ mod tests { // two columns is sufficient for all join selectivity tests fn create_two_column_cost_model( + per_column_stats0: TestPerColumnStats, per_column_stats1: TestPerColumnStats, - per_column_stats2: TestPerColumnStats, ) -> OptCostModel { OptCostModel::new( vec![( String::from(TABLE1_NAME), - PerTableStats::new(100, vec![Some(per_column_stats1), Some(per_column_stats2)]), + PerTableStats::new(100, vec![(0, per_column_stats0), (1, per_column_stats1)].into_iter().collect()), )] .into_iter() .collect(), @@ -1789,7 +1792,7 @@ mod tests { } #[test] - fn test_joinsel_colref_eq_colref_no_mcvs_no_nulls() { + fn test_joinsel_colref_eq_colref_no_nulls() { let cost_model = create_two_column_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 5, From 25c2b75faf7775dca3e425afeaa47cc374e51fa3 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 09:07:26 -0400 Subject: [PATCH 07/29] no cache -> rebuild cache --- optd-perftest/src/cardtest.rs | 4 ++-- optd-perftest/src/datafusion_dbms.rs | 13 ++++++------- optd-perftest/src/main.rs | 8 ++++---- optd-perftest/tests/cardtest_integration.rs | 2 +- 4 files changed, 13 insertions(+), 14 deletions(-) diff --git a/optd-perftest/src/cardtest.rs b/optd-perftest/src/cardtest.rs index a7de677a..0b9158cb 100644 --- a/optd-perftest/src/cardtest.rs +++ b/optd-perftest/src/cardtest.rs @@ -103,14 +103,14 @@ pub trait CardtestRunnerDBMSHelper { pub async fn cardtest>( workspace_dpath: P, - no_cached_optd_stats: bool, + rebuild_cached_optd_stats: bool, pguser: &str, pgpassword: &str, tpch_config: TpchConfig, ) -> anyhow::Result>> { let pg_dbms = Box::new(PostgresDBMS::build(&workspace_dpath, pguser, pgpassword)?); let truecard_getter = pg_dbms.clone(); - let df_dbms = Box::new(DatafusionDBMS::new(&workspace_dpath, no_cached_optd_stats).await?); + let df_dbms = Box::new(DatafusionDBMS::new(&workspace_dpath, rebuild_cached_optd_stats).await?); let dbmss: Vec> = vec![pg_dbms, df_dbms]; let tpch_benchmark = Benchmark::Tpch(tpch_config.clone()); diff --git a/optd-perftest/src/datafusion_dbms.rs b/optd-perftest/src/datafusion_dbms.rs index 204b85e2..25f76b34 100644 --- a/optd-perftest/src/datafusion_dbms.rs +++ b/optd-perftest/src/datafusion_dbms.rs @@ -34,7 +34,7 @@ use regex::Regex; pub struct DatafusionDBMS { workspace_dpath: PathBuf, - no_cached_stats: bool, + rebuild_cached_stats: bool, ctx: SessionContext, } @@ -63,11 +63,11 @@ impl CardtestRunnerDBMSHelper for DatafusionDBMS { impl DatafusionDBMS { pub async fn new>( workspace_dpath: P, - no_cached_stats: bool, + rebuild_cached_stats: bool, ) -> anyhow::Result { Ok(DatafusionDBMS { workspace_dpath: workspace_dpath.as_ref().to_path_buf(), - no_cached_stats, + rebuild_cached_stats, ctx: Self::new_session_ctx(None).await?, }) } @@ -213,7 +213,7 @@ impl DatafusionDBMS { .workspace_dpath .join("datafusion_stats_caches") .join(format!("{}.json", benchmark_fname)); - if !self.no_cached_stats && stats_cache_fpath.exists() { + if !self.rebuild_cached_stats && stats_cache_fpath.exists() { let file = File::open(&stats_cache_fpath)?; Ok(serde_json::from_reader(file)?) } else { @@ -222,9 +222,8 @@ impl DatafusionDBMS { _ => unimplemented!(), }; - // regardless of whether self.no_cached_stats is true or false, we want to update the cache - // this way, even if we choose not to read from the cache, the cache still always has the - // most up to date version of the stats + // When self.rebuild_cached_stats is true, we *don't read* from the cache but we still + // *do write* to the cache. fs::create_dir_all(stats_cache_fpath.parent().unwrap())?; let file = File::create(&stats_cache_fpath)?; serde_json::to_writer(file, &base_table_stats)?; diff --git a/optd-perftest/src/main.rs b/optd-perftest/src/main.rs index 0611b746..6a28cfd0 100644 --- a/optd-perftest/src/main.rs +++ b/optd-perftest/src/main.rs @@ -39,11 +39,11 @@ enum Commands { #[clap(long)] #[clap(action)] #[clap(help = "Whether to use the cached optd stats/cache generated stats")] - // this is an option because you want to make it false whenever you update the + // this is an option because you want to make it true whenever you update the // code for how stats are generated in optd, in order to not use cached stats // I found that I almost always want to use the cache though, which is why the // system will use the cache by default - no_cached_optd_stats: bool, + rebuild_cached_optd_stats: bool, #[clap(long)] #[clap(default_value = "default_user")] @@ -77,7 +77,7 @@ async fn main() -> anyhow::Result<()> { scale_factor, seed, query_ids, - no_cached_optd_stats, + rebuild_cached_optd_stats, pguser, pgpassword, } => { @@ -89,7 +89,7 @@ async fn main() -> anyhow::Result<()> { }; let cardinfo_alldbs = cardtest::cardtest( &workspace_dpath, - no_cached_optd_stats, + rebuild_cached_optd_stats, &pguser, &pgpassword, tpch_config, diff --git a/optd-perftest/tests/cardtest_integration.rs b/optd-perftest/tests/cardtest_integration.rs index 8b5c242d..327d4fa7 100644 --- a/optd-perftest/tests/cardtest_integration.rs +++ b/optd-perftest/tests/cardtest_integration.rs @@ -44,7 +44,7 @@ mod tests { // make sure scale factor is low so the test runs fast "--scale-factor", "0.01", - "--no-cached-optd-stats", + "--rebuild-cached-optd-stats", "--pguser", "test_user", "--pgpassword", From 273aa0d07130d72e851bd25d2763a52219374c1a Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 09:20:37 -0400 Subject: [PATCH 08/29] refactored per col from map back to vec --- optd-datafusion-repr/src/bin/test_optimize.rs | 4 +- optd-datafusion-repr/src/cost/base_cost.rs | 143 ++++++++---------- 2 files changed, 69 insertions(+), 78 deletions(-) diff --git a/optd-datafusion-repr/src/bin/test_optimize.rs b/optd-datafusion-repr/src/bin/test_optimize.rs index c9a1feb8..eb7a80a1 100644 --- a/optd-datafusion-repr/src/bin/test_optimize.rs +++ b/optd-datafusion-repr/src/bin/test_optimize.rs @@ -1,4 +1,4 @@ -use std::{collections::HashMap, sync::Arc}; +use std::sync::Arc; use optd_core::{ cascades::CascadesOptimizer, @@ -45,7 +45,7 @@ pub fn main() { Box::new(OptCostModel::new( [("t1", 1000), ("t2", 100), ("t3", 10000)] .into_iter() - .map(|(x, y)| (x.to_string(), DataFusionPerTableStats::new(y, HashMap::new()))) + .map(|(x, y)| (x.to_string(), DataFusionPerTableStats::new(y, vec![]))) .collect(), )), vec![], diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 62517b89..9466d072 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -94,7 +94,7 @@ impl MostCommonValues for MockMostCommonValues { #[derive(Serialize, Deserialize)] pub struct PerTableStats { row_cnt: usize, - per_column_stats_map: HashMap>, + per_column_stats_vec: Vec>>, } impl DataFusionPerTableStats { @@ -150,20 +150,22 @@ impl DataFusionPerTableStats { } // Assemble the per-column stats. - let mut per_column_stats_map = HashMap::with_capacity(col_cnt); + let mut per_column_stats_vec = Vec::with_capacity(col_cnt); for i in 0..col_cnt { - if Self::is_type_supported(&col_types[i]) { - per_column_stats_map.insert(i, PerColumnStats::new( + per_column_stats_vec.push(if Self::is_type_supported(&col_types[i]) { + Some(PerColumnStats::new( mcvs[i].take().unwrap(), hlls[i].n_distinct(), null_cnt[i] as f64 / row_cnt as f64, distr[i].take().unwrap(), - )); - } + )) + } else { + None + }); } Ok(Self { row_cnt, - per_column_stats_map, + per_column_stats_vec, }) } @@ -733,14 +735,18 @@ impl OptCostModel { join_on_selectivity * join_filter_selectivity } - fn get_per_col_stats(&self, col_ref: &ColumnRef) -> Option<&PerColumnStats> { + fn get_per_column_stats_from_col_ref(&self, col_ref: &ColumnRef) -> Option<&PerColumnStats> { if let ColumnRef::BaseTableColumnRef { table, col_idx } = col_ref { - self.per_table_stats_map.get(table).and_then(|per_table_stats| per_table_stats.per_column_stats_map.get(col_idx)) + self.get_per_column_stats(table, *col_idx) } else { None } } + fn get_per_column_stats(&self, table: &str, col_idx: usize) -> Option<&PerColumnStats> { + self.per_table_stats_map.get(table).and_then(|per_table_stats| per_table_stats.per_column_stats_vec[col_idx].as_ref()) + } + fn get_join_on_selectivity( &self, join_typ: JoinType, @@ -750,8 +756,8 @@ impl OptCostModel { // multiply the selectivities of all individual conditions together on_col_ref_pairs.into_iter().map(|on_col_ref_pair| { // the formula for each pair is min(1 / ndistinct1, 1 / ndistinct2) (see https://postgrespro.com/blog/pgsql/5969618) - let ndistincts = vec![on_col_ref_pair.0, on_col_ref_pair.1].into_iter().map(|on_col_ref| { - match self.get_per_col_stats(&column_refs[on_col_ref.index()]) { + let ndistincts = vec![on_col_ref_pair.0, on_col_ref_pair.1].into_iter().map(|on_col_ref_expr| { + match self.get_per_column_stats_from_col_ref(&column_refs[on_col_ref_expr.index()]) { Some(per_col_stats) => per_col_stats.ndistinct, None => DEFAULT_NUM_DISTINCT, } @@ -936,31 +942,21 @@ impl OptCostModel { value: &Value, is_eq: bool, ) -> f64 { - if let Some(per_table_stats) = self.per_table_stats_map.get(table) { - if let Some(per_column_stats) = per_table_stats.per_column_stats_map.get(&col_idx) - { - let eq_freq = if let Some(freq) = per_column_stats.mcvs.freq(value) { - freq - } else { - let non_mcv_freq = 1.0 - per_column_stats.mcvs.total_freq(); - // always safe because usize is at least as large as i32 - let ndistinct_as_usize = per_column_stats.ndistinct as usize; - let non_mcv_cnt = ndistinct_as_usize - per_column_stats.mcvs.cnt(); - // note that nulls are not included in ndistinct so we don't need to do non_mcv_cnt - 1 if null_frac > 0 - (non_mcv_freq - per_column_stats.null_frac) / (non_mcv_cnt as f64) - }; - if is_eq { - eq_freq - } else { - 1.0 - eq_freq - per_column_stats.null_frac - } + if let Some(per_column_stats) = self.get_per_column_stats(table, col_idx) { + let eq_freq = if let Some(freq) = per_column_stats.mcvs.freq(value) { + freq } else { - #[allow(clippy::collapsible_else_if)] - if is_eq { - DEFAULT_EQ_SEL - } else { - 1.0 - DEFAULT_EQ_SEL - } + let non_mcv_freq = 1.0 - per_column_stats.mcvs.total_freq(); + // always safe because usize is at least as large as i32 + let ndistinct_as_usize = per_column_stats.ndistinct as usize; + let non_mcv_cnt = ndistinct_as_usize - per_column_stats.mcvs.cnt(); + // note that nulls are not included in ndistinct so we don't need to do non_mcv_cnt - 1 if null_frac > 0 + (non_mcv_freq - per_column_stats.null_frac) / (non_mcv_cnt as f64) + }; + if is_eq { + eq_freq + } else { + 1.0 - eq_freq - per_column_stats.null_frac } } else { #[allow(clippy::collapsible_else_if)] @@ -986,46 +982,41 @@ impl OptCostModel { is_col_lt_val: bool, is_col_eq_val: bool, ) -> f64 { - if let Some(per_table_stats) = self.per_table_stats_map.get(table) { - if let Some(per_column_stats) = per_table_stats.per_column_stats_map.get(&col_idx) - { - // because distr does not include the values in MCVs, we need to compute the CDFs there as well - // because nulls return false in any comparison, they are never included when computing range selectivity - let distr_leq_freq = per_column_stats.distr.cdf(value); - let value_clone = value.clone(); // clone the value so that we can move it into the closure to avoid lifetime issues - // TODO: in a future PR, figure out how to make Values comparable. rn I just hardcoded as_i32() to work around this - let pred = Box::new(move |val: &Value| val.as_i32() <= value_clone.as_i32()); - let mcvs_leq_freq = per_column_stats.mcvs.freq_over_pred(pred); - let total_leq_freq = distr_leq_freq + mcvs_leq_freq; - - // depending on whether value is in mcvs or not, we use different logic to turn total_leq_cdf into total_lt_cdf - // this logic just so happens to be the exact same logic as get_column_equality_selectivity implements - let total_lt_freq = total_leq_freq - - self.get_column_equality_selectivity(table, col_idx, value, true); - - // use either total_leq_freq or total_lt_freq to get the selectivity - if is_col_lt_val { - if is_col_eq_val { - // this branch means <= - total_leq_freq - } else { - // this branch means < - total_lt_freq - } + if let Some(per_column_stats) = self.get_per_column_stats(table, col_idx) { + // because distr does not include the values in MCVs, we need to compute the CDFs there as well + // because nulls return false in any comparison, they are never included when computing range selectivity + let distr_leq_freq = per_column_stats.distr.cdf(value); + let value_clone = value.clone(); // clone the value so that we can move it into the closure to avoid lifetime issues + // TODO: in a future PR, figure out how to make Values comparable. rn I just hardcoded as_i32() to work around this + let pred = Box::new(move |val: &Value| val.as_i32() <= value_clone.as_i32()); + let mcvs_leq_freq = per_column_stats.mcvs.freq_over_pred(pred); + let total_leq_freq = distr_leq_freq + mcvs_leq_freq; + + // depending on whether value is in mcvs or not, we use different logic to turn total_leq_cdf into total_lt_cdf + // this logic just so happens to be the exact same logic as get_column_equality_selectivity implements + let total_lt_freq = total_leq_freq + - self.get_column_equality_selectivity(table, col_idx, value, true); + + // use either total_leq_freq or total_lt_freq to get the selectivity + if is_col_lt_val { + if is_col_eq_val { + // this branch means <= + total_leq_freq } else { - // clippy wants me to collapse this into an else if, but keeping two nested if else statements is clearer - #[allow(clippy::collapsible_else_if)] - if is_col_eq_val { - // this branch means >=, which is 1 - < - null_frac - // we need to subtract null_frac since that isn't included in >= either - 1.0 - total_lt_freq - per_column_stats.null_frac - } else { - // this branch means >. same logic as above - 1.0 - total_leq_freq - per_column_stats.null_frac - } + // this branch means < + total_lt_freq } } else { - DEFAULT_INEQ_SEL + // clippy wants me to collapse this into an else if, but keeping two nested if else statements is clearer + #[allow(clippy::collapsible_else_if)] + if is_col_eq_val { + // this branch means >=, which is 1 - < - null_frac + // we need to subtract null_frac since that isn't included in >= either + 1.0 - total_lt_freq - per_column_stats.null_frac + } else { + // this branch means >. same logic as above + 1.0 - total_leq_freq - per_column_stats.null_frac + } } } else { DEFAULT_INEQ_SEL @@ -1057,10 +1048,10 @@ impl OptCostModel { } impl PerTableStats { - pub fn new(row_cnt: usize, per_column_stats_map: HashMap>) -> Self { + pub fn new(row_cnt: usize, per_column_stats_vec: Vec>>) -> Self { Self { row_cnt, - per_column_stats_map, + per_column_stats_vec, } } } @@ -1152,7 +1143,7 @@ mod tests { OptCostModel::new( vec![( String::from(TABLE1_NAME), - PerTableStats::new(100, vec![(0, per_column_stats)].into_iter().collect()), + PerTableStats::new(100, vec![Some(per_column_stats)]), )] .into_iter() .collect(), @@ -1167,7 +1158,7 @@ mod tests { OptCostModel::new( vec![( String::from(TABLE1_NAME), - PerTableStats::new(100, vec![(0, per_column_stats0), (1, per_column_stats1)].into_iter().collect()), + PerTableStats::new(100, vec![Some(per_column_stats0), Some(per_column_stats1)]), )] .into_iter() .collect(), From 9e8b4f2fe2c3b92a4817d1cbc3ff6e7fa99ea22f Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 09:21:52 -0400 Subject: [PATCH 09/29] cmt --- optd-datafusion-repr/src/cost/base_cost.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 9466d072..964929f3 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -94,6 +94,11 @@ impl MostCommonValues for MockMostCommonValues { #[derive(Serialize, Deserialize)] pub struct PerTableStats { row_cnt: usize, + // This is a Vec of Options instead of just a Vec because some columns may not have stats + // due to their type being non-comparable. + // Further, I chose to represent it as a Vec of Options instead of a HashMap because a Vec + // of Options clearly differentiates between two different failure modes: "out-of-bounds + // access" and "column has no stats". per_column_stats_vec: Vec>>, } From 3ff6d24944a50eaa0c4d4bf4c5385deb518701ef Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 09:30:01 -0400 Subject: [PATCH 10/29] fixed joinsel eq test to use two diff tables --- optd-datafusion-repr/src/cost/base_cost.rs | 37 ++++++++++++++-------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 964929f3..ad5ae1ac 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -14,6 +14,7 @@ use datafusion::arrow::array::{ Int32Array, Int8Array, RecordBatch, RecordBatchIterator, RecordBatchReader, UInt16Array, UInt32Array, UInt8Array, }; +use datafusion_expr::col; use itertools::Itertools; use optd_core::{ cascades::{CascadesOptimizer, RelNodeContext}, @@ -711,11 +712,15 @@ impl OptCostModel { ExprList::new(filter_expr_trees), ).into_rel_node()) }; + println!("on_col_ref_pairs={:?}, filter_expr_tree={:?}", on_col_ref_pairs, filter_expr_tree); self.get_join_selectivity_core(join_typ, on_col_ref_pairs, filter_expr_tree, column_refs) } else { + println!("b, expr_tree={:?}, column_refs={:?}", expr_tree, column_refs); if let Some(on_col_ref_pair) = Self::get_on_col_ref_pair(expr_tree.clone(), column_refs) { + println!("c"); self.get_join_selectivity_core(join_typ, vec![on_col_ref_pair], None, column_refs) } else { + println!("d"); self.get_join_selectivity_core(join_typ, vec![], Some(expr_tree), column_refs) } } @@ -1139,7 +1144,8 @@ mod tests { } } - const TABLE1_NAME: &str = "t1"; + const TABLE1_NAME: &str = "table1"; + const TABLE2_NAME: &str = "table2"; // one column is sufficient for all filter selectivity tests fn create_one_column_cost_model( @@ -1156,14 +1162,17 @@ mod tests { } // two columns is sufficient for all join selectivity tests - fn create_two_column_cost_model( - per_column_stats0: TestPerColumnStats, - per_column_stats1: TestPerColumnStats, + fn create_two_table_cost_model( + tbl1_per_column_stats: TestPerColumnStats, + tbl2_per_column_stats: TestPerColumnStats, ) -> OptCostModel { OptCostModel::new( vec![( String::from(TABLE1_NAME), - PerTableStats::new(100, vec![Some(per_column_stats0), Some(per_column_stats1)]), + PerTableStats::new(100, vec![Some(tbl1_per_column_stats)]), + ), ( + String::from(TABLE2_NAME), + PerTableStats::new(100, vec![Some(tbl2_per_column_stats)]), )] .into_iter() .collect(), @@ -1789,7 +1798,7 @@ mod tests { #[test] fn test_joinsel_colref_eq_colref_no_nulls() { - let cost_model = create_two_column_cost_model(TestPerColumnStats::new( + let cost_model = create_two_table_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 5, 0.0, @@ -1806,8 +1815,8 @@ mod tests { table: String::from(TABLE1_NAME), col_idx: 0, }, ColumnRef::BaseTableColumnRef { - table: String::from(TABLE1_NAME), - col_idx: 1, + table: String::from(TABLE2_NAME), + col_idx: 0, }]; assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.2); assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.2); @@ -1815,7 +1824,7 @@ mod tests { #[test] fn test_joinsel_and() { - let cost_model = create_two_column_cost_model(TestPerColumnStats::new( + let cost_model = create_two_table_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 5, 0.0, @@ -1834,8 +1843,8 @@ mod tests { table: String::from(TABLE1_NAME), col_idx: 0, }, ColumnRef::BaseTableColumnRef { - table: String::from(TABLE1_NAME), - col_idx: 1, + table: String::from(TABLE2_NAME), + col_idx: 0, }]; assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.04); assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.04); @@ -1843,7 +1852,7 @@ mod tests { #[test] fn test_joinsel_or() { - let cost_model = create_two_column_cost_model(TestPerColumnStats::new( + let cost_model = create_two_table_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 5, 0.0, @@ -1862,8 +1871,8 @@ mod tests { table: String::from(TABLE1_NAME), col_idx: 0, }, ColumnRef::BaseTableColumnRef { - table: String::from(TABLE1_NAME), - col_idx: 1, + table: String::from(TABLE2_NAME), + col_idx: 0, }]; assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.36); assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.36); From bbbfbfc2abadcbffe13075b6ff69b15c5fd22c7b Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 09:30:49 -0400 Subject: [PATCH 11/29] removed joinsel or test --- optd-datafusion-repr/src/cost/base_cost.rs | 28 ---------------------- 1 file changed, 28 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index ad5ae1ac..d7fb762b 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -1849,32 +1849,4 @@ mod tests { assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.04); assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.04); } - - #[test] - fn test_joinsel_or() { - let cost_model = create_two_table_cost_model(TestPerColumnStats::new( - TestMostCommonValues::empty(), - 5, - 0.0, - TestDistribution::empty(), - ), TestPerColumnStats::new( - TestMostCommonValues::empty(), - 3, - 0.0, - TestDistribution::empty(), - )); - let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); - let eq1and0 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); - let expr_tree = log_op(LogOpType::Or, vec![eq0and1.clone(), eq1and0.clone()]); - let expr_tree_rev = log_op(LogOpType::Or, vec![eq1and0.clone(), eq0and1.clone()]); - let column_refs = vec![ColumnRef::BaseTableColumnRef { - table: String::from(TABLE1_NAME), - col_idx: 0, - }, ColumnRef::BaseTableColumnRef { - table: String::from(TABLE2_NAME), - col_idx: 0, - }]; - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.36); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.36); - } } From 4c792f9620a4cc26ec0e164cf42badb084bbbf4b Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 09:40:16 -0400 Subject: [PATCH 12/29] oncond comment --- optd-datafusion-repr/src/cost/base_cost.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index d7fb762b..a486193d 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -1797,7 +1797,7 @@ mod tests { } #[test] - fn test_joinsel_colref_eq_colref_no_nulls() { + fn test_joinsel_oncond() { let cost_model = create_two_table_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 5, @@ -1823,7 +1823,7 @@ mod tests { } #[test] - fn test_joinsel_and() { + fn test_joinsel_and_with_oncond() { let cost_model = create_two_table_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 5, @@ -1849,4 +1849,6 @@ mod tests { assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.04); assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.04); } + + // We don't test joinsel or with oncond because if there is an oncond (on condition), the top-level operator must be an AND } From 2e23e20352831c228e15029d163a55d251774f13 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 09:45:33 -0400 Subject: [PATCH 13/29] wrote unit tests for join sel --- optd-datafusion-repr/src/cost/base_cost.rs | 93 ++++++++++++++++++++-- 1 file changed, 86 insertions(+), 7 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index a486193d..cc3d2596 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -1075,10 +1075,9 @@ mod tests { use std::collections::HashMap; use crate::{ - plan_nodes::{ + cost::base_cost::DEFAULT_EQ_SEL, plan_nodes::{ BinOpExpr, BinOpType, ColumnRefExpr, ConstantExpr, Expr, ExprList, JoinType, LogOpExpr, LogOpType, OptRelNode, OptRelNodeRef, UnOpExpr, UnOpType - }, - properties::column_ref::ColumnRef, + }, properties::column_ref::ColumnRef }; use super::{Distribution, MostCommonValues, OptCostModel, PerColumnStats, PerTableStats}; @@ -1805,7 +1804,7 @@ mod tests { TestDistribution::empty(), ), TestPerColumnStats::new( TestMostCommonValues::empty(), - 3, + 4, 0.0, TestDistribution::empty(), )); @@ -1823,7 +1822,7 @@ mod tests { } #[test] - fn test_joinsel_and_with_oncond() { + fn test_joinsel_and_of_onconds() { let cost_model = create_two_table_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 5, @@ -1831,12 +1830,12 @@ mod tests { TestDistribution::empty(), ), TestPerColumnStats::new( TestMostCommonValues::empty(), - 3, + 4, 0.0, TestDistribution::empty(), )); let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); - let eq1and0 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); + let eq1and0 = bin_op(BinOpType::Eq, col_ref(1), col_ref(0)); let expr_tree = log_op(LogOpType::And, vec![eq0and1.clone(), eq1and0.clone()]); let expr_tree_rev = log_op(LogOpType::And, vec![eq1and0.clone(), eq0and1.clone()]); let column_refs = vec![ColumnRef::BaseTableColumnRef { @@ -1850,5 +1849,85 @@ mod tests { assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.04); } + #[test] + fn test_joinsel_and_of_oncond_and_filter() { + let cost_model = create_two_table_cost_model(TestPerColumnStats::new( + TestMostCommonValues::empty(), + 5, + 0.0, + TestDistribution::empty(), + ), TestPerColumnStats::new( + TestMostCommonValues::empty(), + 4, + 0.0, + TestDistribution::empty(), + )); + let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); + let eq100 = bin_op(BinOpType::Eq, col_ref(1), cnst(Value::Int32(100))); + let expr_tree = log_op(LogOpType::And, vec![eq0and1.clone(), eq100.clone()]); + let expr_tree_rev = log_op(LogOpType::And, vec![eq100.clone(), eq0and1.clone()]); + let column_refs = vec![ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 0, + }, ColumnRef::BaseTableColumnRef { + table: String::from(TABLE2_NAME), + col_idx: 0, + }]; + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.05); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.05); + } + + #[test] + fn test_joinsel_and_of_filters() { + let cost_model = create_two_table_cost_model(TestPerColumnStats::new( + TestMostCommonValues::empty(), + 5, + 0.0, + TestDistribution::empty(), + ), TestPerColumnStats::new( + TestMostCommonValues::empty(), + 4, + 0.0, + TestDistribution::empty(), + )); + let neq12 = bin_op(BinOpType::Neq, col_ref(0), cnst(Value::Int32(12))); + let eq100 = bin_op(BinOpType::Eq, col_ref(1), cnst(Value::Int32(100))); + let expr_tree = log_op(LogOpType::And, vec![neq12.clone(), eq100.clone()]); + let expr_tree_rev = log_op(LogOpType::And, vec![eq100.clone(), neq12.clone()]); + let column_refs = vec![ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 0, + }, ColumnRef::BaseTableColumnRef { + table: String::from(TABLE2_NAME), + col_idx: 0, + }]; + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.2); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.2); + } + + #[test] + fn test_joinsel_colref_eq_colref_same_table_not_oncond() { + let cost_model = create_two_table_cost_model(TestPerColumnStats::new( + TestMostCommonValues::empty(), + 5, + 0.0, + TestDistribution::empty(), + ), TestPerColumnStats::new( + TestMostCommonValues::empty(), + 4, + 0.0, + TestDistribution::empty(), + )); + let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(0)); + let column_refs = vec![ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 0, + }, ColumnRef::BaseTableColumnRef { + table: String::from(TABLE2_NAME), + col_idx: 0, + }]; + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), DEFAULT_EQ_SEL); + } + // We don't test joinsel or with oncond because if there is an oncond (on condition), the top-level operator must be an AND } From 3428ebb439c53d6bbe028248c4c31bd312ace272 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 09:46:29 -0400 Subject: [PATCH 14/29] now checking join type inner --- optd-datafusion-repr/src/cost/base_cost.rs | 34 ++++++++++++---------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index cc3d2596..85cba8de 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -14,7 +14,6 @@ use datafusion::arrow::array::{ Int32Array, Int8Array, RecordBatch, RecordBatchIterator, RecordBatchReader, UInt16Array, UInt32Array, UInt8Array, }; -use datafusion_expr::col; use itertools::Itertools; use optd_core::{ cascades::{CascadesOptimizer, RelNodeContext}, @@ -763,20 +762,25 @@ impl OptCostModel { on_col_ref_pairs: Vec<(ColumnRefExpr, ColumnRefExpr)>, column_refs: &GroupColumnRefs ) -> f64 { - // multiply the selectivities of all individual conditions together - on_col_ref_pairs.into_iter().map(|on_col_ref_pair| { - // the formula for each pair is min(1 / ndistinct1, 1 / ndistinct2) (see https://postgrespro.com/blog/pgsql/5969618) - let ndistincts = vec![on_col_ref_pair.0, on_col_ref_pair.1].into_iter().map(|on_col_ref_expr| { - match self.get_per_column_stats_from_col_ref(&column_refs[on_col_ref_expr.index()]) { - Some(per_col_stats) => per_col_stats.ndistinct, - None => DEFAULT_NUM_DISTINCT, - } - }); - // using reduce(f64::min) is the idiomatic workaround to the fact that f64 does not implement Ord due to NaN - let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_nodes.len() == 2"); - assert!(!selectivity.is_nan(), "it should be impossible for selectivity to be NaN since n-distinct is never 0"); - selectivity - }).product() + match join_typ { + JoinType::Inner => { + // multiply the selectivities of all individual conditions together + on_col_ref_pairs.into_iter().map(|on_col_ref_pair| { + // the formula for each pair is min(1 / ndistinct1, 1 / ndistinct2) (see https://postgrespro.com/blog/pgsql/5969618) + let ndistincts = vec![on_col_ref_pair.0, on_col_ref_pair.1].into_iter().map(|on_col_ref_expr| { + match self.get_per_column_stats_from_col_ref(&column_refs[on_col_ref_expr.index()]) { + Some(per_col_stats) => per_col_stats.ndistinct, + None => DEFAULT_NUM_DISTINCT, + } + }); + // using reduce(f64::min) is the idiomatic workaround to the fact that f64 does not implement Ord due to NaN + let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_nodes.len() == 2"); + assert!(!selectivity.is_nan(), "it should be impossible for selectivity to be NaN since n-distinct is never 0"); + selectivity + }).product() + } + _ => unimplemented!(), + } } /// Comparison operators are the base case for recursion in get_filter_selectivity() From 673e4aabf469cc78890b1854f6c5d104a35a5449 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 09:55:31 -0400 Subject: [PATCH 15/29] fixed q11 --- optd-datafusion-repr/src/cost/base_cost.rs | 51 ++++++++++------------ 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 85cba8de..c93ba762 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -588,6 +588,7 @@ impl OptCostModel { expr_tree: OptRelNodeRef, column_refs: &GroupColumnRefs, ) -> f64 { + println!("expr_tree={:?}", expr_tree); assert!(expr_tree.typ.is_expression()); match &expr_tree.typ { OptRelNodeTyp::Constant(_) => Self::get_constant_selectivity(expr_tree), @@ -711,15 +712,11 @@ impl OptCostModel { ExprList::new(filter_expr_trees), ).into_rel_node()) }; - println!("on_col_ref_pairs={:?}, filter_expr_tree={:?}", on_col_ref_pairs, filter_expr_tree); self.get_join_selectivity_core(join_typ, on_col_ref_pairs, filter_expr_tree, column_refs) } else { - println!("b, expr_tree={:?}, column_refs={:?}", expr_tree, column_refs); if let Some(on_col_ref_pair) = Self::get_on_col_ref_pair(expr_tree.clone(), column_refs) { - println!("c"); self.get_join_selectivity_core(join_typ, vec![on_col_ref_pair], None, column_refs) } else { - println!("d"); self.get_join_selectivity_core(join_typ, vec![], Some(expr_tree), column_refs) } } @@ -774,7 +771,7 @@ impl OptCostModel { } }); // using reduce(f64::min) is the idiomatic workaround to the fact that f64 does not implement Ord due to NaN - let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_nodes.len() == 2"); + let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_exprs.len() == 2"); assert!(!selectivity.is_nan(), "it should be impossible for selectivity to be NaN since n-distinct is never 0"); selectivity }).product() @@ -794,25 +791,25 @@ impl OptCostModel { assert!(comp_bin_op_typ.is_comparison()); // I intentionally performed moves on left and right. This way, we don't accidentally use them after this block - let (col_ref_nodes, non_col_ref_nodes, is_left_col_ref) = Self::get_semantic_nodes(left, right); + let (col_ref_exprs, non_col_ref_exprs, is_left_col_ref) = Self::get_semantic_nodes(left, right); // handle the different cases of column nodes - if col_ref_nodes.is_empty() { + if col_ref_exprs.is_empty() { UNIMPLEMENTED_SEL - } else if col_ref_nodes.len() == 1 { - let col_ref_node = col_ref_nodes + } else if col_ref_exprs.len() == 1 { + let col_ref_expr = col_ref_exprs .first() - .expect("we just checked that col_ref_nodes.len() == 1"); - let col_ref_idx = col_ref_node.index(); + .expect("we just checked that col_ref_exprs.len() == 1"); + let col_ref_idx = col_ref_expr.index(); if let ColumnRef::BaseTableColumnRef { table, col_idx } = &column_refs[col_ref_idx] { - let non_col_ref_node = non_col_ref_nodes + let non_col_ref_expr = non_col_ref_exprs .first() - .expect("non_col_ref_nodes should have a value since col_ref_nodes.len() == 1"); + .expect("non_col_ref_exprs should have a value since col_ref_exprs.len() == 1"); - match non_col_ref_node.as_ref().typ { + match non_col_ref_expr.as_ref().typ { OptRelNodeTyp::Constant(_) => { - let value = non_col_ref_node + let value = non_col_ref_expr .as_ref() .data .as_ref() @@ -861,46 +858,46 @@ impl OptCostModel { OptRelNodeTyp::Cast => UNIMPLEMENTED_SEL, _ => unimplemented!( "unhandled case of comparing a column ref node to {}", - non_col_ref_node.as_ref().typ + non_col_ref_expr.as_ref().typ ), } } else { - unimplemented!("non base table column refs need to be implemented") + Self::get_default_comparison_op_selectivity(comp_bin_op_typ) } - } else if col_ref_nodes.len() == 2 { + } else if col_ref_exprs.len() == 2 { Self::get_default_comparison_op_selectivity(comp_bin_op_typ) } else { - unreachable!("we could have at most pushed left and right into col_ref_nodes") + unreachable!("we could have at most pushed left and right into col_ref_exprs") } } /// Convert the left and right child nodes of some operation to what they semantically are /// This is convenient to avoid repeating the same logic just with "left" and "right" swapped fn get_semantic_nodes(left: OptRelNodeRef, right: OptRelNodeRef) -> (Vec, Vec, bool) { - let mut col_ref_nodes = vec![]; - let mut non_col_ref_nodes = vec![]; + let mut col_ref_exprs = vec![]; + let mut non_col_ref_exprs = vec![]; let is_left_col_ref; // I intentionally performed moves on left and right. This way, we don't accidentally use them after this block - // We always want to use "col_ref_node" and "non_col_ref_node" instead of "left" or "right" + // We always want to use "col_ref_expr" and "non_col_ref_expr" instead of "left" or "right" if left.as_ref().typ == OptRelNodeTyp::ColumnRef { is_left_col_ref = true; - col_ref_nodes.push( + col_ref_exprs.push( ColumnRefExpr::from_rel_node(left) .expect("we already checked that the type is ColumnRef"), ); } else { is_left_col_ref = false; - non_col_ref_nodes.push(left); + non_col_ref_exprs.push(left); } if right.as_ref().typ == OptRelNodeTyp::ColumnRef { - col_ref_nodes.push( + col_ref_exprs.push( ColumnRefExpr::from_rel_node(right) .expect("we already checked that the type is ColumnRef"), ); } else { - non_col_ref_nodes.push(right); + non_col_ref_exprs.push(right); } - (col_ref_nodes, non_col_ref_nodes, is_left_col_ref) + (col_ref_exprs, non_col_ref_exprs, is_left_col_ref) } /// The default selectivity of a comparison expression From 23d0abf711867fbbc3eb853bbb9d0f0abc64d34f Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 10:14:26 -0400 Subject: [PATCH 16/29] cust row cnt --- optd-datafusion-repr/src/cost/base_cost.rs | 58 ++++++++++++++++++---- 1 file changed, 48 insertions(+), 10 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index c93ba762..421e515d 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -588,7 +588,6 @@ impl OptCostModel { expr_tree: OptRelNodeRef, column_refs: &GroupColumnRefs, ) -> f64 { - println!("expr_tree={:?}", expr_tree); assert!(expr_tree.typ.is_expression()); match &expr_tree.typ { OptRelNodeTyp::Constant(_) => Self::get_constant_selectivity(expr_tree), @@ -1161,18 +1160,28 @@ mod tests { ) } - // two columns is sufficient for all join selectivity tests + /// Two columns is sufficient for all join selectivity tests fn create_two_table_cost_model( tbl1_per_column_stats: TestPerColumnStats, tbl2_per_column_stats: TestPerColumnStats, + ) -> OptCostModel { + create_two_table_cost_model_custom_row_cnts(tbl1_per_column_stats, tbl2_per_column_stats, 100, 100) + } + + /// We need custom row counts because some join algorithms rely on the row cnt + fn create_two_table_cost_model_custom_row_cnts( + tbl1_per_column_stats: TestPerColumnStats, + tbl2_per_column_stats: TestPerColumnStats, + tbl1_row_cnt: usize, + tbl2_row_cnt: usize, ) -> OptCostModel { OptCostModel::new( vec![( String::from(TABLE1_NAME), - PerTableStats::new(100, vec![Some(tbl1_per_column_stats)]), + PerTableStats::new(tbl1_row_cnt, vec![Some(tbl1_per_column_stats)]), ), ( String::from(TABLE2_NAME), - PerTableStats::new(100, vec![Some(tbl2_per_column_stats)]), + PerTableStats::new(tbl2_row_cnt, vec![Some(tbl2_per_column_stats)]), )] .into_iter() .collect(), @@ -1790,14 +1799,14 @@ mod tests { } #[test] - fn test_joinsel_const() { + fn test_joinsel_inner_const() { let cost_model = create_one_column_cost_model(get_empty_per_col_stats()); assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(true)), &vec![]), 1.0); assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(false)), &vec![]), 0.0); } #[test] - fn test_joinsel_oncond() { + fn test_joinsel_inner_oncond() { let cost_model = create_two_table_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 5, @@ -1823,7 +1832,7 @@ mod tests { } #[test] - fn test_joinsel_and_of_onconds() { + fn test_joinsel_inner_and_of_onconds() { let cost_model = create_two_table_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 5, @@ -1851,7 +1860,7 @@ mod tests { } #[test] - fn test_joinsel_and_of_oncond_and_filter() { + fn test_joinsel_inner_and_of_oncond_and_filter() { let cost_model = create_two_table_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 5, @@ -1879,7 +1888,7 @@ mod tests { } #[test] - fn test_joinsel_and_of_filters() { + fn test_joinsel_inner_and_of_filters() { let cost_model = create_two_table_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 5, @@ -1907,7 +1916,7 @@ mod tests { } #[test] - fn test_joinsel_colref_eq_colref_same_table_not_oncond() { + fn test_joinsel_inner_colref_eq_colref_same_table_is_not_oncond() { let cost_model = create_two_table_cost_model(TestPerColumnStats::new( TestMostCommonValues::empty(), 5, @@ -1931,4 +1940,33 @@ mod tests { } // We don't test joinsel or with oncond because if there is an oncond (on condition), the top-level operator must be an AND + + #[test] + fn test_joinsel_outer_oncond() { + let cost_model = create_two_table_cost_model(TestPerColumnStats::new( + TestMostCommonValues::empty(), + 5, + 0.0, + TestDistribution::empty(), + ), TestPerColumnStats::new( + TestMostCommonValues::empty(), + 4, + 0.0, + TestDistribution::empty(), + )); + // since we're talking about left and right outer joins, the order actually matters now + let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); + let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0)); + let column_refs = vec![ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 0, + }, ColumnRef::BaseTableColumnRef { + table: String::from(TABLE2_NAME), + col_idx: 0, + }]; + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), DEFAULT_EQ_SEL); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), DEFAULT_EQ_SEL); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), DEFAULT_EQ_SEL); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), DEFAULT_EQ_SEL); + } } From 38368541a92f1a16b513e37e419d347a925a19ff Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 10:32:53 -0400 Subject: [PATCH 17/29] wrote unit tests for outer sel --- optd-datafusion-repr/src/cost/base_cost.rs | 131 +++++++++++++++++++-- 1 file changed, 124 insertions(+), 7 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 421e515d..af7653d7 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -1941,9 +1941,10 @@ mod tests { // We don't test joinsel or with oncond because if there is an oncond (on condition), the top-level operator must be an AND + /// Unique oncond means an oncondition on columns which are unique in both tables #[test] - fn test_joinsel_outer_oncond() { - let cost_model = create_two_table_cost_model(TestPerColumnStats::new( + fn test_joinsel_outer_unique_oncond() { + let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new( TestMostCommonValues::empty(), 5, 0.0, @@ -1953,7 +1954,75 @@ mod tests { 4, 0.0, TestDistribution::empty(), - )); + ), 5, 4); + // since we're talking about left and right outer joins, the order actually matters now + let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); + let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0)); + let column_refs = vec![ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 0, + }, ColumnRef::BaseTableColumnRef { + table: String::from(TABLE2_NAME), + col_idx: 0, + }]; + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), 0.25); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), 0.25); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), 0.2); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), 0.2); + } + + /// Non-unique oncond means the column is not unique in either table + /// Inner always >= row count means that the inner join result is >= the row count of both tables + #[test] + fn test_joinsel_outer_nonunique_oncond_inner_always_geq_rowcnt() { + let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new( + TestMostCommonValues::empty(), + 5, + 0.0, + TestDistribution::empty(), + ), TestPerColumnStats::new( + TestMostCommonValues::empty(), + 4, + 0.0, + TestDistribution::empty(), + ), 10, 8); + // since we're talking about left and right outer joins, the order actually matters now + let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); + let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0)); + let column_refs = vec![ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 0, + }, ColumnRef::BaseTableColumnRef { + table: String::from(TABLE2_NAME), + col_idx: 0, + }]; + // sanity check the expected inner sel + let expected_inner_sel = 0.2; + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel); + // check the outer sels + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), 0.2); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), 0.2); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), 0.2); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), 0.2); + } + + /// Non-unique oncond means the column is not unique in either table + /// Inner sometimes < row count means that the inner join result < the row count of at least one table. + /// Note that without a join filter, it's impossible to be less than the row count of both tables + #[test] + fn test_joinsel_outer_nonunique_oncond_inner_sometimes_lt_rowcnt() { + let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new( + TestMostCommonValues::empty(), + 10, + 0.0, + TestDistribution::empty(), + ), TestPerColumnStats::new( + TestMostCommonValues::empty(), + 2, + 0.0, + TestDistribution::empty(), + ), 20, 4); // since we're talking about left and right outer joins, the order actually matters now let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0)); @@ -1964,9 +2033,57 @@ mod tests { table: String::from(TABLE2_NAME), col_idx: 0, }]; - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), DEFAULT_EQ_SEL); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), DEFAULT_EQ_SEL); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), DEFAULT_EQ_SEL); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), DEFAULT_EQ_SEL); + // sanity check the expected inner sel + let expected_inner_sel = 0.1; + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel); + // check the outer sels + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), 0.25); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), 0.25); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), 0.1); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), 0.1); + } + + /// Unique oncond means an oncondition on columns which are unique in both tables + /// Filter means we're adding a join filter + /// Inner sometimes < row count means that the inner join result < the row count of at least one table. + #[test] + fn test_joinsel_outer_unique_oncond_filter_inner_sometimes_lt_rowcnt() { + let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new( + TestMostCommonValues::empty(), + 50, + 0.0, + TestDistribution::new(vec![ + (Value::Int32(128), 0.4) + ]), + ), TestPerColumnStats::new( + TestMostCommonValues::empty(), + 4, + 0.0, + TestDistribution::empty(), + ), 50, 4); + // since we're talking about left and right outer joins, the order actually matters now + let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); + let eq1and0 = bin_op(BinOpType::Eq, col_ref(1), col_ref(0)); + let filter = bin_op(BinOpType::Leq, col_ref(0), cnst(Value::Int32(128))); + let expr_tree = log_op(LogOpType::And, vec![eq0and1, filter.clone()]); + // inner rev means its the inner expr (the eq op) whose children are being reversed, as opposed to the and op + let expr_tree_inner_rev = log_op(LogOpType::And, vec![eq1and0, filter.clone()]); + let column_refs = vec![ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 0, + }, ColumnRef::BaseTableColumnRef { + table: String::from(TABLE2_NAME), + col_idx: 0, + }]; + // sanity check the expected inner sel + let expected_inner_sel = 0.008; + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_inner_rev.clone(), &column_refs), expected_inner_sel); + // check the outer sels + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), 0.25); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_inner_rev.clone(), &column_refs), 0.25); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_inner_rev.clone(), &column_refs), 0.02); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), 0.02); } } From 2f473058db9bf4544e2d3b79d48b4745d6af0258 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 11:04:32 -0400 Subject: [PATCH 18/29] refactored unit tests to pass row cnt properly --- optd-datafusion-repr/src/cost/base_cost.rs | 166 ++++++++++++--------- 1 file changed, 95 insertions(+), 71 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index af7653d7..b35b403a 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -493,7 +493,7 @@ impl CostModel for OptCostM let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false); // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information if let Some(expr_tree) = expr_trees.first() { - self.get_join_selectivity(*join_typ, Arc::clone(expr_tree), &column_refs) + self.get_join_selectivity(*join_typ, Arc::clone(expr_tree), &column_refs, row_cnt_1, row_cnt_2) } else { panic!("encountered a join without an expression") } @@ -689,6 +689,8 @@ impl OptCostModel { join_typ: JoinType, expr_tree: OptRelNodeRef, column_refs: &GroupColumnRefs, + left_row_cnt: f64, + right_row_cnt: f64, ) -> f64 { assert!(expr_tree.typ.is_expression()); if expr_tree.typ == OptRelNodeTyp::LogOp(LogOpType::And) { @@ -711,12 +713,12 @@ impl OptCostModel { ExprList::new(filter_expr_trees), ).into_rel_node()) }; - self.get_join_selectivity_core(join_typ, on_col_ref_pairs, filter_expr_tree, column_refs) + self.get_join_selectivity_core(join_typ, on_col_ref_pairs, filter_expr_tree, column_refs, left_row_cnt, right_row_cnt) } else { if let Some(on_col_ref_pair) = Self::get_on_col_ref_pair(expr_tree.clone(), column_refs) { - self.get_join_selectivity_core(join_typ, vec![on_col_ref_pair], None, column_refs) + self.get_join_selectivity_core(join_typ, vec![on_col_ref_pair], None, column_refs, left_row_cnt, right_row_cnt) } else { - self.get_join_selectivity_core(join_typ, vec![], Some(expr_tree), column_refs) + self.get_join_selectivity_core(join_typ, vec![], Some(expr_tree), column_refs, left_row_cnt, right_row_cnt) } } } @@ -727,8 +729,10 @@ impl OptCostModel { on_col_ref_pairs: Vec<(ColumnRefExpr, ColumnRefExpr)>, filter_expr_tree: Option, column_refs: &GroupColumnRefs, + left_row_cnt: f64, + right_row_cnt: f64, ) -> f64 { - let join_on_selectivity = self.get_join_on_selectivity(join_typ, on_col_ref_pairs, column_refs); + let join_on_selectivity = self.get_join_on_selectivity(on_col_ref_pairs, column_refs); // Currently, there is no difference in how we handle a join filter and a select filter, so we use the same function // One difference (that we *don't* care about right now) is that join filters can contain expressions from multiple // different tables. Currently, this doesn't affect the get_filter_selectivity() function, but this may change in @@ -737,7 +741,13 @@ impl OptCostModel { Some(filter_expr_tree) => self.get_filter_selectivity(filter_expr_tree, column_refs), None => 1.0, }; - join_on_selectivity * join_filter_selectivity + let inner_join_selectivity = join_on_selectivity * join_filter_selectivity; + match join_typ { + JoinType::Inner => inner_join_selectivity, + JoinType::LeftOuter => f64::max(inner_join_selectivity, 1.0 / right_row_cnt), + JoinType::RightOuter => f64::max(inner_join_selectivity, 1.0 / left_row_cnt), + _ => unimplemented!() + } } fn get_per_column_stats_from_col_ref(&self, col_ref: &ColumnRef) -> Option<&PerColumnStats> { @@ -752,31 +762,27 @@ impl OptCostModel { self.per_table_stats_map.get(table).and_then(|per_table_stats| per_table_stats.per_column_stats_vec[col_idx].as_ref()) } + /// Get the selectivity of the on conditions + /// Note that the selectivity of the on conditions does not depend on join type. Join type is accounted for separately in get_join_selectivity_core() fn get_join_on_selectivity( &self, - join_typ: JoinType, on_col_ref_pairs: Vec<(ColumnRefExpr, ColumnRefExpr)>, column_refs: &GroupColumnRefs ) -> f64 { - match join_typ { - JoinType::Inner => { - // multiply the selectivities of all individual conditions together - on_col_ref_pairs.into_iter().map(|on_col_ref_pair| { - // the formula for each pair is min(1 / ndistinct1, 1 / ndistinct2) (see https://postgrespro.com/blog/pgsql/5969618) - let ndistincts = vec![on_col_ref_pair.0, on_col_ref_pair.1].into_iter().map(|on_col_ref_expr| { - match self.get_per_column_stats_from_col_ref(&column_refs[on_col_ref_expr.index()]) { - Some(per_col_stats) => per_col_stats.ndistinct, - None => DEFAULT_NUM_DISTINCT, - } - }); - // using reduce(f64::min) is the idiomatic workaround to the fact that f64 does not implement Ord due to NaN - let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_exprs.len() == 2"); - assert!(!selectivity.is_nan(), "it should be impossible for selectivity to be NaN since n-distinct is never 0"); - selectivity - }).product() - } - _ => unimplemented!(), - } + // multiply the selectivities of all individual conditions together + on_col_ref_pairs.into_iter().map(|on_col_ref_pair| { + // the formula for each pair is min(1 / ndistinct1, 1 / ndistinct2) (see https://postgrespro.com/blog/pgsql/5969618) + let ndistincts = vec![on_col_ref_pair.0, on_col_ref_pair.1].into_iter().map(|on_col_ref_expr| { + match self.get_per_column_stats_from_col_ref(&column_refs[on_col_ref_expr.index()]) { + Some(per_col_stats) => per_col_stats.ndistinct, + None => DEFAULT_NUM_DISTINCT, + } + }); + // using reduce(f64::min) is the idiomatic workaround to the fact that f64 does not implement Ord due to NaN + let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_exprs.len() == 2"); + assert!(!selectivity.is_nan(), "it should be impossible for selectivity to be NaN since n-distinct is never 0"); + selectivity + }).product() } /// Comparison operators are the base case for recursion in get_filter_selectivity() @@ -1077,11 +1083,12 @@ mod tests { use crate::{ cost::base_cost::DEFAULT_EQ_SEL, plan_nodes::{ BinOpExpr, BinOpType, ColumnRefExpr, ConstantExpr, Expr, ExprList, JoinType, LogOpExpr, LogOpType, OptRelNode, OptRelNodeRef, UnOpExpr, UnOpType - }, properties::column_ref::ColumnRef + }, properties::column_ref::{ColumnRef, GroupColumnRefs} }; use super::{Distribution, MostCommonValues, OptCostModel, PerColumnStats, PerTableStats}; type TestPerColumnStats = PerColumnStats; + type TestOptCostModel = OptCostModel; struct TestMostCommonValues { mcvs: HashMap, @@ -1149,7 +1156,7 @@ mod tests { // one column is sufficient for all filter selectivity tests fn create_one_column_cost_model( per_column_stats: TestPerColumnStats, - ) -> OptCostModel { + ) -> TestOptCostModel { OptCostModel::new( vec![( String::from(TABLE1_NAME), @@ -1164,7 +1171,7 @@ mod tests { fn create_two_table_cost_model( tbl1_per_column_stats: TestPerColumnStats, tbl2_per_column_stats: TestPerColumnStats, - ) -> OptCostModel { + ) -> TestOptCostModel { create_two_table_cost_model_custom_row_cnts(tbl1_per_column_stats, tbl2_per_column_stats, 100, 100) } @@ -1174,7 +1181,7 @@ mod tests { tbl2_per_column_stats: TestPerColumnStats, tbl1_row_cnt: usize, tbl2_row_cnt: usize, - ) -> OptCostModel { + ) -> TestOptCostModel { OptCostModel::new( vec![( String::from(TABLE1_NAME), @@ -1798,11 +1805,22 @@ mod tests { ); } + /// A wrapper around get_join_selectivity that extracts the table row counts from the cost model + fn test_get_join_selectivity(cost_model: &TestOptCostModel, reverse_tables: bool, join_typ: JoinType, expr_tree: OptRelNodeRef, column_refs: &GroupColumnRefs) -> f64 { + let table1_row_cnt = cost_model.per_table_stats_map[TABLE1_NAME].row_cnt as f64; + let table2_row_cnt = cost_model.per_table_stats_map[TABLE2_NAME].row_cnt as f64; + if reverse_tables { + cost_model.get_join_selectivity(join_typ, expr_tree, column_refs, table1_row_cnt, table2_row_cnt) + } else { + cost_model.get_join_selectivity(join_typ, expr_tree, column_refs, table2_row_cnt, table1_row_cnt) + } + } + #[test] fn test_joinsel_inner_const() { let cost_model = create_one_column_cost_model(get_empty_per_col_stats()); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(true)), &vec![]), 1.0); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(false)), &vec![]), 0.0); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(true)), &vec![], f64::NAN, f64::NAN), 1.0); + assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(false)), &vec![], f64::NAN, f64::NAN), 0.0); } #[test] @@ -1827,8 +1845,8 @@ mod tests { table: String::from(TABLE2_NAME), col_idx: 0, }]; - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.2); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.2); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), 0.2); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev, &column_refs), 0.2); } #[test] @@ -1855,8 +1873,8 @@ mod tests { table: String::from(TABLE2_NAME), col_idx: 0, }]; - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.04); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.04); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), 0.04); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev, &column_refs), 0.04); } #[test] @@ -1883,8 +1901,8 @@ mod tests { table: String::from(TABLE2_NAME), col_idx: 0, }]; - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.05); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.05); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), 0.05); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev, &column_refs), 0.05); } #[test] @@ -1911,8 +1929,8 @@ mod tests { table: String::from(TABLE2_NAME), col_idx: 0, }]; - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), 0.2); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev, &column_refs), 0.2); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), 0.2); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev, &column_refs), 0.2); } #[test] @@ -1936,12 +1954,28 @@ mod tests { table: String::from(TABLE2_NAME), col_idx: 0, }]; - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree, &column_refs), DEFAULT_EQ_SEL); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), DEFAULT_EQ_SEL); } // We don't test joinsel or with oncond because if there is an oncond (on condition), the top-level operator must be an AND + /// I made this helper function to avoid copying all eight lines over and over + fn assert_joinsel_outer_selectivity(cost_model: &TestOptCostModel, expr_tree: OptRelNodeRef, expr_tree_rev: OptRelNodeRef, column_refs: &GroupColumnRefs, expected_table1_outer_sel: f64, expected_table2_outer_sel: f64) { + // all table 1 outer combinations + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::LeftOuter, expr_tree.clone(), &column_refs), expected_table1_outer_sel); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), expected_table1_outer_sel); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, true, JoinType::RightOuter, expr_tree.clone(), &column_refs), expected_table1_outer_sel); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, true, JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), expected_table1_outer_sel); + // all table 2 outer combinations + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, true, JoinType::LeftOuter, expr_tree.clone(), &column_refs), expected_table2_outer_sel); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, true, JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), expected_table2_outer_sel); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::RightOuter, expr_tree.clone(), &column_refs), expected_table2_outer_sel); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), expected_table2_outer_sel); + } + /// Unique oncond means an oncondition on columns which are unique in both tables + /// There's only one case if both columns are unique and have different row counts: the inner will be < 1 / row count + /// of one table and = 1 / row count of another #[test] fn test_joinsel_outer_unique_oncond() { let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new( @@ -1955,7 +1989,7 @@ mod tests { 0.0, TestDistribution::empty(), ), 5, 4); - // since we're talking about left and right outer joins, the order actually matters now + // the left/right of the join refers to the tables, not the order of columns in the predicate let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0)); let column_refs = vec![ColumnRef::BaseTableColumnRef { @@ -1965,14 +1999,11 @@ mod tests { table: String::from(TABLE2_NAME), col_idx: 0, }]; - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), 0.25); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), 0.25); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), 0.2); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), 0.2); + assert_joinsel_outer_selectivity(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.25, 0.2); } /// Non-unique oncond means the column is not unique in either table - /// Inner always >= row count means that the inner join result is >= the row count of both tables + /// Inner always >= row count means that the inner join result is >= 1 / the row count of both tables #[test] fn test_joinsel_outer_nonunique_oncond_inner_always_geq_rowcnt() { let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new( @@ -1986,7 +2017,7 @@ mod tests { 0.0, TestDistribution::empty(), ), 10, 8); - // since we're talking about left and right outer joins, the order actually matters now + // the left/right of the join refers to the tables, not the order of columns in the predicate let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0)); let column_refs = vec![ColumnRef::BaseTableColumnRef { @@ -1998,17 +2029,14 @@ mod tests { }]; // sanity check the expected inner sel let expected_inner_sel = 0.2; - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel); // check the outer sels - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), 0.2); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), 0.2); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), 0.2); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), 0.2); + assert_joinsel_outer_selectivity(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.2, 0.2); } /// Non-unique oncond means the column is not unique in either table - /// Inner sometimes < row count means that the inner join result < the row count of at least one table. + /// Inner sometimes < row count means that the inner join result < 1 / the row count of exactly one table. /// Note that without a join filter, it's impossible to be less than the row count of both tables #[test] fn test_joinsel_outer_nonunique_oncond_inner_sometimes_lt_rowcnt() { @@ -2023,7 +2051,7 @@ mod tests { 0.0, TestDistribution::empty(), ), 20, 4); - // since we're talking about left and right outer joins, the order actually matters now + // the left/right of the join refers to the tables, not the order of columns in the predicate let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0)); let column_refs = vec![ColumnRef::BaseTableColumnRef { @@ -2035,20 +2063,17 @@ mod tests { }]; // sanity check the expected inner sel let expected_inner_sel = 0.1; - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel); // check the outer sels - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), 0.25); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), 0.25); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), 0.1); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), 0.1); + assert_joinsel_outer_selectivity(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.25, 0.1); } /// Unique oncond means an oncondition on columns which are unique in both tables /// Filter means we're adding a join filter - /// Inner sometimes < row count means that the inner join result < the row count of at least one table. + /// There's only one case if both columns are unique and there's a filter: the inner will be < 1 / row count of both tables #[test] - fn test_joinsel_outer_unique_oncond_filter_inner_sometimes_lt_rowcnt() { + fn test_joinsel_outer_unique_oncond_filter() { let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new( TestMostCommonValues::empty(), 50, @@ -2062,7 +2087,7 @@ mod tests { 0.0, TestDistribution::empty(), ), 50, 4); - // since we're talking about left and right outer joins, the order actually matters now + // the left/right of the join refers to the tables, not the order of columns in the predicate let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); let eq1and0 = bin_op(BinOpType::Eq, col_ref(1), col_ref(0)); let filter = bin_op(BinOpType::Leq, col_ref(0), cnst(Value::Int32(128))); @@ -2078,12 +2103,11 @@ mod tests { }]; // sanity check the expected inner sel let expected_inner_sel = 0.008; - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, expr_tree_inner_rev.clone(), &column_refs), expected_inner_sel); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_inner_rev.clone(), &column_refs), expected_inner_sel); // check the outer sels - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree.clone(), &column_refs), 0.25); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree_inner_rev.clone(), &column_refs), 0.25); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::LeftOuter, expr_tree_inner_rev.clone(), &column_refs), 0.02); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::RightOuter, expr_tree.clone(), &column_refs), 0.02); + assert_joinsel_outer_selectivity(&cost_model, expr_tree, expr_tree_inner_rev, &column_refs, 0.25, 0.02); } + + // I didn't test any non-unique cases with filter. The non-unique tests without filter should cover that } From ff49d83658553231147851ba8a25979dc70bb1b7 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 11:07:31 -0400 Subject: [PATCH 19/29] fixed bug in unittests --- optd-datafusion-repr/src/cost/base_cost.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index b35b403a..366af7af 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -723,6 +723,7 @@ impl OptCostModel { } } + /// The core logic of join selectivity which assumes we've already separated the expression into the on conditions and the filters fn get_join_selectivity_core( &self, join_typ: JoinType, @@ -1809,7 +1810,7 @@ mod tests { fn test_get_join_selectivity(cost_model: &TestOptCostModel, reverse_tables: bool, join_typ: JoinType, expr_tree: OptRelNodeRef, column_refs: &GroupColumnRefs) -> f64 { let table1_row_cnt = cost_model.per_table_stats_map[TABLE1_NAME].row_cnt as f64; let table2_row_cnt = cost_model.per_table_stats_map[TABLE2_NAME].row_cnt as f64; - if reverse_tables { + if !reverse_tables { cost_model.get_join_selectivity(join_typ, expr_tree, column_refs, table1_row_cnt, table2_row_cnt) } else { cost_model.get_join_selectivity(join_typ, expr_tree, column_refs, table2_row_cnt, table1_row_cnt) @@ -1960,7 +1961,7 @@ mod tests { // We don't test joinsel or with oncond because if there is an oncond (on condition), the top-level operator must be an AND /// I made this helper function to avoid copying all eight lines over and over - fn assert_joinsel_outer_selectivity(cost_model: &TestOptCostModel, expr_tree: OptRelNodeRef, expr_tree_rev: OptRelNodeRef, column_refs: &GroupColumnRefs, expected_table1_outer_sel: f64, expected_table2_outer_sel: f64) { + fn assert_joinsel_outer_selectivities(cost_model: &TestOptCostModel, expr_tree: OptRelNodeRef, expr_tree_rev: OptRelNodeRef, column_refs: &GroupColumnRefs, expected_table1_outer_sel: f64, expected_table2_outer_sel: f64) { // all table 1 outer combinations assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::LeftOuter, expr_tree.clone(), &column_refs), expected_table1_outer_sel); assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), expected_table1_outer_sel); @@ -1999,7 +2000,12 @@ mod tests { table: String::from(TABLE2_NAME), col_idx: 0, }]; - assert_joinsel_outer_selectivity(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.25, 0.2); + // sanity check the expected inner sel + let expected_inner_sel = 0.2; + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel); + assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel); + // check the outer sels + assert_joinsel_outer_selectivities(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.25, 0.2); } /// Non-unique oncond means the column is not unique in either table @@ -2032,7 +2038,7 @@ mod tests { assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel); assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel); // check the outer sels - assert_joinsel_outer_selectivity(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.2, 0.2); + assert_joinsel_outer_selectivities(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.2, 0.2); } /// Non-unique oncond means the column is not unique in either table @@ -2066,7 +2072,7 @@ mod tests { assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel); assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel); // check the outer sels - assert_joinsel_outer_selectivity(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.25, 0.1); + assert_joinsel_outer_selectivities(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.25, 0.1); } /// Unique oncond means an oncondition on columns which are unique in both tables @@ -2106,7 +2112,7 @@ mod tests { assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel); assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_inner_rev.clone(), &column_refs), expected_inner_sel); // check the outer sels - assert_joinsel_outer_selectivity(&cost_model, expr_tree, expr_tree_inner_rev, &column_refs, 0.25, 0.02); + assert_joinsel_outer_selectivities(&cost_model, expr_tree, expr_tree_inner_rev, &column_refs, 0.25, 0.02); } // I didn't test any non-unique cases with filter. The non-unique tests without filter should cover that From 3711de2d413db829ada968711b421610d7a3cb63 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 11:08:32 -0400 Subject: [PATCH 20/29] added sel to hashjoin --- optd-datafusion-repr/src/cost/base_cost.rs | 26 ++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 366af7af..5f713c8f 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -514,11 +514,33 @@ impl CostModel for OptCostM let (_, compute_cost, _) = Self::cost_tuple(&children[1]); Self::cost(row_cnt, compute_cost * row_cnt, 0.0) } - OptRelNodeTyp::PhysicalHashJoin(_) => { + OptRelNodeTyp::PhysicalHashJoin(join_typ) => { let (row_cnt_1, _, _) = Self::cost_tuple(&children[0]); let (row_cnt_2, _, _) = Self::cost_tuple(&children[1]); + let selectivity = match context { + Some(context) => { + if let Some(optimizer) = optimizer { + let column_refs = optimizer + .get_property_by_group::( + context.group_id, + 1, + ); + let expr_group_id = context.children_group_ids[2]; + let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false); + // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information + if let Some(expr_tree) = expr_trees.first() { + self.get_join_selectivity(*join_typ, Arc::clone(expr_tree), &column_refs, row_cnt_1, row_cnt_2) + } else { + panic!("encountered a join without an expression") + } + } else { + DEFAULT_UNK_SEL + } + } + None => DEFAULT_UNK_SEL, + }; Self::cost( - row_cnt_1.min(row_cnt_2).max(1.0), + (row_cnt_1 * row_cnt_2 * selectivity).max(1.0), row_cnt_1 * 2.0 + row_cnt_2, 0.0, ) From fb141a6f70973f6a45ce20ad352823d9a910066a Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 12:23:38 -0400 Subject: [PATCH 21/29] undid hash join sel --- optd-datafusion-repr/src/cost/base_cost.rs | 23 +--------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 5f713c8f..2996682f 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -517,28 +517,7 @@ impl CostModel for OptCostM OptRelNodeTyp::PhysicalHashJoin(join_typ) => { let (row_cnt_1, _, _) = Self::cost_tuple(&children[0]); let (row_cnt_2, _, _) = Self::cost_tuple(&children[1]); - let selectivity = match context { - Some(context) => { - if let Some(optimizer) = optimizer { - let column_refs = optimizer - .get_property_by_group::( - context.group_id, - 1, - ); - let expr_group_id = context.children_group_ids[2]; - let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false); - // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information - if let Some(expr_tree) = expr_trees.first() { - self.get_join_selectivity(*join_typ, Arc::clone(expr_tree), &column_refs, row_cnt_1, row_cnt_2) - } else { - panic!("encountered a join without an expression") - } - } else { - DEFAULT_UNK_SEL - } - } - None => DEFAULT_UNK_SEL, - }; + let selectivity = DEFAULT_UNK_SEL; Self::cost( (row_cnt_1 * row_cnt_2 * selectivity).max(1.0), row_cnt_1 * 2.0 + row_cnt_2, From 5be618e3d2a4d007f8920f5704d7ca3a24c241a6 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 12:31:43 -0400 Subject: [PATCH 22/29] cross join --- optd-datafusion-repr/src/cost/base_cost.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 5221840e..21cd0686 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -520,7 +520,7 @@ impl CostModel for OptCostM let (_, compute_cost, _) = Self::cost_tuple(&children[1]); Self::cost(row_cnt, compute_cost * row_cnt, 0.0) } - OptRelNodeTyp::PhysicalHashJoin(join_typ) => { + OptRelNodeTyp::PhysicalHashJoin(_) => { let (row_cnt_1, _, _) = Self::cost_tuple(&children[0]); let (row_cnt_2, _, _) = Self::cost_tuple(&children[1]); let selectivity = DEFAULT_UNK_SEL; @@ -797,7 +797,7 @@ impl OptCostModel { left_row_cnt: f64, right_row_cnt: f64, ) -> f64 { - let join_on_selectivity = self.get_join_on_selectivity(on_col_ref_pairs, column_refs); + let join_on_selectivity = self.get_join_on_selectivity(&on_col_ref_pairs, column_refs); // Currently, there is no difference in how we handle a join filter and a select filter, so we use the same function // One difference (that we *don't* care about right now) is that join filters can contain expressions from multiple // different tables. Currently, this doesn't affect the get_filter_selectivity() function, but this may change in @@ -811,7 +811,11 @@ impl OptCostModel { JoinType::Inner => inner_join_selectivity, JoinType::LeftOuter => f64::max(inner_join_selectivity, 1.0 / right_row_cnt), JoinType::RightOuter => f64::max(inner_join_selectivity, 1.0 / left_row_cnt), - _ => unimplemented!() + JoinType::Cross => { + assert!(on_col_ref_pairs.is_empty(), "Cross joins should not have on columns"); + join_filter_selectivity + }, + _ => unimplemented!("join_typ={} is not implemented", join_typ) } } @@ -831,13 +835,13 @@ impl OptCostModel { /// Note that the selectivity of the on conditions does not depend on join type. Join type is accounted for separately in get_join_selectivity_core() fn get_join_on_selectivity( &self, - on_col_ref_pairs: Vec<(ColumnRefExpr, ColumnRefExpr)>, + on_col_ref_pairs: &Vec<(ColumnRefExpr, ColumnRefExpr)>, column_refs: &GroupColumnRefs ) -> f64 { // multiply the selectivities of all individual conditions together on_col_ref_pairs.into_iter().map(|on_col_ref_pair| { // the formula for each pair is min(1 / ndistinct1, 1 / ndistinct2) (see https://postgrespro.com/blog/pgsql/5969618) - let ndistincts = vec![on_col_ref_pair.0, on_col_ref_pair.1].into_iter().map(|on_col_ref_expr| { + let ndistincts = vec![&on_col_ref_pair.0, &on_col_ref_pair.1].into_iter().map(|on_col_ref_expr| { match self.get_per_column_stats_from_col_ref(&column_refs[on_col_ref_expr.index()]) { Some(per_col_stats) => per_col_stats.ndistinct, None => DEFAULT_NUM_DISTINCT, From 4d0f753ec6e4fdbc9720f31cf4f69845765aea58 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 12:31:48 -0400 Subject: [PATCH 23/29] fmt --- optd-datafusion-repr/src/cost/base_cost.rs | 893 +++++++++++++++------ 1 file changed, 647 insertions(+), 246 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 21cd0686..66dbbbc1 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -1,11 +1,12 @@ use std::{collections::HashMap, sync::Arc}; use crate::plan_nodes::{ - BinOpType, ColumnRefExpr, ConstantExpr, ConstantType, Expr, ExprList, LogOpExpr, LogOpType, OptRelNode, UnOpType + BinOpType, ColumnRefExpr, ConstantExpr, ConstantType, Expr, ExprList, LogOpExpr, LogOpType, + OptRelNode, UnOpType, }; use crate::properties::column_ref::{ColumnRefPropertyBuilder, GroupColumnRefs}; use crate::{ - plan_nodes::{OptRelNodeRef, OptRelNodeTyp, JoinType}, + plan_nodes::{JoinType, OptRelNodeRef, OptRelNodeTyp}, properties::column_ref::ColumnRef, }; use arrow_schema::{ArrowError, DataType}; @@ -491,15 +492,21 @@ impl CostModel for OptCostM Some(context) => { if let Some(optimizer) = optimizer { let column_refs = optimizer - .get_property_by_group::( - context.group_id, - 1, - ); + .get_property_by_group::( + context.group_id, + 1, + ); let expr_group_id = context.children_group_ids[2]; let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false); // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information if let Some(expr_tree) = expr_trees.first() { - self.get_join_selectivity(*join_typ, Arc::clone(expr_tree), &column_refs, row_cnt_1, row_cnt_2) + self.get_join_selectivity( + *join_typ, + Arc::clone(expr_tree), + &column_refs, + row_cnt_1, + row_cnt_2, + ) } else { panic!("encountered a join without an expression") } @@ -710,21 +717,34 @@ impl OptCostModel { /// Check if an expr_tree is a join condition, returning the join on col ref pair if it is /// The reason the check and the info are in the same function is because their code is almost identical - fn get_on_col_ref_pair(expr_tree: OptRelNodeRef, column_refs: &GroupColumnRefs) -> Option<(ColumnRefExpr, ColumnRefExpr)> { + fn get_on_col_ref_pair( + expr_tree: OptRelNodeRef, + column_refs: &GroupColumnRefs, + ) -> Option<(ColumnRefExpr, ColumnRefExpr)> { // We perform three checks to see if a child_expr_tree is an on_col_ref_pair // 1. Check that it's equality if expr_tree.typ == OptRelNodeTyp::BinOp(BinOpType::Eq) { let left_child = expr_tree.child(0); let right_child = expr_tree.child(1); // 2. Check that both sides are column refs - if left_child.typ == OptRelNodeTyp::ColumnRef && right_child.typ == OptRelNodeTyp::ColumnRef { + if left_child.typ == OptRelNodeTyp::ColumnRef + && right_child.typ == OptRelNodeTyp::ColumnRef + { // 3. Check that both sides don't belong to the same table (if we don't know, that means they don't belong) - let left_col_ref_expr = ColumnRefExpr::from_rel_node(left_child).expect("we already checked that the type is ColumnRef"); - let right_col_ref_expr = ColumnRefExpr::from_rel_node(right_child).expect("we already checked that the type is ColumnRef"); + let left_col_ref_expr = ColumnRefExpr::from_rel_node(left_child) + .expect("we already checked that the type is ColumnRef"); + let right_col_ref_expr = ColumnRefExpr::from_rel_node(right_child) + .expect("we already checked that the type is ColumnRef"); let left_col_ref = &column_refs[left_col_ref_expr.index()]; let right_col_ref = &column_refs[right_col_ref_expr.index()]; - let is_same_table = if let ColumnRef::BaseTableColumnRef { table: left_table, .. } = left_col_ref { - if let ColumnRef::BaseTableColumnRef { table: right_table, .. } = right_col_ref { + let is_same_table = if let ColumnRef::BaseTableColumnRef { + table: left_table, .. + } = left_col_ref + { + if let ColumnRef::BaseTableColumnRef { + table: right_table, .. + } = right_col_ref + { left_table == right_table } else { false @@ -761,10 +781,14 @@ impl OptCostModel { let mut on_col_ref_pairs = vec![]; let mut filter_expr_trees = vec![]; for child_expr_tree in &expr_tree.children { - if let Some(on_col_ref_pair) = Self::get_on_col_ref_pair(child_expr_tree.clone(), column_refs) { + if let Some(on_col_ref_pair) = + Self::get_on_col_ref_pair(child_expr_tree.clone(), column_refs) + { on_col_ref_pairs.push(on_col_ref_pair) } else { - let child_expr = Expr::from_rel_node(child_expr_tree.clone()).expect("everything that is a direct child of an And node must be an expression"); + let child_expr = Expr::from_rel_node(child_expr_tree.clone()).expect( + "everything that is a direct child of an And node must be an expression", + ); filter_expr_trees.push(child_expr); } } @@ -772,17 +796,39 @@ impl OptCostModel { let filter_expr_tree = if filter_expr_trees.is_empty() { None } else { - Some(LogOpExpr::new( - LogOpType::And, - ExprList::new(filter_expr_trees), - ).into_rel_node()) + Some( + LogOpExpr::new(LogOpType::And, ExprList::new(filter_expr_trees)) + .into_rel_node(), + ) }; - self.get_join_selectivity_core(join_typ, on_col_ref_pairs, filter_expr_tree, column_refs, left_row_cnt, right_row_cnt) + self.get_join_selectivity_core( + join_typ, + on_col_ref_pairs, + filter_expr_tree, + column_refs, + left_row_cnt, + right_row_cnt, + ) } else { - if let Some(on_col_ref_pair) = Self::get_on_col_ref_pair(expr_tree.clone(), column_refs) { - self.get_join_selectivity_core(join_typ, vec![on_col_ref_pair], None, column_refs, left_row_cnt, right_row_cnt) + if let Some(on_col_ref_pair) = Self::get_on_col_ref_pair(expr_tree.clone(), column_refs) + { + self.get_join_selectivity_core( + join_typ, + vec![on_col_ref_pair], + None, + column_refs, + left_row_cnt, + right_row_cnt, + ) } else { - self.get_join_selectivity_core(join_typ, vec![], Some(expr_tree), column_refs, left_row_cnt, right_row_cnt) + self.get_join_selectivity_core( + join_typ, + vec![], + Some(expr_tree), + column_refs, + left_row_cnt, + right_row_cnt, + ) } } } @@ -812,14 +858,20 @@ impl OptCostModel { JoinType::LeftOuter => f64::max(inner_join_selectivity, 1.0 / right_row_cnt), JoinType::RightOuter => f64::max(inner_join_selectivity, 1.0 / left_row_cnt), JoinType::Cross => { - assert!(on_col_ref_pairs.is_empty(), "Cross joins should not have on columns"); + assert!( + on_col_ref_pairs.is_empty(), + "Cross joins should not have on columns" + ); join_filter_selectivity - }, - _ => unimplemented!("join_typ={} is not implemented", join_typ) + } + _ => unimplemented!("join_typ={} is not implemented", join_typ), } } - fn get_per_column_stats_from_col_ref(&self, col_ref: &ColumnRef) -> Option<&PerColumnStats> { + fn get_per_column_stats_from_col_ref( + &self, + col_ref: &ColumnRef, + ) -> Option<&PerColumnStats> { if let ColumnRef::BaseTableColumnRef { table, col_idx } = col_ref { self.get_per_column_stats(table, *col_idx) } else { @@ -828,7 +880,9 @@ impl OptCostModel { } fn get_per_column_stats(&self, table: &str, col_idx: usize) -> Option<&PerColumnStats> { - self.per_table_stats_map.get(table).and_then(|per_table_stats| per_table_stats.per_column_stats_vec[col_idx].as_ref()) + self.per_table_stats_map + .get(table) + .and_then(|per_table_stats| per_table_stats.per_column_stats_vec[col_idx].as_ref()) } /// Get the selectivity of the on conditions @@ -836,7 +890,7 @@ impl OptCostModel { fn get_join_on_selectivity( &self, on_col_ref_pairs: &Vec<(ColumnRefExpr, ColumnRefExpr)>, - column_refs: &GroupColumnRefs + column_refs: &GroupColumnRefs, ) -> f64 { // multiply the selectivities of all individual conditions together on_col_ref_pairs.into_iter().map(|on_col_ref_pair| { @@ -865,7 +919,8 @@ impl OptCostModel { assert!(comp_bin_op_typ.is_comparison()); // I intentionally performed moves on left and right. This way, we don't accidentally use them after this block - let (col_ref_exprs, non_col_ref_exprs, is_left_col_ref) = Self::get_semantic_nodes(left, right); + let (col_ref_exprs, non_col_ref_exprs, is_left_col_ref) = + Self::get_semantic_nodes(left, right); // handle the different cases of column nodes if col_ref_exprs.is_empty() { @@ -947,7 +1002,10 @@ impl OptCostModel { /// Convert the left and right child nodes of some operation to what they semantically are /// This is convenient to avoid repeating the same logic just with "left" and "right" swapped - fn get_semantic_nodes(left: OptRelNodeRef, right: OptRelNodeRef) -> (Vec, Vec, bool) { + fn get_semantic_nodes( + left: OptRelNodeRef, + right: OptRelNodeRef, + ) -> (Vec, Vec, bool) { let mut col_ref_exprs = vec![]; let mut non_col_ref_exprs = vec![]; let is_left_col_ref; @@ -1004,7 +1062,9 @@ impl OptCostModel { 0.0 } } else { - unreachable!("if the typ is ConstantType::Bool, the value should be a Value::Bool") + unreachable!( + "if the typ is ConstantType::Bool, the value should be a Value::Bool" + ) } } else { panic!("selectivity is not defined on constants which are not bools") @@ -1072,15 +1132,15 @@ impl OptCostModel { // because nulls return false in any comparison, they are never included when computing range selectivity let distr_leq_freq = per_column_stats.distr.cdf(value); let value_clone = value.clone(); // clone the value so that we can move it into the closure to avoid lifetime issues - // TODO: in a future PR, figure out how to make Values comparable. rn I just hardcoded as_i32() to work around this + // TODO: in a future PR, figure out how to make Values comparable. rn I just hardcoded as_i32() to work around this let pred = Box::new(move |val: &Value| val.as_i32() <= value_clone.as_i32()); let mcvs_leq_freq = per_column_stats.mcvs.freq_over_pred(pred); let total_leq_freq = distr_leq_freq + mcvs_leq_freq; // depending on whether value is in mcvs or not, we use different logic to turn total_leq_cdf into total_lt_cdf // this logic just so happens to be the exact same logic as get_column_equality_selectivity implements - let total_lt_freq = total_leq_freq - - self.get_column_equality_selectivity(table, col_idx, value, true); + let total_lt_freq = + total_leq_freq - self.get_column_equality_selectivity(table, col_idx, value, true); // use either total_leq_freq or total_lt_freq to get the selectivity if is_col_lt_val { @@ -1150,9 +1210,12 @@ mod tests { use std::collections::HashMap; use crate::{ - cost::base_cost::DEFAULT_EQ_SEL, plan_nodes::{ - BinOpExpr, BinOpType, ColumnRefExpr, ConstantExpr, Expr, ExprList, JoinType, LogOpExpr, LogOpType, OptRelNode, OptRelNodeRef, UnOpExpr, UnOpType - }, properties::column_ref::{ColumnRef, GroupColumnRefs} + cost::base_cost::DEFAULT_EQ_SEL, + plan_nodes::{ + BinOpExpr, BinOpType, ColumnRefExpr, ConstantExpr, Expr, ExprList, JoinType, LogOpExpr, + LogOpType, OptRelNode, OptRelNodeRef, UnOpExpr, UnOpType, + }, + properties::column_ref::{ColumnRef, GroupColumnRefs}, }; use super::{Distribution, MostCommonValues, OptCostModel, PerColumnStats, PerTableStats}; @@ -1223,9 +1286,7 @@ mod tests { const TABLE2_NAME: &str = "table2"; // one column is sufficient for all filter selectivity tests - fn create_one_column_cost_model( - per_column_stats: TestPerColumnStats, - ) -> TestOptCostModel { + fn create_one_column_cost_model(per_column_stats: TestPerColumnStats) -> TestOptCostModel { OptCostModel::new( vec![( String::from(TABLE1_NAME), @@ -1241,7 +1302,12 @@ mod tests { tbl1_per_column_stats: TestPerColumnStats, tbl2_per_column_stats: TestPerColumnStats, ) -> TestOptCostModel { - create_two_table_cost_model_custom_row_cnts(tbl1_per_column_stats, tbl2_per_column_stats, 100, 100) + create_two_table_cost_model_custom_row_cnts( + tbl1_per_column_stats, + tbl2_per_column_stats, + 100, + 100, + ) } /// We need custom row counts because some join algorithms rely on the row cnt @@ -1252,13 +1318,16 @@ mod tests { tbl2_row_cnt: usize, ) -> TestOptCostModel { OptCostModel::new( - vec![( - String::from(TABLE1_NAME), - PerTableStats::new(tbl1_row_cnt, vec![Some(tbl1_per_column_stats)]), - ), ( - String::from(TABLE2_NAME), - PerTableStats::new(tbl2_row_cnt, vec![Some(tbl2_per_column_stats)]), - )] + vec![ + ( + String::from(TABLE1_NAME), + PerTableStats::new(tbl1_row_cnt, vec![Some(tbl1_per_column_stats)]), + ), + ( + String::from(TABLE2_NAME), + PerTableStats::new(tbl2_row_cnt, vec![Some(tbl2_per_column_stats)]), + ), + ] .into_iter() .collect(), ) @@ -1321,8 +1390,14 @@ mod tests { #[test] fn test_filtersel_const() { let cost_model = create_one_column_cost_model(get_empty_per_col_stats()); - assert_approx_eq::assert_approx_eq!(cost_model.get_filter_selectivity(cnst(Value::Bool(true)), &vec![]), 1.0); - assert_approx_eq::assert_approx_eq!(cost_model.get_filter_selectivity(cnst(Value::Bool(false)), &vec![]), 0.0); + assert_approx_eq::assert_approx_eq!( + cost_model.get_filter_selectivity(cnst(Value::Bool(true)), &vec![]), + 1.0 + ); + assert_approx_eq::assert_approx_eq!( + cost_model.get_filter_selectivity(cnst(Value::Bool(false)), &vec![]), + 0.0 + ); } #[test] @@ -1875,171 +1950,367 @@ mod tests { } /// A wrapper around get_join_selectivity that extracts the table row counts from the cost model - fn test_get_join_selectivity(cost_model: &TestOptCostModel, reverse_tables: bool, join_typ: JoinType, expr_tree: OptRelNodeRef, column_refs: &GroupColumnRefs) -> f64 { + fn test_get_join_selectivity( + cost_model: &TestOptCostModel, + reverse_tables: bool, + join_typ: JoinType, + expr_tree: OptRelNodeRef, + column_refs: &GroupColumnRefs, + ) -> f64 { let table1_row_cnt = cost_model.per_table_stats_map[TABLE1_NAME].row_cnt as f64; let table2_row_cnt = cost_model.per_table_stats_map[TABLE2_NAME].row_cnt as f64; if !reverse_tables { - cost_model.get_join_selectivity(join_typ, expr_tree, column_refs, table1_row_cnt, table2_row_cnt) + cost_model.get_join_selectivity( + join_typ, + expr_tree, + column_refs, + table1_row_cnt, + table2_row_cnt, + ) } else { - cost_model.get_join_selectivity(join_typ, expr_tree, column_refs, table2_row_cnt, table1_row_cnt) + cost_model.get_join_selectivity( + join_typ, + expr_tree, + column_refs, + table2_row_cnt, + table1_row_cnt, + ) } } #[test] fn test_joinsel_inner_const() { let cost_model = create_one_column_cost_model(get_empty_per_col_stats()); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(true)), &vec![], f64::NAN, f64::NAN), 1.0); - assert_approx_eq::assert_approx_eq!(cost_model.get_join_selectivity(JoinType::Inner, cnst(Value::Bool(false)), &vec![], f64::NAN, f64::NAN), 0.0); + assert_approx_eq::assert_approx_eq!( + cost_model.get_join_selectivity( + JoinType::Inner, + cnst(Value::Bool(true)), + &vec![], + f64::NAN, + f64::NAN + ), + 1.0 + ); + assert_approx_eq::assert_approx_eq!( + cost_model.get_join_selectivity( + JoinType::Inner, + cnst(Value::Bool(false)), + &vec![], + f64::NAN, + f64::NAN + ), + 0.0 + ); } #[test] fn test_joinsel_inner_oncond() { - let cost_model = create_two_table_cost_model(TestPerColumnStats::new( - TestMostCommonValues::empty(), - 5, - 0.0, - TestDistribution::empty(), - ), TestPerColumnStats::new( - TestMostCommonValues::empty(), - 4, - 0.0, - TestDistribution::empty(), - )); + let cost_model = create_two_table_cost_model( + TestPerColumnStats::new( + TestMostCommonValues::empty(), + 5, + 0.0, + TestDistribution::empty(), + ), + TestPerColumnStats::new( + TestMostCommonValues::empty(), + 4, + 0.0, + TestDistribution::empty(), + ), + ); let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0)); - let column_refs = vec![ColumnRef::BaseTableColumnRef { - table: String::from(TABLE1_NAME), - col_idx: 0, - }, ColumnRef::BaseTableColumnRef { - table: String::from(TABLE2_NAME), - col_idx: 0, - }]; - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), 0.2); - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev, &column_refs), 0.2); + let column_refs = vec![ + ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 0, + }, + ColumnRef::BaseTableColumnRef { + table: String::from(TABLE2_NAME), + col_idx: 0, + }, + ]; + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), + 0.2 + ); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + false, + JoinType::Inner, + expr_tree_rev, + &column_refs + ), + 0.2 + ); } #[test] fn test_joinsel_inner_and_of_onconds() { - let cost_model = create_two_table_cost_model(TestPerColumnStats::new( - TestMostCommonValues::empty(), - 5, - 0.0, - TestDistribution::empty(), - ), TestPerColumnStats::new( - TestMostCommonValues::empty(), - 4, - 0.0, - TestDistribution::empty(), - )); + let cost_model = create_two_table_cost_model( + TestPerColumnStats::new( + TestMostCommonValues::empty(), + 5, + 0.0, + TestDistribution::empty(), + ), + TestPerColumnStats::new( + TestMostCommonValues::empty(), + 4, + 0.0, + TestDistribution::empty(), + ), + ); let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); let eq1and0 = bin_op(BinOpType::Eq, col_ref(1), col_ref(0)); let expr_tree = log_op(LogOpType::And, vec![eq0and1.clone(), eq1and0.clone()]); let expr_tree_rev = log_op(LogOpType::And, vec![eq1and0.clone(), eq0and1.clone()]); - let column_refs = vec![ColumnRef::BaseTableColumnRef { - table: String::from(TABLE1_NAME), - col_idx: 0, - }, ColumnRef::BaseTableColumnRef { - table: String::from(TABLE2_NAME), - col_idx: 0, - }]; - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), 0.04); - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev, &column_refs), 0.04); + let column_refs = vec![ + ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 0, + }, + ColumnRef::BaseTableColumnRef { + table: String::from(TABLE2_NAME), + col_idx: 0, + }, + ]; + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), + 0.04 + ); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + false, + JoinType::Inner, + expr_tree_rev, + &column_refs + ), + 0.04 + ); } #[test] fn test_joinsel_inner_and_of_oncond_and_filter() { - let cost_model = create_two_table_cost_model(TestPerColumnStats::new( - TestMostCommonValues::empty(), - 5, - 0.0, - TestDistribution::empty(), - ), TestPerColumnStats::new( - TestMostCommonValues::empty(), - 4, - 0.0, - TestDistribution::empty(), - )); + let cost_model = create_two_table_cost_model( + TestPerColumnStats::new( + TestMostCommonValues::empty(), + 5, + 0.0, + TestDistribution::empty(), + ), + TestPerColumnStats::new( + TestMostCommonValues::empty(), + 4, + 0.0, + TestDistribution::empty(), + ), + ); let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); let eq100 = bin_op(BinOpType::Eq, col_ref(1), cnst(Value::Int32(100))); let expr_tree = log_op(LogOpType::And, vec![eq0and1.clone(), eq100.clone()]); let expr_tree_rev = log_op(LogOpType::And, vec![eq100.clone(), eq0and1.clone()]); - let column_refs = vec![ColumnRef::BaseTableColumnRef { - table: String::from(TABLE1_NAME), - col_idx: 0, - }, ColumnRef::BaseTableColumnRef { - table: String::from(TABLE2_NAME), - col_idx: 0, - }]; - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), 0.05); - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev, &column_refs), 0.05); + let column_refs = vec![ + ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 0, + }, + ColumnRef::BaseTableColumnRef { + table: String::from(TABLE2_NAME), + col_idx: 0, + }, + ]; + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), + 0.05 + ); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + false, + JoinType::Inner, + expr_tree_rev, + &column_refs + ), + 0.05 + ); } #[test] fn test_joinsel_inner_and_of_filters() { - let cost_model = create_two_table_cost_model(TestPerColumnStats::new( - TestMostCommonValues::empty(), - 5, - 0.0, - TestDistribution::empty(), - ), TestPerColumnStats::new( - TestMostCommonValues::empty(), - 4, - 0.0, - TestDistribution::empty(), - )); + let cost_model = create_two_table_cost_model( + TestPerColumnStats::new( + TestMostCommonValues::empty(), + 5, + 0.0, + TestDistribution::empty(), + ), + TestPerColumnStats::new( + TestMostCommonValues::empty(), + 4, + 0.0, + TestDistribution::empty(), + ), + ); let neq12 = bin_op(BinOpType::Neq, col_ref(0), cnst(Value::Int32(12))); let eq100 = bin_op(BinOpType::Eq, col_ref(1), cnst(Value::Int32(100))); let expr_tree = log_op(LogOpType::And, vec![neq12.clone(), eq100.clone()]); let expr_tree_rev = log_op(LogOpType::And, vec![eq100.clone(), neq12.clone()]); - let column_refs = vec![ColumnRef::BaseTableColumnRef { - table: String::from(TABLE1_NAME), - col_idx: 0, - }, ColumnRef::BaseTableColumnRef { - table: String::from(TABLE2_NAME), - col_idx: 0, - }]; - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), 0.2); - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev, &column_refs), 0.2); + let column_refs = vec![ + ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 0, + }, + ColumnRef::BaseTableColumnRef { + table: String::from(TABLE2_NAME), + col_idx: 0, + }, + ]; + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), + 0.2 + ); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + false, + JoinType::Inner, + expr_tree_rev, + &column_refs + ), + 0.2 + ); } #[test] fn test_joinsel_inner_colref_eq_colref_same_table_is_not_oncond() { - let cost_model = create_two_table_cost_model(TestPerColumnStats::new( - TestMostCommonValues::empty(), - 5, - 0.0, - TestDistribution::empty(), - ), TestPerColumnStats::new( - TestMostCommonValues::empty(), - 4, - 0.0, - TestDistribution::empty(), - )); + let cost_model = create_two_table_cost_model( + TestPerColumnStats::new( + TestMostCommonValues::empty(), + 5, + 0.0, + TestDistribution::empty(), + ), + TestPerColumnStats::new( + TestMostCommonValues::empty(), + 4, + 0.0, + TestDistribution::empty(), + ), + ); let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(0)); - let column_refs = vec![ColumnRef::BaseTableColumnRef { - table: String::from(TABLE1_NAME), - col_idx: 0, - }, ColumnRef::BaseTableColumnRef { - table: String::from(TABLE2_NAME), - col_idx: 0, - }]; - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), DEFAULT_EQ_SEL); + let column_refs = vec![ + ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 0, + }, + ColumnRef::BaseTableColumnRef { + table: String::from(TABLE2_NAME), + col_idx: 0, + }, + ]; + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree, &column_refs), + DEFAULT_EQ_SEL + ); } // We don't test joinsel or with oncond because if there is an oncond (on condition), the top-level operator must be an AND /// I made this helper function to avoid copying all eight lines over and over - fn assert_joinsel_outer_selectivities(cost_model: &TestOptCostModel, expr_tree: OptRelNodeRef, expr_tree_rev: OptRelNodeRef, column_refs: &GroupColumnRefs, expected_table1_outer_sel: f64, expected_table2_outer_sel: f64) { + fn assert_joinsel_outer_selectivities( + cost_model: &TestOptCostModel, + expr_tree: OptRelNodeRef, + expr_tree_rev: OptRelNodeRef, + column_refs: &GroupColumnRefs, + expected_table1_outer_sel: f64, + expected_table2_outer_sel: f64, + ) { // all table 1 outer combinations - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::LeftOuter, expr_tree.clone(), &column_refs), expected_table1_outer_sel); - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), expected_table1_outer_sel); - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, true, JoinType::RightOuter, expr_tree.clone(), &column_refs), expected_table1_outer_sel); - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, true, JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), expected_table1_outer_sel); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + false, + JoinType::LeftOuter, + expr_tree.clone(), + &column_refs + ), + expected_table1_outer_sel + ); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + false, + JoinType::LeftOuter, + expr_tree_rev.clone(), + &column_refs + ), + expected_table1_outer_sel + ); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + true, + JoinType::RightOuter, + expr_tree.clone(), + &column_refs + ), + expected_table1_outer_sel + ); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + true, + JoinType::RightOuter, + expr_tree_rev.clone(), + &column_refs + ), + expected_table1_outer_sel + ); // all table 2 outer combinations - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, true, JoinType::LeftOuter, expr_tree.clone(), &column_refs), expected_table2_outer_sel); - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, true, JoinType::LeftOuter, expr_tree_rev.clone(), &column_refs), expected_table2_outer_sel); - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::RightOuter, expr_tree.clone(), &column_refs), expected_table2_outer_sel); - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::RightOuter, expr_tree_rev.clone(), &column_refs), expected_table2_outer_sel); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + true, + JoinType::LeftOuter, + expr_tree.clone(), + &column_refs + ), + expected_table2_outer_sel + ); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + true, + JoinType::LeftOuter, + expr_tree_rev.clone(), + &column_refs + ), + expected_table2_outer_sel + ); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + false, + JoinType::RightOuter, + expr_tree.clone(), + &column_refs + ), + expected_table2_outer_sel + ); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + false, + JoinType::RightOuter, + expr_tree_rev.clone(), + &column_refs + ), + expected_table2_outer_sel + ); } /// Unique oncond means an oncondition on columns which are unique in both tables @@ -2047,66 +2318,132 @@ mod tests { /// of one table and = 1 / row count of another #[test] fn test_joinsel_outer_unique_oncond() { - let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new( - TestMostCommonValues::empty(), + let cost_model = create_two_table_cost_model_custom_row_cnts( + TestPerColumnStats::new( + TestMostCommonValues::empty(), + 5, + 0.0, + TestDistribution::empty(), + ), + TestPerColumnStats::new( + TestMostCommonValues::empty(), + 4, + 0.0, + TestDistribution::empty(), + ), 5, - 0.0, - TestDistribution::empty(), - ), TestPerColumnStats::new( - TestMostCommonValues::empty(), 4, - 0.0, - TestDistribution::empty(), - ), 5, 4); + ); // the left/right of the join refers to the tables, not the order of columns in the predicate let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0)); - let column_refs = vec![ColumnRef::BaseTableColumnRef { - table: String::from(TABLE1_NAME), - col_idx: 0, - }, ColumnRef::BaseTableColumnRef { - table: String::from(TABLE2_NAME), - col_idx: 0, - }]; + let column_refs = vec![ + ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 0, + }, + ColumnRef::BaseTableColumnRef { + table: String::from(TABLE2_NAME), + col_idx: 0, + }, + ]; // sanity check the expected inner sel let expected_inner_sel = 0.2; - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel); - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + false, + JoinType::Inner, + expr_tree.clone(), + &column_refs + ), + expected_inner_sel + ); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + false, + JoinType::Inner, + expr_tree_rev.clone(), + &column_refs + ), + expected_inner_sel + ); // check the outer sels - assert_joinsel_outer_selectivities(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.25, 0.2); + assert_joinsel_outer_selectivities( + &cost_model, + expr_tree, + expr_tree_rev, + &column_refs, + 0.25, + 0.2, + ); } /// Non-unique oncond means the column is not unique in either table /// Inner always >= row count means that the inner join result is >= 1 / the row count of both tables #[test] fn test_joinsel_outer_nonunique_oncond_inner_always_geq_rowcnt() { - let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new( - TestMostCommonValues::empty(), - 5, - 0.0, - TestDistribution::empty(), - ), TestPerColumnStats::new( - TestMostCommonValues::empty(), - 4, - 0.0, - TestDistribution::empty(), - ), 10, 8); + let cost_model = create_two_table_cost_model_custom_row_cnts( + TestPerColumnStats::new( + TestMostCommonValues::empty(), + 5, + 0.0, + TestDistribution::empty(), + ), + TestPerColumnStats::new( + TestMostCommonValues::empty(), + 4, + 0.0, + TestDistribution::empty(), + ), + 10, + 8, + ); // the left/right of the join refers to the tables, not the order of columns in the predicate let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0)); - let column_refs = vec![ColumnRef::BaseTableColumnRef { - table: String::from(TABLE1_NAME), - col_idx: 0, - }, ColumnRef::BaseTableColumnRef { - table: String::from(TABLE2_NAME), - col_idx: 0, - }]; + let column_refs = vec![ + ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 0, + }, + ColumnRef::BaseTableColumnRef { + table: String::from(TABLE2_NAME), + col_idx: 0, + }, + ]; // sanity check the expected inner sel let expected_inner_sel = 0.2; - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel); - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + false, + JoinType::Inner, + expr_tree.clone(), + &column_refs + ), + expected_inner_sel + ); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + false, + JoinType::Inner, + expr_tree_rev.clone(), + &column_refs + ), + expected_inner_sel + ); // check the outer sels - assert_joinsel_outer_selectivities(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.2, 0.2); + assert_joinsel_outer_selectivities( + &cost_model, + expr_tree, + expr_tree_rev, + &column_refs, + 0.2, + 0.2, + ); } /// Non-unique oncond means the column is not unique in either table @@ -2114,33 +2451,66 @@ mod tests { /// Note that without a join filter, it's impossible to be less than the row count of both tables #[test] fn test_joinsel_outer_nonunique_oncond_inner_sometimes_lt_rowcnt() { - let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new( - TestMostCommonValues::empty(), - 10, - 0.0, - TestDistribution::empty(), - ), TestPerColumnStats::new( - TestMostCommonValues::empty(), - 2, - 0.0, - TestDistribution::empty(), - ), 20, 4); + let cost_model = create_two_table_cost_model_custom_row_cnts( + TestPerColumnStats::new( + TestMostCommonValues::empty(), + 10, + 0.0, + TestDistribution::empty(), + ), + TestPerColumnStats::new( + TestMostCommonValues::empty(), + 2, + 0.0, + TestDistribution::empty(), + ), + 20, + 4, + ); // the left/right of the join refers to the tables, not the order of columns in the predicate let expr_tree = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); let expr_tree_rev = bin_op(BinOpType::Eq, col_ref(1), col_ref(0)); - let column_refs = vec![ColumnRef::BaseTableColumnRef { - table: String::from(TABLE1_NAME), - col_idx: 0, - }, ColumnRef::BaseTableColumnRef { - table: String::from(TABLE2_NAME), - col_idx: 0, - }]; + let column_refs = vec![ + ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 0, + }, + ColumnRef::BaseTableColumnRef { + table: String::from(TABLE2_NAME), + col_idx: 0, + }, + ]; // sanity check the expected inner sel let expected_inner_sel = 0.1; - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel); - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_rev.clone(), &column_refs), expected_inner_sel); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + false, + JoinType::Inner, + expr_tree.clone(), + &column_refs + ), + expected_inner_sel + ); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + false, + JoinType::Inner, + expr_tree_rev.clone(), + &column_refs + ), + expected_inner_sel + ); // check the outer sels - assert_joinsel_outer_selectivities(&cost_model, expr_tree, expr_tree_rev, &column_refs, 0.25, 0.1); + assert_joinsel_outer_selectivities( + &cost_model, + expr_tree, + expr_tree_rev, + &column_refs, + 0.25, + 0.1, + ); } /// Unique oncond means an oncondition on columns which are unique in both tables @@ -2148,19 +2518,22 @@ mod tests { /// There's only one case if both columns are unique and there's a filter: the inner will be < 1 / row count of both tables #[test] fn test_joinsel_outer_unique_oncond_filter() { - let cost_model = create_two_table_cost_model_custom_row_cnts(TestPerColumnStats::new( - TestMostCommonValues::empty(), + let cost_model = create_two_table_cost_model_custom_row_cnts( + TestPerColumnStats::new( + TestMostCommonValues::empty(), + 50, + 0.0, + TestDistribution::new(vec![(Value::Int32(128), 0.4)]), + ), + TestPerColumnStats::new( + TestMostCommonValues::empty(), + 4, + 0.0, + TestDistribution::empty(), + ), 50, - 0.0, - TestDistribution::new(vec![ - (Value::Int32(128), 0.4) - ]), - ), TestPerColumnStats::new( - TestMostCommonValues::empty(), 4, - 0.0, - TestDistribution::empty(), - ), 50, 4); + ); // the left/right of the join refers to the tables, not the order of columns in the predicate let eq0and1 = bin_op(BinOpType::Eq, col_ref(0), col_ref(1)); let eq1and0 = bin_op(BinOpType::Eq, col_ref(1), col_ref(0)); @@ -2168,19 +2541,47 @@ mod tests { let expr_tree = log_op(LogOpType::And, vec![eq0and1, filter.clone()]); // inner rev means its the inner expr (the eq op) whose children are being reversed, as opposed to the and op let expr_tree_inner_rev = log_op(LogOpType::And, vec![eq1and0, filter.clone()]); - let column_refs = vec![ColumnRef::BaseTableColumnRef { - table: String::from(TABLE1_NAME), - col_idx: 0, - }, ColumnRef::BaseTableColumnRef { - table: String::from(TABLE2_NAME), - col_idx: 0, - }]; + let column_refs = vec![ + ColumnRef::BaseTableColumnRef { + table: String::from(TABLE1_NAME), + col_idx: 0, + }, + ColumnRef::BaseTableColumnRef { + table: String::from(TABLE2_NAME), + col_idx: 0, + }, + ]; // sanity check the expected inner sel let expected_inner_sel = 0.008; - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree.clone(), &column_refs), expected_inner_sel); - assert_approx_eq::assert_approx_eq!(test_get_join_selectivity(&cost_model, false, JoinType::Inner, expr_tree_inner_rev.clone(), &column_refs), expected_inner_sel); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + false, + JoinType::Inner, + expr_tree.clone(), + &column_refs + ), + expected_inner_sel + ); + assert_approx_eq::assert_approx_eq!( + test_get_join_selectivity( + &cost_model, + false, + JoinType::Inner, + expr_tree_inner_rev.clone(), + &column_refs + ), + expected_inner_sel + ); // check the outer sels - assert_joinsel_outer_selectivities(&cost_model, expr_tree, expr_tree_inner_rev, &column_refs, 0.25, 0.02); + assert_joinsel_outer_selectivities( + &cost_model, + expr_tree, + expr_tree_inner_rev, + &column_refs, + 0.25, + 0.02, + ); } // I didn't test any non-unique cases with filter. The non-unique tests without filter should cover that From 2f5d67429f34d4160b23791b1b664181a7836e55 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 12:34:10 -0400 Subject: [PATCH 24/29] clippy --- optd-datafusion-repr/src/cost/base_cost.rs | 37 +++++++++++----------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 66dbbbc1..87d91656 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -810,6 +810,7 @@ impl OptCostModel { right_row_cnt, ) } else { + #[allow(clippy::collapsible_else_if)] if let Some(on_col_ref_pair) = Self::get_on_col_ref_pair(expr_tree.clone(), column_refs) { self.get_join_selectivity_core( @@ -889,11 +890,11 @@ impl OptCostModel { /// Note that the selectivity of the on conditions does not depend on join type. Join type is accounted for separately in get_join_selectivity_core() fn get_join_on_selectivity( &self, - on_col_ref_pairs: &Vec<(ColumnRefExpr, ColumnRefExpr)>, + on_col_ref_pairs: &[(ColumnRefExpr, ColumnRefExpr)], column_refs: &GroupColumnRefs, ) -> f64 { // multiply the selectivities of all individual conditions together - on_col_ref_pairs.into_iter().map(|on_col_ref_pair| { + on_col_ref_pairs.iter().map(|on_col_ref_pair| { // the formula for each pair is min(1 / ndistinct1, 1 / ndistinct2) (see https://postgrespro.com/blog/pgsql/5969618) let ndistincts = vec![&on_col_ref_pair.0, &on_col_ref_pair.1].into_iter().map(|on_col_ref_expr| { match self.get_per_column_stats_from_col_ref(&column_refs[on_col_ref_expr.index()]) { @@ -2232,82 +2233,82 @@ mod tests { // all table 1 outer combinations assert_approx_eq::assert_approx_eq!( test_get_join_selectivity( - &cost_model, + cost_model, false, JoinType::LeftOuter, expr_tree.clone(), - &column_refs + column_refs ), expected_table1_outer_sel ); assert_approx_eq::assert_approx_eq!( test_get_join_selectivity( - &cost_model, + cost_model, false, JoinType::LeftOuter, expr_tree_rev.clone(), - &column_refs + column_refs ), expected_table1_outer_sel ); assert_approx_eq::assert_approx_eq!( test_get_join_selectivity( - &cost_model, + cost_model, true, JoinType::RightOuter, expr_tree.clone(), - &column_refs + column_refs ), expected_table1_outer_sel ); assert_approx_eq::assert_approx_eq!( test_get_join_selectivity( - &cost_model, + cost_model, true, JoinType::RightOuter, expr_tree_rev.clone(), - &column_refs + column_refs ), expected_table1_outer_sel ); // all table 2 outer combinations assert_approx_eq::assert_approx_eq!( test_get_join_selectivity( - &cost_model, + cost_model, true, JoinType::LeftOuter, expr_tree.clone(), - &column_refs + column_refs ), expected_table2_outer_sel ); assert_approx_eq::assert_approx_eq!( test_get_join_selectivity( - &cost_model, + cost_model, true, JoinType::LeftOuter, expr_tree_rev.clone(), - &column_refs + column_refs ), expected_table2_outer_sel ); assert_approx_eq::assert_approx_eq!( test_get_join_selectivity( - &cost_model, + cost_model, false, JoinType::RightOuter, expr_tree.clone(), - &column_refs + column_refs ), expected_table2_outer_sel ); assert_approx_eq::assert_approx_eq!( test_get_join_selectivity( - &cost_model, + cost_model, false, JoinType::RightOuter, expr_tree_rev.clone(), - &column_refs + column_refs ), expected_table2_outer_sel ); From cfbca671df004607ea7e162528014812c695777b Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 12:52:00 -0400 Subject: [PATCH 25/29] hash join working --- optd-datafusion-repr/src/cost/base_cost.rs | 80 +++++++++++++++------- 1 file changed, 56 insertions(+), 24 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 87d91656..e9d4dddc 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -466,11 +466,8 @@ impl CostModel for OptCostM let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false); // there may be more than one expression tree in a group (you can see this trivially as you can just swap the order of two subtrees for commutative operators) // however, we just take an arbitrary expression tree from the group to compute selectivity - if let Some(expr_tree) = expr_trees.first() { - self.get_filter_selectivity(Arc::clone(expr_tree), &column_refs) - } else { - panic!("encountered a PhysicalFilter without an expression") - } + let expr_tree = expr_trees.first().expect("expression missing"); + self.get_filter_selectivity(expr_tree.clone(), &column_refs) } else { DEFAULT_UNK_SEL } @@ -499,17 +496,14 @@ impl CostModel for OptCostM let expr_group_id = context.children_group_ids[2]; let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false); // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information - if let Some(expr_tree) = expr_trees.first() { - self.get_join_selectivity( - *join_typ, - Arc::clone(expr_tree), - &column_refs, - row_cnt_1, - row_cnt_2, - ) - } else { - panic!("encountered a join without an expression") - } + let expr_tree = expr_trees.first().expect("expression missing"); + self.get_join_selectivity_from_expr_tree( + *join_typ, + expr_tree.clone(), + &column_refs, + row_cnt_1, + row_cnt_2, + ) } else { DEFAULT_UNK_SEL } @@ -527,10 +521,38 @@ impl CostModel for OptCostM let (_, compute_cost, _) = Self::cost_tuple(&children[1]); Self::cost(row_cnt, compute_cost * row_cnt, 0.0) } - OptRelNodeTyp::PhysicalHashJoin(_) => { + OptRelNodeTyp::PhysicalHashJoin(join_typ) => { let (row_cnt_1, _, _) = Self::cost_tuple(&children[0]); let (row_cnt_2, _, _) = Self::cost_tuple(&children[1]); - let selectivity = DEFAULT_UNK_SEL; + let selectivity = match context { + Some(context) => { + if let Some(optimizer) = optimizer { + let column_refs = optimizer + .get_property_by_group::( + context.group_id, + 1, + ); + let left_keys_group_id = context.children_group_ids[2]; + let right_keys_group_id = context.children_group_ids[3]; + let left_keys_list = optimizer.get_all_group_bindings(left_keys_group_id, false); + let right_keys_list = optimizer.get_all_group_bindings(right_keys_group_id, false); + // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information + let left_keys = left_keys_list.first().expect("left keys missing"); + let right_keys = right_keys_list.first().expect("right keys missing"); + self.get_join_selectivity_from_keys( + *join_typ, + ExprList::from_rel_node(left_keys.clone()).expect("left_keys should be an ExprList"), + ExprList::from_rel_node(right_keys.clone()).expect("right_keys should be an ExprList"), + &column_refs, + row_cnt_1, + row_cnt_2, + ) + } else { + DEFAULT_UNK_SEL + } + } + None => DEFAULT_UNK_SEL, + }; Self::cost( (row_cnt_1 * row_cnt_2 * selectivity).max(1.0), row_cnt_1 * 2.0 + row_cnt_2, @@ -768,7 +790,7 @@ impl OptCostModel { /// The expr_tree input must be a "mixed expression tree", just like with get_filter_selectivity() /// This is a "wrapper" to separate the equality conditions from the filter conditions before calling /// the "main" get_join_selectivity_core() function. - fn get_join_selectivity( + fn get_join_selectivity_from_expr_tree( &self, join_typ: JoinType, expr_tree: OptRelNodeRef, @@ -834,6 +856,16 @@ impl OptCostModel { } } + /// A wrapper to convert the join keys to the format expected by get_join_selectivity_core() + fn get_join_selectivity_from_keys(&self, join_typ: JoinType, left_keys: ExprList, right_keys: ExprList, column_refs: &GroupColumnRefs, left_row_cnt: f64, right_row_cnt: f64) -> f64 { + assert!(left_keys.len() == right_keys.len()); + // I assume that the keys are already in the right order s.t. the ith key of left_keys corresponds with the ith key of right_keys + let on_col_ref_pairs = left_keys.to_vec().into_iter().zip(right_keys.to_vec().into_iter()).map(|(left_key, right_key)| { + (ColumnRefExpr::from_rel_node(left_key.into_rel_node()).expect("keys should be ColumnRefExprs"), ColumnRefExpr::from_rel_node(right_key.into_rel_node()).expect("keys should be ColumnRefExprs")) + }).collect_vec(); + self.get_join_selectivity_core(join_typ, on_col_ref_pairs, None, column_refs, left_row_cnt, right_row_cnt) + } + /// The core logic of join selectivity which assumes we've already separated the expression into the on conditions and the filters fn get_join_selectivity_core( &self, @@ -1950,7 +1982,7 @@ mod tests { ); } - /// A wrapper around get_join_selectivity that extracts the table row counts from the cost model + /// A wrapper around get_join_selectivity_from_expr_tree that extracts the table row counts from the cost model fn test_get_join_selectivity( cost_model: &TestOptCostModel, reverse_tables: bool, @@ -1961,7 +1993,7 @@ mod tests { let table1_row_cnt = cost_model.per_table_stats_map[TABLE1_NAME].row_cnt as f64; let table2_row_cnt = cost_model.per_table_stats_map[TABLE2_NAME].row_cnt as f64; if !reverse_tables { - cost_model.get_join_selectivity( + cost_model.get_join_selectivity_from_expr_tree( join_typ, expr_tree, column_refs, @@ -1969,7 +2001,7 @@ mod tests { table2_row_cnt, ) } else { - cost_model.get_join_selectivity( + cost_model.get_join_selectivity_from_expr_tree( join_typ, expr_tree, column_refs, @@ -1983,7 +2015,7 @@ mod tests { fn test_joinsel_inner_const() { let cost_model = create_one_column_cost_model(get_empty_per_col_stats()); assert_approx_eq::assert_approx_eq!( - cost_model.get_join_selectivity( + cost_model.get_join_selectivity_from_expr_tree( JoinType::Inner, cnst(Value::Bool(true)), &vec![], @@ -1993,7 +2025,7 @@ mod tests { 1.0 ); assert_approx_eq::assert_approx_eq!( - cost_model.get_join_selectivity( + cost_model.get_join_selectivity_from_expr_tree( JoinType::Inner, cnst(Value::Bool(false)), &vec![], From 0a38dde0340557157ead325ed9542bb3efcd7f46 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 12:52:47 -0400 Subject: [PATCH 26/29] clip --- optd-datafusion-repr/src/cost/base_cost.rs | 47 +++++++++++++++++----- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index e9d4dddc..53948a13 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -534,15 +534,19 @@ impl CostModel for OptCostM ); let left_keys_group_id = context.children_group_ids[2]; let right_keys_group_id = context.children_group_ids[3]; - let left_keys_list = optimizer.get_all_group_bindings(left_keys_group_id, false); - let right_keys_list = optimizer.get_all_group_bindings(right_keys_group_id, false); + let left_keys_list = + optimizer.get_all_group_bindings(left_keys_group_id, false); + let right_keys_list = + optimizer.get_all_group_bindings(right_keys_group_id, false); // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information let left_keys = left_keys_list.first().expect("left keys missing"); let right_keys = right_keys_list.first().expect("right keys missing"); self.get_join_selectivity_from_keys( *join_typ, - ExprList::from_rel_node(left_keys.clone()).expect("left_keys should be an ExprList"), - ExprList::from_rel_node(right_keys.clone()).expect("right_keys should be an ExprList"), + ExprList::from_rel_node(left_keys.clone()) + .expect("left_keys should be an ExprList"), + ExprList::from_rel_node(right_keys.clone()) + .expect("right_keys should be an ExprList"), &column_refs, row_cnt_1, row_cnt_2, @@ -857,13 +861,38 @@ impl OptCostModel { } /// A wrapper to convert the join keys to the format expected by get_join_selectivity_core() - fn get_join_selectivity_from_keys(&self, join_typ: JoinType, left_keys: ExprList, right_keys: ExprList, column_refs: &GroupColumnRefs, left_row_cnt: f64, right_row_cnt: f64) -> f64 { + fn get_join_selectivity_from_keys( + &self, + join_typ: JoinType, + left_keys: ExprList, + right_keys: ExprList, + column_refs: &GroupColumnRefs, + left_row_cnt: f64, + right_row_cnt: f64, + ) -> f64 { assert!(left_keys.len() == right_keys.len()); // I assume that the keys are already in the right order s.t. the ith key of left_keys corresponds with the ith key of right_keys - let on_col_ref_pairs = left_keys.to_vec().into_iter().zip(right_keys.to_vec().into_iter()).map(|(left_key, right_key)| { - (ColumnRefExpr::from_rel_node(left_key.into_rel_node()).expect("keys should be ColumnRefExprs"), ColumnRefExpr::from_rel_node(right_key.into_rel_node()).expect("keys should be ColumnRefExprs")) - }).collect_vec(); - self.get_join_selectivity_core(join_typ, on_col_ref_pairs, None, column_refs, left_row_cnt, right_row_cnt) + let on_col_ref_pairs = left_keys + .to_vec() + .into_iter() + .zip(right_keys.to_vec()) + .map(|(left_key, right_key)| { + ( + ColumnRefExpr::from_rel_node(left_key.into_rel_node()) + .expect("keys should be ColumnRefExprs"), + ColumnRefExpr::from_rel_node(right_key.into_rel_node()) + .expect("keys should be ColumnRefExprs"), + ) + }) + .collect_vec(); + self.get_join_selectivity_core( + join_typ, + on_col_ref_pairs, + None, + column_refs, + left_row_cnt, + right_row_cnt, + ) } /// The core logic of join selectivity which assumes we've already separated the expression into the on conditions and the filters From 805aaa99e76d421deba12902a7e99685d20312a0 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 14:49:04 -0400 Subject: [PATCH 27/29] fixed context and optimizer stuff --- optd-datafusion-repr/src/cost/base_cost.rs | 136 +++++++++------------ 1 file changed, 60 insertions(+), 76 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 53948a13..cfada1c0 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -454,27 +454,21 @@ impl CostModel for OptCostM OptRelNodeTyp::PhysicalFilter => { let (row_cnt, _, _) = Self::cost_tuple(&children[0]); let (_, compute_cost, _) = Self::cost_tuple(&children[1]); - let selectivity = match context { - Some(context) => { - if let Some(optimizer) = optimizer { - let column_refs = optimizer - .get_property_by_group::( - context.group_id, - 1, - ); - let expr_group_id = context.children_group_ids[1]; - let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false); - // there may be more than one expression tree in a group (you can see this trivially as you can just swap the order of two subtrees for commutative operators) - // however, we just take an arbitrary expression tree from the group to compute selectivity - let expr_tree = expr_trees.first().expect("expression missing"); - self.get_filter_selectivity(expr_tree.clone(), &column_refs) - } else { - DEFAULT_UNK_SEL - } - } - None => DEFAULT_UNK_SEL, + let selectivity = if let (Some(context), Some(optimizer)) = (context, optimizer) { + let column_refs = optimizer + .get_property_by_group::( + context.group_id, + 1, + ); + let expr_group_id = context.children_group_ids[1]; + let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false); + // there may be more than one expression tree in a group (you can see this trivially as you can just swap the order of two subtrees for commutative operators) + // however, we just take an arbitrary expression tree from the group to compute selectivity + let expr_tree = expr_trees.first().expect("expression missing"); + self.get_filter_selectivity(expr_tree.clone(), &column_refs) + } else { + DEFAULT_UNK_SEL }; - Self::cost( (row_cnt * selectivity).max(1.0), row_cnt * compute_cost, @@ -485,30 +479,25 @@ impl CostModel for OptCostM let (row_cnt_1, _, _) = Self::cost_tuple(&children[0]); let (row_cnt_2, _, _) = Self::cost_tuple(&children[1]); let (_, compute_cost, _) = Self::cost_tuple(&children[2]); - let selectivity = match context { - Some(context) => { - if let Some(optimizer) = optimizer { - let column_refs = optimizer - .get_property_by_group::( - context.group_id, - 1, - ); - let expr_group_id = context.children_group_ids[2]; - let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false); - // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information - let expr_tree = expr_trees.first().expect("expression missing"); - self.get_join_selectivity_from_expr_tree( - *join_typ, - expr_tree.clone(), - &column_refs, - row_cnt_1, - row_cnt_2, - ) - } else { - DEFAULT_UNK_SEL - } - } - None => DEFAULT_UNK_SEL, + let selectivity = if let (Some(context), Some(optimizer)) = (context, optimizer) { + let column_refs = optimizer + .get_property_by_group::( + context.group_id, + 1, + ); + let expr_group_id = context.children_group_ids[2]; + let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false); + // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information + let expr_tree = expr_trees.first().expect("expression missing"); + self.get_join_selectivity_from_expr_tree( + *join_typ, + expr_tree.clone(), + &column_refs, + row_cnt_1, + row_cnt_2, + ) + } else { + DEFAULT_UNK_SEL }; Self::cost( (row_cnt_1 * row_cnt_2 * selectivity).max(1.0), @@ -524,38 +513,33 @@ impl CostModel for OptCostM OptRelNodeTyp::PhysicalHashJoin(join_typ) => { let (row_cnt_1, _, _) = Self::cost_tuple(&children[0]); let (row_cnt_2, _, _) = Self::cost_tuple(&children[1]); - let selectivity = match context { - Some(context) => { - if let Some(optimizer) = optimizer { - let column_refs = optimizer - .get_property_by_group::( - context.group_id, - 1, - ); - let left_keys_group_id = context.children_group_ids[2]; - let right_keys_group_id = context.children_group_ids[3]; - let left_keys_list = - optimizer.get_all_group_bindings(left_keys_group_id, false); - let right_keys_list = - optimizer.get_all_group_bindings(right_keys_group_id, false); - // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information - let left_keys = left_keys_list.first().expect("left keys missing"); - let right_keys = right_keys_list.first().expect("right keys missing"); - self.get_join_selectivity_from_keys( - *join_typ, - ExprList::from_rel_node(left_keys.clone()) - .expect("left_keys should be an ExprList"), - ExprList::from_rel_node(right_keys.clone()) - .expect("right_keys should be an ExprList"), - &column_refs, - row_cnt_1, - row_cnt_2, - ) - } else { - DEFAULT_UNK_SEL - } - } - None => DEFAULT_UNK_SEL, + let selectivity = if let (Some(context), Some(optimizer)) = (context, optimizer) { + let column_refs = optimizer + .get_property_by_group::( + context.group_id, + 1, + ); + let left_keys_group_id = context.children_group_ids[2]; + let right_keys_group_id = context.children_group_ids[3]; + let left_keys_list = + optimizer.get_all_group_bindings(left_keys_group_id, false); + let right_keys_list = + optimizer.get_all_group_bindings(right_keys_group_id, false); + // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information + let left_keys = left_keys_list.first().expect("left keys missing"); + let right_keys = right_keys_list.first().expect("right keys missing"); + self.get_join_selectivity_from_keys( + *join_typ, + ExprList::from_rel_node(left_keys.clone()) + .expect("left_keys should be an ExprList"), + ExprList::from_rel_node(right_keys.clone()) + .expect("right_keys should be an ExprList"), + &column_refs, + row_cnt_1, + row_cnt_2, + ) + } else { + DEFAULT_UNK_SEL }; Self::cost( (row_cnt_1 * row_cnt_2 * selectivity).max(1.0), From 900a10b9a660c178432448afb05243a7c580b4fc Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 14:51:45 -0400 Subject: [PATCH 28/29] pr changes --- optd-datafusion-repr/src/cost/base_cost.rs | 23 +++++++++------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index cfada1c0..5c8802d6 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -725,13 +725,13 @@ impl OptCostModel { } } - /// Check if an expr_tree is a join condition, returning the join on col ref pair if it is - /// The reason the check and the info are in the same function is because their code is almost identical + /// Check if an expr_tree is a join condition, returning the join on col ref pair if it is. + /// The reason the check and the info are in the same function is because their code is almost identical. + /// It only picks out equality conditions between two column refs on different tables fn get_on_col_ref_pair( expr_tree: OptRelNodeRef, column_refs: &GroupColumnRefs, ) -> Option<(ColumnRefExpr, ColumnRefExpr)> { - // We perform three checks to see if a child_expr_tree is an on_col_ref_pair // 1. Check that it's equality if expr_tree.typ == OptRelNodeTyp::BinOp(BinOpType::Eq) { let left_child = expr_tree.child(0); @@ -747,18 +747,13 @@ impl OptCostModel { .expect("we already checked that the type is ColumnRef"); let left_col_ref = &column_refs[left_col_ref_expr.index()]; let right_col_ref = &column_refs[right_col_ref_expr.index()]; - let is_same_table = if let ColumnRef::BaseTableColumnRef { + let is_same_table = if let (ColumnRef::BaseTableColumnRef { table: left_table, .. - } = left_col_ref + }, ColumnRef::BaseTableColumnRef { + table: right_table, .. + }) = (left_col_ref, right_col_ref) { - if let ColumnRef::BaseTableColumnRef { - table: right_table, .. - } = right_col_ref - { - left_table == right_table - } else { - false - } + left_table == right_table } else { false }; @@ -947,7 +942,7 @@ impl OptCostModel { None => DEFAULT_NUM_DISTINCT, } }); - // using reduce(f64::min) is the idiomatic workaround to the fact that f64 does not implement Ord due to NaN + // using reduce(f64::min) is the idiomatic workaround to min() because f64 does not implement Ord due to NaN let selectivity = ndistincts.map(|ndistinct| 1.0 / ndistinct as f64).reduce(f64::min).expect("reduce() only returns None if the iterator is empty, which is impossible since col_ref_exprs.len() == 2"); assert!(!selectivity.is_nan(), "it should be impossible for selectivity to be NaN since n-distinct is never 0"); selectivity From d30dc19ea6783fd68f41d7463f0b255d75e3caf7 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 14:52:00 -0400 Subject: [PATCH 29/29] fmt and clippy --- optd-datafusion-repr/src/cost/base_cost.rs | 28 +++++++++------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 5c8802d6..5fc0fb1c 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -456,10 +456,7 @@ impl CostModel for OptCostM let (_, compute_cost, _) = Self::cost_tuple(&children[1]); let selectivity = if let (Some(context), Some(optimizer)) = (context, optimizer) { let column_refs = optimizer - .get_property_by_group::( - context.group_id, - 1, - ); + .get_property_by_group::(context.group_id, 1); let expr_group_id = context.children_group_ids[1]; let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false); // there may be more than one expression tree in a group (you can see this trivially as you can just swap the order of two subtrees for commutative operators) @@ -481,10 +478,7 @@ impl CostModel for OptCostM let (_, compute_cost, _) = Self::cost_tuple(&children[2]); let selectivity = if let (Some(context), Some(optimizer)) = (context, optimizer) { let column_refs = optimizer - .get_property_by_group::( - context.group_id, - 1, - ); + .get_property_by_group::(context.group_id, 1); let expr_group_id = context.children_group_ids[2]; let expr_trees = optimizer.get_all_group_bindings(expr_group_id, false); // there may be more than one expression tree in a group. see comment in OptRelNodeTyp::PhysicalFilter(_) for more information @@ -515,10 +509,7 @@ impl CostModel for OptCostM let (row_cnt_2, _, _) = Self::cost_tuple(&children[1]); let selectivity = if let (Some(context), Some(optimizer)) = (context, optimizer) { let column_refs = optimizer - .get_property_by_group::( - context.group_id, - 1, - ); + .get_property_by_group::(context.group_id, 1); let left_keys_group_id = context.children_group_ids[2]; let right_keys_group_id = context.children_group_ids[3]; let left_keys_list = @@ -747,11 +738,14 @@ impl OptCostModel { .expect("we already checked that the type is ColumnRef"); let left_col_ref = &column_refs[left_col_ref_expr.index()]; let right_col_ref = &column_refs[right_col_ref_expr.index()]; - let is_same_table = if let (ColumnRef::BaseTableColumnRef { - table: left_table, .. - }, ColumnRef::BaseTableColumnRef { - table: right_table, .. - }) = (left_col_ref, right_col_ref) + let is_same_table = if let ( + ColumnRef::BaseTableColumnRef { + table: left_table, .. + }, + ColumnRef::BaseTableColumnRef { + table: right_table, .. + }, + ) = (left_col_ref, right_col_ref) { left_table == right_table } else {