Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: join selectivity #145

Merged
merged 30 commits into from
Mar 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
f223a93
done with join sel skeleton
wangpatrick57 Mar 30, 2024
a20b8f5
added filtersel and joinsel const
wangpatrick57 Mar 30, 2024
ccf068d
made get semantic nodes a function
wangpatrick57 Mar 30, 2024
88a15e9
added tests for log op join sel
wangpatrick57 Mar 30, 2024
c015562
refactored per_col_vec to per_col_map to avoid double options
wangpatrick57 Mar 31, 2024
fdb1c8c
wrote wrapper to extract join on condition
wangpatrick57 Mar 31, 2024
25c2b75
no cache -> rebuild cache
wangpatrick57 Mar 31, 2024
273aa0d
refactored per col from map back to vec
wangpatrick57 Mar 31, 2024
9e8b4f2
cmt
wangpatrick57 Mar 31, 2024
3ff6d24
fixed joinsel eq test to use two diff tables
wangpatrick57 Mar 31, 2024
bbbfbfc
removed joinsel or test
wangpatrick57 Mar 31, 2024
4c792f9
oncond comment
wangpatrick57 Mar 31, 2024
2e23e20
wrote unit tests for join sel
wangpatrick57 Mar 31, 2024
3428ebb
now checking join type inner
wangpatrick57 Mar 31, 2024
673e4aa
fixed q11
wangpatrick57 Mar 31, 2024
23d0abf
cust row cnt
wangpatrick57 Mar 31, 2024
3836854
wrote unit tests for outer sel
wangpatrick57 Mar 31, 2024
2f47305
refactored unit tests to pass row cnt properly
wangpatrick57 Mar 31, 2024
ff49d83
fixed bug in unittests
wangpatrick57 Mar 31, 2024
3711de2
added sel to hashjoin
wangpatrick57 Mar 31, 2024
fb141a6
undid hash join sel
wangpatrick57 Mar 31, 2024
7068c83
merged with main
wangpatrick57 Mar 31, 2024
5be618e
cross join
wangpatrick57 Mar 31, 2024
4d0f753
fmt
wangpatrick57 Mar 31, 2024
2f5d674
clippy
wangpatrick57 Mar 31, 2024
cfbca67
hash join working
wangpatrick57 Mar 31, 2024
0a38dde
clip
wangpatrick57 Mar 31, 2024
805aaa9
fixed context and optimizer stuff
wangpatrick57 Mar 31, 2024
900a10b
pr changes
wangpatrick57 Mar 31, 2024
d30dc19
fmt and clippy
wangpatrick57 Mar 31, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions optd-core/src/cascades/optimizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,10 @@ impl<T: RelNodeTyp> CascadesOptimizer<T> {
self.memo.merge_group(group_a, group_b);
}

/// Get the properties of a Cascades group
/// P is the type of the property you expect
/// idx is the idx of the property you want. The order of properties is defined
/// by the property_builders parameter in CascadesOptimizer::new()
pub fn get_property_by_group<P: PropertyBuilder<T>>(
&self,
group_id: GroupId,
Expand Down
1,331 changes: 1,158 additions & 173 deletions optd-datafusion-repr/src/cost/base_cost.rs

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions optd-datafusion-repr/src/plan_nodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ pub use sort::{LogicalSort, PhysicalSort};

use crate::properties::schema::{Schema, SchemaPropertyBuilder};

/// OptRelNodeTyp FAQ:
/// - The define_plan_node!() macro defines what the children of each join node are
wangpatrick57 marked this conversation as resolved.
Show resolved Hide resolved
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum OptRelNodeTyp {
Placeholder(GroupId),
Expand Down
4 changes: 2 additions & 2 deletions optd-perftest/src/cardtest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,14 +103,14 @@ pub trait CardtestRunnerDBMSHelper {

pub async fn cardtest<P: AsRef<Path>>(
workspace_dpath: P,
no_cached_optd_stats: bool,
rebuild_cached_optd_stats: bool,
pguser: &str,
pgpassword: &str,
tpch_config: TpchConfig,
) -> anyhow::Result<HashMap<String, Vec<Cardinfo>>> {
let pg_dbms = Box::new(PostgresDBMS::build(&workspace_dpath, pguser, pgpassword)?);
let truecard_getter = pg_dbms.clone();
let df_dbms = Box::new(DatafusionDBMS::new(&workspace_dpath, no_cached_optd_stats).await?);
let df_dbms = Box::new(DatafusionDBMS::new(&workspace_dpath, rebuild_cached_optd_stats).await?);
let dbmss: Vec<Box<dyn CardtestRunnerDBMSHelper>> = vec![pg_dbms, df_dbms];

let tpch_benchmark = Benchmark::Tpch(tpch_config.clone());
Expand Down
21 changes: 10 additions & 11 deletions optd-perftest/src/datafusion_dbms.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ use regex::Regex;

pub struct DatafusionDBMS {
workspace_dpath: PathBuf,
no_cached_stats: bool,
rebuild_cached_stats: bool,
ctx: SessionContext,
}

Expand Down Expand Up @@ -63,11 +63,11 @@ impl CardtestRunnerDBMSHelper for DatafusionDBMS {
impl DatafusionDBMS {
pub async fn new<P: AsRef<Path>>(
workspace_dpath: P,
no_cached_stats: bool,
rebuild_cached_stats: bool,
) -> anyhow::Result<Self> {
Ok(DatafusionDBMS {
workspace_dpath: workspace_dpath.as_ref().to_path_buf(),
no_cached_stats,
rebuild_cached_stats,
ctx: Self::new_session_ctx(None).await?,
})
}
Expand Down Expand Up @@ -145,13 +145,13 @@ impl DatafusionDBMS {

let mut estcards = vec![];
for (query_id, sql_fpath) in tpch_kit.get_sql_fpath_ordered_iter(tpch_config)? {
let sql = fs::read_to_string(sql_fpath)?;
let estcard = self.eval_query_estcard(&sql).await?;
estcards.push(estcard);
println!(
"done evaluating datafusion's estcard for TPC-H Q{}",
"about to evaluate datafusion's estcard for TPC-H Q{}",
query_id
);
let sql = fs::read_to_string(sql_fpath)?;
let estcard = self.eval_query_estcard(&sql).await?;
estcards.push(estcard);
}

Ok(estcards)
Expand Down Expand Up @@ -213,7 +213,7 @@ impl DatafusionDBMS {
.workspace_dpath
.join("datafusion_stats_caches")
.join(format!("{}.json", benchmark_fname));
if !self.no_cached_stats && stats_cache_fpath.exists() {
if !self.rebuild_cached_stats && stats_cache_fpath.exists() {
let file = File::open(&stats_cache_fpath)?;
Ok(serde_json::from_reader(file)?)
} else {
Expand All @@ -222,9 +222,8 @@ impl DatafusionDBMS {
_ => unimplemented!(),
};

// regardless of whether self.no_cached_stats is true or false, we want to update the cache
// this way, even if we choose not to read from the cache, the cache still always has the
// most up to date version of the stats
// When self.rebuild_cached_stats is true, we *don't read* from the cache but we still
// *do write* to the cache.
fs::create_dir_all(stats_cache_fpath.parent().unwrap())?;
let file = File::create(&stats_cache_fpath)?;
serde_json::to_writer(file, &base_table_stats)?;
Expand Down
8 changes: 4 additions & 4 deletions optd-perftest/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ enum Commands {
#[clap(long)]
#[clap(action)]
#[clap(help = "Whether to use the cached optd stats/cache generated stats")]
// this is an option because you want to make it false whenever you update the
// this is an option because you want to make it true whenever you update the
// code for how stats are generated in optd, in order to not use cached stats
// I found that I almost always want to use the cache though, which is why the
// system will use the cache by default
no_cached_optd_stats: bool,
rebuild_cached_optd_stats: bool,

#[clap(long)]
#[clap(default_value = "default_user")]
Expand Down Expand Up @@ -77,7 +77,7 @@ async fn main() -> anyhow::Result<()> {
scale_factor,
seed,
query_ids,
no_cached_optd_stats,
rebuild_cached_optd_stats,
pguser,
pgpassword,
} => {
Expand All @@ -89,7 +89,7 @@ async fn main() -> anyhow::Result<()> {
};
let cardinfo_alldbs = cardtest::cardtest(
&workspace_dpath,
no_cached_optd_stats,
rebuild_cached_optd_stats,
&pguser,
&pgpassword,
tpch_config,
Expand Down
2 changes: 1 addition & 1 deletion optd-perftest/tests/cardtest_integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ mod tests {
// make sure scale factor is low so the test runs fast
"--scale-factor",
"0.01",
"--no-cached-optd-stats",
"--rebuild-cached-optd-stats",
"--pguser",
"test_user",
"--pgpassword",
Expand Down
Loading