From e386a29377c4138a6a2aee87750501b35edae86d Mon Sep 17 00:00:00 2001 From: David Sisson Date: Tue, 8 Oct 2024 14:58:18 -0700 Subject: [PATCH] feat: define sideband optimization hints (#705) --- proto/substrait/algebra.proto | 37 ++++++++++++++++++++++++++++ site/docs/relations/_config | 3 ++- site/docs/relations/common_fields.md | 26 +++++++++++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 site/docs/relations/common_fields.md diff --git a/proto/substrait/algebra.proto b/proto/substrait/algebra.proto index 6c68f19dd..8279ece67 100644 --- a/proto/substrait/algebra.proto +++ b/proto/substrait/algebra.proto @@ -47,6 +47,12 @@ message RelCommon { substrait.extensions.AdvancedExtension advanced_extension = 10; + // Save or load a system-specific computation for use in optimizing a remote operation. + // The anchor refers to the source/destination of the computation. The computation type + // and number refer to the current relation. + repeated SavedComputation saved_computations = 11; + repeated LoadedComputation loaded_computations = 12; + // The statistics related to a hint (physical properties of records) message Stats { double row_count = 1; @@ -59,6 +65,37 @@ message RelCommon { substrait.extensions.AdvancedExtension advanced_extension = 10; } + + enum ComputationType { + COMPUTATION_TYPE_UNSPECIFIED = 0; + COMPUTATION_TYPE_HASHTABLE = 1; + COMPUTATION_TYPE_BLOOM_FILTER = 2; + COMPUTATION_TYPE_UNKNOWN = 9999; + } + + message SavedComputation { + // The value corresponds to a plan unique number for that datastructure. Any particular + // computation may be saved only once but it may be loaded multiple times. + int32 computation_id = 1; + // The type of this computation. While a plan may use COMPUTATION_TYPE_UNKNOWN for all + // of its types it is recommended to use a more specific type so that the optimization + // is more portable. The consumer should be able to decide if an unknown type here + // matches the same unknown type at a different plan and ignore the optimization if they + // are mismatched. + ComputationType type = 2; + } + + message LoadedComputation { + // The value corresponds to a plan unique number for that datastructure. Any particular + // computation may be saved only once but it may be loaded multiple times. + int32 computation_id_reference = 1; + // The type of this computation. While a plan may use COMPUTATION_TYPE_UNKNOWN for all + // of its types it is recommended to use a more specific type so that the optimization + // is more portable. The consumer should be able to decide if an unknown type here + // matches the same unknown type at a different plan and ignore the optimization if they + // are mismatched. + ComputationType type = 2; + } } } diff --git a/site/docs/relations/_config b/site/docs/relations/_config index 5a13776e1..b3a6085b8 100644 --- a/site/docs/relations/_config +++ b/site/docs/relations/_config @@ -1,6 +1,7 @@ arrange: - basics.md + - common_fields.md - logical_relations.md - physical_relations.md - user_defined_relations.md - - embedded_relations.md \ No newline at end of file + - embedded_relations.md diff --git a/site/docs/relations/common_fields.md b/site/docs/relations/common_fields.md new file mode 100644 index 000000000..37f0d4cf4 --- /dev/null +++ b/site/docs/relations/common_fields.md @@ -0,0 +1,26 @@ +# Common Fields + +Every relation contains a common section containing optional hints and emit behavior. + + +## Emit + +A relation which has a direct emit kind outputs the relation's output without reordering or selection. A relation that specifies an emit output mapping can output its output columns in any order and may leave output columns out. + +???+ info "Relation Output" + + * Many relations (such as Project) by default provide as their output the list of all their input columns plus any generated columns as its output columns. Review each relation to understand its specific output default. + + +## Hints + +Hints provide information that can improve performance but cannot be used to control the behavior. Table statistics, runtime constraints, name hints, and saved computations all fall into this category. + +???+ info "Hint Design" + + * If a hint is not present or has incorrect data the consumer should be able to ignore it and still arrive at the correct result. + + +### Saved Computations + +Computations can be used to save a data structure to use elsewhere. For instance, let's say we have a plan with a HashEquiJoin and an AggregateDistinct operation. The HashEquiJoin could save its hash table as part of saved computation id number 1 and the AggregateDistinct could read in computation id number 1.