Skip to content

add logical plan distributed optimizer to query frontend #6974

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions pkg/api/queryapi/query_api.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ import (
"github.com/prometheus/prometheus/util/annotations"
"github.com/prometheus/prometheus/util/httputil"
v1 "github.com/prometheus/prometheus/web/api/v1"
"github.com/thanos-io/promql-engine/logicalplan"
"github.com/weaveworks/common/httpgrpc"

"github.com/cortexproject/cortex/pkg/distributed_execution"
"github.com/cortexproject/cortex/pkg/engine"
"github.com/cortexproject/cortex/pkg/querier"
"github.com/cortexproject/cortex/pkg/util"
Expand Down Expand Up @@ -110,7 +110,7 @@ func (q *QueryAPI) RangeQueryHandler(r *http.Request) (result apiFuncResult) {

byteLP := []byte(r.PostFormValue("plan"))
if len(byteLP) != 0 {
logicalPlan, err := logicalplan.Unmarshal(byteLP)
logicalPlan, err := distributed_execution.Unmarshal(byteLP)
if err != nil {
return apiFuncResult{nil, &apiError{errorInternal, fmt.Errorf("invalid logical plan: %v", err)}, nil, nil}
}
Expand Down Expand Up @@ -183,7 +183,7 @@ func (q *QueryAPI) InstantQueryHandler(r *http.Request) (result apiFuncResult) {

byteLP := []byte(r.PostFormValue("plan"))
if len(byteLP) != 0 {
logicalPlan, err := logicalplan.Unmarshal(byteLP)
logicalPlan, err := distributed_execution.Unmarshal(byteLP)
if err != nil {
return apiFuncResult{nil, &apiError{errorInternal, fmt.Errorf("invalid logical plan: %v", err)}, nil, nil}
}
Expand Down
40 changes: 40 additions & 0 deletions pkg/distributed_execution/distributed_optimizer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package distributed_execution

import (
"fmt"

"github.com/prometheus/prometheus/util/annotations"
"github.com/thanos-io/promql-engine/logicalplan"
)

// This is a simplified implementation that only handles binary aggregation cases
// Future versions of the distributed optimizer are expected to:
// - Support more complex query patterns
// - Incorporate diverse optimization strategies
// - Extend support to node types beyond binary operations

type DistributedOptimizer struct{}

func (d *DistributedOptimizer) Optimize(root logicalplan.Node) (logicalplan.Node, annotations.Annotations, error) {
warns := annotations.New()

if root == nil {
return nil, *warns, fmt.Errorf("nil root node")
}

logicalplan.TraverseBottomUp(nil, &root, func(parent, current *logicalplan.Node) bool {

if (*current).Type() == logicalplan.BinaryNode {
ch := (*current).Children()

for _, child := range ch {
temp := (*child).Clone()
*child = NewRemoteNode()
*(*child).Children()[0] = temp
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even though it is just a dummy optimizer, we should probably add constraints to only mark as remote node if the child has aggregation. We don't want to optimize queries like up + up as each child returns raw data instead of aggregated data

}

return false
})
return root, *warns, nil
}
125 changes: 125 additions & 0 deletions pkg/distributed_execution/distributed_optimizer_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
package distributed_execution

import (
"testing"
"time"

"github.com/prometheus/prometheus/promql/parser"
"github.com/stretchr/testify/require"
"github.com/thanos-io/promql-engine/logicalplan"
"github.com/thanos-io/promql-engine/query"
)

func TestDistributedOptimizer(t *testing.T) {
now := time.Now()
testCases := []struct {
name string
query string
start time.Time
end time.Time
step time.Duration
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit. We don't have to parameterize start, end and step if they don't matter much in this test. We can just hardcode when creating the plan

remoteExecCount int
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's compare result logical plan instead of remoteExecCount. remoteExecCount can be misleading as remote node might be added to the wrong place

}{
{
name: "binary operation with aggregations",
query: "sum(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) + sum(rate(node_memory_Active_bytes[5m]))",
start: now,
end: now,
step: time.Minute,
remoteExecCount: 2,
},
{
name: "multiple binary operations with aggregations",
query: "sum(rate(http_requests_total{job=\"api\"}[5m])) + sum(rate(http_requests_total{job=\"web\"}[5m])) - sum(rate(http_requests_total{job=\"cache\"}[5m]))",
start: now,
end: now,
step: time.Minute,
remoteExecCount: 4,
},
{
name: "subquery with aggregation",
query: "sum(rate(container_network_transmit_bytes_total[5m:1m]))",
start: now,
end: now,
step: time.Minute,
remoteExecCount: 0,
},
{
name: "function applied on binary operation",
query: "rate(http_requests_total[5m]) + rate(http_errors_total[5m]) > bool 0",
start: now,
end: now,
step: time.Minute,
remoteExecCount: 4,
},
{
name: "numerical binary query",
query: "(1 + 1) + (1 + 1)",
start: now,
end: now,
step: time.Minute,
remoteExecCount: 0,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
lp, _, err := CreateTestLogicalPlan(tc.query, tc.start, tc.end, tc.step)
require.NoError(t, err)

d := DistributedOptimizer{}
newRoot, _, err := d.Optimize((*lp).Root())
require.NoError(t, err)

remoteNodeCount := 0
logicalplan.TraverseBottomUp(nil, &newRoot, func(parent, current *logicalplan.Node) bool {
if RemoteNode == (*current).Type() {
remoteNodeCount++
}
return false
})
require.Equal(t, tc.remoteExecCount, remoteNodeCount)
})
}
}

func getStartAndEnd(start time.Time, end time.Time, step time.Duration) (time.Time, time.Time) {
if step == 0 {
return start, start
}
return start, end
}

func CreateTestLogicalPlan(qs string, start time.Time, end time.Time, step time.Duration) (*logicalplan.Plan, query.Options, error) {

start, end = getStartAndEnd(start, end, step)

qOpts := query.Options{
Start: start,
End: end,
Step: step,
StepsBatch: 10,
NoStepSubqueryIntervalFn: func(duration time.Duration) time.Duration {
return 0
},
LookbackDelta: 0,
EnablePerStepStats: false,
}

expr, err := parser.NewParser(qs, parser.WithFunctions(parser.Functions)).ParseExpr()
if err != nil {
return nil, qOpts, err
}

planOpts := logicalplan.PlanOptions{
DisableDuplicateLabelCheck: false,
}

logicalPlan, err := logicalplan.NewFromAST(expr, &qOpts, planOpts)
if err != nil {
return nil, qOpts, err
}
optimizedPlan, _ := logicalPlan.Optimize(logicalplan.DefaultOptimizers)

return &optimizedPlan, qOpts, nil
}
21 changes: 21 additions & 0 deletions pkg/distributed_execution/fragment_key.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package distributed_execution

type FragmentKey struct {
queryID uint64
fragmentID uint64
}

func MakeFragmentKey(queryID uint64, fragmentID uint64) *FragmentKey {
return &FragmentKey{
queryID: queryID,
fragmentID: fragmentID,
}
}

func (f FragmentKey) GetQueryID() uint64 {
return f.queryID
}

func (f FragmentKey) GetFragmentID() uint64 {
return f.fragmentID
}
72 changes: 72 additions & 0 deletions pkg/distributed_execution/remote_node.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package distributed_execution

import (
"encoding/json"
"fmt"

"github.com/prometheus/prometheus/promql/parser"
"github.com/thanos-io/promql-engine/logicalplan"
)

type NodeType = logicalplan.NodeType
type Node = logicalplan.Node
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those seems not needed. When you return in the function you can just specify return type to be logicalplan.NodeType and logicalplan.Node


const (
RemoteNode = "RemoteNode"
)

// (to verify interface implementations)
var _ logicalplan.Node = (*Remote)(nil)

type Remote struct {
Op parser.ItemType
Expr Node `json:"-"`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need Op?


FragmentKey FragmentKey
FragmentAddr string
}

func NewRemoteNode() Node {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might need to take expr as a parameter

return &Remote{
// initialize the fragment key pointer first
FragmentKey: FragmentKey{},
}
}
func (r *Remote) Clone() Node {
return &Remote{Op: r.Op, Expr: r.Expr.Clone(), FragmentKey: r.FragmentKey, FragmentAddr: r.FragmentAddr}
}
func (r *Remote) Children() []*Node {
return []*Node{&r.Expr}
}
func (r *Remote) String() string {
return fmt.Sprintf("%s%s", r.Op.String(), r.Expr.String())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to mention the node name remote. Maybe similar to what Thanos has fmt.Sprintf(remote(%s), r.Expr.String())

}
func (r *Remote) ReturnType() parser.ValueType {
return r.Expr.ReturnType()
}
func (r *Remote) Type() NodeType { return RemoteNode }

type remote struct {
QueryID uint64
FragmentID uint64
FragmentAddr string
}

func (r *Remote) MarshalJSON() ([]byte, error) {
return json.Marshal(remote{
QueryID: r.FragmentKey.queryID,
FragmentID: r.FragmentKey.fragmentID,
FragmentAddr: r.FragmentAddr,
})
}

func (r *Remote) UnmarshalJSON(data []byte) error {
re := remote{}
if err := json.Unmarshal(data, &re); err != nil {
return err
}

r.FragmentKey = *MakeFragmentKey(re.QueryID, re.FragmentID)
r.FragmentAddr = re.FragmentAddr
return nil
}
76 changes: 76 additions & 0 deletions pkg/distributed_execution/remote_node_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
package distributed_execution

import (
"encoding/json"
"testing"

"github.com/prometheus/prometheus/promql/parser"
"github.com/stretchr/testify/require"
"github.com/thanos-io/promql-engine/logicalplan"
)

func TestRemoteNode(t *testing.T) {
t.Run("NewRemoteNode creates valid node", func(t *testing.T) {
node := NewRemoteNode()
require.NotNil(t, node)
require.IsType(t, &Remote{}, node)
require.Equal(t, (&Remote{}).Type(), node.Type())
})

t.Run("Clone creates correct copy", func(t *testing.T) {
original := &Remote{
Op: parser.ADD,
FragmentKey: FragmentKey{queryID: 1, fragmentID: 2},
FragmentAddr: "[IP_ADDRESS]:9090",
Expr: &logicalplan.NumberLiteral{Val: 42},
}

cloned := original.Clone()
require.NotNil(t, cloned)

remote, ok := cloned.(*Remote)
require.True(t, ok)
require.Equal(t, original.Op, remote.Op)
require.Equal(t, original.FragmentKey, remote.FragmentKey)
require.Equal(t, original.FragmentAddr, remote.FragmentAddr)
require.Equal(t, original.Expr.String(), remote.Expr.String())
})

t.Run("JSON marshaling/unmarshaling", func(t *testing.T) {
original := &Remote{
FragmentKey: *MakeFragmentKey(1, 2),
FragmentAddr: "[IP_ADDRESS]:9090",
}

data, err := json.Marshal(original)
require.NoError(t, err)

var unmarshaled Remote
err = json.Unmarshal(data, &unmarshaled)
require.NoError(t, err)

require.Equal(t, original.FragmentKey.queryID, unmarshaled.FragmentKey.queryID)
require.Equal(t, original.FragmentKey.fragmentID, unmarshaled.FragmentKey.fragmentID)
require.Equal(t, original.FragmentAddr, unmarshaled.FragmentAddr)
})

t.Run("Children returns correct nodes", func(t *testing.T) {
expr := &logicalplan.NumberLiteral{Val: 42}
node := &Remote{
Expr: expr,
}

children := node.Children()
require.Len(t, children, 1)
require.Equal(t, expr, *children[0])
})

t.Run("ReturnType matches expression type", func(t *testing.T) {
expr := &logicalplan.NumberLiteral{Val: 42}
node := &Remote{
Expr: expr,
}

require.Equal(t, expr.ReturnType(), node.ReturnType())
})
}
Loading
Loading