Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

POC: Cost Attribution Proposal 2 #9733

Draft
wants to merge 36 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
4e75934
Poc: cost attribution proposal 1.2
ying-jeanne Oct 24, 2024
0456d94
test update
ying-jeanne Oct 24, 2024
f4ebc07
address review comments and fix lint and test
ying-jeanne Oct 25, 2024
cb99a3f
fix lint and ci
ying-jeanne Oct 25, 2024
b0d3f0a
change max-cost-attribution-cardinality-per-user to 10k
ying-jeanne Oct 25, 2024
ebd6105
change custom registry path
ying-jeanne Oct 25, 2024
a438331
Add license for lint
ying-jeanne Oct 25, 2024
13a0b2c
add reset logics to handle overflow and recovery from overflow
ying-jeanne Oct 27, 2024
7f0b372
remove noop implementation
ying-jeanne Nov 5, 2024
3397578
Merge remote-tracking branch 'origin/main' into poc-cost-attribution-2
ying-jeanne Nov 5, 2024
50fa0ec
add new discarded sample metrics
ying-jeanne Nov 5, 2024
abdd0cc
fix test
ying-jeanne Nov 5, 2024
698a5c6
address comment to combine 2 config compare
ying-jeanne Nov 6, 2024
da6b00b
add logic for overflow
ying-jeanne Nov 7, 2024
2b5e3ff
improve tests for cost attribution service
ying-jeanne Nov 7, 2024
cb2a2b6
Don't hold labels from store-gateways in two forms, and don't convert…
grafanabot Nov 18, 2024
078e689
add per tenant cost attribution label limit
ying-jeanne Nov 18, 2024
3131a2f
Merge remote-tracking branch 'origin/main' into poc-cost-attribution-2
ying-jeanne Nov 18, 2024
4bf418a
update doc
ying-jeanne Nov 18, 2024
5e9e1c1
fix unittest
ying-jeanne Nov 18, 2024
bd3e112
fix ci
ying-jeanne Nov 18, 2024
cf16611
fix ci
ying-jeanne Nov 18, 2024
3c1f886
remove unrelated changes
ying-jeanne Nov 18, 2024
d0cb1f3
update purge logics
ying-jeanne Nov 18, 2024
5d4a2c4
fix ci
ying-jeanne Nov 18, 2024
6091493
fix ci
ying-jeanne Nov 18, 2024
7324a1d
update logic for overflow, purge other metrics than overflow
ying-jeanne Nov 18, 2024
5af48e4
add distributor benchmark test for push
ying-jeanne Nov 19, 2024
203689a
Improve logging at ha_tracker sync operation (#9958) (#9961)
grafanabot Nov 20, 2024
a2f009b
add benchmark in ingester
ying-jeanne Nov 20, 2024
791a75d
Merge remote-tracking branch 'origin/main' into poc-cost-attribution-2
ying-jeanne Nov 20, 2024
00d2092
refactory benchmark tests
ying-jeanne Nov 22, 2024
9fba531
MQE: fix issue where subqueries could return series with no points (#…
charleskorn Nov 25, 2024
12d7d79
fix service dependencies
ying-jeanne Nov 25, 2024
e0525e1
fix the distributor crashloop
ying-jeanne Nov 27, 2024
70c6099
Merge remote-tracking branch 'origin/r317' into poc-cost-attribution-2
ying-jeanne Nov 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions cmd/mimir/config-descriptor.json
Original file line number Diff line number Diff line change
Expand Up @@ -4347,6 +4347,28 @@
"fieldType": "int",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "cost_attribution_labels",
"required": false,
"desc": "List of labels used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution.",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldFlag": "validation.cost-attribution-labels",
"fieldType": "string",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "max_cost_attribution_per_user",
"required": false,
"desc": "Maximum number of cost attribution labels allowed per user.",
"fieldValue": null,
"fieldDefaultValue": 0,
"fieldFlag": "validation.max-cost-attribution-per-user",
"fieldType": "int",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "ruler_evaluation_delay_duration",
Expand Down Expand Up @@ -18346,6 +18368,17 @@
"fieldValue": null,
"fieldDefaultValue": null
},
{
"kind": "field",
"name": "custom_registry_path",
"required": false,
"desc": "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldFlag": "custom-registry-path",
"fieldType": "string",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "timeseries_unmarshal_caching_optimization_enabled",
Expand All @@ -18356,6 +18389,28 @@
"fieldFlag": "timeseries-unmarshal-caching-optimization-enabled",
"fieldType": "boolean",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "cost_attribution_eviction_interval",
"required": false,
"desc": "Time interval at which inactive cost attributions will be evicted from the cache.",
"fieldValue": null,
"fieldDefaultValue": 1800000000000,
"fieldFlag": "cost-attribution-eviction-interval",
"fieldType": "duration",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "cost_attribution_cool_down_duration",
"required": false,
"desc": "Duration during which any cost attribution for a user will be marked as __overflow__ after exceeding the specified limit, prior to resetting the cache.",
"fieldValue": null,
"fieldDefaultValue": 1200000000000,
"fieldFlag": "cost-attribution-cool-down-duration",
"fieldType": "duration",
"fieldCategory": "experimental"
}
],
"fieldValue": null,
Expand Down
10 changes: 10 additions & 0 deletions cmd/mimir/help-all.txt.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -1139,6 +1139,12 @@ Usage of ./cmd/mimir/mimir:
Expands ${var} or $var in config according to the values of the environment variables.
-config.file value
Configuration file to load.
-cost-attribution-cool-down-duration duration
[experimental] Duration during which any cost attribution for a user will be marked as __overflow__ after exceeding the specified limit, prior to resetting the cache. (default 20m0s)
-cost-attribution-eviction-interval duration
[experimental] Time interval at which inactive cost attributions will be evicted from the cache. (default 30m0s)
colega marked this conversation as resolved.
Show resolved Hide resolved
-custom-registry-path string
Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.
-debug.block-profile-rate int
Fraction of goroutine blocking events that are reported in the blocking profile. 1 to include every blocking event in the profile, 0 to disable.
-debug.mutex-profile-fraction int
Expand Down Expand Up @@ -3097,10 +3103,14 @@ Usage of ./cmd/mimir/mimir:
Enable anonymous usage reporting. (default true)
-usage-stats.installation-mode string
Installation mode. Supported values: custom, helm, jsonnet. (default "custom")
-validation.cost-attribution-labels comma-separated-list-of-strings
[experimental] List of labels used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution.
-validation.create-grace-period duration
Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + creation_grace_period)'. This configuration is enforced in the distributor and ingester. (default 10m)
-validation.enforce-metadata-metric-name
Enforce every metadata has a metric name. (default true)
-validation.max-cost-attribution-per-user int
[experimental] Maximum number of cost attribution labels allowed per user.
-validation.max-label-names-per-series int
Maximum number of label names per series. (default 30)
-validation.max-length-label-name int
Expand Down
159 changes: 159 additions & 0 deletions pkg/costattribution/manager.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
package costattribution

import (
"context"
"sort"
"sync"
"time"

"github.com/go-kit/log"
"github.com/grafana/dskit/services"
"github.com/prometheus/client_golang/prometheus"

"github.com/grafana/mimir/pkg/util/validation"
)

const (
missingValue = "__missing__"
overflowValue = "__overflow__"
)

type Manager struct {
services.Service
logger log.Logger
inactiveTimeout time.Duration
limits *validation.Overrides
cooldownTimeout time.Duration

// mu protects the trackersByUserID map
tlock sync.RWMutex
colega marked this conversation as resolved.
Show resolved Hide resolved
trackersByUserID map[string]*Tracker
}

// NewManager creates a new cost attribution manager. which is responsible for managing the cost attribution of series.
// It will clean up inactive series and update the cost attribution of series every 3 minutes.
func NewManager(cleanupInterval, inactiveTimeout time.Duration, cooldownTimeout time.Duration, logger log.Logger, limits *validation.Overrides) *Manager {
s := &Manager{
trackersByUserID: make(map[string]*Tracker),
limits: limits,
tlock: sync.RWMutex{},
cooldownTimeout: cooldownTimeout,
inactiveTimeout: inactiveTimeout,
logger: logger,
}

s.Service = services.NewTimerService(cleanupInterval, nil, s.iteration, nil).WithName("cost attribution manager")
return s
}

func (m *Manager) iteration(_ context.Context) error {
m.purgeInactiveAttributions(m.inactiveTimeout)
return nil
}

// EnabledForUser returns true if the cost attribution is enabled for the user
func (m *Manager) EnabledForUser(userID string) bool {
return len(m.limits.CostAttributionLabel(userID)) > 0
}

func (m *Manager) TrackerForUser(userID string) *Tracker {
// if cost attribution is not enabled, return nil
if !m.EnabledForUser(userID) {
return nil
}
m.tlock.Lock()
defer m.tlock.Unlock()

// if not exists, create a new tracker
if _, exists := m.trackersByUserID[userID]; !exists {
m.trackersByUserID[userID], _ = newTracker(m.limits.CostAttributionLabel(userID), m.limits.MaxCostAttributionPerUser(userID))
}
return m.trackersByUserID[userID]
}

func (m *Manager) Collect(out chan<- prometheus.Metric) {
m.tlock.RLock()
defer m.tlock.RUnlock()
for _, tracker := range m.trackersByUserID {
tracker.Collect(out)
}
}

// Describe implements prometheus.Collector.
func (m *Manager) Describe(chan<- *prometheus.Desc) {
// this is an unchecked collector
}

// deleteUserTracer is delete user tracker since the user is disabled for cost attribution
func (m *Manager) deleteUserTracer(userID string) {
m.tlock.Lock()
defer m.tlock.Unlock()
if _, exists := m.trackersByUserID[userID]; !exists {
return
}
// clean up tracker metrics and delete the tracker
m.trackersByUserID[userID].cleanupTracker(userID)
delete(m.trackersByUserID, userID)
}

func (m *Manager) purgeInactiveAttributions(inactiveTimeout time.Duration) {

// Get all userIDs from the map
m.tlock.RLock()
userIDs := make([]string, 0, len(m.trackersByUserID))
for userID := range m.trackersByUserID {
userIDs = append(userIDs, userID)
}
m.tlock.RUnlock()

// Iterate over all userIDs and purge inactive attributions of each user
currentTime := time.Now()
for _, userID := range userIDs {
// if cost attribution is not enabled for the user, delete the user tracker and continue
if len(m.limits.CostAttributionLabel(userID)) == 0 || m.limits.MaxCostAttributionPerUser(userID) <= 0 {
m.deleteUserTracer(userID)
continue
}
// get all inactive attributions for the user and clean up the tracker
inactiveObs := m.purgeInactiveObservationsForUser(userID, currentTime.Add(-inactiveTimeout).UnixNano())

for _, ob := range inactiveObs {
m.trackersByUserID[userID].cleanupTrackerAttribution(ob.lvalues)
}
}
}

// compare two sorted string slices
func compareStringSlice(a, b []string) bool {
if len(a) != len(b) {
return false
}
for i, v := range a {
if v != b[i] {
return false
}
}
return true
}

func (m *Manager) purgeInactiveObservationsForUser(userID string, deadline int64) []*observation {
cat := m.TrackerForUser(userID)
if cat == nil {
return nil
}

newTrackedLabels := sort.StringSlice(m.limits.CostAttributionLabel(userID))
// if they are different, we need to update the tracker, we don't mind, just reinitalized the tracker
if !compareStringSlice(cat.trackedLabels, newTrackedLabels) {
m.tlock.Lock()
m.trackersByUserID[userID], _ = newTracker(m.limits.CostAttributionLabel(userID), m.limits.MaxCostAttributionPerUser(userID))
// update the tracker with the new tracker
cat = m.trackersByUserID[userID]
m.tlock.Unlock()
} else if maxCardinality := m.limits.MaxCostAttributionPerUser(userID); cat.maxCardinality != maxCardinality {
// if the maxCardinality is different, update the tracker
cat.updateMaxCardinality(maxCardinality)
}

return cat.PurgeInactiveObservations(deadline)
}
Loading
Loading