grafana · ying-jeanne · Oct 24, 2024 · Oct 24, 2024 · Oct 25, 2024 · Oct 25, 2024
@@ -4347,6 +4347,28 @@
           "fieldType": "int",
           "fieldCategory": "experimental"
         },
+        {
+          "kind": "field",
+          "name": "cost_attribution_labels",
+          "required": false,
+          "desc": "List of labels used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution.",
+          "fieldValue": null,
+          "fieldDefaultValue": "",
+          "fieldFlag": "validation.cost-attribution-labels",
+          "fieldType": "string",
+          "fieldCategory": "experimental"
+        },
+        {
+          "kind": "field",
+          "name": "max_cost_attribution_cardinality_per_user",
+          "required": false,
+          "desc": "Maximum cardinality of cost attribution labels allowed per user.",
+          "fieldValue": null,
+          "fieldDefaultValue": 10000,
+          "fieldFlag": "validation.max-cost-attribution-cardinality-per-user",
+          "fieldType": "int",
+          "fieldCategory": "experimental"
+        },
         {
           "kind": "field",
           "name": "ruler_evaluation_delay_duration",
@@ -18346,6 +18368,17 @@
       "fieldValue": null,
       "fieldDefaultValue": null
     },
+    {
+      "kind": "field",
+      "name": "cost_attribution_registry_path",
+      "required": false,
+      "desc": "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.",
+      "fieldValue": null,
+      "fieldDefaultValue": "",
+      "fieldFlag": "cost-attribution.registry-path",
+      "fieldType": "string",
+      "fieldCategory": "advanced"
+    },
     {
       "kind": "field",
       "name": "timeseries_unmarshal_caching_optimization_enabled",
@@ -18356,6 +18389,17 @@
       "fieldFlag": "timeseries-unmarshal-caching-optimization-enabled",
       "fieldType": "boolean",
       "fieldCategory": "experimental"
+    },
+    {
+      "kind": "field",
+      "name": "cost_attribution_eviction_interval",
+      "required": false,
+      "desc": "Time interval at which inactive cost attributions will be evicted from the counter, so it won't be counted when checking max_cost_attribution_cardinality_per_user.",
+      "fieldValue": null,
+      "fieldDefaultValue": 1800000000000,
+      "fieldFlag": "cost-attribution.eviction-interval",
+      "fieldType": "duration",
+      "fieldCategory": "experimental"
     }
   ],
   "fieldValue": null,

@@ -1139,6 +1139,10 @@ Usage of ./cmd/mimir/mimir:
     	Expands ${var} or $var in config according to the values of the environment variables.
   -config.file value
     	Configuration file to load.
+  -cost-attribution.eviction-interval duration
+    	[experimental] Time interval at which inactive cost attributions will be evicted from the counter, so it won't be counted when checking max_cost_attribution_cardinality_per_user. (default 30m0s)
+  -cost-attribution.registry-path string
+    	Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.
   -debug.block-profile-rate int
     	Fraction of goroutine blocking events that are reported in the blocking profile. 1 to include every blocking event in the profile, 0 to disable.
   -debug.mutex-profile-fraction int
@@ -3097,10 +3101,14 @@ Usage of ./cmd/mimir/mimir:
     	Enable anonymous usage reporting. (default true)
   -usage-stats.installation-mode string
     	Installation mode. Supported values: custom, helm, jsonnet. (default "custom")
+  -validation.cost-attribution-labels comma-separated-list-of-strings
+    	[experimental] List of labels used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution.
   -validation.create-grace-period duration
     	Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + creation_grace_period)'. This configuration is enforced in the distributor and ingester. (default 10m)
   -validation.enforce-metadata-metric-name
     	Enforce every metadata has a metric name. (default true)
+  -validation.max-cost-attribution-cardinality-per-user int
+    	[experimental] Maximum cardinality of cost attribution labels allowed per user. (default 10000)
   -validation.max-label-names-per-series int
     	Maximum number of label names per series. (default 30)
   -validation.max-length-label-name int

@@ -458,9 +458,21 @@ overrides_exporter:
 # time.
 [common: <common>]
 
+# (advanced) Defines a custom path for the registry. When specified, Mimir will
+# expose cost attribution metrics through this custom path, if not specified,
+# cost attribution metrics won't be exposed.
+# CLI flag: -cost-attribution.registry-path
+[cost_attribution_registry_path: <string> | default = ""]
+
 # (experimental) Enables optimized marshaling of timeseries.
 # CLI flag: -timeseries-unmarshal-caching-optimization-enabled
 [timeseries_unmarshal_caching_optimization_enabled: <boolean> | default = true]
+
+# (experimental) Time interval at which inactive cost attributions will be
+# evicted from the counter, so it won't be counted when checking
+# max_cost_attribution_cardinality_per_user.
+# CLI flag: -cost-attribution.eviction-interval
+[cost_attribution_eviction_interval: <duration> | default = 30m]
 ```
 
 ### common
@@ -3527,6 +3539,20 @@ The `limits` block configures default and per-tenant limits imposed by component
 # CLI flag: -querier.active-series-results-max-size-bytes
 [active_series_results_max_size_bytes: <int> | default = 419430400]
 
+# (experimental) List of labels used to define the cost attribution. This label
+# will be included in the specified distributor and ingester metrics for each
+# write request, allowing them to be distinguished by the label. The label
+# applies to the following metrics: cortex_distributor_received_samples_total,
+# cortex_ingester_active_series and cortex_discarded_samples_attribution_total.
+# Set to an empty string to disable cost attribution.
+# CLI flag: -validation.cost-attribution-labels
+[cost_attribution_labels: <string> | default = ""]
+
+# (experimental) Maximum cardinality of cost attribution labels allowed per
+# user.
+# CLI flag: -validation.max-cost-attribution-cardinality-per-user
+[max_cost_attribution_cardinality_per_user: <int> | default = 10000]
+
 # Duration to delay the evaluation of rules to ensure the underlying metrics
 # have been pushed.
 # CLI flag: -ruler.evaluation-delay-duration

@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: AGPL-3.0-only
+
+package costattribution
+
+import (
+	"context"
+	"sort"
+	"sync"
+	"time"
+
+	"github.com/go-kit/log"
+	"github.com/grafana/dskit/services"
+	"github.com/prometheus/client_golang/prometheus"
+
+	"github.com/grafana/mimir/pkg/util/validation"
+)
+
+const (
+	missingValue  = "__missing__"
+	overflowValue = "__overflow__"
+)
+
+type Manager struct {
+	services.Service
+	logger          log.Logger
+	inactiveTimeout time.Duration
+	limits          *validation.Overrides
+
+	// mu protects the trackersByUserID map
+	mtx              sync.RWMutex
+	trackersByUserID map[string]*TrackerImp
+}
+
+// NewManager creates a new cost attribution manager. which is responsible for managing the cost attribution of series.
+// It will clean up inactive series and update the cost attribution of series every 3 minutes.
+func NewManager(cleanupInterval, inactiveTimeout time.Duration, logger log.Logger, limits *validation.Overrides) *Manager {
+	s := &Manager{
+		trackersByUserID: make(map[string]*TrackerImp),
+		limits:           limits,
+		mtx:              sync.RWMutex{},
+		inactiveTimeout:  inactiveTimeout,
+		logger:           logger,
+	}
+
+	s.Service = services.NewTimerService(cleanupInterval, nil, s.iteration, nil).WithName("cost attribution manager")
+	return s
+}
+
+func (m *Manager) iteration(_ context.Context) error {
+	currentTime := time.Now()
+	m.purgeInactiveAttributionsUntil(currentTime.Add(-m.inactiveTimeout).Unix())
+	return nil
+}
+
+// EnabledForUser returns true if the cost attribution is enabled for the user
+func (m *Manager) EnabledForUser(userID string) bool {
+	return len(m.limits.CostAttributionLabels(userID)) > 0
+}
+
+func (m *Manager) TrackerForUser(userID string) Tracker {
+	// if cost attribution is not enabled, return nil
+	if !m.EnabledForUser(userID) {
+		return NewNoopTracker()
+	}
+	m.mtx.Lock()
+	defer m.mtx.Unlock()
+
+	// if not exists, create a new tracker
+	if _, exists := m.trackersByUserID[userID]; !exists {
+		m.trackersByUserID[userID], _ = newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID))
+	}
+	return m.trackersByUserID[userID]
+}
+
+func (m *Manager) Collect(out chan<- prometheus.Metric) {
+	m.mtx.RLock()
+	defer m.mtx.RUnlock()
+	for _, tracker := range m.trackersByUserID {
+		tracker.Collect(out)
+	}
+}
+
+// Describe implements prometheus.Collector.
+func (m *Manager) Describe(chan<- *prometheus.Desc) {
+	// this is an unchecked collector
+}
+
+// deleteUserTracer is delete user tracker since the user is disabled for cost attribution
+func (m *Manager) deleteUserTracer(userID string) {
+	m.mtx.Lock()
+	defer m.mtx.Unlock()
+	if _, exists := m.trackersByUserID[userID]; !exists {
+		return
+	}
+	// clean up tracker metrics and delete the tracker
+	m.trackersByUserID[userID].cleanupTracker(userID)
+	delete(m.trackersByUserID, userID)
+}
+
+func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) {
+	// Get all userIDs from the map
+	m.mtx.RLock()
+	userIDs := make([]string, 0, len(m.trackersByUserID))
+	for userID := range m.trackersByUserID {
+		userIDs = append(userIDs, userID)
+	}
+	m.mtx.RUnlock()
+
+	// Iterate over all userIDs and purge inactive attributions of each user
+	for _, userID := range userIDs {
+		// if cost attribution is not enabled for the user, delete the user tracker and continue
+		if len(m.limits.CostAttributionLabels(userID)) == 0 || m.limits.MaxCostAttributionCardinalityPerUser(userID) <= 0 {
+			m.deleteUserTracer(userID)
+			continue
+		}
+		// get all inactive attributions for the user and clean up the tracker
+		inactiveObs := m.purgeInactiveObservationsForUser(userID, deadline)
+		for _, ob := range inactiveObs {
+			m.trackersByUserID[userID].cleanupTrackerAttribution(ob.lvalues)
+		}
+	}
+}
+
+// compare two sorted string slices
+func compareStringSlice(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i, v := range a {
+		if v != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func (m *Manager) purgeInactiveObservationsForUser(userID string, deadline int64) []*Observation {
+	cat := m.TrackerForUser(userID)
+	if _, ok := cat.(*NoopTracker); ok {
+		// It's a noop implementation
+		return nil
+	}
+
+	newTrackedLabels := m.limits.CostAttributionLabels(userID)
+	sort.Slice(newTrackedLabels, func(i, j int) bool {
+		return newTrackedLabels[i] < newTrackedLabels[j]
+	})
+	// if they are different, we need to update the tracker, we don't mind, just reinitialized the tracker
+	if !compareStringSlice(cat.GetCALabels(), newTrackedLabels) {
+		m.mtx.Lock()
+		m.trackersByUserID[userID], _ = newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID))
+		// update the tracker with the new tracker
+		cat = m.trackersByUserID[userID]
+		m.mtx.Unlock()
+	} else if maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID); cat.GetMaxCardinality() != maxCardinality {
+		// if the maxCardinality is different, update the tracker
+		cat.UpdateMaxCardinality(maxCardinality)
+	}
+
+	return cat.PurgeInactiveObservations(deadline)
+}