grafana · ying-jeanne · Oct 24, 2024 · Oct 24, 2024 · Oct 25, 2024 · Oct 25, 2024
@@ -4347,6 +4347,28 @@
           "fieldType": "int",
           "fieldCategory": "experimental"
         },
+        {
+          "kind": "field",
+          "name": "cost_attribution_labels",
+          "required": false,
+          "desc": "List of labels used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution.",
+          "fieldValue": null,
+          "fieldDefaultValue": "",
+          "fieldFlag": "validation.cost-attribution-labels",
+          "fieldType": "string",
+          "fieldCategory": "experimental"
+        },
+        {
+          "kind": "field",
+          "name": "max_cost_attribution_per_user",
+          "required": false,
+          "desc": "Maximum number of cost attribution labels allowed per user.",
+          "fieldValue": null,
+          "fieldDefaultValue": 0,
+          "fieldFlag": "validation.max-cost-attribution-per-user",
+          "fieldType": "int",
+          "fieldCategory": "experimental"
+        },
         {
           "kind": "field",
           "name": "ruler_evaluation_delay_duration",
@@ -18346,6 +18368,17 @@
       "fieldValue": null,
       "fieldDefaultValue": null
     },
+    {
+      "kind": "field",
+      "name": "custom_registry_path",
+      "required": false,
+      "desc": "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.",
+      "fieldValue": null,
+      "fieldDefaultValue": "",
+      "fieldFlag": "custom-registry-path",
+      "fieldType": "string",
+      "fieldCategory": "advanced"
+    },
     {
       "kind": "field",
       "name": "timeseries_unmarshal_caching_optimization_enabled",
@@ -18356,6 +18389,28 @@
       "fieldFlag": "timeseries-unmarshal-caching-optimization-enabled",
       "fieldType": "boolean",
       "fieldCategory": "experimental"
+    },
+    {
+      "kind": "field",
+      "name": "cost_attribution_eviction_interval",
+      "required": false,
+      "desc": "Time interval at which inactive cost attributions will be evicted from the cache.",
+      "fieldValue": null,
+      "fieldDefaultValue": 1800000000000,
+      "fieldFlag": "cost-attribution-eviction-interval",
+      "fieldType": "duration",
+      "fieldCategory": "experimental"
+    },
+    {
+      "kind": "field",
+      "name": "cost_attribution_cool_down_duration",
+      "required": false,
+      "desc": "Duration during which any cost attribution for a user will be marked as __overflow__ after exceeding the specified limit, prior to resetting the cache.",
+      "fieldValue": null,
+      "fieldDefaultValue": 1200000000000,
+      "fieldFlag": "cost-attribution-cool-down-duration",
+      "fieldType": "duration",
+      "fieldCategory": "experimental"
     }
   ],
   "fieldValue": null,

@@ -1139,6 +1139,12 @@ Usage of ./cmd/mimir/mimir:
     	Expands ${var} or $var in config according to the values of the environment variables.
   -config.file value
     	Configuration file to load.
+  -cost-attribution-cool-down-duration duration
+    	[experimental] Duration during which any cost attribution for a user will be marked as __overflow__ after exceeding the specified limit, prior to resetting the cache. (default 20m0s)
+  -cost-attribution-eviction-interval duration
+    	[experimental] Time interval at which inactive cost attributions will be evicted from the cache. (default 30m0s)
+  -custom-registry-path string
+    	Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.
   -debug.block-profile-rate int
     	Fraction of goroutine blocking events that are reported in the blocking profile. 1 to include every blocking event in the profile, 0 to disable.
   -debug.mutex-profile-fraction int
@@ -3097,10 +3103,14 @@ Usage of ./cmd/mimir/mimir:
     	Enable anonymous usage reporting. (default true)
   -usage-stats.installation-mode string
     	Installation mode. Supported values: custom, helm, jsonnet. (default "custom")
+  -validation.cost-attribution-labels comma-separated-list-of-strings
+    	[experimental] List of labels used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution.
   -validation.create-grace-period duration
     	Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + creation_grace_period)'. This configuration is enforced in the distributor and ingester. (default 10m)
   -validation.enforce-metadata-metric-name
     	Enforce every metadata has a metric name. (default true)
+  -validation.max-cost-attribution-per-user int
+    	[experimental] Maximum number of cost attribution labels allowed per user.
   -validation.max-label-names-per-series int
     	Maximum number of label names per series. (default 30)
   -validation.max-length-label-name int

@@ -0,0 +1,159 @@
+package costattribution
+
+import (
+	"context"
+	"sort"
+	"sync"
+	"time"
+
+	"github.com/go-kit/log"
+	"github.com/grafana/dskit/services"
+	"github.com/prometheus/client_golang/prometheus"
+
+	"github.com/grafana/mimir/pkg/util/validation"
+)
+
+const (
+	missingValue  = "__missing__"
+	overflowValue = "__overflow__"
+)
+
+type Manager struct {
+	services.Service
+	logger          log.Logger
+	inactiveTimeout time.Duration
+	limits          *validation.Overrides
+	cooldownTimeout time.Duration
+
+	// mu protects the trackersByUserID map
+	tlock            sync.RWMutex
+	trackersByUserID map[string]*Tracker
+}
+
+// NewManager creates a new cost attribution manager. which is responsible for managing the cost attribution of series.
+// It will clean up inactive series and update the cost attribution of series every 3 minutes.
+func NewManager(cleanupInterval, inactiveTimeout time.Duration, cooldownTimeout time.Duration, logger log.Logger, limits *validation.Overrides) *Manager {
+	s := &Manager{
+		trackersByUserID: make(map[string]*Tracker),
+		limits:           limits,
+		tlock:            sync.RWMutex{},
+		cooldownTimeout:  cooldownTimeout,
+		inactiveTimeout:  inactiveTimeout,
+		logger:           logger,
+	}
+
+	s.Service = services.NewTimerService(cleanupInterval, nil, s.iteration, nil).WithName("cost attribution manager")
+	return s
+}
+
+func (m *Manager) iteration(_ context.Context) error {
+	m.purgeInactiveAttributions(m.inactiveTimeout)
+	return nil
+}
+
+// EnabledForUser returns true if the cost attribution is enabled for the user
+func (m *Manager) EnabledForUser(userID string) bool {
+	return len(m.limits.CostAttributionLabel(userID)) > 0
+}
+
+func (m *Manager) TrackerForUser(userID string) *Tracker {
+	// if cost attribution is not enabled, return nil
+	if !m.EnabledForUser(userID) {
+		return nil
+	}
+	m.tlock.Lock()
+	defer m.tlock.Unlock()
+
+	// if not exists, create a new tracker
+	if _, exists := m.trackersByUserID[userID]; !exists {
+		m.trackersByUserID[userID], _ = newTracker(m.limits.CostAttributionLabel(userID), m.limits.MaxCostAttributionPerUser(userID))
+	}
+	return m.trackersByUserID[userID]
+}
+
+func (m *Manager) Collect(out chan<- prometheus.Metric) {
+	m.tlock.RLock()
+	defer m.tlock.RUnlock()
+	for _, tracker := range m.trackersByUserID {
+		tracker.Collect(out)
+	}
+}
+
+// Describe implements prometheus.Collector.
+func (m *Manager) Describe(chan<- *prometheus.Desc) {
+	// this is an unchecked collector
+}
+
+// deleteUserTracer is delete user tracker since the user is disabled for cost attribution
+func (m *Manager) deleteUserTracer(userID string) {
+	m.tlock.Lock()
+	defer m.tlock.Unlock()
+	if _, exists := m.trackersByUserID[userID]; !exists {
+		return
+	}
+	// clean up tracker metrics and delete the tracker
+	m.trackersByUserID[userID].cleanupTracker(userID)
+	delete(m.trackersByUserID, userID)
+}
+
+func (m *Manager) purgeInactiveAttributions(inactiveTimeout time.Duration) {
+
+	// Get all userIDs from the map
+	m.tlock.RLock()
+	userIDs := make([]string, 0, len(m.trackersByUserID))
+	for userID := range m.trackersByUserID {
+		userIDs = append(userIDs, userID)
+	}
+	m.tlock.RUnlock()
+
+	// Iterate over all userIDs and purge inactive attributions of each user
+	currentTime := time.Now()
+	for _, userID := range userIDs {
+		// if cost attribution is not enabled for the user, delete the user tracker and continue
+		if len(m.limits.CostAttributionLabel(userID)) == 0 || m.limits.MaxCostAttributionPerUser(userID) <= 0 {
+			m.deleteUserTracer(userID)
+			continue
+		}
+		// get all inactive attributions for the user and clean up the tracker
+		inactiveObs := m.purgeInactiveObservationsForUser(userID, currentTime.Add(-inactiveTimeout).UnixNano())
+
+		for _, ob := range inactiveObs {
+			m.trackersByUserID[userID].cleanupTrackerAttribution(ob.lvalues)
+		}
+	}
+}
+
+// compare two sorted string slices
+func compareStringSlice(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i, v := range a {
+		if v != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func (m *Manager) purgeInactiveObservationsForUser(userID string, deadline int64) []*observation {
+	cat := m.TrackerForUser(userID)
+	if cat == nil {
+		return nil
+	}
+
+	newTrackedLabels := sort.StringSlice(m.limits.CostAttributionLabel(userID))
+	// if they are different, we need to update the tracker, we don't mind, just reinitalized the tracker
+	if !compareStringSlice(cat.trackedLabels, newTrackedLabels) {
+		m.tlock.Lock()
+		m.trackersByUserID[userID], _ = newTracker(m.limits.CostAttributionLabel(userID), m.limits.MaxCostAttributionPerUser(userID))
+		// update the tracker with the new tracker
+		cat = m.trackersByUserID[userID]
+		m.tlock.Unlock()
+	} else if maxCardinality := m.limits.MaxCostAttributionPerUser(userID); cat.maxCardinality != maxCardinality {
+		// if the maxCardinality is different, update the tracker
+		cat.updateMaxCardinality(maxCardinality)
+	}
+
+	return cat.PurgeInactiveObservations(deadline)
+}