grafana · ying-jeanne · Oct 24, 2024 · Oct 24, 2024 · Oct 25, 2024 · Oct 25, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,7 +19,7 @@
 * [CHANGE] Ingester: remove experimental flags `-ingest-storage.kafka.ongoing-records-per-fetch` and `-ingest-storage.kafka.startup-records-per-fetch`. They are removed in favour of `-ingest-storage.kafka.max-buffered-bytes`. #9906
 * [CHANGE] Ingester: Replace `cortex_discarded_samples_total` label from `sample-out-of-bounds` to `sample-timestamp-too-old`. #9885
 * [CHANGE] Ruler: the `/prometheus/config/v1/rules` does not return an error anymore if a rule group is missing in the object storage after been successfully returned by listing the storage, because it could have been deleted in the meanwhile. #9936
-* [FEATURE] Querier: add experimental streaming PromQL engine, enabled with `-querier.query-engine=mimir`. #9367 #9368 #9398 #9399 #9403 #9417 #9418 #9419 #9420 #9482 #9504 #9505 #9507 #9518 #9531 #9532 #9533 #9553 #9558 #9588 #9589 #9639 #9641 #9642 #9651 #9664 #9681 #9717 #9719 #9724 #9874
+* [FEATURE] Querier: add experimental streaming PromQL engine, enabled with `-querier.query-engine=mimir`. #9367 #9368 #9398 #9399 #9403 #9417 #9418 #9419 #9420 #9482 #9504 #9505 #9507 #9518 #9531 #9532 #9533 #9553 #9558 #9588 #9589 #9639 #9641 #9642 #9651 #9664 #9681 #9717 #9719 #9724 #9874 #9998
 * [FEATURE] Distributor: Add support for `lz4` OTLP compression. #9763
 * [FEATURE] Query-frontend: added experimental configuration options `query-frontend.cache-errors` and `query-frontend.results-cache-ttl-for-errors` to allow non-transient responses to be cached. When set to `true` error responses from hitting limits or bad data are cached for a short TTL. #9028
 * [FEATURE] Query-frontend: add middleware to control access to specific PromQL experimental functions on a per-tenant basis. #9798

@@ -4358,6 +4358,50 @@
           "fieldType": "int",
           "fieldCategory": "experimental"
         },
+        {
+          "kind": "field",
+          "name": "cost_attribution_labels",
+          "required": false,
+          "desc": "List of labels used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution.",
+          "fieldValue": null,
+          "fieldDefaultValue": "",
+          "fieldFlag": "validation.cost-attribution-labels",
+          "fieldType": "string",
+          "fieldCategory": "experimental"
+        },
+        {
+          "kind": "field",
+          "name": "max_cost_attribution_labels_per_user",
+          "required": false,
+          "desc": "Maximum number of cost attribution labels allowed per user. 0 to disable.",
+          "fieldValue": null,
+          "fieldDefaultValue": 2,
+          "fieldFlag": "validation.max-cost-attribution-labels-per-user",
+          "fieldType": "int",
+          "fieldCategory": "experimental"
+        },
+        {
+          "kind": "field",
+          "name": "max_cost_attribution_cardinality_per_user",
+          "required": false,
+          "desc": "Maximum cardinality of cost attribution labels allowed per user.",
+          "fieldValue": null,
+          "fieldDefaultValue": 10000,
+          "fieldFlag": "validation.max-cost-attribution-cardinality-per-user",
+          "fieldType": "int",
+          "fieldCategory": "experimental"
+        },
+        {
+          "kind": "field",
+          "name": "cost_attribution_cooldown",
+          "required": false,
+          "desc": "Cooldown period for cost attribution labels. This specifies how long the cost attribution tracker remains in overflow before attempting a reset. If the tracker is still in overflow after this period, the cooldown will be extended. Set to 0 to disable the cooldown period.",
+          "fieldValue": null,
+          "fieldDefaultValue": 0,
+          "fieldFlag": "validation.cost-attribution-cooldown",
+          "fieldType": "duration",
+          "fieldCategory": "experimental"
+        },
         {
           "kind": "field",
           "name": "ruler_evaluation_delay_duration",
@@ -19524,6 +19568,17 @@
       "fieldValue": null,
       "fieldDefaultValue": null
     },
+    {
+      "kind": "field",
+      "name": "cost_attribution_registry_path",
+      "required": false,
+      "desc": "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.",
+      "fieldValue": null,
+      "fieldDefaultValue": "",
+      "fieldFlag": "cost-attribution.registry-path",
+      "fieldType": "string",
+      "fieldCategory": "advanced"
+    },
     {
       "kind": "field",
       "name": "timeseries_unmarshal_caching_optimization_enabled",
@@ -19534,6 +19589,17 @@
       "fieldFlag": "timeseries-unmarshal-caching-optimization-enabled",
       "fieldType": "boolean",
       "fieldCategory": "experimental"
+    },
+    {
+      "kind": "field",
+      "name": "cost_attribution_eviction_interval",
+      "required": false,
+      "desc": "Time interval at which inactive cost attributions will be evicted from the counter, so it won't be counted when checking max_cost_attribution_cardinality_per_user.",
+      "fieldValue": null,
+      "fieldDefaultValue": 1800000000000,
+      "fieldFlag": "cost-attribution.eviction-interval",
+      "fieldType": "duration",
+      "fieldCategory": "experimental"
     }
   ],
   "fieldValue": null,

@@ -1283,6 +1283,10 @@ Usage of ./cmd/mimir/mimir:
     	Expands ${var} or $var in config according to the values of the environment variables.
   -config.file value
     	Configuration file to load.
+  -cost-attribution.eviction-interval duration
+    	[experimental] Time interval at which inactive cost attributions will be evicted from the counter, so it won't be counted when checking max_cost_attribution_cardinality_per_user. (default 30m0s)
+  -cost-attribution.registry-path string
+    	Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.
   -debug.block-profile-rate int
     	Fraction of goroutine blocking events that are reported in the blocking profile. 1 to include every blocking event in the profile, 0 to disable.
   -debug.mutex-profile-fraction int
@@ -3297,10 +3301,18 @@ Usage of ./cmd/mimir/mimir:
     	Enable anonymous usage reporting. (default true)
   -usage-stats.installation-mode string
     	Installation mode. Supported values: custom, helm, jsonnet. (default "custom")
+  -validation.cost-attribution-cooldown duration
+    	[experimental] Cooldown period for cost attribution labels. This specifies how long the cost attribution tracker remains in overflow before attempting a reset. If the tracker is still in overflow after this period, the cooldown will be extended. Set to 0 to disable the cooldown period.
+  -validation.cost-attribution-labels comma-separated-list-of-strings
+    	[experimental] List of labels used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution.
   -validation.create-grace-period duration
     	Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + creation_grace_period)'. This configuration is enforced in the distributor and ingester. (default 10m)
   -validation.enforce-metadata-metric-name
     	Enforce every metadata has a metric name. (default true)
+  -validation.max-cost-attribution-cardinality-per-user int
+    	[experimental] Maximum cardinality of cost attribution labels allowed per user. (default 10000)
+  -validation.max-cost-attribution-labels-per-user int
+    	[experimental] Maximum number of cost attribution labels allowed per user. 0 to disable. (default 2)
   -validation.max-label-names-per-series int
     	Maximum number of label names per series. (default 30)
   -validation.max-length-label-name int

@@ -1,4 +1,6 @@
 multitenancy_enabled: false
+cost_attribution_registry_path: "/usage-metrics"
+cost_attribution_eviction_interval: 10m
 
 distributor:
   ha_tracker:
@@ -184,5 +186,10 @@ limits:
   ha_replica_label: ha_replica
   ha_max_clusters: 10
 
+  cost_attribution_labels: "container"
+  max_cost_attribution_labels_per_user: 2
+  max_cost_attribution_cardinality_per_user: 100
+  cost_attribution_cooldown: 20m
+
 runtime_config:
-  file: ./config/runtime.yaml
+  file: ./config/runtime.yaml
@@ -1,4 +1,6 @@
 multitenancy_enabled: false
+cost_attribution_registry_path: "/usage-metrics"
+cost_attribution_eviction_interval: 10m
 
 distributor:
   pool:
@@ -180,5 +182,11 @@ limits:
   ha_replica_label: ha_replica
   ha_max_clusters: 10
 
+  cost_attribution_labels: "instance"
+  max_cost_attribution_labels_per_user: 2
+  max_cost_attribution_cardinality_per_user: 100
+  cost_attribution_cooldown: 20m
+
 runtime_config:
   file: ./config/runtime.yaml
+
@@ -458,9 +458,21 @@ overrides_exporter:
 # time.
 [common: <common>]
 
+# (advanced) Defines a custom path for the registry. When specified, Mimir will
+# expose cost attribution metrics through this custom path, if not specified,
+# cost attribution metrics won't be exposed.
+# CLI flag: -cost-attribution.registry-path
+[cost_attribution_registry_path: <string> | default = ""]
+
 # (experimental) Enables optimized marshaling of timeseries.
 # CLI flag: -timeseries-unmarshal-caching-optimization-enabled
 [timeseries_unmarshal_caching_optimization_enabled: <boolean> | default = true]
+
+# (experimental) Time interval at which inactive cost attributions will be
+# evicted from the counter, so it won't be counted when checking
+# max_cost_attribution_cardinality_per_user.
+# CLI flag: -cost-attribution.eviction-interval
+[cost_attribution_eviction_interval: <duration> | default = 30m]
 ```
 
 ### common
@@ -3539,6 +3551,32 @@ The `limits` block configures default and per-tenant limits imposed by component
 # CLI flag: -querier.active-series-results-max-size-bytes
 [active_series_results_max_size_bytes: <int> | default = 419430400]
 
+# (experimental) List of labels used to define the cost attribution. This label
+# will be included in the specified distributor and ingester metrics for each
+# write request, allowing them to be distinguished by the label. The label
+# applies to the following metrics: cortex_distributor_received_samples_total,
+# cortex_ingester_active_series and cortex_discarded_samples_attribution_total.
+# Set to an empty string to disable cost attribution.
+# CLI flag: -validation.cost-attribution-labels
+[cost_attribution_labels: <string> | default = ""]
+
+# (experimental) Maximum number of cost attribution labels allowed per user. 0
+# to disable.
+# CLI flag: -validation.max-cost-attribution-labels-per-user
+[max_cost_attribution_labels_per_user: <int> | default = 2]
+
+# (experimental) Maximum cardinality of cost attribution labels allowed per
+# user.
+# CLI flag: -validation.max-cost-attribution-cardinality-per-user
+[max_cost_attribution_cardinality_per_user: <int> | default = 10000]
+
+# (experimental) Cooldown period for cost attribution labels. This specifies how
+# long the cost attribution tracker remains in overflow before attempting a
+# reset. If the tracker is still in overflow after this period, the cooldown
+# will be extended. Set to 0 to disable the cooldown period.
+# CLI flag: -validation.cost-attribution-cooldown
+[cost_attribution_cooldown: <duration> | default = 0s]
+
 # Duration to delay the evaluation of rules to ensure the underlying metrics
 # have been pushed.
 # CLI flag: -ruler.evaluation-delay-duration

@@ -20,6 +20,7 @@ import (
 	"github.com/grafana/dskit/middleware"
 	"github.com/grafana/dskit/server"
 	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promhttp"
 
 	"github.com/grafana/mimir/pkg/alertmanager"
 	"github.com/grafana/mimir/pkg/alertmanager/alertmanagerpb"
@@ -278,6 +279,13 @@ func (a *API) RegisterDistributor(d *distributor.Distributor, pushConfig distrib
 	a.RegisterRoute("/distributor/ha_tracker", d.HATracker, false, true, "GET")
 }
 
+// Function to register the usage metrics route
+func (a *API) RegisterUsageMetricsRoute(customRegistryPath string, reg *prometheus.Registry) {
+	// Create a Prometheus HTTP handler for the custom registry
+	// Register the handler with the API's routing system
+	a.RegisterRoute(customRegistryPath, promhttp.HandlerFor(reg, promhttp.HandlerOpts{}), true, false, "GET")
+}
+
 // Ingester is defined as an interface to allow for alternative implementations
 // of ingesters to be passed into the API.RegisterIngester() method.
 type Ingester interface {

@@ -48,7 +48,7 @@ type TSDBBuilder struct {
 var softErrProcessor = mimir_storage.NewSoftAppendErrorProcessor(
 	func() {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {},
 	func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {},
-	func() {}, func([]mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {},
+	func([]mimirpb.LabelAdapter) {}, func([]mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {},
 	func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {},
 	func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {},
 )